rdf-sak 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ require 'rdf/sak'
2
+ require 'xml-mixup'
3
+ require 'commander'
4
+
5
+ module RDF::SAK
6
+ class CLI
7
+ # This is a command-line interface
8
+
9
+ include XML::Mixup
10
+ include Commander::Methods
11
+
12
+ # bunch of data declarations etc we don't want to expose
13
+ private
14
+
15
+ # actual methods
16
+ public
17
+
18
+ # constructor
19
+
20
+ # configuration:
21
+
22
+ # directories: source, target, private
23
+ # files (or file names): graph, rewrite_map, redirect_map, gone_map
24
+ # URIs: base, aliases
25
+
26
+ def initialize config: {}
27
+ end
28
+
29
+ # vestigial
30
+
31
+ def run
32
+ run!
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,188 @@
1
+ require 'rdf/sak/version'
2
+ require 'set'
3
+ require 'descriptive_statistics'
4
+ require 'nokogiri'
5
+
6
+ class RDF::SAK::DocStats < Nokogiri::XML::SAX::Document
7
+ private
8
+
9
+ MAYBE = %i[dt dd li td th caption figcaption]
10
+ SKIP = %i[html head title base link meta script]
11
+ BLOCKS = Set.new(%i[body p h1 h2 h3 h4 h5 h6 ul ol pre dl main header footer
12
+ article section aside figure nav div noscript blockquote form hr
13
+ table fieldset address] + MAYBE).freeze
14
+ SECTIONS = Set.new(%i[body article section]).freeze
15
+ IMAGES = Set.new(%i[img picture]).freeze
16
+ VIDEOS = Set.new(%i[video]).freeze
17
+ EMBEDS = Set.new(%i[embed object iframe])
18
+ COUNTS = {
19
+ sections: %i[body article section header footer nav aside],
20
+ images: %i[img picture],
21
+ videos: %i[video],
22
+ embeds: %i[embed object iframe],
23
+ tables: %i[table],
24
+ lists: %i[ul ol dl],
25
+ forms: %i[form],
26
+ scripts: %i[script],
27
+ sheets: %i[style],
28
+ }.transform_values { |v| Set.new v }.freeze
29
+
30
+
31
+ NODEXP = '/html:html/html:body[not(*)]|/html:html/html:body//*[not(*)]'.freeze
32
+ XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
33
+ XPATHNS = { html: XHTMLNS }.freeze
34
+
35
+ # ok listen up fools here is the new html document stats algo:
36
+
37
+ # okay we want to count characters, words, blocks, and sections, as
38
+ # well as gather stats on words per block (and probably blocks per section)
39
+
40
+ # the problem is we don't want to count blocks that only contain other blocks
41
+
42
+ # we also don't want to count the text of sub-blocks in a superordinate block
43
+
44
+ # there are also quasi-blocks that we may not ordinarily count,
45
+ # except if they themselves contain two or more adjacent
46
+ # blocks. (examples: li, th/td, h1-6, caption/figcaption)
47
+
48
+ # count the block only if it contains text and inline elements (and
49
+ # only count the text and inline elements)
50
+
51
+ # if
52
+
53
+ # we can also
54
+
55
+ # use xpath to find all the leaf node elements
56
+ #
57
+
58
+ def pretend_sax node
59
+ case node.type
60
+ when Nokogiri::XML::Node::DOCUMENT_NODE
61
+ # if node is a document run begin and end document and then run
62
+ # for children
63
+ start_document
64
+ node.children.each { |c| pretend_sax c }
65
+ end_document
66
+ when Nokogiri::XML::Node::ELEMENT_NODE
67
+ # if node is an element run begin and end element and run for children
68
+ prefix, uri = if ns = node.namespace
69
+ [ns.prefix, ns.href]
70
+ end
71
+ ns = node.namespace_scopes.map { |n| [ns.prefix, ns.href] }
72
+ attrs = node.attribute_nodes.map do |a|
73
+ an = a.name
74
+ an = "#{a.namespace.prefix}:#{an}" if
75
+ a.namespace and a.namespace.prefix
76
+ [an, a.content]
77
+ end
78
+ start_element_namespace node.name, attrs, prefix, uri, ns
79
+ node.children.each { |c| pretend_sax c }
80
+ end_element_namespace node.name, prefix, uri
81
+ when Nokogiri::XML::Node::TEXT_NODE
82
+ characters node.content
83
+ when Nokogiri::XML::Node::CDATA_SECTION_NODE
84
+ cdata_block node.content
85
+ end
86
+ end
87
+
88
+ def do_block name
89
+ if BLOCKS.include? name.to_sym
90
+ w = @text.strip.split
91
+ t = w.join ' '
92
+
93
+ unless w.empty?
94
+ words = w.length
95
+ @counts[:chars] += t.length
96
+ @counts[:words] += words
97
+ @counts[:blocks] += 1
98
+ @wpb << words
99
+ @stack << t
100
+ @text = ''
101
+ end
102
+ end
103
+ end
104
+
105
+ def clear_text
106
+ @text = ''
107
+ end
108
+
109
+ public
110
+
111
+ attr_reader :chars, :words, :blocks
112
+
113
+ def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = []
114
+ unless uri != XHTMLNS or SKIP.include? name.to_sym
115
+ @on = true
116
+ do_block name
117
+ end
118
+ end
119
+
120
+ def end_element_namespace name, prefix = nil, uri = nil
121
+ if uri == XHTMLNS
122
+ SKIP.include?(name.to_sym) ? clear_text : do_block(name)
123
+ COUNTS.each do |type, set|
124
+ @counts[type] += 1 if set.include? name.to_sym
125
+ end
126
+ @counts[:sections] -= 1 if name == 'body'
127
+ @on = false if name == 'body'
128
+ end
129
+ end
130
+
131
+ def characters string
132
+ @text += string if @on
133
+ end
134
+
135
+ def cdata_block string
136
+ characters string
137
+ end
138
+
139
+ # @return [Float] mean of words per block
140
+ def mean
141
+ @wpb.mean
142
+ end
143
+
144
+ # @return [Float] standard deviation of words per block
145
+ def sd
146
+ @wpb.standard_deviation
147
+ end
148
+
149
+ # @return
150
+ def quartiles
151
+ [0, 25, 50, 75, 100].map { |pct| @wpb.percentile(pct) }
152
+ end
153
+
154
+ def counts
155
+ @counts.dup.freeze
156
+ end
157
+
158
+ def initialize
159
+ @on = false
160
+ @text = ''
161
+ @stack = [] # XXX i don't think we use this one
162
+ @wpb = []
163
+ @counts = %i[chars words blocks sections images videos embeds
164
+ tables lists forms scripts sheets].map { |k| [k, 0] }.to_h
165
+ end
166
+
167
+ def scan doc
168
+ if doc.is_a? Nokogiri::XML::Node
169
+ pretend_sax doc
170
+ else
171
+ parser = Nokogiri::XML::SAX::Parser.new self
172
+ parser.parse doc
173
+ end
174
+
175
+ self
176
+ end
177
+
178
+ def self.scan doc
179
+ new.scan doc
180
+ end
181
+
182
+ def to_h
183
+ { mean: mean, sd: sd, quartiles: quartiles }.merge counts
184
+ end
185
+
186
+ def to_rdf uri: nil, subject: nil
187
+ end
188
+ end
@@ -0,0 +1,772 @@
1
+ require 'rdf'
2
+ require 'rdf/sak/util'
3
+ require 'time'
4
+ require 'nokogiri'
5
+ require 'xml-mixup'
6
+
7
+ class RDF::SAK::Document
8
+ include XML::Mixup
9
+ include RDF::SAK::Util
10
+
11
+ private
12
+
13
+ XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
14
+ XPATHNS = { html: XHTMLNS }
15
+ XHV = 'http://www.w3.org/1999/xhtml/vocab#'.freeze
16
+
17
+ # notice these are only RDFa attributes that take URIs
18
+ RDFA_ATTR = [:about, :resource, :typeof].freeze
19
+ LINK_ATTR = [:href, :src, :data, :action, :longdesc].freeze
20
+ LINK_XPATH = ('.//html:*[not(self::html:base)][%s]' %
21
+ (LINK_ATTR + RDFA_ATTR).map { |a| "@#{a.to_s}" }.join('|')).freeze
22
+
23
+ OBJS = [:href, :src].freeze
24
+
25
+ # ancestor node always with (@property and not @content) and
26
+ # not @resource|@href|@src unless @rel|@rev
27
+ LITXP = ['(ancestor::*[@property][not(@content)]',
28
+ '[not(@resource|@href|@src) or @rel|@rev])[1]' ].join('').freeze
29
+ # note parentheses cause the index to be counted from the root
30
+
31
+ public
32
+
33
+ attr_reader :repo, :subject, :doc, :base, :prefixes
34
+
35
+ # Initialize a document context.
36
+ def initialize repo, doc, subject: nil, base: nil, resolve: nil,
37
+ prefixes: {}, transform: nil, scache: {}, ucache: {}
38
+ # coerce the document
39
+ doc = case doc
40
+ when Nokogiri::XML::Document then doc
41
+ when Nokogiri::XML::Node then Nokogiri::XML::Document.new << doc.dup
42
+ when String, IO, File, Pathname then Nokogiri.XML doc
43
+ else
44
+ raise ArgumentError, "Not sure what to do with #{doc.class}"
45
+ end
46
+
47
+ # we only try this if there is a subject defined, obvs
48
+ base ||= RDF::SAK::Util.canonical_uri repo, subject, rdf: false if subject
49
+
50
+ @repo = repo
51
+ @subject = subject
52
+ @doc = doc
53
+ @base = URI(base.to_s) if base # note this is a vanilla URI
54
+ @resolve = RDF::URI(resolve.to_s) if resolve # note this is an RDF::URI
55
+ @prefixes = prefixes
56
+ @transform = transform
57
+ @scache = scache
58
+ @ucache = ucache
59
+ end
60
+
61
+ def canonical_uuid uri, unique: true, published: false
62
+ RDF::SAK::Util.canonical_uuid @repo, uri, base: @base,
63
+ unique: unique, published: published, scache: @scache, ucache: @ucache
64
+ end
65
+
66
+ def canonical_uri subject,
67
+ unique: true, rdf: true, slugs: false, fragment: false
68
+ RDF::SAK::Util.canonical_uri @repo, subject, base: @base,
69
+ unique: unique, rdf: rdf, slugs: slugs, fragment: fragment
70
+ end
71
+
72
+ def cmp_label a, b, labels: nil, supplant: true, reverse: false
73
+ RDF::SAK::Util.cmp_label @repo, a, b,
74
+ labels: labels, supplant: supplant, reverse: reverse
75
+ end
76
+
77
+ def asserted_types subject, type = nil
78
+ RDF::SAK::Util.asserted_types @repo, subject, type
79
+ end
80
+
81
+ def subjects_for predicate, object, entail: true, only: []
82
+ RDF::SAK::Util.subjects_for @repo, predicate, object,
83
+ entail: entail, only: only
84
+ end
85
+
86
+ def objects_for subject, predicate, entail: true, only: [], datatype: nil
87
+ RDF::SAK::Util.objects_for @repo, subject, predicate,
88
+ entail: entail, only: only, datatype: datatype
89
+ end
90
+
91
+ def struct_for subject, rev: false, only: [], uuids: false, canon: false
92
+ RDF::SAK::Util.struct_for @repo, subject,
93
+ rev: rev, only: only, uuids: uuids, canon: canon,
94
+ ucache: @ucache, scache: @scache
95
+ end
96
+
97
+ def label_for subject, candidates: nil, unique: true, type: nil,
98
+ lang: nil, desc: false, alt: false
99
+ RDF::SAK::Util.label_for @repo, subject, candidates: candidates,
100
+ unique: unique, type: type, lang: lang, desc: desc, alt: alt
101
+ end
102
+
103
+ def formats_for subject, predicate: RDF::Vocab::DC.format,
104
+ datatype: [RDF::XSD.token]
105
+ RDF::SAK::Util.formats_for @repo, subject,
106
+ predicate: predicate, datatype: datatype
107
+ end
108
+
109
+ def authors_for subject, unique: false, contrib: false
110
+ RDF::SAK::Util.authors_for @repo, subject, unique: unique, contrib: contrib
111
+ end
112
+
113
+ # proxy for context published
114
+ def published? subject = nil
115
+ return RDF::SAK::Util.published? @repo, subject, base: @base if subject
116
+ @published ||= RDF::SAK::Util.published? @repo, @subject, base: @base
117
+ end
118
+
119
+ def abbreviate term, prefixes: @prefixes,
120
+ vocab: nil, noop: true, sort: true
121
+ super term, prefixes: prefixes || {}, vocab: vocab, noop: noop, sort: sort
122
+ end
123
+
124
+ def base_for node = nil
125
+ node ||= @doc
126
+ doc = node.document
127
+ base = URI(@base.to_s)
128
+
129
+ return base unless doc.root
130
+
131
+ if doc.root.name.to_sym == :html
132
+ b = doc.at_xpath(
133
+ '(/html:html/html:head/html:base[@href])[1]/@href', XPATHNS
134
+ ).to_s.strip
135
+ b = URI(b)
136
+
137
+ base = b if b.absolute?
138
+ elsif b = doc.root.at_xpath('ancestor-or-self::*[@xml:base][1]/@xml:base')
139
+ b = URI(b.to_s.strip)
140
+ base = b if b.absolute?
141
+ end
142
+
143
+
144
+ # warn({ orig_base: @base, resolve: resolve, base: base}.inspect)
145
+
146
+ # warn %i[scheme host port].map { |s| [s, base.send(s) == resolve.send(s)] }.to_h.inspect
147
+
148
+ # rewrite if aliased
149
+ if @resolve and resolve = URI(@resolve.to_s) and
150
+ %i[scheme host port].all? { |s| base.send(s) == resolve.send(s) }
151
+ tmp = base.dup
152
+ tmp.scheme = @base.scheme
153
+ tmp.host = @base.host
154
+ tmp.port = @base.port
155
+ base = tmp.normalize
156
+ end
157
+
158
+ base
159
+ end
160
+
161
+ def rewrite_links node = @doc, uuids: {}, uris: {}, &block
162
+ base = base_for node
163
+ if be = node.at_xpath('(/html:html/html:head/html:base[@href])[1]', XPATHNS)
164
+ be[:href] = base.to_s if base.to_s != be[:href]
165
+ end
166
+ count = 0
167
+ node.xpath(LINK_XPATH, XPATHNS).each do |elem|
168
+ LINK_ATTR.each do |attr|
169
+ attr = attr.to_s
170
+ next unless elem.has_attribute? attr
171
+
172
+ abs = base.merge uri_pp(elem[attr].strip) rescue nil
173
+ next unless abs
174
+
175
+ # bail out if this isn't http(s)
176
+ next if abs.scheme and !%w[http https].include? abs.scheme.downcase
177
+
178
+ # fix e.g. http->https
179
+ if abs.host == @base.host and abs.scheme != @base.scheme
180
+ tmp = @base.dup
181
+ tmp.path = abs.path
182
+ tmp.query = abs.query
183
+ tmp.fragment = abs.fragment
184
+ abs = tmp
185
+ end
186
+
187
+ # harvest path parameters
188
+ pp = split_pp abs, only: true
189
+
190
+ # coerce to rdf
191
+ abs = RDF::URI(abs.to_s)
192
+
193
+ # make an aliased copy we use to look up the uuid
194
+ aliased = if @resolve
195
+ tmp = abs.dup
196
+ tmp.scheme = @resolve.scheme
197
+ tmp.authority = @resolve.authority if @resolve.authority
198
+ tmp
199
+ else
200
+ abs
201
+ end
202
+
203
+ # warn "aliased #{abs} to #{aliased}" if @resolve
204
+
205
+
206
+ # round-trip to uuid and back if we can
207
+ if uuid = uris[abs] ||= canonical_uuid(aliased)
208
+ abs = uuids[uuid] ||= canonical_uri(uuid)
209
+ elsif cu = canonical_uri(abs)
210
+ # otherwise just find the canonical uri
211
+ abs = cu
212
+ end
213
+
214
+ # reinstate the path parameters
215
+ if !pp.empty? && split_pp(abs, only: true).empty?
216
+ abs = abs.dup
217
+ abs.path = ([abs.path] + pp).join(';')
218
+ end
219
+
220
+ elem[attr] = @base.route_to(abs.to_s).to_s
221
+ count += 1
222
+ end
223
+
224
+ block.call elem if block
225
+ end
226
+
227
+ count
228
+ end
229
+
230
+ # sponge the document for rdfa
231
+ def triples_for
232
+ end
233
+
234
+ def vocab_for node
235
+ if node[:vocab]
236
+ vocab = node[:vocab].strip
237
+ return nil if vocab == ''
238
+ return vocab
239
+ end
240
+ parent = node.parent
241
+ vocab_for parent if parent and parent.element?
242
+ end
243
+
244
+ def prefixes_for node, prefixes = {}
245
+ # start with namespaces
246
+ pfx = node.namespace_declarations.filter(&:prefix).map do |n|
247
+ [n.prefix.to_sym, n.href]
248
+ end.to_h
249
+
250
+ # then add @prefix overtop of the namespaces
251
+ if node[:prefix]
252
+ x = node[:prefix].strip.split(/\s+/)
253
+ a = []
254
+ b = []
255
+ x.each_index { |i| (i % 2 == 0 ? a : b).push x[i] }
256
+ a.map!(&:to_sym)
257
+ # if the size is uneven the values will be nil, so w drop em
258
+ pfx.merge! a.zip(b).to_h.reject { |_, v| v.nil? }
259
+ end
260
+
261
+ # since we're ascending the tree, input takes precedence
262
+ prefixes = pfx.merge prefixes
263
+
264
+ if node.parent and node.parent.element?
265
+ prefixes_for(node.parent, prefixes)
266
+ else
267
+ prefixes
268
+ end
269
+ end
270
+
271
+ # give us the rdf subject of the node itself
272
+ def subject_for node = nil, rdf: false, is_ancestor: false
273
+ node ||= @doc.root
274
+ raise 'Node must be an element' unless
275
+ node.is_a? Nokogiri::XML::Element
276
+
277
+ # first we check for an ancestor element with @property and no
278
+ # @content; if we find one then we reevaluate with that
279
+ # element as the starting point
280
+ if n = node.at_xpath(LITXP)
281
+ return subject_for n
282
+ end
283
+
284
+ # answer a bunch of helpful questions about this element
285
+ subject = nil
286
+ base = base_for node
287
+ parent = node.parent
288
+ ns_href = node.namespace.href if node.namespace
289
+ up_ok = %i{rel rev}.none? { |a| node[a] }
290
+ is_root = !parent or parent.document?
291
+ special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
292
+ (ns_href == XHTMLNS or /^(?:[^:]+:)?html$/xi === parent.name)
293
+
294
+ # if the node is being inspected as an ancestor to the
295
+ # original node, we have to check it backwards.
296
+ if is_ancestor
297
+ # ah right @resource gets special treatment
298
+ if subject = node[:resource]
299
+ subject.strip!
300
+ if m = /^\[(.*?)\]$/.match(subject)
301
+ end
302
+ else
303
+ OBJS.each do |attr|
304
+ if node[attr]
305
+ # merge with the root and return it
306
+ subject = base + node[attr]
307
+ break
308
+ end
309
+ end
310
+ end
311
+
312
+ return rdf ? RDF::URI(subject.to_s) : subject
313
+
314
+ # note if we are being called with is_ancestor, that means
315
+ # the original node (or indeed any of the nodes previously
316
+ # tested) have anything resembling a resource in them. this
317
+ # means @rel/@rev should be ignored, and we should keep
318
+ # looking for a subject.
319
+ end
320
+
321
+ if node[:about]
322
+
323
+ if m = /^_:(.*)$/.match(node[:about])
324
+ return RDF::Node(m[1])
325
+ end
326
+
327
+ # XXX resolve @about against potential curie
328
+ subject = base + node[:about]
329
+
330
+ elsif is_root
331
+ subject = base
332
+ elsif special
333
+ subject = subject_for parent
334
+ elsif node[:resource]
335
+ # XXX resolve @about against potential curie
336
+ subject = base + node[:resource]
337
+ elsif node[:href]
338
+ subject = base + node[:href]
339
+ elsif node[:src]
340
+ subject = base + node[:src]
341
+ elsif node[:typeof]
342
+ # bnode the typeof attr
343
+
344
+ # note we return bnodes irrespective of the rdf flag
345
+ return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
346
+ elsif node[:inlist]
347
+ # bnode the inlist attr
348
+ return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
349
+ elsif (parent[:inlist] && OBJS.none? { |a| parent[a] }) ||
350
+ (is_ancestor && !up_ok)
351
+ # bnode the element
352
+ return RDF::Node('id-%016x' % node.pointer_id)
353
+ # elsif node[:id]
354
+ else
355
+ subject = subject_for parent, is_ancestor: true
356
+ end
357
+
358
+ rdf ? RDF::URI(subject.to_s) : URI(subject.to_s)
359
+
360
+ end
361
+
362
+ # backlink structure
363
+ def generate_backlinks published: true, struct: nil,
364
+ ignore: nil, pattern: nil, terse: false
365
+ uri = canonical_uri(subject, rdf: false) || URI(uri_pp subject)
366
+ ignore = case ignore
367
+ when nil then Set.new
368
+ when Proc then ignore
369
+ when -> x { x.respond_to? :to_set } then ignore = ignore.to_set
370
+ else
371
+ raise 'ignore must be either a proc or amenable to a set'
372
+ end
373
+ nodes = {}
374
+ labels = {}
375
+ types = {}
376
+
377
+ if struct
378
+ struct.each do |p, subjects|
379
+ subjects.each do |s|
380
+ case ignore
381
+ when Proc then next if ignore.call s, p
382
+ when Set then next if ignore.include? s
383
+ end
384
+ preds = nodes[s] ||= Set.new
385
+ preds << p
386
+ types[s] ||= asserted_types s
387
+ labels[s] ||= label_for s
388
+ labels[p] ||= label_for p unless terse
389
+ end
390
+ end
391
+ else
392
+ @repo.query([nil, nil, subject]).each do |stmt|
393
+ s = stmt.subject
394
+ case ignore
395
+ when Proc then next if ignore.call stmt
396
+ when Set then next if ignore.include? s
397
+ end
398
+ preds = nodes[s] ||= Set.new
399
+ preds << (p = stmt.predicate)
400
+ types[s] ||= asserted_types s
401
+ labels[s] ||= label_for s
402
+ labels[p] ||= label_for p unless terse
403
+ end
404
+ end
405
+
406
+ # prune out nonmatching
407
+ nodes.select! { |k, _| pattern.match? k.to_s } if
408
+ pattern and pattern.is_a? Regexp
409
+
410
+ # prune out unpublished
411
+ nodes.select! { |k, _| published? k } if published
412
+
413
+ return if nodes.empty?
414
+
415
+ if terse
416
+ nodes.map do |rsrc, preds|
417
+ cu = canonical_uri(rsrc, rdf: false) or next
418
+ lab = labels[rsrc] || [nil, rsrc]
419
+ link = { nil => :link, rel: '', href: uri.route_to(cu),
420
+ rev: abbreviate(preds) }
421
+ link[:typeof] = abbreviate(types[rsrc]) if types[rsrc]
422
+ link[:title] = lab.last if lab.last
423
+ link
424
+ end.compact
425
+ else
426
+ li = nodes.sort do |a, b|
427
+ cmp_label a.first, b.first, labels: labels
428
+ end.map do |rsrc, preds|
429
+ cu = canonical_uri(rsrc, rdf: false) or next
430
+ lab = labels[rsrc] || [nil, rsrc]
431
+ lp = abbreviate(lab.first) if lab.first
432
+ ty = abbreviate(types[rsrc]) if types[rsrc]
433
+
434
+ { [{ [{ [lab[1].to_s] => :span, property: lp }] => :a, typeof: ty,
435
+ href: uri.route_to(cu), rev: abbreviate(preds) }] => :li }
436
+ end.compact
437
+
438
+ { [{ li => :ul }] => :nav }
439
+ end
440
+ end
441
+
442
+ # goofy twitter-specific metadata
443
+ def generate_twitter_meta
444
+ # get author
445
+ author = authors_for(subject, unique: true) or return
446
+
447
+ return unless author.is_a? RDF::Resource
448
+
449
+ # get author's twitter account
450
+ twitter = objects_for(author, RDF::Vocab::FOAF.account,
451
+ only: :resource).select { |t| t.to_s =~ /twitter\.com/
452
+ }.sort.first or return
453
+ twitter = URI(twitter.to_s).path.split(/\/+/)[1]
454
+ twitter = ?@ + twitter unless twitter.start_with? ?@
455
+
456
+ # get title
457
+ title = label_for(subject) or return
458
+
459
+ out = [
460
+ { nil => :meta, name: 'twitter:card', content: :summary },
461
+ { nil => :meta, name: 'twitter:site', content: twitter },
462
+ { nil => :meta, name: 'twitter:title', content: title[1].to_s }
463
+ ]
464
+
465
+ # get abstract
466
+ if desc = label_for(subject, desc: true)
467
+ out.push({ nil => :meta, name: 'twitter:description',
468
+ content: desc[1].to_s })
469
+ end
470
+
471
+ # get image (foaf:depiction)
472
+ img = objects_for(subject, RDF::Vocab::FOAF.depiction, only: :resource)
473
+ unless img.empty?
474
+ img = img[0].to_s
475
+ out.push({ nil => :meta, name: 'twitter:image', content: img })
476
+ out[0][:content] = :summary_large_image
477
+ end
478
+
479
+ # return the appropriate xml-mixup structure
480
+ out
481
+ end
482
+
483
+ def transform_xhtml published: true, titles: false
484
+ # before we do any more work make sure this is html
485
+ doc = @doc.dup 1
486
+ body = doc.at_xpath('//html:body[1]', XPATHNS) || doc.root
487
+
488
+ # eliminate comments
489
+ doc.xpath('//comment()[not(ancestor::html:script)]', XPATHNS).each do |c|
490
+ c.unlink
491
+ end
492
+
493
+ # initial stuff
494
+ struct = struct_for @subject, uuids: true, canon: true
495
+ rstruct = struct_for @subject, uuids: true, canon: true, rev: true
496
+ resources = {}
497
+ literals = {}
498
+ ufwd = {} # uuid => uri
499
+ urev = {} # uri => uuid
500
+ datatypes = Set.new
501
+ types = Set.new
502
+ authors = authors_for @subject
503
+ title = label_for @subject, candidates: struct
504
+ desc = label_for @subject, candidates: struct, desc: true
505
+
506
+ # warn struct
507
+
508
+ # rewrite content
509
+ title = title[1] if title
510
+ desc = desc[1] if desc
511
+
512
+ # `struct` and `rstruct` will contain all the links and
513
+ # metadata for forward and backward neighbours, respectively,
514
+ # which we need to mine (predicates, classes, datatypes) for
515
+ # prefixes among other things.
516
+
517
+ struct.each do |p, v|
518
+ v.each do |o|
519
+ if o.literal?
520
+ literals[o] ||= Set.new
521
+ literals[o].add p
522
+
523
+ # collect the datatype
524
+ datatypes.add o.datatype if o.has_datatype?
525
+ else
526
+ # normalize URIs
527
+ if o.to_s.start_with? 'urn:uuid:'
528
+ ufwd[o] ||= canonical_uri o
529
+ elsif cu = urev[o] || canonical_uuid(o)
530
+ o = urev[o] ||= cu
531
+ end
532
+
533
+ # collect the resource
534
+ resources[o] ||= Set.new
535
+ resources[o].add p
536
+
537
+ # add to type
538
+ types.add o if p == RDF::RDFV.type
539
+ end
540
+ end
541
+ end
542
+
543
+ urev.merge! ufwd.invert
544
+
545
+ labels = resources.keys.map do |k|
546
+ # turn this into a pair which subsequently gets turned into a hash
547
+ [k, label_for(k) ]
548
+ end.to_h
549
+
550
+ #warn labels
551
+
552
+ # handle the title
553
+ title ||= RDF::Literal('')
554
+ tm = { '#title' => title,
555
+ property: abbreviate(literals[title].to_a, vocab: XHV) }
556
+ if tl = title.language
557
+ tm['xml:lang'] = tl # if xmlns
558
+ tm['lang'] = tl
559
+ elsif tdt = title.datatype and tdt != RDF::XSD.string
560
+ tm[:datatype] = abbreviate(tdt)
561
+ end
562
+
563
+ # we accumulate a record of the links in the body so we know
564
+ # which ones to skip in the head
565
+ bodylinks = {}
566
+ rewrite_links body, uuids: ufwd, uris: urev do |elem|
567
+ vocab = elem.at_xpath('ancestor-or-self::*[@vocab][1]/@vocab')
568
+ vocab = uri_pp(vocab.to_s) if vocab
569
+
570
+ if elem.key?('href') or elem.key?('src')
571
+ begin
572
+ vu = uri_pp(elem['href'] || elem['src'])
573
+ ru = RDF::URI(@base.merge(vu))
574
+ bodylinks[urev[ru] || ru] = true
575
+
576
+ if rel = resources[urev[ru] || ru]
577
+ elem['rel'] = (abbreviate rel, vocab: vocab).join ' '
578
+ end
579
+
580
+ label = labels[urev[ru] || ru]
581
+ if titles and label and
582
+ (!elem.key?('title') or elem['title'].strip == '')
583
+ elem['title'] = label[1].to_s
584
+ end
585
+ rescue URI::InvalidComponentError => e
586
+ warn "#{e}: #{vu} in #{@subject}"
587
+ end
588
+ end
589
+ end
590
+
591
+ # and now we do the head
592
+ links = []
593
+ resources.reject { |k, _| bodylinks[k] }.each do |k, v|
594
+ v = v.dup.delete RDF::RDFV.type
595
+ next if v.empty?
596
+ mts = formats_for k
597
+
598
+ # warn k, v.inspect
599
+
600
+ # warn k, mts.inspect
601
+
602
+ rel = abbreviate v.to_a, vocab: XHV
603
+ ru = @base.route_to(uri_pp (ufwd[k] || k).to_s)
604
+ ln = { nil => :link, rel: rel, href: ru.to_s }
605
+ if (label = labels[urev[k] || k])
606
+ ln[:title] = label[1].to_s
607
+ end
608
+
609
+ # add type=lol/wut
610
+ ln[:type] = mts.first.to_s unless mts.empty?
611
+
612
+ if !ln[:type] and v.include?(RDF::Vocab::XHV.stylesheet)
613
+ ln[:type] = 'text/css'
614
+ elsif ln[:type] =~ /(java|ecma)script/i or
615
+ v.include?(RDF::Vocab::DC.requires)
616
+ ln[nil] = :script
617
+ ln[:src] = ln.delete :href
618
+ ln[:type] ||= 'text/javascript'
619
+ end
620
+ links.push ln
621
+ end
622
+
623
+ links.sort! do |a, b|
624
+ # sort by rel, then by href
625
+ # warn a.inspect, b.inspect
626
+ s = 0
627
+ [nil, :rel, :rev, :href, :title].each do |k|
628
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
629
+ break if s != 0
630
+ end
631
+ s
632
+ end
633
+
634
+ # we want to duplicate links from particular subjects (eg the root)
635
+ (@duplicate || {}).sort do |a, b|
636
+ a.first <=> b.first
637
+ end.each do |s, preds|
638
+
639
+ o = {}
640
+ u = ufwd[s] ||= canonical_uuid s
641
+ s = urev[u] ||= canonical_uri u if u
642
+ f = {}
643
+
644
+ # do not include this subject as these links are already included!
645
+ next if u == @subject
646
+
647
+ # gather up the objects, then gather up the predicates
648
+
649
+ objects_for u || s, preds, only: :resource do |obj, rel|
650
+ # XXX do not know why += |= etc does not work
651
+ x = canonical_uuid(obj) || obj
652
+ urev[x] ||= canonical_uri x
653
+ y = o[x] ||= Set.new
654
+ o[x] = y | rel
655
+ f[x] = formats_for x
656
+ end
657
+
658
+ srel = @base.route_to((u ? urev[u] || s : s).to_s)
659
+
660
+ # now collect all the other predicates
661
+ o.keys.each do |obj|
662
+ hrel = @base.route_to((urev[obj] || obj).to_s)
663
+ o[obj] |= @repo.query([u || s, nil, obj]).predicates.to_set
664
+ rels = abbreviate o[obj].to_a, vocab: XHV
665
+ ln = { nil => :link, about: srel, rel: rels, href: hrel }
666
+ ln[:type] = f[obj].first if f[obj]
667
+
668
+ # add to links
669
+ links << ln
670
+ end
671
+ end
672
+
673
+ meta = []
674
+
675
+ # include author names as old school meta tags
676
+ authors.each do |a|
677
+ name = labels[urev[a] || a] or next
678
+ datatypes.add name[0] # a convenient place to chuck this
679
+ prop = abbreviate(name[0])
680
+ name = name[1]
681
+ about = @base.route_to((ufwd[a] || a).to_s)
682
+ tag = { nil => :meta, about: about.to_s, name: :author,
683
+ property: prop, content: name.to_s }
684
+
685
+ if name.has_datatype? and name.datatype != RDF::XSD.string
686
+ tag[:datatype] = abbreviate(name.datatype)
687
+ elsif name.has_language?
688
+ tag['xml:lang'] = tag[:lang] = name.language
689
+ end
690
+ meta.push tag
691
+ end
692
+
693
+ literals.each do |k, v|
694
+ next if k == title
695
+ rel = abbreviate v.to_a, vocab: XHV
696
+ elem = { nil => :meta, property: rel, content: k.to_s }
697
+ elem[:name] = :description if k == desc
698
+
699
+ if k.has_datatype?
700
+ datatypes.add k.datatype # so we get the prefix
701
+ elem[:datatype] = abbreviate k.datatype, vocab: XHV
702
+ end
703
+
704
+ meta.push(elem)
705
+ end
706
+
707
+ meta.sort! do |a, b|
708
+ s = 0
709
+ [:about, :property, :datatype, :content, :name].each do |k|
710
+ # warn a.inspect, b.inspect
711
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
712
+ break if s != 0
713
+ end
714
+ s
715
+ end
716
+
717
+ # don't forget style tag
718
+ style = doc.xpath('/html:html/html:head/html:style', { html: XHTMLNS })
719
+
720
+ body = body.dup 1
721
+ body = { '#body' => body.children.to_a, about: '' }
722
+ body[:typeof] = abbreviate(types.to_a, vocab: XHV) unless
723
+ types.empty?
724
+
725
+
726
+
727
+ # prepare only the prefixes we need to resolve the data we need
728
+ rsc = abbreviate(
729
+ (struct.keys + resources.keys + datatypes.to_a +
730
+ types.to_a + rstruct.to_a.flatten).uniq, noop: false).map do |x|
731
+ next if x.nil?
732
+ x.split(?:)[0].to_sym
733
+ end.reject(&:nil?).to_set
734
+
735
+ # warn rsc
736
+
737
+ pfx = prefixes.select do |k, _|
738
+ rsc.include? k
739
+ end.transform_values { |v| v.to_s }
740
+
741
+ # XXX deal with the qb:Observation separately (just nuke it for now)
742
+ extra = generate_twitter_meta || []
743
+ bl_op = begin
744
+ bads = @repo.query(
745
+ [nil, RDF::SAK::CI.document, @subject]).subjects.to_set
746
+ nope = %w[top contents index].map { |x| RDF::Vocab::XHV[x] }
747
+ lambda { |s, p| bads.include? s or nope.include? p }
748
+ end
749
+ if bl = generate_backlinks(
750
+ published: published, pattern: /^urn:uuid:/, terse: true,
751
+ struct: rstruct, ignore: bl_op)
752
+ extra << bl #{ [bl] => :object }
753
+ end
754
+
755
+ # and now for the document
756
+ xf = @transform
757
+ doc = xhtml_stub(
758
+ base: @base, prefix: pfx, vocab: XHV, lang: 'en', title: tm,
759
+ link: links, meta: meta, style: style, transform: xf,
760
+ extra: extra, body: body).document
761
+
762
+ # goddamn script tags and text/html
763
+ doc.xpath('//html:script[@src][not(node())]', XPATHNS).each do |script|
764
+ script << doc.create_text_node('')
765
+ end
766
+
767
+ doc
768
+ end
769
+
770
+
771
+
772
+ end