rdf-sak 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,35 @@
1
+ require 'rdf/sak'
2
+ require 'xml-mixup'
3
+ require 'commander'
4
+
5
+ module RDF::SAK
6
+ class CLI
7
+ # This is a command-line interface
8
+
9
+ include XML::Mixup
10
+ include Commander::Methods
11
+
12
+ # bunch of data declarations etc we don't want to expose
13
+ private
14
+
15
+ # actual methods
16
+ public
17
+
18
+ # constructor
19
+
20
+ # configuration:
21
+
22
+ # directories: source, target, private
23
+ # files (or file names): graph, rewrite_map, redirect_map, gone_map
24
+ # URIs: base, aliases
25
+
26
+ def initialize config: {}
27
+ end
28
+
29
+ # vestigial
30
+
31
+ def run
32
+ run!
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,188 @@
1
+ require 'rdf/sak/version'
2
+ require 'set'
3
+ require 'descriptive_statistics'
4
+ require 'nokogiri'
5
+
6
+ class RDF::SAK::DocStats < Nokogiri::XML::SAX::Document
7
+ private
8
+
9
+ MAYBE = %i[dt dd li td th caption figcaption]
10
+ SKIP = %i[html head title base link meta script]
11
+ BLOCKS = Set.new(%i[body p h1 h2 h3 h4 h5 h6 ul ol pre dl main header footer
12
+ article section aside figure nav div noscript blockquote form hr
13
+ table fieldset address] + MAYBE).freeze
14
+ SECTIONS = Set.new(%i[body article section]).freeze
15
+ IMAGES = Set.new(%i[img picture]).freeze
16
+ VIDEOS = Set.new(%i[video]).freeze
17
+ EMBEDS = Set.new(%i[embed object iframe])
18
+ COUNTS = {
19
+ sections: %i[body article section header footer nav aside],
20
+ images: %i[img picture],
21
+ videos: %i[video],
22
+ embeds: %i[embed object iframe],
23
+ tables: %i[table],
24
+ lists: %i[ul ol dl],
25
+ forms: %i[form],
26
+ scripts: %i[script],
27
+ sheets: %i[style],
28
+ }.transform_values { |v| Set.new v }.freeze
29
+
30
+
31
+ NODEXP = '/html:html/html:body[not(*)]|/html:html/html:body//*[not(*)]'.freeze
32
+ XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
33
+ XPATHNS = { html: XHTMLNS }.freeze
34
+
35
+ # ok listen up fools here is the new html document stats algo:
36
+
37
+ # okay we want to count characters, words, blocks, and sections, as
38
+ # well as gather stats on words per block (and probably blocks per section)
39
+
40
+ # the problem is we don't want to count blocks that only contain other blocks
41
+
42
+ # we also don't want to count the text of sub-blocks in a superordinate block
43
+
44
+ # there are also quasi-blocks that we may not ordinarily count,
45
+ # except if they themselves contain two or more adjacent
46
+ # blocks. (examples: li, th/td, h1-6, caption/figcaption)
47
+
48
+ # count the block only if it contains text and inline elements (and
49
+ # only count the text and inline elements)
50
+
51
+ # if
52
+
53
+ # we can also
54
+
55
+ # use xpath to find all the leaf node elements
56
+ #
57
+
58
+ def pretend_sax node
59
+ case node.type
60
+ when Nokogiri::XML::Node::DOCUMENT_NODE
61
+ # if node is a document run begin and end document and then run
62
+ # for children
63
+ start_document
64
+ node.children.each { |c| pretend_sax c }
65
+ end_document
66
+ when Nokogiri::XML::Node::ELEMENT_NODE
67
+ # if node is an element run begin and end element and run for children
68
+ prefix, uri = if ns = node.namespace
69
+ [ns.prefix, ns.href]
70
+ end
71
+ ns = node.namespace_scopes.map { |n| [ns.prefix, ns.href] }
72
+ attrs = node.attribute_nodes.map do |a|
73
+ an = a.name
74
+ an = "#{a.namespace.prefix}:#{an}" if
75
+ a.namespace and a.namespace.prefix
76
+ [an, a.content]
77
+ end
78
+ start_element_namespace node.name, attrs, prefix, uri, ns
79
+ node.children.each { |c| pretend_sax c }
80
+ end_element_namespace node.name, prefix, uri
81
+ when Nokogiri::XML::Node::TEXT_NODE
82
+ characters node.content
83
+ when Nokogiri::XML::Node::CDATA_SECTION_NODE
84
+ cdata_block node.content
85
+ end
86
+ end
87
+
88
+ def do_block name
89
+ if BLOCKS.include? name.to_sym
90
+ w = @text.strip.split
91
+ t = w.join ' '
92
+
93
+ unless w.empty?
94
+ words = w.length
95
+ @counts[:chars] += t.length
96
+ @counts[:words] += words
97
+ @counts[:blocks] += 1
98
+ @wpb << words
99
+ @stack << t
100
+ @text = ''
101
+ end
102
+ end
103
+ end
104
+
105
+ def clear_text
106
+ @text = ''
107
+ end
108
+
109
+ public
110
+
111
+ attr_reader :chars, :words, :blocks
112
+
113
+ def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = []
114
+ unless uri != XHTMLNS or SKIP.include? name.to_sym
115
+ @on = true
116
+ do_block name
117
+ end
118
+ end
119
+
120
+ def end_element_namespace name, prefix = nil, uri = nil
121
+ if uri == XHTMLNS
122
+ SKIP.include?(name.to_sym) ? clear_text : do_block(name)
123
+ COUNTS.each do |type, set|
124
+ @counts[type] += 1 if set.include? name.to_sym
125
+ end
126
+ @counts[:sections] -= 1 if name == 'body'
127
+ @on = false if name == 'body'
128
+ end
129
+ end
130
+
131
+ def characters string
132
+ @text += string if @on
133
+ end
134
+
135
+ def cdata_block string
136
+ characters string
137
+ end
138
+
139
+ # @return [Float] mean of words per block
140
+ def mean
141
+ @wpb.mean
142
+ end
143
+
144
+ # @return [Float] standard deviation of words per block
145
+ def sd
146
+ @wpb.standard_deviation
147
+ end
148
+
149
+ # @return
150
+ def quartiles
151
+ [0, 25, 50, 75, 100].map { |pct| @wpb.percentile(pct) }
152
+ end
153
+
154
+ def counts
155
+ @counts.dup.freeze
156
+ end
157
+
158
+ def initialize
159
+ @on = false
160
+ @text = ''
161
+ @stack = [] # XXX i don't think we use this one
162
+ @wpb = []
163
+ @counts = %i[chars words blocks sections images videos embeds
164
+ tables lists forms scripts sheets].map { |k| [k, 0] }.to_h
165
+ end
166
+
167
+ def scan doc
168
+ if doc.is_a? Nokogiri::XML::Node
169
+ pretend_sax doc
170
+ else
171
+ parser = Nokogiri::XML::SAX::Parser.new self
172
+ parser.parse doc
173
+ end
174
+
175
+ self
176
+ end
177
+
178
+ def self.scan doc
179
+ new.scan doc
180
+ end
181
+
182
+ def to_h
183
+ { mean: mean, sd: sd, quartiles: quartiles }.merge counts
184
+ end
185
+
186
+ def to_rdf uri: nil, subject: nil
187
+ end
188
+ end
@@ -0,0 +1,772 @@
1
+ require 'rdf'
2
+ require 'rdf/sak/util'
3
+ require 'time'
4
+ require 'nokogiri'
5
+ require 'xml-mixup'
6
+
7
+ class RDF::SAK::Document
8
+ include XML::Mixup
9
+ include RDF::SAK::Util
10
+
11
+ private
12
+
13
+ XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
14
+ XPATHNS = { html: XHTMLNS }
15
+ XHV = 'http://www.w3.org/1999/xhtml/vocab#'.freeze
16
+
17
+ # notice these are only RDFa attributes that take URIs
18
+ RDFA_ATTR = [:about, :resource, :typeof].freeze
19
+ LINK_ATTR = [:href, :src, :data, :action, :longdesc].freeze
20
+ LINK_XPATH = ('.//html:*[not(self::html:base)][%s]' %
21
+ (LINK_ATTR + RDFA_ATTR).map { |a| "@#{a.to_s}" }.join('|')).freeze
22
+
23
+ OBJS = [:href, :src].freeze
24
+
25
+ # ancestor node always with (@property and not @content) and
26
+ # not @resource|@href|@src unless @rel|@rev
27
+ LITXP = ['(ancestor::*[@property][not(@content)]',
28
+ '[not(@resource|@href|@src) or @rel|@rev])[1]' ].join('').freeze
29
+ # note parentheses cause the index to be counted from the root
30
+
31
+ public
32
+
33
+ attr_reader :repo, :subject, :doc, :base, :prefixes
34
+
35
+ # Initialize a document context.
36
+ def initialize repo, doc, subject: nil, base: nil, resolve: nil,
37
+ prefixes: {}, transform: nil, scache: {}, ucache: {}
38
+ # coerce the document
39
+ doc = case doc
40
+ when Nokogiri::XML::Document then doc
41
+ when Nokogiri::XML::Node then Nokogiri::XML::Document.new << doc.dup
42
+ when String, IO, File, Pathname then Nokogiri.XML doc
43
+ else
44
+ raise ArgumentError, "Not sure what to do with #{doc.class}"
45
+ end
46
+
47
+ # we only try this if there is a subject defined, obvs
48
+ base ||= RDF::SAK::Util.canonical_uri repo, subject, rdf: false if subject
49
+
50
+ @repo = repo
51
+ @subject = subject
52
+ @doc = doc
53
+ @base = URI(base.to_s) if base # note this is a vanilla URI
54
+ @resolve = RDF::URI(resolve.to_s) if resolve # note this is an RDF::URI
55
+ @prefixes = prefixes
56
+ @transform = transform
57
+ @scache = scache
58
+ @ucache = ucache
59
+ end
60
+
61
+ def canonical_uuid uri, unique: true, published: false
62
+ RDF::SAK::Util.canonical_uuid @repo, uri, base: @base,
63
+ unique: unique, published: published, scache: @scache, ucache: @ucache
64
+ end
65
+
66
+ def canonical_uri subject,
67
+ unique: true, rdf: true, slugs: false, fragment: false
68
+ RDF::SAK::Util.canonical_uri @repo, subject, base: @base,
69
+ unique: unique, rdf: rdf, slugs: slugs, fragment: fragment
70
+ end
71
+
72
+ def cmp_label a, b, labels: nil, supplant: true, reverse: false
73
+ RDF::SAK::Util.cmp_label @repo, a, b,
74
+ labels: labels, supplant: supplant, reverse: reverse
75
+ end
76
+
77
+ def asserted_types subject, type = nil
78
+ RDF::SAK::Util.asserted_types @repo, subject, type
79
+ end
80
+
81
+ def subjects_for predicate, object, entail: true, only: []
82
+ RDF::SAK::Util.subjects_for @repo, predicate, object,
83
+ entail: entail, only: only
84
+ end
85
+
86
+ def objects_for subject, predicate, entail: true, only: [], datatype: nil
87
+ RDF::SAK::Util.objects_for @repo, subject, predicate,
88
+ entail: entail, only: only, datatype: datatype
89
+ end
90
+
91
+ def struct_for subject, rev: false, only: [], uuids: false, canon: false
92
+ RDF::SAK::Util.struct_for @repo, subject,
93
+ rev: rev, only: only, uuids: uuids, canon: canon,
94
+ ucache: @ucache, scache: @scache
95
+ end
96
+
97
+ def label_for subject, candidates: nil, unique: true, type: nil,
98
+ lang: nil, desc: false, alt: false
99
+ RDF::SAK::Util.label_for @repo, subject, candidates: candidates,
100
+ unique: unique, type: type, lang: lang, desc: desc, alt: alt
101
+ end
102
+
103
+ def formats_for subject, predicate: RDF::Vocab::DC.format,
104
+ datatype: [RDF::XSD.token]
105
+ RDF::SAK::Util.formats_for @repo, subject,
106
+ predicate: predicate, datatype: datatype
107
+ end
108
+
109
+ def authors_for subject, unique: false, contrib: false
110
+ RDF::SAK::Util.authors_for @repo, subject, unique: unique, contrib: contrib
111
+ end
112
+
113
+ # proxy for context published
114
+ def published? subject = nil
115
+ return RDF::SAK::Util.published? @repo, subject, base: @base if subject
116
+ @published ||= RDF::SAK::Util.published? @repo, @subject, base: @base
117
+ end
118
+
119
+ def abbreviate term, prefixes: @prefixes,
120
+ vocab: nil, noop: true, sort: true
121
+ super term, prefixes: prefixes || {}, vocab: vocab, noop: noop, sort: sort
122
+ end
123
+
124
+ def base_for node = nil
125
+ node ||= @doc
126
+ doc = node.document
127
+ base = URI(@base.to_s)
128
+
129
+ return base unless doc.root
130
+
131
+ if doc.root.name.to_sym == :html
132
+ b = doc.at_xpath(
133
+ '(/html:html/html:head/html:base[@href])[1]/@href', XPATHNS
134
+ ).to_s.strip
135
+ b = URI(b)
136
+
137
+ base = b if b.absolute?
138
+ elsif b = doc.root.at_xpath('ancestor-or-self::*[@xml:base][1]/@xml:base')
139
+ b = URI(b.to_s.strip)
140
+ base = b if b.absolute?
141
+ end
142
+
143
+
144
+ # warn({ orig_base: @base, resolve: resolve, base: base}.inspect)
145
+
146
+ # warn %i[scheme host port].map { |s| [s, base.send(s) == resolve.send(s)] }.to_h.inspect
147
+
148
+ # rewrite if aliased
149
+ if @resolve and resolve = URI(@resolve.to_s) and
150
+ %i[scheme host port].all? { |s| base.send(s) == resolve.send(s) }
151
+ tmp = base.dup
152
+ tmp.scheme = @base.scheme
153
+ tmp.host = @base.host
154
+ tmp.port = @base.port
155
+ base = tmp.normalize
156
+ end
157
+
158
+ base
159
+ end
160
+
161
+ def rewrite_links node = @doc, uuids: {}, uris: {}, &block
162
+ base = base_for node
163
+ if be = node.at_xpath('(/html:html/html:head/html:base[@href])[1]', XPATHNS)
164
+ be[:href] = base.to_s if base.to_s != be[:href]
165
+ end
166
+ count = 0
167
+ node.xpath(LINK_XPATH, XPATHNS).each do |elem|
168
+ LINK_ATTR.each do |attr|
169
+ attr = attr.to_s
170
+ next unless elem.has_attribute? attr
171
+
172
+ abs = base.merge uri_pp(elem[attr].strip) rescue nil
173
+ next unless abs
174
+
175
+ # bail out if this isn't http(s)
176
+ next if abs.scheme and !%w[http https].include? abs.scheme.downcase
177
+
178
+ # fix e.g. http->https
179
+ if abs.host == @base.host and abs.scheme != @base.scheme
180
+ tmp = @base.dup
181
+ tmp.path = abs.path
182
+ tmp.query = abs.query
183
+ tmp.fragment = abs.fragment
184
+ abs = tmp
185
+ end
186
+
187
+ # harvest path parameters
188
+ pp = split_pp abs, only: true
189
+
190
+ # coerce to rdf
191
+ abs = RDF::URI(abs.to_s)
192
+
193
+ # make an aliased copy we use to look up the uuid
194
+ aliased = if @resolve
195
+ tmp = abs.dup
196
+ tmp.scheme = @resolve.scheme
197
+ tmp.authority = @resolve.authority if @resolve.authority
198
+ tmp
199
+ else
200
+ abs
201
+ end
202
+
203
+ # warn "aliased #{abs} to #{aliased}" if @resolve
204
+
205
+
206
+ # round-trip to uuid and back if we can
207
+ if uuid = uris[abs] ||= canonical_uuid(aliased)
208
+ abs = uuids[uuid] ||= canonical_uri(uuid)
209
+ elsif cu = canonical_uri(abs)
210
+ # otherwise just find the canonical uri
211
+ abs = cu
212
+ end
213
+
214
+ # reinstate the path parameters
215
+ if !pp.empty? && split_pp(abs, only: true).empty?
216
+ abs = abs.dup
217
+ abs.path = ([abs.path] + pp).join(';')
218
+ end
219
+
220
+ elem[attr] = @base.route_to(abs.to_s).to_s
221
+ count += 1
222
+ end
223
+
224
+ block.call elem if block
225
+ end
226
+
227
+ count
228
+ end
229
+
230
+ # sponge the document for rdfa
231
+ def triples_for
232
+ end
233
+
234
+ def vocab_for node
235
+ if node[:vocab]
236
+ vocab = node[:vocab].strip
237
+ return nil if vocab == ''
238
+ return vocab
239
+ end
240
+ parent = node.parent
241
+ vocab_for parent if parent and parent.element?
242
+ end
243
+
244
+ def prefixes_for node, prefixes = {}
245
+ # start with namespaces
246
+ pfx = node.namespace_declarations.filter(&:prefix).map do |n|
247
+ [n.prefix.to_sym, n.href]
248
+ end.to_h
249
+
250
+ # then add @prefix overtop of the namespaces
251
+ if node[:prefix]
252
+ x = node[:prefix].strip.split(/\s+/)
253
+ a = []
254
+ b = []
255
+ x.each_index { |i| (i % 2 == 0 ? a : b).push x[i] }
256
+ a.map!(&:to_sym)
257
+ # if the size is uneven the values will be nil, so w drop em
258
+ pfx.merge! a.zip(b).to_h.reject { |_, v| v.nil? }
259
+ end
260
+
261
+ # since we're ascending the tree, input takes precedence
262
+ prefixes = pfx.merge prefixes
263
+
264
+ if node.parent and node.parent.element?
265
+ prefixes_for(node.parent, prefixes)
266
+ else
267
+ prefixes
268
+ end
269
+ end
270
+
271
+ # give us the rdf subject of the node itself
272
+ def subject_for node = nil, rdf: false, is_ancestor: false
273
+ node ||= @doc.root
274
+ raise 'Node must be an element' unless
275
+ node.is_a? Nokogiri::XML::Element
276
+
277
+ # first we check for an ancestor element with @property and no
278
+ # @content; if we find one then we reevaluate with that
279
+ # element as the starting point
280
+ if n = node.at_xpath(LITXP)
281
+ return subject_for n
282
+ end
283
+
284
+ # answer a bunch of helpful questions about this element
285
+ subject = nil
286
+ base = base_for node
287
+ parent = node.parent
288
+ ns_href = node.namespace.href if node.namespace
289
+ up_ok = %i{rel rev}.none? { |a| node[a] }
290
+ is_root = !parent or parent.document?
291
+ special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
292
+ (ns_href == XHTMLNS or /^(?:[^:]+:)?html$/xi === parent.name)
293
+
294
+ # if the node is being inspected as an ancestor to the
295
+ # original node, we have to check it backwards.
296
+ if is_ancestor
297
+ # ah right @resource gets special treatment
298
+ if subject = node[:resource]
299
+ subject.strip!
300
+ if m = /^\[(.*?)\]$/.match(subject)
301
+ end
302
+ else
303
+ OBJS.each do |attr|
304
+ if node[attr]
305
+ # merge with the root and return it
306
+ subject = base + node[attr]
307
+ break
308
+ end
309
+ end
310
+ end
311
+
312
+ return rdf ? RDF::URI(subject.to_s) : subject
313
+
314
+ # note if we are being called with is_ancestor, that means
315
+ # the original node (or indeed any of the nodes previously
316
+ # tested) have anything resembling a resource in them. this
317
+ # means @rel/@rev should be ignored, and we should keep
318
+ # looking for a subject.
319
+ end
320
+
321
+ if node[:about]
322
+
323
+ if m = /^_:(.*)$/.match(node[:about])
324
+ return RDF::Node(m[1])
325
+ end
326
+
327
+ # XXX resolve @about against potential curie
328
+ subject = base + node[:about]
329
+
330
+ elsif is_root
331
+ subject = base
332
+ elsif special
333
+ subject = subject_for parent
334
+ elsif node[:resource]
335
+ # XXX resolve @about against potential curie
336
+ subject = base + node[:resource]
337
+ elsif node[:href]
338
+ subject = base + node[:href]
339
+ elsif node[:src]
340
+ subject = base + node[:src]
341
+ elsif node[:typeof]
342
+ # bnode the typeof attr
343
+
344
+ # note we return bnodes irrespective of the rdf flag
345
+ return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
346
+ elsif node[:inlist]
347
+ # bnode the inlist attr
348
+ return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
349
+ elsif (parent[:inlist] && OBJS.none? { |a| parent[a] }) ||
350
+ (is_ancestor && !up_ok)
351
+ # bnode the element
352
+ return RDF::Node('id-%016x' % node.pointer_id)
353
+ # elsif node[:id]
354
+ else
355
+ subject = subject_for parent, is_ancestor: true
356
+ end
357
+
358
+ rdf ? RDF::URI(subject.to_s) : URI(subject.to_s)
359
+
360
+ end
361
+
362
+ # backlink structure
363
+ def generate_backlinks published: true, struct: nil,
364
+ ignore: nil, pattern: nil, terse: false
365
+ uri = canonical_uri(subject, rdf: false) || URI(uri_pp subject)
366
+ ignore = case ignore
367
+ when nil then Set.new
368
+ when Proc then ignore
369
+ when -> x { x.respond_to? :to_set } then ignore = ignore.to_set
370
+ else
371
+ raise 'ignore must be either a proc or amenable to a set'
372
+ end
373
+ nodes = {}
374
+ labels = {}
375
+ types = {}
376
+
377
+ if struct
378
+ struct.each do |p, subjects|
379
+ subjects.each do |s|
380
+ case ignore
381
+ when Proc then next if ignore.call s, p
382
+ when Set then next if ignore.include? s
383
+ end
384
+ preds = nodes[s] ||= Set.new
385
+ preds << p
386
+ types[s] ||= asserted_types s
387
+ labels[s] ||= label_for s
388
+ labels[p] ||= label_for p unless terse
389
+ end
390
+ end
391
+ else
392
+ @repo.query([nil, nil, subject]).each do |stmt|
393
+ s = stmt.subject
394
+ case ignore
395
+ when Proc then next if ignore.call stmt
396
+ when Set then next if ignore.include? s
397
+ end
398
+ preds = nodes[s] ||= Set.new
399
+ preds << (p = stmt.predicate)
400
+ types[s] ||= asserted_types s
401
+ labels[s] ||= label_for s
402
+ labels[p] ||= label_for p unless terse
403
+ end
404
+ end
405
+
406
+ # prune out nonmatching
407
+ nodes.select! { |k, _| pattern.match? k.to_s } if
408
+ pattern and pattern.is_a? Regexp
409
+
410
+ # prune out unpublished
411
+ nodes.select! { |k, _| published? k } if published
412
+
413
+ return if nodes.empty?
414
+
415
+ if terse
416
+ nodes.map do |rsrc, preds|
417
+ cu = canonical_uri(rsrc, rdf: false) or next
418
+ lab = labels[rsrc] || [nil, rsrc]
419
+ link = { nil => :link, rel: '', href: uri.route_to(cu),
420
+ rev: abbreviate(preds) }
421
+ link[:typeof] = abbreviate(types[rsrc]) if types[rsrc]
422
+ link[:title] = lab.last if lab.last
423
+ link
424
+ end.compact
425
+ else
426
+ li = nodes.sort do |a, b|
427
+ cmp_label a.first, b.first, labels: labels
428
+ end.map do |rsrc, preds|
429
+ cu = canonical_uri(rsrc, rdf: false) or next
430
+ lab = labels[rsrc] || [nil, rsrc]
431
+ lp = abbreviate(lab.first) if lab.first
432
+ ty = abbreviate(types[rsrc]) if types[rsrc]
433
+
434
+ { [{ [{ [lab[1].to_s] => :span, property: lp }] => :a, typeof: ty,
435
+ href: uri.route_to(cu), rev: abbreviate(preds) }] => :li }
436
+ end.compact
437
+
438
+ { [{ li => :ul }] => :nav }
439
+ end
440
+ end
441
+
442
+ # goofy twitter-specific metadata
443
+ def generate_twitter_meta
444
+ # get author
445
+ author = authors_for(subject, unique: true) or return
446
+
447
+ return unless author.is_a? RDF::Resource
448
+
449
+ # get author's twitter account
450
+ twitter = objects_for(author, RDF::Vocab::FOAF.account,
451
+ only: :resource).select { |t| t.to_s =~ /twitter\.com/
452
+ }.sort.first or return
453
+ twitter = URI(twitter.to_s).path.split(/\/+/)[1]
454
+ twitter = ?@ + twitter unless twitter.start_with? ?@
455
+
456
+ # get title
457
+ title = label_for(subject) or return
458
+
459
+ out = [
460
+ { nil => :meta, name: 'twitter:card', content: :summary },
461
+ { nil => :meta, name: 'twitter:site', content: twitter },
462
+ { nil => :meta, name: 'twitter:title', content: title[1].to_s }
463
+ ]
464
+
465
+ # get abstract
466
+ if desc = label_for(subject, desc: true)
467
+ out.push({ nil => :meta, name: 'twitter:description',
468
+ content: desc[1].to_s })
469
+ end
470
+
471
+ # get image (foaf:depiction)
472
+ img = objects_for(subject, RDF::Vocab::FOAF.depiction, only: :resource)
473
+ unless img.empty?
474
+ img = img[0].to_s
475
+ out.push({ nil => :meta, name: 'twitter:image', content: img })
476
+ out[0][:content] = :summary_large_image
477
+ end
478
+
479
+ # return the appropriate xml-mixup structure
480
+ out
481
+ end
482
+
483
+ def transform_xhtml published: true, titles: false
484
+ # before we do any more work make sure this is html
485
+ doc = @doc.dup 1
486
+ body = doc.at_xpath('//html:body[1]', XPATHNS) || doc.root
487
+
488
+ # eliminate comments
489
+ doc.xpath('//comment()[not(ancestor::html:script)]', XPATHNS).each do |c|
490
+ c.unlink
491
+ end
492
+
493
+ # initial stuff
494
+ struct = struct_for @subject, uuids: true, canon: true
495
+ rstruct = struct_for @subject, uuids: true, canon: true, rev: true
496
+ resources = {}
497
+ literals = {}
498
+ ufwd = {} # uuid => uri
499
+ urev = {} # uri => uuid
500
+ datatypes = Set.new
501
+ types = Set.new
502
+ authors = authors_for @subject
503
+ title = label_for @subject, candidates: struct
504
+ desc = label_for @subject, candidates: struct, desc: true
505
+
506
+ # warn struct
507
+
508
+ # rewrite content
509
+ title = title[1] if title
510
+ desc = desc[1] if desc
511
+
512
+ # `struct` and `rstruct` will contain all the links and
513
+ # metadata for forward and backward neighbours, respectively,
514
+ # which we need to mine (predicates, classes, datatypes) for
515
+ # prefixes among other things.
516
+
517
+ struct.each do |p, v|
518
+ v.each do |o|
519
+ if o.literal?
520
+ literals[o] ||= Set.new
521
+ literals[o].add p
522
+
523
+ # collect the datatype
524
+ datatypes.add o.datatype if o.has_datatype?
525
+ else
526
+ # normalize URIs
527
+ if o.to_s.start_with? 'urn:uuid:'
528
+ ufwd[o] ||= canonical_uri o
529
+ elsif cu = urev[o] || canonical_uuid(o)
530
+ o = urev[o] ||= cu
531
+ end
532
+
533
+ # collect the resource
534
+ resources[o] ||= Set.new
535
+ resources[o].add p
536
+
537
+ # add to type
538
+ types.add o if p == RDF::RDFV.type
539
+ end
540
+ end
541
+ end
542
+
543
+ urev.merge! ufwd.invert
544
+
545
+ labels = resources.keys.map do |k|
546
+ # turn this into a pair which subsequently gets turned into a hash
547
+ [k, label_for(k) ]
548
+ end.to_h
549
+
550
+ #warn labels
551
+
552
+ # handle the title
553
+ title ||= RDF::Literal('')
554
+ tm = { '#title' => title,
555
+ property: abbreviate(literals[title].to_a, vocab: XHV) }
556
+ if tl = title.language
557
+ tm['xml:lang'] = tl # if xmlns
558
+ tm['lang'] = tl
559
+ elsif tdt = title.datatype and tdt != RDF::XSD.string
560
+ tm[:datatype] = abbreviate(tdt)
561
+ end
562
+
563
+ # we accumulate a record of the links in the body so we know
564
+ # which ones to skip in the head
565
+ bodylinks = {}
566
+ rewrite_links body, uuids: ufwd, uris: urev do |elem|
567
+ vocab = elem.at_xpath('ancestor-or-self::*[@vocab][1]/@vocab')
568
+ vocab = uri_pp(vocab.to_s) if vocab
569
+
570
+ if elem.key?('href') or elem.key?('src')
571
+ begin
572
+ vu = uri_pp(elem['href'] || elem['src'])
573
+ ru = RDF::URI(@base.merge(vu))
574
+ bodylinks[urev[ru] || ru] = true
575
+
576
+ if rel = resources[urev[ru] || ru]
577
+ elem['rel'] = (abbreviate rel, vocab: vocab).join ' '
578
+ end
579
+
580
+ label = labels[urev[ru] || ru]
581
+ if titles and label and
582
+ (!elem.key?('title') or elem['title'].strip == '')
583
+ elem['title'] = label[1].to_s
584
+ end
585
+ rescue URI::InvalidComponentError => e
586
+ warn "#{e}: #{vu} in #{@subject}"
587
+ end
588
+ end
589
+ end
590
+
591
+ # and now we do the head
592
+ links = []
593
+ resources.reject { |k, _| bodylinks[k] }.each do |k, v|
594
+ v = v.dup.delete RDF::RDFV.type
595
+ next if v.empty?
596
+ mts = formats_for k
597
+
598
+ # warn k, v.inspect
599
+
600
+ # warn k, mts.inspect
601
+
602
+ rel = abbreviate v.to_a, vocab: XHV
603
+ ru = @base.route_to(uri_pp (ufwd[k] || k).to_s)
604
+ ln = { nil => :link, rel: rel, href: ru.to_s }
605
+ if (label = labels[urev[k] || k])
606
+ ln[:title] = label[1].to_s
607
+ end
608
+
609
+ # add type=lol/wut
610
+ ln[:type] = mts.first.to_s unless mts.empty?
611
+
612
+ if !ln[:type] and v.include?(RDF::Vocab::XHV.stylesheet)
613
+ ln[:type] = 'text/css'
614
+ elsif ln[:type] =~ /(java|ecma)script/i or
615
+ v.include?(RDF::Vocab::DC.requires)
616
+ ln[nil] = :script
617
+ ln[:src] = ln.delete :href
618
+ ln[:type] ||= 'text/javascript'
619
+ end
620
+ links.push ln
621
+ end
622
+
623
+ links.sort! do |a, b|
624
+ # sort by rel, then by href
625
+ # warn a.inspect, b.inspect
626
+ s = 0
627
+ [nil, :rel, :rev, :href, :title].each do |k|
628
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
629
+ break if s != 0
630
+ end
631
+ s
632
+ end
633
+
634
+ # we want to duplicate links from particular subjects (eg the root)
635
+ (@duplicate || {}).sort do |a, b|
636
+ a.first <=> b.first
637
+ end.each do |s, preds|
638
+
639
+ o = {}
640
+ u = ufwd[s] ||= canonical_uuid s
641
+ s = urev[u] ||= canonical_uri u if u
642
+ f = {}
643
+
644
+ # do not include this subject as these links are already included!
645
+ next if u == @subject
646
+
647
+ # gather up the objects, then gather up the predicates
648
+
649
+ objects_for u || s, preds, only: :resource do |obj, rel|
650
+ # XXX do not know why += |= etc does not work
651
+ x = canonical_uuid(obj) || obj
652
+ urev[x] ||= canonical_uri x
653
+ y = o[x] ||= Set.new
654
+ o[x] = y | rel
655
+ f[x] = formats_for x
656
+ end
657
+
658
+ srel = @base.route_to((u ? urev[u] || s : s).to_s)
659
+
660
+ # now collect all the other predicates
661
+ o.keys.each do |obj|
662
+ hrel = @base.route_to((urev[obj] || obj).to_s)
663
+ o[obj] |= @repo.query([u || s, nil, obj]).predicates.to_set
664
+ rels = abbreviate o[obj].to_a, vocab: XHV
665
+ ln = { nil => :link, about: srel, rel: rels, href: hrel }
666
+ ln[:type] = f[obj].first if f[obj]
667
+
668
+ # add to links
669
+ links << ln
670
+ end
671
+ end
672
+
673
+ meta = []
674
+
675
+ # include author names as old school meta tags
676
+ authors.each do |a|
677
+ name = labels[urev[a] || a] or next
678
+ datatypes.add name[0] # a convenient place to chuck this
679
+ prop = abbreviate(name[0])
680
+ name = name[1]
681
+ about = @base.route_to((ufwd[a] || a).to_s)
682
+ tag = { nil => :meta, about: about.to_s, name: :author,
683
+ property: prop, content: name.to_s }
684
+
685
+ if name.has_datatype? and name.datatype != RDF::XSD.string
686
+ tag[:datatype] = abbreviate(name.datatype)
687
+ elsif name.has_language?
688
+ tag['xml:lang'] = tag[:lang] = name.language
689
+ end
690
+ meta.push tag
691
+ end
692
+
693
+ literals.each do |k, v|
694
+ next if k == title
695
+ rel = abbreviate v.to_a, vocab: XHV
696
+ elem = { nil => :meta, property: rel, content: k.to_s }
697
+ elem[:name] = :description if k == desc
698
+
699
+ if k.has_datatype?
700
+ datatypes.add k.datatype # so we get the prefix
701
+ elem[:datatype] = abbreviate k.datatype, vocab: XHV
702
+ end
703
+
704
+ meta.push(elem)
705
+ end
706
+
707
+ meta.sort! do |a, b|
708
+ s = 0
709
+ [:about, :property, :datatype, :content, :name].each do |k|
710
+ # warn a.inspect, b.inspect
711
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
712
+ break if s != 0
713
+ end
714
+ s
715
+ end
716
+
717
+ # don't forget style tag
718
+ style = doc.xpath('/html:html/html:head/html:style', { html: XHTMLNS })
719
+
720
+ body = body.dup 1
721
+ body = { '#body' => body.children.to_a, about: '' }
722
+ body[:typeof] = abbreviate(types.to_a, vocab: XHV) unless
723
+ types.empty?
724
+
725
+
726
+
727
+ # prepare only the prefixes we need to resolve the data we need
728
+ rsc = abbreviate(
729
+ (struct.keys + resources.keys + datatypes.to_a +
730
+ types.to_a + rstruct.to_a.flatten).uniq, noop: false).map do |x|
731
+ next if x.nil?
732
+ x.split(?:)[0].to_sym
733
+ end.reject(&:nil?).to_set
734
+
735
+ # warn rsc
736
+
737
+ pfx = prefixes.select do |k, _|
738
+ rsc.include? k
739
+ end.transform_values { |v| v.to_s }
740
+
741
+ # XXX deal with the qb:Observation separately (just nuke it for now)
742
+ extra = generate_twitter_meta || []
743
+ bl_op = begin
744
+ bads = @repo.query(
745
+ [nil, RDF::SAK::CI.document, @subject]).subjects.to_set
746
+ nope = %w[top contents index].map { |x| RDF::Vocab::XHV[x] }
747
+ lambda { |s, p| bads.include? s or nope.include? p }
748
+ end
749
+ if bl = generate_backlinks(
750
+ published: published, pattern: /^urn:uuid:/, terse: true,
751
+ struct: rstruct, ignore: bl_op)
752
+ extra << bl #{ [bl] => :object }
753
+ end
754
+
755
+ # and now for the document
756
+ xf = @transform
757
+ doc = xhtml_stub(
758
+ base: @base, prefix: pfx, vocab: XHV, lang: 'en', title: tm,
759
+ link: links, meta: meta, style: style, transform: xf,
760
+ extra: extra, body: body).document
761
+
762
+ # goddamn script tags and text/html
763
+ doc.xpath('//html:script[@src][not(node())]', XPATHNS).each do |script|
764
+ script << doc.create_text_node('')
765
+ end
766
+
767
+ doc
768
+ end
769
+
770
+
771
+
772
+ end