rdf-sak 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +268 -0
- data/Rakefile +13 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/example/cleanup.xsl +14 -0
- data/example/matches.xhtml +11 -0
- data/example/transforms.ttl +58 -0
- data/lib/rdf-sak.rb +1 -0
- data/lib/rdf/sak.rb +2506 -0
- data/lib/rdf/sak/ci.rb +827 -0
- data/lib/rdf/sak/cli.rb +35 -0
- data/lib/rdf/sak/docstats.rb +188 -0
- data/lib/rdf/sak/document.rb +772 -0
- data/lib/rdf/sak/ibis.rb +248 -0
- data/lib/rdf/sak/mimemagic.rb +73 -0
- data/lib/rdf/sak/pav.rb +479 -0
- data/lib/rdf/sak/qb.rb +280 -0
- data/lib/rdf/sak/scovo.rb +51 -0
- data/lib/rdf/sak/tfo.rb +301 -0
- data/lib/rdf/sak/transform.rb +1172 -0
- data/lib/rdf/sak/urlrunner.rb +602 -0
- data/lib/rdf/sak/util.rb +2081 -0
- data/lib/rdf/sak/version.rb +5 -0
- data/rdf-sak.gemspec +60 -0
- metadata +366 -0
data/lib/rdf/sak/cli.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'rdf/sak'
|
2
|
+
require 'xml-mixup'
|
3
|
+
require 'commander'
|
4
|
+
|
5
|
+
module RDF::SAK
|
6
|
+
class CLI
|
7
|
+
# This is a command-line interface
|
8
|
+
|
9
|
+
include XML::Mixup
|
10
|
+
include Commander::Methods
|
11
|
+
|
12
|
+
# bunch of data declarations etc we don't want to expose
|
13
|
+
private
|
14
|
+
|
15
|
+
# actual methods
|
16
|
+
public
|
17
|
+
|
18
|
+
# constructor
|
19
|
+
|
20
|
+
# configuration:
|
21
|
+
|
22
|
+
# directories: source, target, private
|
23
|
+
# files (or file names): graph, rewrite_map, redirect_map, gone_map
|
24
|
+
# URIs: base, aliases
|
25
|
+
|
26
|
+
def initialize config: {}
|
27
|
+
end
|
28
|
+
|
29
|
+
# vestigial
|
30
|
+
|
31
|
+
def run
|
32
|
+
run!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
require 'rdf/sak/version'
|
2
|
+
require 'set'
|
3
|
+
require 'descriptive_statistics'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
class RDF::SAK::DocStats < Nokogiri::XML::SAX::Document
|
7
|
+
private
|
8
|
+
|
9
|
+
MAYBE = %i[dt dd li td th caption figcaption]
|
10
|
+
SKIP = %i[html head title base link meta script]
|
11
|
+
BLOCKS = Set.new(%i[body p h1 h2 h3 h4 h5 h6 ul ol pre dl main header footer
|
12
|
+
article section aside figure nav div noscript blockquote form hr
|
13
|
+
table fieldset address] + MAYBE).freeze
|
14
|
+
SECTIONS = Set.new(%i[body article section]).freeze
|
15
|
+
IMAGES = Set.new(%i[img picture]).freeze
|
16
|
+
VIDEOS = Set.new(%i[video]).freeze
|
17
|
+
EMBEDS = Set.new(%i[embed object iframe])
|
18
|
+
COUNTS = {
|
19
|
+
sections: %i[body article section header footer nav aside],
|
20
|
+
images: %i[img picture],
|
21
|
+
videos: %i[video],
|
22
|
+
embeds: %i[embed object iframe],
|
23
|
+
tables: %i[table],
|
24
|
+
lists: %i[ul ol dl],
|
25
|
+
forms: %i[form],
|
26
|
+
scripts: %i[script],
|
27
|
+
sheets: %i[style],
|
28
|
+
}.transform_values { |v| Set.new v }.freeze
|
29
|
+
|
30
|
+
|
31
|
+
NODEXP = '/html:html/html:body[not(*)]|/html:html/html:body//*[not(*)]'.freeze
|
32
|
+
XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
|
33
|
+
XPATHNS = { html: XHTMLNS }.freeze
|
34
|
+
|
35
|
+
# ok listen up fools here is the new html document stats algo:
|
36
|
+
|
37
|
+
# okay we want to count characters, words, blocks, and sections, as
|
38
|
+
# well as gather stats on words per block (and probably blocks per section)
|
39
|
+
|
40
|
+
# the problem is we don't want to count blocks that only contain other blocks
|
41
|
+
|
42
|
+
# we also don't want to count the text of sub-blocks in a superordinate block
|
43
|
+
|
44
|
+
# there are also quasi-blocks that we may not ordinarily count,
|
45
|
+
# except if they themselves contain two or more adjacent
|
46
|
+
# blocks. (examples: li, th/td, h1-6, caption/figcaption)
|
47
|
+
|
48
|
+
# count the block only if it contains text and inline elements (and
|
49
|
+
# only count the text and inline elements)
|
50
|
+
|
51
|
+
# if
|
52
|
+
|
53
|
+
# we can also
|
54
|
+
|
55
|
+
# use xpath to find all the leaf node elements
|
56
|
+
#
|
57
|
+
|
58
|
+
def pretend_sax node
|
59
|
+
case node.type
|
60
|
+
when Nokogiri::XML::Node::DOCUMENT_NODE
|
61
|
+
# if node is a document run begin and end document and then run
|
62
|
+
# for children
|
63
|
+
start_document
|
64
|
+
node.children.each { |c| pretend_sax c }
|
65
|
+
end_document
|
66
|
+
when Nokogiri::XML::Node::ELEMENT_NODE
|
67
|
+
# if node is an element run begin and end element and run for children
|
68
|
+
prefix, uri = if ns = node.namespace
|
69
|
+
[ns.prefix, ns.href]
|
70
|
+
end
|
71
|
+
ns = node.namespace_scopes.map { |n| [ns.prefix, ns.href] }
|
72
|
+
attrs = node.attribute_nodes.map do |a|
|
73
|
+
an = a.name
|
74
|
+
an = "#{a.namespace.prefix}:#{an}" if
|
75
|
+
a.namespace and a.namespace.prefix
|
76
|
+
[an, a.content]
|
77
|
+
end
|
78
|
+
start_element_namespace node.name, attrs, prefix, uri, ns
|
79
|
+
node.children.each { |c| pretend_sax c }
|
80
|
+
end_element_namespace node.name, prefix, uri
|
81
|
+
when Nokogiri::XML::Node::TEXT_NODE
|
82
|
+
characters node.content
|
83
|
+
when Nokogiri::XML::Node::CDATA_SECTION_NODE
|
84
|
+
cdata_block node.content
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def do_block name
|
89
|
+
if BLOCKS.include? name.to_sym
|
90
|
+
w = @text.strip.split
|
91
|
+
t = w.join ' '
|
92
|
+
|
93
|
+
unless w.empty?
|
94
|
+
words = w.length
|
95
|
+
@counts[:chars] += t.length
|
96
|
+
@counts[:words] += words
|
97
|
+
@counts[:blocks] += 1
|
98
|
+
@wpb << words
|
99
|
+
@stack << t
|
100
|
+
@text = ''
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def clear_text
|
106
|
+
@text = ''
|
107
|
+
end
|
108
|
+
|
109
|
+
public
|
110
|
+
|
111
|
+
attr_reader :chars, :words, :blocks
|
112
|
+
|
113
|
+
def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = []
|
114
|
+
unless uri != XHTMLNS or SKIP.include? name.to_sym
|
115
|
+
@on = true
|
116
|
+
do_block name
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def end_element_namespace name, prefix = nil, uri = nil
|
121
|
+
if uri == XHTMLNS
|
122
|
+
SKIP.include?(name.to_sym) ? clear_text : do_block(name)
|
123
|
+
COUNTS.each do |type, set|
|
124
|
+
@counts[type] += 1 if set.include? name.to_sym
|
125
|
+
end
|
126
|
+
@counts[:sections] -= 1 if name == 'body'
|
127
|
+
@on = false if name == 'body'
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def characters string
|
132
|
+
@text += string if @on
|
133
|
+
end
|
134
|
+
|
135
|
+
def cdata_block string
|
136
|
+
characters string
|
137
|
+
end
|
138
|
+
|
139
|
+
# @return [Float] mean of words per block
|
140
|
+
def mean
|
141
|
+
@wpb.mean
|
142
|
+
end
|
143
|
+
|
144
|
+
# @return [Float] standard deviation of words per block
|
145
|
+
def sd
|
146
|
+
@wpb.standard_deviation
|
147
|
+
end
|
148
|
+
|
149
|
+
# @return
|
150
|
+
def quartiles
|
151
|
+
[0, 25, 50, 75, 100].map { |pct| @wpb.percentile(pct) }
|
152
|
+
end
|
153
|
+
|
154
|
+
def counts
|
155
|
+
@counts.dup.freeze
|
156
|
+
end
|
157
|
+
|
158
|
+
def initialize
|
159
|
+
@on = false
|
160
|
+
@text = ''
|
161
|
+
@stack = [] # XXX i don't think we use this one
|
162
|
+
@wpb = []
|
163
|
+
@counts = %i[chars words blocks sections images videos embeds
|
164
|
+
tables lists forms scripts sheets].map { |k| [k, 0] }.to_h
|
165
|
+
end
|
166
|
+
|
167
|
+
def scan doc
|
168
|
+
if doc.is_a? Nokogiri::XML::Node
|
169
|
+
pretend_sax doc
|
170
|
+
else
|
171
|
+
parser = Nokogiri::XML::SAX::Parser.new self
|
172
|
+
parser.parse doc
|
173
|
+
end
|
174
|
+
|
175
|
+
self
|
176
|
+
end
|
177
|
+
|
178
|
+
def self.scan doc
|
179
|
+
new.scan doc
|
180
|
+
end
|
181
|
+
|
182
|
+
def to_h
|
183
|
+
{ mean: mean, sd: sd, quartiles: quartiles }.merge counts
|
184
|
+
end
|
185
|
+
|
186
|
+
def to_rdf uri: nil, subject: nil
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,772 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
require 'rdf/sak/util'
|
3
|
+
require 'time'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'xml-mixup'
|
6
|
+
|
7
|
+
class RDF::SAK::Document
|
8
|
+
include XML::Mixup
|
9
|
+
include RDF::SAK::Util
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
|
14
|
+
XPATHNS = { html: XHTMLNS }
|
15
|
+
XHV = 'http://www.w3.org/1999/xhtml/vocab#'.freeze
|
16
|
+
|
17
|
+
# notice these are only RDFa attributes that take URIs
|
18
|
+
RDFA_ATTR = [:about, :resource, :typeof].freeze
|
19
|
+
LINK_ATTR = [:href, :src, :data, :action, :longdesc].freeze
|
20
|
+
LINK_XPATH = ('.//html:*[not(self::html:base)][%s]' %
|
21
|
+
(LINK_ATTR + RDFA_ATTR).map { |a| "@#{a.to_s}" }.join('|')).freeze
|
22
|
+
|
23
|
+
OBJS = [:href, :src].freeze
|
24
|
+
|
25
|
+
# ancestor node always with (@property and not @content) and
|
26
|
+
# not @resource|@href|@src unless @rel|@rev
|
27
|
+
LITXP = ['(ancestor::*[@property][not(@content)]',
|
28
|
+
'[not(@resource|@href|@src) or @rel|@rev])[1]' ].join('').freeze
|
29
|
+
# note parentheses cause the index to be counted from the root
|
30
|
+
|
31
|
+
public
|
32
|
+
|
33
|
+
attr_reader :repo, :subject, :doc, :base, :prefixes
|
34
|
+
|
35
|
+
# Initialize a document context.
|
36
|
+
def initialize repo, doc, subject: nil, base: nil, resolve: nil,
|
37
|
+
prefixes: {}, transform: nil, scache: {}, ucache: {}
|
38
|
+
# coerce the document
|
39
|
+
doc = case doc
|
40
|
+
when Nokogiri::XML::Document then doc
|
41
|
+
when Nokogiri::XML::Node then Nokogiri::XML::Document.new << doc.dup
|
42
|
+
when String, IO, File, Pathname then Nokogiri.XML doc
|
43
|
+
else
|
44
|
+
raise ArgumentError, "Not sure what to do with #{doc.class}"
|
45
|
+
end
|
46
|
+
|
47
|
+
# we only try this if there is a subject defined, obvs
|
48
|
+
base ||= RDF::SAK::Util.canonical_uri repo, subject, rdf: false if subject
|
49
|
+
|
50
|
+
@repo = repo
|
51
|
+
@subject = subject
|
52
|
+
@doc = doc
|
53
|
+
@base = URI(base.to_s) if base # note this is a vanilla URI
|
54
|
+
@resolve = RDF::URI(resolve.to_s) if resolve # note this is an RDF::URI
|
55
|
+
@prefixes = prefixes
|
56
|
+
@transform = transform
|
57
|
+
@scache = scache
|
58
|
+
@ucache = ucache
|
59
|
+
end
|
60
|
+
|
61
|
+
def canonical_uuid uri, unique: true, published: false
|
62
|
+
RDF::SAK::Util.canonical_uuid @repo, uri, base: @base,
|
63
|
+
unique: unique, published: published, scache: @scache, ucache: @ucache
|
64
|
+
end
|
65
|
+
|
66
|
+
def canonical_uri subject,
|
67
|
+
unique: true, rdf: true, slugs: false, fragment: false
|
68
|
+
RDF::SAK::Util.canonical_uri @repo, subject, base: @base,
|
69
|
+
unique: unique, rdf: rdf, slugs: slugs, fragment: fragment
|
70
|
+
end
|
71
|
+
|
72
|
+
def cmp_label a, b, labels: nil, supplant: true, reverse: false
|
73
|
+
RDF::SAK::Util.cmp_label @repo, a, b,
|
74
|
+
labels: labels, supplant: supplant, reverse: reverse
|
75
|
+
end
|
76
|
+
|
77
|
+
def asserted_types subject, type = nil
|
78
|
+
RDF::SAK::Util.asserted_types @repo, subject, type
|
79
|
+
end
|
80
|
+
|
81
|
+
def subjects_for predicate, object, entail: true, only: []
|
82
|
+
RDF::SAK::Util.subjects_for @repo, predicate, object,
|
83
|
+
entail: entail, only: only
|
84
|
+
end
|
85
|
+
|
86
|
+
def objects_for subject, predicate, entail: true, only: [], datatype: nil
|
87
|
+
RDF::SAK::Util.objects_for @repo, subject, predicate,
|
88
|
+
entail: entail, only: only, datatype: datatype
|
89
|
+
end
|
90
|
+
|
91
|
+
def struct_for subject, rev: false, only: [], uuids: false, canon: false
|
92
|
+
RDF::SAK::Util.struct_for @repo, subject,
|
93
|
+
rev: rev, only: only, uuids: uuids, canon: canon,
|
94
|
+
ucache: @ucache, scache: @scache
|
95
|
+
end
|
96
|
+
|
97
|
+
def label_for subject, candidates: nil, unique: true, type: nil,
|
98
|
+
lang: nil, desc: false, alt: false
|
99
|
+
RDF::SAK::Util.label_for @repo, subject, candidates: candidates,
|
100
|
+
unique: unique, type: type, lang: lang, desc: desc, alt: alt
|
101
|
+
end
|
102
|
+
|
103
|
+
def formats_for subject, predicate: RDF::Vocab::DC.format,
|
104
|
+
datatype: [RDF::XSD.token]
|
105
|
+
RDF::SAK::Util.formats_for @repo, subject,
|
106
|
+
predicate: predicate, datatype: datatype
|
107
|
+
end
|
108
|
+
|
109
|
+
def authors_for subject, unique: false, contrib: false
|
110
|
+
RDF::SAK::Util.authors_for @repo, subject, unique: unique, contrib: contrib
|
111
|
+
end
|
112
|
+
|
113
|
+
# proxy for context published
|
114
|
+
def published? subject = nil
|
115
|
+
return RDF::SAK::Util.published? @repo, subject, base: @base if subject
|
116
|
+
@published ||= RDF::SAK::Util.published? @repo, @subject, base: @base
|
117
|
+
end
|
118
|
+
|
119
|
+
def abbreviate term, prefixes: @prefixes,
|
120
|
+
vocab: nil, noop: true, sort: true
|
121
|
+
super term, prefixes: prefixes || {}, vocab: vocab, noop: noop, sort: sort
|
122
|
+
end
|
123
|
+
|
124
|
+
def base_for node = nil
|
125
|
+
node ||= @doc
|
126
|
+
doc = node.document
|
127
|
+
base = URI(@base.to_s)
|
128
|
+
|
129
|
+
return base unless doc.root
|
130
|
+
|
131
|
+
if doc.root.name.to_sym == :html
|
132
|
+
b = doc.at_xpath(
|
133
|
+
'(/html:html/html:head/html:base[@href])[1]/@href', XPATHNS
|
134
|
+
).to_s.strip
|
135
|
+
b = URI(b)
|
136
|
+
|
137
|
+
base = b if b.absolute?
|
138
|
+
elsif b = doc.root.at_xpath('ancestor-or-self::*[@xml:base][1]/@xml:base')
|
139
|
+
b = URI(b.to_s.strip)
|
140
|
+
base = b if b.absolute?
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
# warn({ orig_base: @base, resolve: resolve, base: base}.inspect)
|
145
|
+
|
146
|
+
# warn %i[scheme host port].map { |s| [s, base.send(s) == resolve.send(s)] }.to_h.inspect
|
147
|
+
|
148
|
+
# rewrite if aliased
|
149
|
+
if @resolve and resolve = URI(@resolve.to_s) and
|
150
|
+
%i[scheme host port].all? { |s| base.send(s) == resolve.send(s) }
|
151
|
+
tmp = base.dup
|
152
|
+
tmp.scheme = @base.scheme
|
153
|
+
tmp.host = @base.host
|
154
|
+
tmp.port = @base.port
|
155
|
+
base = tmp.normalize
|
156
|
+
end
|
157
|
+
|
158
|
+
base
|
159
|
+
end
|
160
|
+
|
161
|
+
def rewrite_links node = @doc, uuids: {}, uris: {}, &block
|
162
|
+
base = base_for node
|
163
|
+
if be = node.at_xpath('(/html:html/html:head/html:base[@href])[1]', XPATHNS)
|
164
|
+
be[:href] = base.to_s if base.to_s != be[:href]
|
165
|
+
end
|
166
|
+
count = 0
|
167
|
+
node.xpath(LINK_XPATH, XPATHNS).each do |elem|
|
168
|
+
LINK_ATTR.each do |attr|
|
169
|
+
attr = attr.to_s
|
170
|
+
next unless elem.has_attribute? attr
|
171
|
+
|
172
|
+
abs = base.merge uri_pp(elem[attr].strip) rescue nil
|
173
|
+
next unless abs
|
174
|
+
|
175
|
+
# bail out if this isn't http(s)
|
176
|
+
next if abs.scheme and !%w[http https].include? abs.scheme.downcase
|
177
|
+
|
178
|
+
# fix e.g. http->https
|
179
|
+
if abs.host == @base.host and abs.scheme != @base.scheme
|
180
|
+
tmp = @base.dup
|
181
|
+
tmp.path = abs.path
|
182
|
+
tmp.query = abs.query
|
183
|
+
tmp.fragment = abs.fragment
|
184
|
+
abs = tmp
|
185
|
+
end
|
186
|
+
|
187
|
+
# harvest path parameters
|
188
|
+
pp = split_pp abs, only: true
|
189
|
+
|
190
|
+
# coerce to rdf
|
191
|
+
abs = RDF::URI(abs.to_s)
|
192
|
+
|
193
|
+
# make an aliased copy we use to look up the uuid
|
194
|
+
aliased = if @resolve
|
195
|
+
tmp = abs.dup
|
196
|
+
tmp.scheme = @resolve.scheme
|
197
|
+
tmp.authority = @resolve.authority if @resolve.authority
|
198
|
+
tmp
|
199
|
+
else
|
200
|
+
abs
|
201
|
+
end
|
202
|
+
|
203
|
+
# warn "aliased #{abs} to #{aliased}" if @resolve
|
204
|
+
|
205
|
+
|
206
|
+
# round-trip to uuid and back if we can
|
207
|
+
if uuid = uris[abs] ||= canonical_uuid(aliased)
|
208
|
+
abs = uuids[uuid] ||= canonical_uri(uuid)
|
209
|
+
elsif cu = canonical_uri(abs)
|
210
|
+
# otherwise just find the canonical uri
|
211
|
+
abs = cu
|
212
|
+
end
|
213
|
+
|
214
|
+
# reinstate the path parameters
|
215
|
+
if !pp.empty? && split_pp(abs, only: true).empty?
|
216
|
+
abs = abs.dup
|
217
|
+
abs.path = ([abs.path] + pp).join(';')
|
218
|
+
end
|
219
|
+
|
220
|
+
elem[attr] = @base.route_to(abs.to_s).to_s
|
221
|
+
count += 1
|
222
|
+
end
|
223
|
+
|
224
|
+
block.call elem if block
|
225
|
+
end
|
226
|
+
|
227
|
+
count
|
228
|
+
end
|
229
|
+
|
230
|
+
# sponge the document for rdfa
|
231
|
+
def triples_for
|
232
|
+
end
|
233
|
+
|
234
|
+
def vocab_for node
|
235
|
+
if node[:vocab]
|
236
|
+
vocab = node[:vocab].strip
|
237
|
+
return nil if vocab == ''
|
238
|
+
return vocab
|
239
|
+
end
|
240
|
+
parent = node.parent
|
241
|
+
vocab_for parent if parent and parent.element?
|
242
|
+
end
|
243
|
+
|
244
|
+
def prefixes_for node, prefixes = {}
|
245
|
+
# start with namespaces
|
246
|
+
pfx = node.namespace_declarations.filter(&:prefix).map do |n|
|
247
|
+
[n.prefix.to_sym, n.href]
|
248
|
+
end.to_h
|
249
|
+
|
250
|
+
# then add @prefix overtop of the namespaces
|
251
|
+
if node[:prefix]
|
252
|
+
x = node[:prefix].strip.split(/\s+/)
|
253
|
+
a = []
|
254
|
+
b = []
|
255
|
+
x.each_index { |i| (i % 2 == 0 ? a : b).push x[i] }
|
256
|
+
a.map!(&:to_sym)
|
257
|
+
# if the size is uneven the values will be nil, so w drop em
|
258
|
+
pfx.merge! a.zip(b).to_h.reject { |_, v| v.nil? }
|
259
|
+
end
|
260
|
+
|
261
|
+
# since we're ascending the tree, input takes precedence
|
262
|
+
prefixes = pfx.merge prefixes
|
263
|
+
|
264
|
+
if node.parent and node.parent.element?
|
265
|
+
prefixes_for(node.parent, prefixes)
|
266
|
+
else
|
267
|
+
prefixes
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
# give us the rdf subject of the node itself
|
272
|
+
def subject_for node = nil, rdf: false, is_ancestor: false
|
273
|
+
node ||= @doc.root
|
274
|
+
raise 'Node must be an element' unless
|
275
|
+
node.is_a? Nokogiri::XML::Element
|
276
|
+
|
277
|
+
# first we check for an ancestor element with @property and no
|
278
|
+
# @content; if we find one then we reevaluate with that
|
279
|
+
# element as the starting point
|
280
|
+
if n = node.at_xpath(LITXP)
|
281
|
+
return subject_for n
|
282
|
+
end
|
283
|
+
|
284
|
+
# answer a bunch of helpful questions about this element
|
285
|
+
subject = nil
|
286
|
+
base = base_for node
|
287
|
+
parent = node.parent
|
288
|
+
ns_href = node.namespace.href if node.namespace
|
289
|
+
up_ok = %i{rel rev}.none? { |a| node[a] }
|
290
|
+
is_root = !parent or parent.document?
|
291
|
+
special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
|
292
|
+
(ns_href == XHTMLNS or /^(?:[^:]+:)?html$/xi === parent.name)
|
293
|
+
|
294
|
+
# if the node is being inspected as an ancestor to the
|
295
|
+
# original node, we have to check it backwards.
|
296
|
+
if is_ancestor
|
297
|
+
# ah right @resource gets special treatment
|
298
|
+
if subject = node[:resource]
|
299
|
+
subject.strip!
|
300
|
+
if m = /^\[(.*?)\]$/.match(subject)
|
301
|
+
end
|
302
|
+
else
|
303
|
+
OBJS.each do |attr|
|
304
|
+
if node[attr]
|
305
|
+
# merge with the root and return it
|
306
|
+
subject = base + node[attr]
|
307
|
+
break
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
return rdf ? RDF::URI(subject.to_s) : subject
|
313
|
+
|
314
|
+
# note if we are being called with is_ancestor, that means
|
315
|
+
# the original node (or indeed any of the nodes previously
|
316
|
+
# tested) have anything resembling a resource in them. this
|
317
|
+
# means @rel/@rev should be ignored, and we should keep
|
318
|
+
# looking for a subject.
|
319
|
+
end
|
320
|
+
|
321
|
+
if node[:about]
|
322
|
+
|
323
|
+
if m = /^_:(.*)$/.match(node[:about])
|
324
|
+
return RDF::Node(m[1])
|
325
|
+
end
|
326
|
+
|
327
|
+
# XXX resolve @about against potential curie
|
328
|
+
subject = base + node[:about]
|
329
|
+
|
330
|
+
elsif is_root
|
331
|
+
subject = base
|
332
|
+
elsif special
|
333
|
+
subject = subject_for parent
|
334
|
+
elsif node[:resource]
|
335
|
+
# XXX resolve @about against potential curie
|
336
|
+
subject = base + node[:resource]
|
337
|
+
elsif node[:href]
|
338
|
+
subject = base + node[:href]
|
339
|
+
elsif node[:src]
|
340
|
+
subject = base + node[:src]
|
341
|
+
elsif node[:typeof]
|
342
|
+
# bnode the typeof attr
|
343
|
+
|
344
|
+
# note we return bnodes irrespective of the rdf flag
|
345
|
+
return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
|
346
|
+
elsif node[:inlist]
|
347
|
+
# bnode the inlist attr
|
348
|
+
return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
|
349
|
+
elsif (parent[:inlist] && OBJS.none? { |a| parent[a] }) ||
|
350
|
+
(is_ancestor && !up_ok)
|
351
|
+
# bnode the element
|
352
|
+
return RDF::Node('id-%016x' % node.pointer_id)
|
353
|
+
# elsif node[:id]
|
354
|
+
else
|
355
|
+
subject = subject_for parent, is_ancestor: true
|
356
|
+
end
|
357
|
+
|
358
|
+
rdf ? RDF::URI(subject.to_s) : URI(subject.to_s)
|
359
|
+
|
360
|
+
end
|
361
|
+
|
362
|
+
# backlink structure
|
363
|
+
def generate_backlinks published: true, struct: nil,
|
364
|
+
ignore: nil, pattern: nil, terse: false
|
365
|
+
uri = canonical_uri(subject, rdf: false) || URI(uri_pp subject)
|
366
|
+
ignore = case ignore
|
367
|
+
when nil then Set.new
|
368
|
+
when Proc then ignore
|
369
|
+
when -> x { x.respond_to? :to_set } then ignore = ignore.to_set
|
370
|
+
else
|
371
|
+
raise 'ignore must be either a proc or amenable to a set'
|
372
|
+
end
|
373
|
+
nodes = {}
|
374
|
+
labels = {}
|
375
|
+
types = {}
|
376
|
+
|
377
|
+
if struct
|
378
|
+
struct.each do |p, subjects|
|
379
|
+
subjects.each do |s|
|
380
|
+
case ignore
|
381
|
+
when Proc then next if ignore.call s, p
|
382
|
+
when Set then next if ignore.include? s
|
383
|
+
end
|
384
|
+
preds = nodes[s] ||= Set.new
|
385
|
+
preds << p
|
386
|
+
types[s] ||= asserted_types s
|
387
|
+
labels[s] ||= label_for s
|
388
|
+
labels[p] ||= label_for p unless terse
|
389
|
+
end
|
390
|
+
end
|
391
|
+
else
|
392
|
+
@repo.query([nil, nil, subject]).each do |stmt|
|
393
|
+
s = stmt.subject
|
394
|
+
case ignore
|
395
|
+
when Proc then next if ignore.call stmt
|
396
|
+
when Set then next if ignore.include? s
|
397
|
+
end
|
398
|
+
preds = nodes[s] ||= Set.new
|
399
|
+
preds << (p = stmt.predicate)
|
400
|
+
types[s] ||= asserted_types s
|
401
|
+
labels[s] ||= label_for s
|
402
|
+
labels[p] ||= label_for p unless terse
|
403
|
+
end
|
404
|
+
end
|
405
|
+
|
406
|
+
# prune out nonmatching
|
407
|
+
nodes.select! { |k, _| pattern.match? k.to_s } if
|
408
|
+
pattern and pattern.is_a? Regexp
|
409
|
+
|
410
|
+
# prune out unpublished
|
411
|
+
nodes.select! { |k, _| published? k } if published
|
412
|
+
|
413
|
+
return if nodes.empty?
|
414
|
+
|
415
|
+
if terse
|
416
|
+
nodes.map do |rsrc, preds|
|
417
|
+
cu = canonical_uri(rsrc, rdf: false) or next
|
418
|
+
lab = labels[rsrc] || [nil, rsrc]
|
419
|
+
link = { nil => :link, rel: '', href: uri.route_to(cu),
|
420
|
+
rev: abbreviate(preds) }
|
421
|
+
link[:typeof] = abbreviate(types[rsrc]) if types[rsrc]
|
422
|
+
link[:title] = lab.last if lab.last
|
423
|
+
link
|
424
|
+
end.compact
|
425
|
+
else
|
426
|
+
li = nodes.sort do |a, b|
|
427
|
+
cmp_label a.first, b.first, labels: labels
|
428
|
+
end.map do |rsrc, preds|
|
429
|
+
cu = canonical_uri(rsrc, rdf: false) or next
|
430
|
+
lab = labels[rsrc] || [nil, rsrc]
|
431
|
+
lp = abbreviate(lab.first) if lab.first
|
432
|
+
ty = abbreviate(types[rsrc]) if types[rsrc]
|
433
|
+
|
434
|
+
{ [{ [{ [lab[1].to_s] => :span, property: lp }] => :a, typeof: ty,
|
435
|
+
href: uri.route_to(cu), rev: abbreviate(preds) }] => :li }
|
436
|
+
end.compact
|
437
|
+
|
438
|
+
{ [{ li => :ul }] => :nav }
|
439
|
+
end
|
440
|
+
end
|
441
|
+
|
442
|
+
# goofy twitter-specific metadata
|
443
|
+
def generate_twitter_meta
|
444
|
+
# get author
|
445
|
+
author = authors_for(subject, unique: true) or return
|
446
|
+
|
447
|
+
return unless author.is_a? RDF::Resource
|
448
|
+
|
449
|
+
# get author's twitter account
|
450
|
+
twitter = objects_for(author, RDF::Vocab::FOAF.account,
|
451
|
+
only: :resource).select { |t| t.to_s =~ /twitter\.com/
|
452
|
+
}.sort.first or return
|
453
|
+
twitter = URI(twitter.to_s).path.split(/\/+/)[1]
|
454
|
+
twitter = ?@ + twitter unless twitter.start_with? ?@
|
455
|
+
|
456
|
+
# get title
|
457
|
+
title = label_for(subject) or return
|
458
|
+
|
459
|
+
out = [
|
460
|
+
{ nil => :meta, name: 'twitter:card', content: :summary },
|
461
|
+
{ nil => :meta, name: 'twitter:site', content: twitter },
|
462
|
+
{ nil => :meta, name: 'twitter:title', content: title[1].to_s }
|
463
|
+
]
|
464
|
+
|
465
|
+
# get abstract
|
466
|
+
if desc = label_for(subject, desc: true)
|
467
|
+
out.push({ nil => :meta, name: 'twitter:description',
|
468
|
+
content: desc[1].to_s })
|
469
|
+
end
|
470
|
+
|
471
|
+
# get image (foaf:depiction)
|
472
|
+
img = objects_for(subject, RDF::Vocab::FOAF.depiction, only: :resource)
|
473
|
+
unless img.empty?
|
474
|
+
img = img[0].to_s
|
475
|
+
out.push({ nil => :meta, name: 'twitter:image', content: img })
|
476
|
+
out[0][:content] = :summary_large_image
|
477
|
+
end
|
478
|
+
|
479
|
+
# return the appropriate xml-mixup structure
|
480
|
+
out
|
481
|
+
end
|
482
|
+
|
483
|
+
def transform_xhtml published: true, titles: false
|
484
|
+
# before we do any more work make sure this is html
|
485
|
+
doc = @doc.dup 1
|
486
|
+
body = doc.at_xpath('//html:body[1]', XPATHNS) || doc.root
|
487
|
+
|
488
|
+
# eliminate comments
|
489
|
+
doc.xpath('//comment()[not(ancestor::html:script)]', XPATHNS).each do |c|
|
490
|
+
c.unlink
|
491
|
+
end
|
492
|
+
|
493
|
+
# initial stuff
|
494
|
+
struct = struct_for @subject, uuids: true, canon: true
|
495
|
+
rstruct = struct_for @subject, uuids: true, canon: true, rev: true
|
496
|
+
resources = {}
|
497
|
+
literals = {}
|
498
|
+
ufwd = {} # uuid => uri
|
499
|
+
urev = {} # uri => uuid
|
500
|
+
datatypes = Set.new
|
501
|
+
types = Set.new
|
502
|
+
authors = authors_for @subject
|
503
|
+
title = label_for @subject, candidates: struct
|
504
|
+
desc = label_for @subject, candidates: struct, desc: true
|
505
|
+
|
506
|
+
# warn struct
|
507
|
+
|
508
|
+
# rewrite content
|
509
|
+
title = title[1] if title
|
510
|
+
desc = desc[1] if desc
|
511
|
+
|
512
|
+
# `struct` and `rstruct` will contain all the links and
|
513
|
+
# metadata for forward and backward neighbours, respectively,
|
514
|
+
# which we need to mine (predicates, classes, datatypes) for
|
515
|
+
# prefixes among other things.
|
516
|
+
|
517
|
+
struct.each do |p, v|
|
518
|
+
v.each do |o|
|
519
|
+
if o.literal?
|
520
|
+
literals[o] ||= Set.new
|
521
|
+
literals[o].add p
|
522
|
+
|
523
|
+
# collect the datatype
|
524
|
+
datatypes.add o.datatype if o.has_datatype?
|
525
|
+
else
|
526
|
+
# normalize URIs
|
527
|
+
if o.to_s.start_with? 'urn:uuid:'
|
528
|
+
ufwd[o] ||= canonical_uri o
|
529
|
+
elsif cu = urev[o] || canonical_uuid(o)
|
530
|
+
o = urev[o] ||= cu
|
531
|
+
end
|
532
|
+
|
533
|
+
# collect the resource
|
534
|
+
resources[o] ||= Set.new
|
535
|
+
resources[o].add p
|
536
|
+
|
537
|
+
# add to type
|
538
|
+
types.add o if p == RDF::RDFV.type
|
539
|
+
end
|
540
|
+
end
|
541
|
+
end
|
542
|
+
|
543
|
+
urev.merge! ufwd.invert
|
544
|
+
|
545
|
+
labels = resources.keys.map do |k|
|
546
|
+
# turn this into a pair which subsequently gets turned into a hash
|
547
|
+
[k, label_for(k) ]
|
548
|
+
end.to_h
|
549
|
+
|
550
|
+
#warn labels
|
551
|
+
|
552
|
+
# handle the title
|
553
|
+
title ||= RDF::Literal('')
|
554
|
+
tm = { '#title' => title,
|
555
|
+
property: abbreviate(literals[title].to_a, vocab: XHV) }
|
556
|
+
if tl = title.language
|
557
|
+
tm['xml:lang'] = tl # if xmlns
|
558
|
+
tm['lang'] = tl
|
559
|
+
elsif tdt = title.datatype and tdt != RDF::XSD.string
|
560
|
+
tm[:datatype] = abbreviate(tdt)
|
561
|
+
end
|
562
|
+
|
563
|
+
# we accumulate a record of the links in the body so we know
|
564
|
+
# which ones to skip in the head
|
565
|
+
bodylinks = {}
|
566
|
+
rewrite_links body, uuids: ufwd, uris: urev do |elem|
|
567
|
+
vocab = elem.at_xpath('ancestor-or-self::*[@vocab][1]/@vocab')
|
568
|
+
vocab = uri_pp(vocab.to_s) if vocab
|
569
|
+
|
570
|
+
if elem.key?('href') or elem.key?('src')
|
571
|
+
begin
|
572
|
+
vu = uri_pp(elem['href'] || elem['src'])
|
573
|
+
ru = RDF::URI(@base.merge(vu))
|
574
|
+
bodylinks[urev[ru] || ru] = true
|
575
|
+
|
576
|
+
if rel = resources[urev[ru] || ru]
|
577
|
+
elem['rel'] = (abbreviate rel, vocab: vocab).join ' '
|
578
|
+
end
|
579
|
+
|
580
|
+
label = labels[urev[ru] || ru]
|
581
|
+
if titles and label and
|
582
|
+
(!elem.key?('title') or elem['title'].strip == '')
|
583
|
+
elem['title'] = label[1].to_s
|
584
|
+
end
|
585
|
+
rescue URI::InvalidComponentError => e
|
586
|
+
warn "#{e}: #{vu} in #{@subject}"
|
587
|
+
end
|
588
|
+
end
|
589
|
+
end
|
590
|
+
|
591
|
+
# and now we do the head
|
592
|
+
links = []
|
593
|
+
resources.reject { |k, _| bodylinks[k] }.each do |k, v|
|
594
|
+
v = v.dup.delete RDF::RDFV.type
|
595
|
+
next if v.empty?
|
596
|
+
mts = formats_for k
|
597
|
+
|
598
|
+
# warn k, v.inspect
|
599
|
+
|
600
|
+
# warn k, mts.inspect
|
601
|
+
|
602
|
+
rel = abbreviate v.to_a, vocab: XHV
|
603
|
+
ru = @base.route_to(uri_pp (ufwd[k] || k).to_s)
|
604
|
+
ln = { nil => :link, rel: rel, href: ru.to_s }
|
605
|
+
if (label = labels[urev[k] || k])
|
606
|
+
ln[:title] = label[1].to_s
|
607
|
+
end
|
608
|
+
|
609
|
+
# add type=lol/wut
|
610
|
+
ln[:type] = mts.first.to_s unless mts.empty?
|
611
|
+
|
612
|
+
if !ln[:type] and v.include?(RDF::Vocab::XHV.stylesheet)
|
613
|
+
ln[:type] = 'text/css'
|
614
|
+
elsif ln[:type] =~ /(java|ecma)script/i or
|
615
|
+
v.include?(RDF::Vocab::DC.requires)
|
616
|
+
ln[nil] = :script
|
617
|
+
ln[:src] = ln.delete :href
|
618
|
+
ln[:type] ||= 'text/javascript'
|
619
|
+
end
|
620
|
+
links.push ln
|
621
|
+
end
|
622
|
+
|
623
|
+
links.sort! do |a, b|
|
624
|
+
# sort by rel, then by href
|
625
|
+
# warn a.inspect, b.inspect
|
626
|
+
s = 0
|
627
|
+
[nil, :rel, :rev, :href, :title].each do |k|
|
628
|
+
s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
|
629
|
+
break if s != 0
|
630
|
+
end
|
631
|
+
s
|
632
|
+
end
|
633
|
+
|
634
|
+
# we want to duplicate links from particular subjects (eg the root)
|
635
|
+
(@duplicate || {}).sort do |a, b|
|
636
|
+
a.first <=> b.first
|
637
|
+
end.each do |s, preds|
|
638
|
+
|
639
|
+
o = {}
|
640
|
+
u = ufwd[s] ||= canonical_uuid s
|
641
|
+
s = urev[u] ||= canonical_uri u if u
|
642
|
+
f = {}
|
643
|
+
|
644
|
+
# do not include this subject as these links are already included!
|
645
|
+
next if u == @subject
|
646
|
+
|
647
|
+
# gather up the objects, then gather up the predicates
|
648
|
+
|
649
|
+
objects_for u || s, preds, only: :resource do |obj, rel|
|
650
|
+
# XXX do not know why += |= etc does not work
|
651
|
+
x = canonical_uuid(obj) || obj
|
652
|
+
urev[x] ||= canonical_uri x
|
653
|
+
y = o[x] ||= Set.new
|
654
|
+
o[x] = y | rel
|
655
|
+
f[x] = formats_for x
|
656
|
+
end
|
657
|
+
|
658
|
+
srel = @base.route_to((u ? urev[u] || s : s).to_s)
|
659
|
+
|
660
|
+
# now collect all the other predicates
|
661
|
+
o.keys.each do |obj|
|
662
|
+
hrel = @base.route_to((urev[obj] || obj).to_s)
|
663
|
+
o[obj] |= @repo.query([u || s, nil, obj]).predicates.to_set
|
664
|
+
rels = abbreviate o[obj].to_a, vocab: XHV
|
665
|
+
ln = { nil => :link, about: srel, rel: rels, href: hrel }
|
666
|
+
ln[:type] = f[obj].first if f[obj]
|
667
|
+
|
668
|
+
# add to links
|
669
|
+
links << ln
|
670
|
+
end
|
671
|
+
end
|
672
|
+
|
673
|
+
meta = []
|
674
|
+
|
675
|
+
# include author names as old school meta tags
|
676
|
+
authors.each do |a|
|
677
|
+
name = labels[urev[a] || a] or next
|
678
|
+
datatypes.add name[0] # a convenient place to chuck this
|
679
|
+
prop = abbreviate(name[0])
|
680
|
+
name = name[1]
|
681
|
+
about = @base.route_to((ufwd[a] || a).to_s)
|
682
|
+
tag = { nil => :meta, about: about.to_s, name: :author,
|
683
|
+
property: prop, content: name.to_s }
|
684
|
+
|
685
|
+
if name.has_datatype? and name.datatype != RDF::XSD.string
|
686
|
+
tag[:datatype] = abbreviate(name.datatype)
|
687
|
+
elsif name.has_language?
|
688
|
+
tag['xml:lang'] = tag[:lang] = name.language
|
689
|
+
end
|
690
|
+
meta.push tag
|
691
|
+
end
|
692
|
+
|
693
|
+
literals.each do |k, v|
|
694
|
+
next if k == title
|
695
|
+
rel = abbreviate v.to_a, vocab: XHV
|
696
|
+
elem = { nil => :meta, property: rel, content: k.to_s }
|
697
|
+
elem[:name] = :description if k == desc
|
698
|
+
|
699
|
+
if k.has_datatype?
|
700
|
+
datatypes.add k.datatype # so we get the prefix
|
701
|
+
elem[:datatype] = abbreviate k.datatype, vocab: XHV
|
702
|
+
end
|
703
|
+
|
704
|
+
meta.push(elem)
|
705
|
+
end
|
706
|
+
|
707
|
+
meta.sort! do |a, b|
|
708
|
+
s = 0
|
709
|
+
[:about, :property, :datatype, :content, :name].each do |k|
|
710
|
+
# warn a.inspect, b.inspect
|
711
|
+
s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
|
712
|
+
break if s != 0
|
713
|
+
end
|
714
|
+
s
|
715
|
+
end
|
716
|
+
|
717
|
+
# don't forget style tag
|
718
|
+
style = doc.xpath('/html:html/html:head/html:style', { html: XHTMLNS })
|
719
|
+
|
720
|
+
body = body.dup 1
|
721
|
+
body = { '#body' => body.children.to_a, about: '' }
|
722
|
+
body[:typeof] = abbreviate(types.to_a, vocab: XHV) unless
|
723
|
+
types.empty?
|
724
|
+
|
725
|
+
|
726
|
+
|
727
|
+
# prepare only the prefixes we need to resolve the data we need
|
728
|
+
rsc = abbreviate(
|
729
|
+
(struct.keys + resources.keys + datatypes.to_a +
|
730
|
+
types.to_a + rstruct.to_a.flatten).uniq, noop: false).map do |x|
|
731
|
+
next if x.nil?
|
732
|
+
x.split(?:)[0].to_sym
|
733
|
+
end.reject(&:nil?).to_set
|
734
|
+
|
735
|
+
# warn rsc
|
736
|
+
|
737
|
+
pfx = prefixes.select do |k, _|
|
738
|
+
rsc.include? k
|
739
|
+
end.transform_values { |v| v.to_s }
|
740
|
+
|
741
|
+
# XXX deal with the qb:Observation separately (just nuke it for now)
|
742
|
+
extra = generate_twitter_meta || []
|
743
|
+
bl_op = begin
|
744
|
+
bads = @repo.query(
|
745
|
+
[nil, RDF::SAK::CI.document, @subject]).subjects.to_set
|
746
|
+
nope = %w[top contents index].map { |x| RDF::Vocab::XHV[x] }
|
747
|
+
lambda { |s, p| bads.include? s or nope.include? p }
|
748
|
+
end
|
749
|
+
if bl = generate_backlinks(
|
750
|
+
published: published, pattern: /^urn:uuid:/, terse: true,
|
751
|
+
struct: rstruct, ignore: bl_op)
|
752
|
+
extra << bl #{ [bl] => :object }
|
753
|
+
end
|
754
|
+
|
755
|
+
# and now for the document
|
756
|
+
xf = @transform
|
757
|
+
doc = xhtml_stub(
|
758
|
+
base: @base, prefix: pfx, vocab: XHV, lang: 'en', title: tm,
|
759
|
+
link: links, meta: meta, style: style, transform: xf,
|
760
|
+
extra: extra, body: body).document
|
761
|
+
|
762
|
+
# goddamn script tags and text/html
|
763
|
+
doc.xpath('//html:script[@src][not(node())]', XPATHNS).each do |script|
|
764
|
+
script << doc.create_text_node('')
|
765
|
+
end
|
766
|
+
|
767
|
+
doc
|
768
|
+
end
|
769
|
+
|
770
|
+
|
771
|
+
|
772
|
+
end
|