rdf-sak 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +268 -0
- data/Rakefile +13 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/example/cleanup.xsl +14 -0
- data/example/matches.xhtml +11 -0
- data/example/transforms.ttl +58 -0
- data/lib/rdf-sak.rb +1 -0
- data/lib/rdf/sak.rb +2506 -0
- data/lib/rdf/sak/ci.rb +827 -0
- data/lib/rdf/sak/cli.rb +35 -0
- data/lib/rdf/sak/docstats.rb +188 -0
- data/lib/rdf/sak/document.rb +772 -0
- data/lib/rdf/sak/ibis.rb +248 -0
- data/lib/rdf/sak/mimemagic.rb +73 -0
- data/lib/rdf/sak/pav.rb +479 -0
- data/lib/rdf/sak/qb.rb +280 -0
- data/lib/rdf/sak/scovo.rb +51 -0
- data/lib/rdf/sak/tfo.rb +301 -0
- data/lib/rdf/sak/transform.rb +1172 -0
- data/lib/rdf/sak/urlrunner.rb +602 -0
- data/lib/rdf/sak/util.rb +2081 -0
- data/lib/rdf/sak/version.rb +5 -0
- data/rdf-sak.gemspec +60 -0
- metadata +366 -0
data/lib/rdf/sak/cli.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'rdf/sak'
|
2
|
+
require 'xml-mixup'
|
3
|
+
require 'commander'
|
4
|
+
|
5
|
+
module RDF::SAK
|
6
|
+
class CLI
|
7
|
+
# This is a command-line interface
|
8
|
+
|
9
|
+
include XML::Mixup
|
10
|
+
include Commander::Methods
|
11
|
+
|
12
|
+
# bunch of data declarations etc we don't want to expose
|
13
|
+
private
|
14
|
+
|
15
|
+
# actual methods
|
16
|
+
public
|
17
|
+
|
18
|
+
# constructor
|
19
|
+
|
20
|
+
# configuration:
|
21
|
+
|
22
|
+
# directories: source, target, private
|
23
|
+
# files (or file names): graph, rewrite_map, redirect_map, gone_map
|
24
|
+
# URIs: base, aliases
|
25
|
+
|
26
|
+
def initialize config: {}
|
27
|
+
end
|
28
|
+
|
29
|
+
# vestigial
|
30
|
+
|
31
|
+
def run
|
32
|
+
run!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
require 'rdf/sak/version'
|
2
|
+
require 'set'
|
3
|
+
require 'descriptive_statistics'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
class RDF::SAK::DocStats < Nokogiri::XML::SAX::Document
|
7
|
+
private
|
8
|
+
|
9
|
+
MAYBE = %i[dt dd li td th caption figcaption]
|
10
|
+
SKIP = %i[html head title base link meta script]
|
11
|
+
BLOCKS = Set.new(%i[body p h1 h2 h3 h4 h5 h6 ul ol pre dl main header footer
|
12
|
+
article section aside figure nav div noscript blockquote form hr
|
13
|
+
table fieldset address] + MAYBE).freeze
|
14
|
+
SECTIONS = Set.new(%i[body article section]).freeze
|
15
|
+
IMAGES = Set.new(%i[img picture]).freeze
|
16
|
+
VIDEOS = Set.new(%i[video]).freeze
|
17
|
+
EMBEDS = Set.new(%i[embed object iframe])
|
18
|
+
COUNTS = {
|
19
|
+
sections: %i[body article section header footer nav aside],
|
20
|
+
images: %i[img picture],
|
21
|
+
videos: %i[video],
|
22
|
+
embeds: %i[embed object iframe],
|
23
|
+
tables: %i[table],
|
24
|
+
lists: %i[ul ol dl],
|
25
|
+
forms: %i[form],
|
26
|
+
scripts: %i[script],
|
27
|
+
sheets: %i[style],
|
28
|
+
}.transform_values { |v| Set.new v }.freeze
|
29
|
+
|
30
|
+
|
31
|
+
NODEXP = '/html:html/html:body[not(*)]|/html:html/html:body//*[not(*)]'.freeze
|
32
|
+
XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
|
33
|
+
XPATHNS = { html: XHTMLNS }.freeze
|
34
|
+
|
35
|
+
# ok listen up fools here is the new html document stats algo:
|
36
|
+
|
37
|
+
# okay we want to count characters, words, blocks, and sections, as
|
38
|
+
# well as gather stats on words per block (and probably blocks per section)
|
39
|
+
|
40
|
+
# the problem is we don't want to count blocks that only contain other blocks
|
41
|
+
|
42
|
+
# we also don't want to count the text of sub-blocks in a superordinate block
|
43
|
+
|
44
|
+
# there are also quasi-blocks that we may not ordinarily count,
|
45
|
+
# except if they themselves contain two or more adjacent
|
46
|
+
# blocks. (examples: li, th/td, h1-6, caption/figcaption)
|
47
|
+
|
48
|
+
# count the block only if it contains text and inline elements (and
|
49
|
+
# only count the text and inline elements)
|
50
|
+
|
51
|
+
# if
|
52
|
+
|
53
|
+
# we can also
|
54
|
+
|
55
|
+
# use xpath to find all the leaf node elements
|
56
|
+
#
|
57
|
+
|
58
|
+
def pretend_sax node
|
59
|
+
case node.type
|
60
|
+
when Nokogiri::XML::Node::DOCUMENT_NODE
|
61
|
+
# if node is a document run begin and end document and then run
|
62
|
+
# for children
|
63
|
+
start_document
|
64
|
+
node.children.each { |c| pretend_sax c }
|
65
|
+
end_document
|
66
|
+
when Nokogiri::XML::Node::ELEMENT_NODE
|
67
|
+
# if node is an element run begin and end element and run for children
|
68
|
+
prefix, uri = if ns = node.namespace
|
69
|
+
[ns.prefix, ns.href]
|
70
|
+
end
|
71
|
+
ns = node.namespace_scopes.map { |n| [ns.prefix, ns.href] }
|
72
|
+
attrs = node.attribute_nodes.map do |a|
|
73
|
+
an = a.name
|
74
|
+
an = "#{a.namespace.prefix}:#{an}" if
|
75
|
+
a.namespace and a.namespace.prefix
|
76
|
+
[an, a.content]
|
77
|
+
end
|
78
|
+
start_element_namespace node.name, attrs, prefix, uri, ns
|
79
|
+
node.children.each { |c| pretend_sax c }
|
80
|
+
end_element_namespace node.name, prefix, uri
|
81
|
+
when Nokogiri::XML::Node::TEXT_NODE
|
82
|
+
characters node.content
|
83
|
+
when Nokogiri::XML::Node::CDATA_SECTION_NODE
|
84
|
+
cdata_block node.content
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def do_block name
|
89
|
+
if BLOCKS.include? name.to_sym
|
90
|
+
w = @text.strip.split
|
91
|
+
t = w.join ' '
|
92
|
+
|
93
|
+
unless w.empty?
|
94
|
+
words = w.length
|
95
|
+
@counts[:chars] += t.length
|
96
|
+
@counts[:words] += words
|
97
|
+
@counts[:blocks] += 1
|
98
|
+
@wpb << words
|
99
|
+
@stack << t
|
100
|
+
@text = ''
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def clear_text
|
106
|
+
@text = ''
|
107
|
+
end
|
108
|
+
|
109
|
+
public
|
110
|
+
|
111
|
+
attr_reader :chars, :words, :blocks
|
112
|
+
|
113
|
+
def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = []
|
114
|
+
unless uri != XHTMLNS or SKIP.include? name.to_sym
|
115
|
+
@on = true
|
116
|
+
do_block name
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def end_element_namespace name, prefix = nil, uri = nil
|
121
|
+
if uri == XHTMLNS
|
122
|
+
SKIP.include?(name.to_sym) ? clear_text : do_block(name)
|
123
|
+
COUNTS.each do |type, set|
|
124
|
+
@counts[type] += 1 if set.include? name.to_sym
|
125
|
+
end
|
126
|
+
@counts[:sections] -= 1 if name == 'body'
|
127
|
+
@on = false if name == 'body'
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def characters string
|
132
|
+
@text += string if @on
|
133
|
+
end
|
134
|
+
|
135
|
+
def cdata_block string
|
136
|
+
characters string
|
137
|
+
end
|
138
|
+
|
139
|
+
# @return [Float] mean of words per block
|
140
|
+
def mean
|
141
|
+
@wpb.mean
|
142
|
+
end
|
143
|
+
|
144
|
+
# @return [Float] standard deviation of words per block
|
145
|
+
def sd
|
146
|
+
@wpb.standard_deviation
|
147
|
+
end
|
148
|
+
|
149
|
+
# @return
|
150
|
+
def quartiles
|
151
|
+
[0, 25, 50, 75, 100].map { |pct| @wpb.percentile(pct) }
|
152
|
+
end
|
153
|
+
|
154
|
+
def counts
|
155
|
+
@counts.dup.freeze
|
156
|
+
end
|
157
|
+
|
158
|
+
def initialize
|
159
|
+
@on = false
|
160
|
+
@text = ''
|
161
|
+
@stack = [] # XXX i don't think we use this one
|
162
|
+
@wpb = []
|
163
|
+
@counts = %i[chars words blocks sections images videos embeds
|
164
|
+
tables lists forms scripts sheets].map { |k| [k, 0] }.to_h
|
165
|
+
end
|
166
|
+
|
167
|
+
def scan doc
|
168
|
+
if doc.is_a? Nokogiri::XML::Node
|
169
|
+
pretend_sax doc
|
170
|
+
else
|
171
|
+
parser = Nokogiri::XML::SAX::Parser.new self
|
172
|
+
parser.parse doc
|
173
|
+
end
|
174
|
+
|
175
|
+
self
|
176
|
+
end
|
177
|
+
|
178
|
+
def self.scan doc
|
179
|
+
new.scan doc
|
180
|
+
end
|
181
|
+
|
182
|
+
def to_h
|
183
|
+
{ mean: mean, sd: sd, quartiles: quartiles }.merge counts
|
184
|
+
end
|
185
|
+
|
186
|
+
def to_rdf uri: nil, subject: nil
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,772 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
require 'rdf/sak/util'
|
3
|
+
require 'time'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'xml-mixup'
|
6
|
+
|
7
|
+
class RDF::SAK::Document
|
8
|
+
include XML::Mixup
|
9
|
+
include RDF::SAK::Util
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
|
14
|
+
XPATHNS = { html: XHTMLNS }
|
15
|
+
XHV = 'http://www.w3.org/1999/xhtml/vocab#'.freeze
|
16
|
+
|
17
|
+
# notice these are only RDFa attributes that take URIs
|
18
|
+
RDFA_ATTR = [:about, :resource, :typeof].freeze
|
19
|
+
LINK_ATTR = [:href, :src, :data, :action, :longdesc].freeze
|
20
|
+
LINK_XPATH = ('.//html:*[not(self::html:base)][%s]' %
|
21
|
+
(LINK_ATTR + RDFA_ATTR).map { |a| "@#{a.to_s}" }.join('|')).freeze
|
22
|
+
|
23
|
+
OBJS = [:href, :src].freeze
|
24
|
+
|
25
|
+
# ancestor node always with (@property and not @content) and
|
26
|
+
# not @resource|@href|@src unless @rel|@rev
|
27
|
+
LITXP = ['(ancestor::*[@property][not(@content)]',
|
28
|
+
'[not(@resource|@href|@src) or @rel|@rev])[1]' ].join('').freeze
|
29
|
+
# note parentheses cause the index to be counted from the root
|
30
|
+
|
31
|
+
public
|
32
|
+
|
33
|
+
attr_reader :repo, :subject, :doc, :base, :prefixes
|
34
|
+
|
35
|
+
# Initialize a document context.
|
36
|
+
def initialize repo, doc, subject: nil, base: nil, resolve: nil,
|
37
|
+
prefixes: {}, transform: nil, scache: {}, ucache: {}
|
38
|
+
# coerce the document
|
39
|
+
doc = case doc
|
40
|
+
when Nokogiri::XML::Document then doc
|
41
|
+
when Nokogiri::XML::Node then Nokogiri::XML::Document.new << doc.dup
|
42
|
+
when String, IO, File, Pathname then Nokogiri.XML doc
|
43
|
+
else
|
44
|
+
raise ArgumentError, "Not sure what to do with #{doc.class}"
|
45
|
+
end
|
46
|
+
|
47
|
+
# we only try this if there is a subject defined, obvs
|
48
|
+
base ||= RDF::SAK::Util.canonical_uri repo, subject, rdf: false if subject
|
49
|
+
|
50
|
+
@repo = repo
|
51
|
+
@subject = subject
|
52
|
+
@doc = doc
|
53
|
+
@base = URI(base.to_s) if base # note this is a vanilla URI
|
54
|
+
@resolve = RDF::URI(resolve.to_s) if resolve # note this is an RDF::URI
|
55
|
+
@prefixes = prefixes
|
56
|
+
@transform = transform
|
57
|
+
@scache = scache
|
58
|
+
@ucache = ucache
|
59
|
+
end
|
60
|
+
|
61
|
+
def canonical_uuid uri, unique: true, published: false
|
62
|
+
RDF::SAK::Util.canonical_uuid @repo, uri, base: @base,
|
63
|
+
unique: unique, published: published, scache: @scache, ucache: @ucache
|
64
|
+
end
|
65
|
+
|
66
|
+
def canonical_uri subject,
|
67
|
+
unique: true, rdf: true, slugs: false, fragment: false
|
68
|
+
RDF::SAK::Util.canonical_uri @repo, subject, base: @base,
|
69
|
+
unique: unique, rdf: rdf, slugs: slugs, fragment: fragment
|
70
|
+
end
|
71
|
+
|
72
|
+
def cmp_label a, b, labels: nil, supplant: true, reverse: false
|
73
|
+
RDF::SAK::Util.cmp_label @repo, a, b,
|
74
|
+
labels: labels, supplant: supplant, reverse: reverse
|
75
|
+
end
|
76
|
+
|
77
|
+
def asserted_types subject, type = nil
|
78
|
+
RDF::SAK::Util.asserted_types @repo, subject, type
|
79
|
+
end
|
80
|
+
|
81
|
+
def subjects_for predicate, object, entail: true, only: []
|
82
|
+
RDF::SAK::Util.subjects_for @repo, predicate, object,
|
83
|
+
entail: entail, only: only
|
84
|
+
end
|
85
|
+
|
86
|
+
def objects_for subject, predicate, entail: true, only: [], datatype: nil
|
87
|
+
RDF::SAK::Util.objects_for @repo, subject, predicate,
|
88
|
+
entail: entail, only: only, datatype: datatype
|
89
|
+
end
|
90
|
+
|
91
|
+
def struct_for subject, rev: false, only: [], uuids: false, canon: false
|
92
|
+
RDF::SAK::Util.struct_for @repo, subject,
|
93
|
+
rev: rev, only: only, uuids: uuids, canon: canon,
|
94
|
+
ucache: @ucache, scache: @scache
|
95
|
+
end
|
96
|
+
|
97
|
+
def label_for subject, candidates: nil, unique: true, type: nil,
|
98
|
+
lang: nil, desc: false, alt: false
|
99
|
+
RDF::SAK::Util.label_for @repo, subject, candidates: candidates,
|
100
|
+
unique: unique, type: type, lang: lang, desc: desc, alt: alt
|
101
|
+
end
|
102
|
+
|
103
|
+
def formats_for subject, predicate: RDF::Vocab::DC.format,
|
104
|
+
datatype: [RDF::XSD.token]
|
105
|
+
RDF::SAK::Util.formats_for @repo, subject,
|
106
|
+
predicate: predicate, datatype: datatype
|
107
|
+
end
|
108
|
+
|
109
|
+
def authors_for subject, unique: false, contrib: false
|
110
|
+
RDF::SAK::Util.authors_for @repo, subject, unique: unique, contrib: contrib
|
111
|
+
end
|
112
|
+
|
113
|
+
# proxy for context published
|
114
|
+
def published? subject = nil
|
115
|
+
return RDF::SAK::Util.published? @repo, subject, base: @base if subject
|
116
|
+
@published ||= RDF::SAK::Util.published? @repo, @subject, base: @base
|
117
|
+
end
|
118
|
+
|
119
|
+
def abbreviate term, prefixes: @prefixes,
|
120
|
+
vocab: nil, noop: true, sort: true
|
121
|
+
super term, prefixes: prefixes || {}, vocab: vocab, noop: noop, sort: sort
|
122
|
+
end
|
123
|
+
|
124
|
+
def base_for node = nil
|
125
|
+
node ||= @doc
|
126
|
+
doc = node.document
|
127
|
+
base = URI(@base.to_s)
|
128
|
+
|
129
|
+
return base unless doc.root
|
130
|
+
|
131
|
+
if doc.root.name.to_sym == :html
|
132
|
+
b = doc.at_xpath(
|
133
|
+
'(/html:html/html:head/html:base[@href])[1]/@href', XPATHNS
|
134
|
+
).to_s.strip
|
135
|
+
b = URI(b)
|
136
|
+
|
137
|
+
base = b if b.absolute?
|
138
|
+
elsif b = doc.root.at_xpath('ancestor-or-self::*[@xml:base][1]/@xml:base')
|
139
|
+
b = URI(b.to_s.strip)
|
140
|
+
base = b if b.absolute?
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
# warn({ orig_base: @base, resolve: resolve, base: base}.inspect)
|
145
|
+
|
146
|
+
# warn %i[scheme host port].map { |s| [s, base.send(s) == resolve.send(s)] }.to_h.inspect
|
147
|
+
|
148
|
+
# rewrite if aliased
|
149
|
+
if @resolve and resolve = URI(@resolve.to_s) and
|
150
|
+
%i[scheme host port].all? { |s| base.send(s) == resolve.send(s) }
|
151
|
+
tmp = base.dup
|
152
|
+
tmp.scheme = @base.scheme
|
153
|
+
tmp.host = @base.host
|
154
|
+
tmp.port = @base.port
|
155
|
+
base = tmp.normalize
|
156
|
+
end
|
157
|
+
|
158
|
+
base
|
159
|
+
end
|
160
|
+
|
161
|
+
def rewrite_links node = @doc, uuids: {}, uris: {}, &block
|
162
|
+
base = base_for node
|
163
|
+
if be = node.at_xpath('(/html:html/html:head/html:base[@href])[1]', XPATHNS)
|
164
|
+
be[:href] = base.to_s if base.to_s != be[:href]
|
165
|
+
end
|
166
|
+
count = 0
|
167
|
+
node.xpath(LINK_XPATH, XPATHNS).each do |elem|
|
168
|
+
LINK_ATTR.each do |attr|
|
169
|
+
attr = attr.to_s
|
170
|
+
next unless elem.has_attribute? attr
|
171
|
+
|
172
|
+
abs = base.merge uri_pp(elem[attr].strip) rescue nil
|
173
|
+
next unless abs
|
174
|
+
|
175
|
+
# bail out if this isn't http(s)
|
176
|
+
next if abs.scheme and !%w[http https].include? abs.scheme.downcase
|
177
|
+
|
178
|
+
# fix e.g. http->https
|
179
|
+
if abs.host == @base.host and abs.scheme != @base.scheme
|
180
|
+
tmp = @base.dup
|
181
|
+
tmp.path = abs.path
|
182
|
+
tmp.query = abs.query
|
183
|
+
tmp.fragment = abs.fragment
|
184
|
+
abs = tmp
|
185
|
+
end
|
186
|
+
|
187
|
+
# harvest path parameters
|
188
|
+
pp = split_pp abs, only: true
|
189
|
+
|
190
|
+
# coerce to rdf
|
191
|
+
abs = RDF::URI(abs.to_s)
|
192
|
+
|
193
|
+
# make an aliased copy we use to look up the uuid
|
194
|
+
aliased = if @resolve
|
195
|
+
tmp = abs.dup
|
196
|
+
tmp.scheme = @resolve.scheme
|
197
|
+
tmp.authority = @resolve.authority if @resolve.authority
|
198
|
+
tmp
|
199
|
+
else
|
200
|
+
abs
|
201
|
+
end
|
202
|
+
|
203
|
+
# warn "aliased #{abs} to #{aliased}" if @resolve
|
204
|
+
|
205
|
+
|
206
|
+
# round-trip to uuid and back if we can
|
207
|
+
if uuid = uris[abs] ||= canonical_uuid(aliased)
|
208
|
+
abs = uuids[uuid] ||= canonical_uri(uuid)
|
209
|
+
elsif cu = canonical_uri(abs)
|
210
|
+
# otherwise just find the canonical uri
|
211
|
+
abs = cu
|
212
|
+
end
|
213
|
+
|
214
|
+
# reinstate the path parameters
|
215
|
+
if !pp.empty? && split_pp(abs, only: true).empty?
|
216
|
+
abs = abs.dup
|
217
|
+
abs.path = ([abs.path] + pp).join(';')
|
218
|
+
end
|
219
|
+
|
220
|
+
elem[attr] = @base.route_to(abs.to_s).to_s
|
221
|
+
count += 1
|
222
|
+
end
|
223
|
+
|
224
|
+
block.call elem if block
|
225
|
+
end
|
226
|
+
|
227
|
+
count
|
228
|
+
end
|
229
|
+
|
230
|
+
# sponge the document for rdfa
|
231
|
+
def triples_for
|
232
|
+
end
|
233
|
+
|
234
|
+
def vocab_for node
|
235
|
+
if node[:vocab]
|
236
|
+
vocab = node[:vocab].strip
|
237
|
+
return nil if vocab == ''
|
238
|
+
return vocab
|
239
|
+
end
|
240
|
+
parent = node.parent
|
241
|
+
vocab_for parent if parent and parent.element?
|
242
|
+
end
|
243
|
+
|
244
|
+
def prefixes_for node, prefixes = {}
|
245
|
+
# start with namespaces
|
246
|
+
pfx = node.namespace_declarations.filter(&:prefix).map do |n|
|
247
|
+
[n.prefix.to_sym, n.href]
|
248
|
+
end.to_h
|
249
|
+
|
250
|
+
# then add @prefix overtop of the namespaces
|
251
|
+
if node[:prefix]
|
252
|
+
x = node[:prefix].strip.split(/\s+/)
|
253
|
+
a = []
|
254
|
+
b = []
|
255
|
+
x.each_index { |i| (i % 2 == 0 ? a : b).push x[i] }
|
256
|
+
a.map!(&:to_sym)
|
257
|
+
# if the size is uneven the values will be nil, so w drop em
|
258
|
+
pfx.merge! a.zip(b).to_h.reject { |_, v| v.nil? }
|
259
|
+
end
|
260
|
+
|
261
|
+
# since we're ascending the tree, input takes precedence
|
262
|
+
prefixes = pfx.merge prefixes
|
263
|
+
|
264
|
+
if node.parent and node.parent.element?
|
265
|
+
prefixes_for(node.parent, prefixes)
|
266
|
+
else
|
267
|
+
prefixes
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
# give us the rdf subject of the node itself
|
272
|
+
def subject_for node = nil, rdf: false, is_ancestor: false
|
273
|
+
node ||= @doc.root
|
274
|
+
raise 'Node must be an element' unless
|
275
|
+
node.is_a? Nokogiri::XML::Element
|
276
|
+
|
277
|
+
# first we check for an ancestor element with @property and no
|
278
|
+
# @content; if we find one then we reevaluate with that
|
279
|
+
# element as the starting point
|
280
|
+
if n = node.at_xpath(LITXP)
|
281
|
+
return subject_for n
|
282
|
+
end
|
283
|
+
|
284
|
+
# answer a bunch of helpful questions about this element
|
285
|
+
subject = nil
|
286
|
+
base = base_for node
|
287
|
+
parent = node.parent
|
288
|
+
ns_href = node.namespace.href if node.namespace
|
289
|
+
up_ok = %i{rel rev}.none? { |a| node[a] }
|
290
|
+
is_root = !parent or parent.document?
|
291
|
+
special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
|
292
|
+
(ns_href == XHTMLNS or /^(?:[^:]+:)?html$/xi === parent.name)
|
293
|
+
|
294
|
+
# if the node is being inspected as an ancestor to the
|
295
|
+
# original node, we have to check it backwards.
|
296
|
+
if is_ancestor
|
297
|
+
# ah right @resource gets special treatment
|
298
|
+
if subject = node[:resource]
|
299
|
+
subject.strip!
|
300
|
+
if m = /^\[(.*?)\]$/.match(subject)
|
301
|
+
end
|
302
|
+
else
|
303
|
+
OBJS.each do |attr|
|
304
|
+
if node[attr]
|
305
|
+
# merge with the root and return it
|
306
|
+
subject = base + node[attr]
|
307
|
+
break
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
return rdf ? RDF::URI(subject.to_s) : subject
|
313
|
+
|
314
|
+
# note if we are being called with is_ancestor, that means
|
315
|
+
# the original node (or indeed any of the nodes previously
|
316
|
+
# tested) have anything resembling a resource in them. this
|
317
|
+
# means @rel/@rev should be ignored, and we should keep
|
318
|
+
# looking for a subject.
|
319
|
+
end
|
320
|
+
|
321
|
+
if node[:about]
|
322
|
+
|
323
|
+
if m = /^_:(.*)$/.match(node[:about])
|
324
|
+
return RDF::Node(m[1])
|
325
|
+
end
|
326
|
+
|
327
|
+
# XXX resolve @about against potential curie
|
328
|
+
subject = base + node[:about]
|
329
|
+
|
330
|
+
elsif is_root
|
331
|
+
subject = base
|
332
|
+
elsif special
|
333
|
+
subject = subject_for parent
|
334
|
+
elsif node[:resource]
|
335
|
+
# XXX resolve @about against potential curie
|
336
|
+
subject = base + node[:resource]
|
337
|
+
elsif node[:href]
|
338
|
+
subject = base + node[:href]
|
339
|
+
elsif node[:src]
|
340
|
+
subject = base + node[:src]
|
341
|
+
elsif node[:typeof]
|
342
|
+
# bnode the typeof attr
|
343
|
+
|
344
|
+
# note we return bnodes irrespective of the rdf flag
|
345
|
+
return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
|
346
|
+
elsif node[:inlist]
|
347
|
+
# bnode the inlist attr
|
348
|
+
return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
|
349
|
+
elsif (parent[:inlist] && OBJS.none? { |a| parent[a] }) ||
|
350
|
+
(is_ancestor && !up_ok)
|
351
|
+
# bnode the element
|
352
|
+
return RDF::Node('id-%016x' % node.pointer_id)
|
353
|
+
# elsif node[:id]
|
354
|
+
else
|
355
|
+
subject = subject_for parent, is_ancestor: true
|
356
|
+
end
|
357
|
+
|
358
|
+
rdf ? RDF::URI(subject.to_s) : URI(subject.to_s)
|
359
|
+
|
360
|
+
end
|
361
|
+
|
362
|
+
# backlink structure
|
363
|
+
def generate_backlinks published: true, struct: nil,
|
364
|
+
ignore: nil, pattern: nil, terse: false
|
365
|
+
uri = canonical_uri(subject, rdf: false) || URI(uri_pp subject)
|
366
|
+
ignore = case ignore
|
367
|
+
when nil then Set.new
|
368
|
+
when Proc then ignore
|
369
|
+
when -> x { x.respond_to? :to_set } then ignore = ignore.to_set
|
370
|
+
else
|
371
|
+
raise 'ignore must be either a proc or amenable to a set'
|
372
|
+
end
|
373
|
+
nodes = {}
|
374
|
+
labels = {}
|
375
|
+
types = {}
|
376
|
+
|
377
|
+
if struct
|
378
|
+
struct.each do |p, subjects|
|
379
|
+
subjects.each do |s|
|
380
|
+
case ignore
|
381
|
+
when Proc then next if ignore.call s, p
|
382
|
+
when Set then next if ignore.include? s
|
383
|
+
end
|
384
|
+
preds = nodes[s] ||= Set.new
|
385
|
+
preds << p
|
386
|
+
types[s] ||= asserted_types s
|
387
|
+
labels[s] ||= label_for s
|
388
|
+
labels[p] ||= label_for p unless terse
|
389
|
+
end
|
390
|
+
end
|
391
|
+
else
|
392
|
+
@repo.query([nil, nil, subject]).each do |stmt|
|
393
|
+
s = stmt.subject
|
394
|
+
case ignore
|
395
|
+
when Proc then next if ignore.call stmt
|
396
|
+
when Set then next if ignore.include? s
|
397
|
+
end
|
398
|
+
preds = nodes[s] ||= Set.new
|
399
|
+
preds << (p = stmt.predicate)
|
400
|
+
types[s] ||= asserted_types s
|
401
|
+
labels[s] ||= label_for s
|
402
|
+
labels[p] ||= label_for p unless terse
|
403
|
+
end
|
404
|
+
end
|
405
|
+
|
406
|
+
# prune out nonmatching
|
407
|
+
nodes.select! { |k, _| pattern.match? k.to_s } if
|
408
|
+
pattern and pattern.is_a? Regexp
|
409
|
+
|
410
|
+
# prune out unpublished
|
411
|
+
nodes.select! { |k, _| published? k } if published
|
412
|
+
|
413
|
+
return if nodes.empty?
|
414
|
+
|
415
|
+
if terse
|
416
|
+
nodes.map do |rsrc, preds|
|
417
|
+
cu = canonical_uri(rsrc, rdf: false) or next
|
418
|
+
lab = labels[rsrc] || [nil, rsrc]
|
419
|
+
link = { nil => :link, rel: '', href: uri.route_to(cu),
|
420
|
+
rev: abbreviate(preds) }
|
421
|
+
link[:typeof] = abbreviate(types[rsrc]) if types[rsrc]
|
422
|
+
link[:title] = lab.last if lab.last
|
423
|
+
link
|
424
|
+
end.compact
|
425
|
+
else
|
426
|
+
li = nodes.sort do |a, b|
|
427
|
+
cmp_label a.first, b.first, labels: labels
|
428
|
+
end.map do |rsrc, preds|
|
429
|
+
cu = canonical_uri(rsrc, rdf: false) or next
|
430
|
+
lab = labels[rsrc] || [nil, rsrc]
|
431
|
+
lp = abbreviate(lab.first) if lab.first
|
432
|
+
ty = abbreviate(types[rsrc]) if types[rsrc]
|
433
|
+
|
434
|
+
{ [{ [{ [lab[1].to_s] => :span, property: lp }] => :a, typeof: ty,
|
435
|
+
href: uri.route_to(cu), rev: abbreviate(preds) }] => :li }
|
436
|
+
end.compact
|
437
|
+
|
438
|
+
{ [{ li => :ul }] => :nav }
|
439
|
+
end
|
440
|
+
end
|
441
|
+
|
442
|
+
# goofy twitter-specific metadata
|
443
|
+
def generate_twitter_meta
|
444
|
+
# get author
|
445
|
+
author = authors_for(subject, unique: true) or return
|
446
|
+
|
447
|
+
return unless author.is_a? RDF::Resource
|
448
|
+
|
449
|
+
# get author's twitter account
|
450
|
+
twitter = objects_for(author, RDF::Vocab::FOAF.account,
|
451
|
+
only: :resource).select { |t| t.to_s =~ /twitter\.com/
|
452
|
+
}.sort.first or return
|
453
|
+
twitter = URI(twitter.to_s).path.split(/\/+/)[1]
|
454
|
+
twitter = ?@ + twitter unless twitter.start_with? ?@
|
455
|
+
|
456
|
+
# get title
|
457
|
+
title = label_for(subject) or return
|
458
|
+
|
459
|
+
out = [
|
460
|
+
{ nil => :meta, name: 'twitter:card', content: :summary },
|
461
|
+
{ nil => :meta, name: 'twitter:site', content: twitter },
|
462
|
+
{ nil => :meta, name: 'twitter:title', content: title[1].to_s }
|
463
|
+
]
|
464
|
+
|
465
|
+
# get abstract
|
466
|
+
if desc = label_for(subject, desc: true)
|
467
|
+
out.push({ nil => :meta, name: 'twitter:description',
|
468
|
+
content: desc[1].to_s })
|
469
|
+
end
|
470
|
+
|
471
|
+
# get image (foaf:depiction)
|
472
|
+
img = objects_for(subject, RDF::Vocab::FOAF.depiction, only: :resource)
|
473
|
+
unless img.empty?
|
474
|
+
img = img[0].to_s
|
475
|
+
out.push({ nil => :meta, name: 'twitter:image', content: img })
|
476
|
+
out[0][:content] = :summary_large_image
|
477
|
+
end
|
478
|
+
|
479
|
+
# return the appropriate xml-mixup structure
|
480
|
+
out
|
481
|
+
end
|
482
|
+
|
483
|
+
def transform_xhtml published: true, titles: false
|
484
|
+
# before we do any more work make sure this is html
|
485
|
+
doc = @doc.dup 1
|
486
|
+
body = doc.at_xpath('//html:body[1]', XPATHNS) || doc.root
|
487
|
+
|
488
|
+
# eliminate comments
|
489
|
+
doc.xpath('//comment()[not(ancestor::html:script)]', XPATHNS).each do |c|
|
490
|
+
c.unlink
|
491
|
+
end
|
492
|
+
|
493
|
+
# initial stuff
|
494
|
+
struct = struct_for @subject, uuids: true, canon: true
|
495
|
+
rstruct = struct_for @subject, uuids: true, canon: true, rev: true
|
496
|
+
resources = {}
|
497
|
+
literals = {}
|
498
|
+
ufwd = {} # uuid => uri
|
499
|
+
urev = {} # uri => uuid
|
500
|
+
datatypes = Set.new
|
501
|
+
types = Set.new
|
502
|
+
authors = authors_for @subject
|
503
|
+
title = label_for @subject, candidates: struct
|
504
|
+
desc = label_for @subject, candidates: struct, desc: true
|
505
|
+
|
506
|
+
# warn struct
|
507
|
+
|
508
|
+
# rewrite content
|
509
|
+
title = title[1] if title
|
510
|
+
desc = desc[1] if desc
|
511
|
+
|
512
|
+
# `struct` and `rstruct` will contain all the links and
|
513
|
+
# metadata for forward and backward neighbours, respectively,
|
514
|
+
# which we need to mine (predicates, classes, datatypes) for
|
515
|
+
# prefixes among other things.
|
516
|
+
|
517
|
+
struct.each do |p, v|
|
518
|
+
v.each do |o|
|
519
|
+
if o.literal?
|
520
|
+
literals[o] ||= Set.new
|
521
|
+
literals[o].add p
|
522
|
+
|
523
|
+
# collect the datatype
|
524
|
+
datatypes.add o.datatype if o.has_datatype?
|
525
|
+
else
|
526
|
+
# normalize URIs
|
527
|
+
if o.to_s.start_with? 'urn:uuid:'
|
528
|
+
ufwd[o] ||= canonical_uri o
|
529
|
+
elsif cu = urev[o] || canonical_uuid(o)
|
530
|
+
o = urev[o] ||= cu
|
531
|
+
end
|
532
|
+
|
533
|
+
# collect the resource
|
534
|
+
resources[o] ||= Set.new
|
535
|
+
resources[o].add p
|
536
|
+
|
537
|
+
# add to type
|
538
|
+
types.add o if p == RDF::RDFV.type
|
539
|
+
end
|
540
|
+
end
|
541
|
+
end
|
542
|
+
|
543
|
+
urev.merge! ufwd.invert
|
544
|
+
|
545
|
+
labels = resources.keys.map do |k|
|
546
|
+
# turn this into a pair which subsequently gets turned into a hash
|
547
|
+
[k, label_for(k) ]
|
548
|
+
end.to_h
|
549
|
+
|
550
|
+
#warn labels
|
551
|
+
|
552
|
+
# handle the title
|
553
|
+
title ||= RDF::Literal('')
|
554
|
+
tm = { '#title' => title,
|
555
|
+
property: abbreviate(literals[title].to_a, vocab: XHV) }
|
556
|
+
if tl = title.language
|
557
|
+
tm['xml:lang'] = tl # if xmlns
|
558
|
+
tm['lang'] = tl
|
559
|
+
elsif tdt = title.datatype and tdt != RDF::XSD.string
|
560
|
+
tm[:datatype] = abbreviate(tdt)
|
561
|
+
end
|
562
|
+
|
563
|
+
# we accumulate a record of the links in the body so we know
|
564
|
+
# which ones to skip in the head
|
565
|
+
bodylinks = {}
|
566
|
+
rewrite_links body, uuids: ufwd, uris: urev do |elem|
|
567
|
+
vocab = elem.at_xpath('ancestor-or-self::*[@vocab][1]/@vocab')
|
568
|
+
vocab = uri_pp(vocab.to_s) if vocab
|
569
|
+
|
570
|
+
if elem.key?('href') or elem.key?('src')
|
571
|
+
begin
|
572
|
+
vu = uri_pp(elem['href'] || elem['src'])
|
573
|
+
ru = RDF::URI(@base.merge(vu))
|
574
|
+
bodylinks[urev[ru] || ru] = true
|
575
|
+
|
576
|
+
if rel = resources[urev[ru] || ru]
|
577
|
+
elem['rel'] = (abbreviate rel, vocab: vocab).join ' '
|
578
|
+
end
|
579
|
+
|
580
|
+
label = labels[urev[ru] || ru]
|
581
|
+
if titles and label and
|
582
|
+
(!elem.key?('title') or elem['title'].strip == '')
|
583
|
+
elem['title'] = label[1].to_s
|
584
|
+
end
|
585
|
+
rescue URI::InvalidComponentError => e
|
586
|
+
warn "#{e}: #{vu} in #{@subject}"
|
587
|
+
end
|
588
|
+
end
|
589
|
+
end
|
590
|
+
|
591
|
+
# and now we do the head
|
592
|
+
links = []
|
593
|
+
resources.reject { |k, _| bodylinks[k] }.each do |k, v|
|
594
|
+
v = v.dup.delete RDF::RDFV.type
|
595
|
+
next if v.empty?
|
596
|
+
mts = formats_for k
|
597
|
+
|
598
|
+
# warn k, v.inspect
|
599
|
+
|
600
|
+
# warn k, mts.inspect
|
601
|
+
|
602
|
+
rel = abbreviate v.to_a, vocab: XHV
|
603
|
+
ru = @base.route_to(uri_pp (ufwd[k] || k).to_s)
|
604
|
+
ln = { nil => :link, rel: rel, href: ru.to_s }
|
605
|
+
if (label = labels[urev[k] || k])
|
606
|
+
ln[:title] = label[1].to_s
|
607
|
+
end
|
608
|
+
|
609
|
+
# add type=lol/wut
|
610
|
+
ln[:type] = mts.first.to_s unless mts.empty?
|
611
|
+
|
612
|
+
if !ln[:type] and v.include?(RDF::Vocab::XHV.stylesheet)
|
613
|
+
ln[:type] = 'text/css'
|
614
|
+
elsif ln[:type] =~ /(java|ecma)script/i or
|
615
|
+
v.include?(RDF::Vocab::DC.requires)
|
616
|
+
ln[nil] = :script
|
617
|
+
ln[:src] = ln.delete :href
|
618
|
+
ln[:type] ||= 'text/javascript'
|
619
|
+
end
|
620
|
+
links.push ln
|
621
|
+
end
|
622
|
+
|
623
|
+
links.sort! do |a, b|
|
624
|
+
# sort by rel, then by href
|
625
|
+
# warn a.inspect, b.inspect
|
626
|
+
s = 0
|
627
|
+
[nil, :rel, :rev, :href, :title].each do |k|
|
628
|
+
s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
|
629
|
+
break if s != 0
|
630
|
+
end
|
631
|
+
s
|
632
|
+
end
|
633
|
+
|
634
|
+
# we want to duplicate links from particular subjects (eg the root)
|
635
|
+
(@duplicate || {}).sort do |a, b|
|
636
|
+
a.first <=> b.first
|
637
|
+
end.each do |s, preds|
|
638
|
+
|
639
|
+
o = {}
|
640
|
+
u = ufwd[s] ||= canonical_uuid s
|
641
|
+
s = urev[u] ||= canonical_uri u if u
|
642
|
+
f = {}
|
643
|
+
|
644
|
+
# do not include this subject as these links are already included!
|
645
|
+
next if u == @subject
|
646
|
+
|
647
|
+
# gather up the objects, then gather up the predicates
|
648
|
+
|
649
|
+
objects_for u || s, preds, only: :resource do |obj, rel|
|
650
|
+
# XXX do not know why += |= etc does not work
|
651
|
+
x = canonical_uuid(obj) || obj
|
652
|
+
urev[x] ||= canonical_uri x
|
653
|
+
y = o[x] ||= Set.new
|
654
|
+
o[x] = y | rel
|
655
|
+
f[x] = formats_for x
|
656
|
+
end
|
657
|
+
|
658
|
+
srel = @base.route_to((u ? urev[u] || s : s).to_s)
|
659
|
+
|
660
|
+
# now collect all the other predicates
|
661
|
+
o.keys.each do |obj|
|
662
|
+
hrel = @base.route_to((urev[obj] || obj).to_s)
|
663
|
+
o[obj] |= @repo.query([u || s, nil, obj]).predicates.to_set
|
664
|
+
rels = abbreviate o[obj].to_a, vocab: XHV
|
665
|
+
ln = { nil => :link, about: srel, rel: rels, href: hrel }
|
666
|
+
ln[:type] = f[obj].first if f[obj]
|
667
|
+
|
668
|
+
# add to links
|
669
|
+
links << ln
|
670
|
+
end
|
671
|
+
end
|
672
|
+
|
673
|
+
meta = []
|
674
|
+
|
675
|
+
# include author names as old school meta tags
|
676
|
+
authors.each do |a|
|
677
|
+
name = labels[urev[a] || a] or next
|
678
|
+
datatypes.add name[0] # a convenient place to chuck this
|
679
|
+
prop = abbreviate(name[0])
|
680
|
+
name = name[1]
|
681
|
+
about = @base.route_to((ufwd[a] || a).to_s)
|
682
|
+
tag = { nil => :meta, about: about.to_s, name: :author,
|
683
|
+
property: prop, content: name.to_s }
|
684
|
+
|
685
|
+
if name.has_datatype? and name.datatype != RDF::XSD.string
|
686
|
+
tag[:datatype] = abbreviate(name.datatype)
|
687
|
+
elsif name.has_language?
|
688
|
+
tag['xml:lang'] = tag[:lang] = name.language
|
689
|
+
end
|
690
|
+
meta.push tag
|
691
|
+
end
|
692
|
+
|
693
|
+
literals.each do |k, v|
|
694
|
+
next if k == title
|
695
|
+
rel = abbreviate v.to_a, vocab: XHV
|
696
|
+
elem = { nil => :meta, property: rel, content: k.to_s }
|
697
|
+
elem[:name] = :description if k == desc
|
698
|
+
|
699
|
+
if k.has_datatype?
|
700
|
+
datatypes.add k.datatype # so we get the prefix
|
701
|
+
elem[:datatype] = abbreviate k.datatype, vocab: XHV
|
702
|
+
end
|
703
|
+
|
704
|
+
meta.push(elem)
|
705
|
+
end
|
706
|
+
|
707
|
+
meta.sort! do |a, b|
|
708
|
+
s = 0
|
709
|
+
[:about, :property, :datatype, :content, :name].each do |k|
|
710
|
+
# warn a.inspect, b.inspect
|
711
|
+
s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
|
712
|
+
break if s != 0
|
713
|
+
end
|
714
|
+
s
|
715
|
+
end
|
716
|
+
|
717
|
+
# don't forget style tag
|
718
|
+
style = doc.xpath('/html:html/html:head/html:style', { html: XHTMLNS })
|
719
|
+
|
720
|
+
body = body.dup 1
|
721
|
+
body = { '#body' => body.children.to_a, about: '' }
|
722
|
+
body[:typeof] = abbreviate(types.to_a, vocab: XHV) unless
|
723
|
+
types.empty?
|
724
|
+
|
725
|
+
|
726
|
+
|
727
|
+
# prepare only the prefixes we need to resolve the data we need
|
728
|
+
rsc = abbreviate(
|
729
|
+
(struct.keys + resources.keys + datatypes.to_a +
|
730
|
+
types.to_a + rstruct.to_a.flatten).uniq, noop: false).map do |x|
|
731
|
+
next if x.nil?
|
732
|
+
x.split(?:)[0].to_sym
|
733
|
+
end.reject(&:nil?).to_set
|
734
|
+
|
735
|
+
# warn rsc
|
736
|
+
|
737
|
+
pfx = prefixes.select do |k, _|
|
738
|
+
rsc.include? k
|
739
|
+
end.transform_values { |v| v.to_s }
|
740
|
+
|
741
|
+
# XXX deal with the qb:Observation separately (just nuke it for now)
|
742
|
+
extra = generate_twitter_meta || []
|
743
|
+
bl_op = begin
|
744
|
+
bads = @repo.query(
|
745
|
+
[nil, RDF::SAK::CI.document, @subject]).subjects.to_set
|
746
|
+
nope = %w[top contents index].map { |x| RDF::Vocab::XHV[x] }
|
747
|
+
lambda { |s, p| bads.include? s or nope.include? p }
|
748
|
+
end
|
749
|
+
if bl = generate_backlinks(
|
750
|
+
published: published, pattern: /^urn:uuid:/, terse: true,
|
751
|
+
struct: rstruct, ignore: bl_op)
|
752
|
+
extra << bl #{ [bl] => :object }
|
753
|
+
end
|
754
|
+
|
755
|
+
# and now for the document
|
756
|
+
xf = @transform
|
757
|
+
doc = xhtml_stub(
|
758
|
+
base: @base, prefix: pfx, vocab: XHV, lang: 'en', title: tm,
|
759
|
+
link: links, meta: meta, style: style, transform: xf,
|
760
|
+
extra: extra, body: body).document
|
761
|
+
|
762
|
+
# goddamn script tags and text/html
|
763
|
+
doc.xpath('//html:script[@src][not(node())]', XPATHNS).each do |script|
|
764
|
+
script << doc.create_text_node('')
|
765
|
+
end
|
766
|
+
|
767
|
+
doc
|
768
|
+
end
|
769
|
+
|
770
|
+
|
771
|
+
|
772
|
+
end
|