rdf-microdata 2.2.0 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,22 +8,23 @@ module RDF::Microdata
8
8
  #
9
9
  # Based on processing rules, amended with the following:
10
10
  #
11
- # @see http://dvcs.w3.org/hg/htmldata/raw-file/0d6b89f5befb/microdata-rdf/index.html
12
- # @author [Gregg Kellogg](http://greggkellogg.net/)
11
+ # @see https://dvcs.w3.org/hg/htmldata/raw-file/0d6b89f5befb/microdata-rdf/index.html
12
+ # @author [Gregg Kellogg](https://greggkellogg.net/)
13
13
  class Reader < RDF::Reader
14
14
  format Format
15
15
  include Expansion
16
16
  include RDF::Util::Logger
17
17
  URL_PROPERTY_ELEMENTS = %w(a area audio embed iframe img link object source track video)
18
- DEFAULT_REGISTRY = File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "..", "etc", "registry.json"))
19
18
 
20
19
  # @private
21
20
  class CrawlFailure < StandardError; end
22
21
 
23
- # @!attribute [r] implementation
24
22
  # @return [Module] Returns the HTML implementation module for this reader instance.
25
23
  attr_reader :implementation
26
24
 
25
+ # @return [Hash{Object => RDF::Resource}] maps RDF elements (items) to resources
26
+ attr_reader :memory
27
+
27
28
  ##
28
29
  # Returns the base URI determined by this reader.
29
30
  #
@@ -36,109 +37,38 @@ module RDF::Microdata
36
37
  @options[:base_uri]
37
38
  end
38
39
 
39
- # Interface to registry
40
- class Registry
41
- # @return [RDF::URI] Prefix of vocabulary
42
- attr_reader :uri
43
-
44
- # @return [Hash] properties
45
- attr_reader :properties
46
-
47
- ##
48
- # Initialize the registry from a URI or file path
49
- #
50
- # @param [String] registry_uri
51
- def self.load_registry(registry_uri)
52
- return if @registry_uri == registry_uri
53
-
54
- json = RDF::Util::File.open_file(registry_uri) { |f| JSON.load(f) }
55
-
56
- @prefixes = {}
57
- json.each do |prefix, elements|
58
- next unless elements.is_a?(Hash)
59
- properties = elements.fetch("properties", {})
60
- @prefixes[prefix] = Registry.new(prefix, properties)
61
- end
62
- @registry_uri = registry_uri
63
- end
64
-
65
- ##
66
- # Initialize registry for a particular prefix URI
67
- #
68
- # @param [RDF::URI] prefixURI
69
- # @param [Hash] properties ({})
70
- def initialize(prefixURI, properties = {})
71
- @uri = prefixURI
72
- @properties = properties
73
- @property_base = prefixURI.to_s
74
- # Append a '#' for fragment if necessary
75
- @property_base += '#' unless %w(/ #).include?(@property_base[-1,1])
76
- end
77
-
78
- ##
79
- # Find a registry entry given a type URI
80
- #
81
- # @param [RDF::URI] type
82
- # @return [Registry]
83
- def self.find(type)
84
- @prefixes ||= {}
85
- k = @prefixes.keys.detect {|key| type.to_s.index(key) == 0 }
86
- @prefixes[k] if k
87
- end
88
-
89
- ##
90
- # Generate a predicateURI given a `name`
91
- #
92
- # @param [#to_s] name
93
- # @param [Hash{}] ec Evaluation Context
94
- # @return [RDF::URI]
95
- def predicateURI(name, ec)
96
- u = RDF::URI(name)
97
- # 1) If _name_ is an _absolute URL_, return _name_ as a _URI reference_
98
- return u if u.absolute?
99
-
100
- n = frag_escape(name)
101
- if ec[:current_type].nil?
102
- # 2) If current type from context is null, there can be no current vocabulary.
103
- # Return the URI reference that is the document base with its fragment set to the fragment-escaped value of name
104
- u = RDF::URI(ec[:document_base].to_s)
105
- u.fragment = frag_escape(name)
106
- u
107
- else
108
- # 4) If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/).
109
- RDF::URI(@property_base + n)
110
- end
111
- end
40
+ ##
41
+ # Reader options
42
+ # @see https://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Reader#options-class_method
43
+ def self.options
44
+ super + [
45
+ RDF::CLI::Option.new(
46
+ symbol: :rdfa,
47
+ datatype: TrueClass,
48
+ on: ["--rdfa"],
49
+ description: "Transform and parse as RDFa.") {true},
50
+ ]
51
+ end
112
52
 
113
- ##
114
- # Yield a equivalentProperty or subPropertyOf if appropriate
115
- #
116
- # @param [RDF::URI] predicateURI
117
- # @yield equiv
118
- # @yieldparam [RDF::URI] equiv
119
- def expand(predicateURI)
120
- tok = tokenize(predicateURI)
121
- if @properties[tok].is_a?(Hash)
122
- value = @properties[tok].fetch("subPropertyOf", nil)
123
- value ||= @properties[tok].fetch("equivalentProperty", nil)
124
-
125
- Array(value).each {|equiv| yield RDF::URI(equiv)}
53
+ ##
54
+ # Redirect for RDFa Reader given `:rdfa` option
55
+ #
56
+ # @private
57
+ def self.new(input = nil, **options, &block)
58
+ klass = if options[:rdfa]
59
+ # Requires rdf-rdfa gem to be loaded
60
+ begin
61
+ require 'rdf/rdfa'
62
+ rescue LoadError
63
+ raise ReaderError, "Use of RDFa-based reader requires rdf-rdfa gem"
126
64
  end
65
+ RdfaReader
66
+ else
67
+ self
127
68
  end
128
-
129
- ##
130
- # Turn a predicateURI into a simple token
131
- # @param [RDF::URI] predicateURI
132
- # @return [String]
133
- def tokenize(predicateURI)
134
- predicateURI.to_s.sub(@property_base, '')
135
- end
136
-
137
- ##
138
- # Fragment escape a name
139
- def frag_escape(name)
140
- name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase}
141
- end
69
+ reader = klass.allocate
70
+ reader.send(:initialize, input, **options, &block)
71
+ reader
142
72
  end
143
73
 
144
74
  ##
@@ -164,7 +94,7 @@ module RDF::Microdata
164
94
  # @yieldparam [RDF::Reader] reader
165
95
  # @yieldreturn [void] ignored
166
96
  # @raise [Error] Raises `RDF::ReaderError` when validating
167
- def initialize(input = $stdin, options = {}, &block)
97
+ def initialize(input = $stdin, **options, &block)
168
98
  super do
169
99
  @library = :nokogiri
170
100
 
@@ -173,17 +103,17 @@ module RDF::Microdata
173
103
  self.extend(@implementation)
174
104
 
175
105
  input.rewind if input.respond_to?(:rewind)
176
- initialize_html(input, options) rescue log_fatal($!.message, exception: RDF::ReaderError)
106
+ initialize_html(input, **options) rescue log_fatal($!.message, exception: RDF::ReaderError)
177
107
 
178
108
  log_error("Empty document") if root.nil?
179
109
  log_error(doc_errors.map(&:message).uniq.join("\n")) if !doc_errors.empty?
180
110
 
181
- log_debug(@doc, "library = #{@library}")
111
+ log_debug('', "library = #{@library}")
182
112
 
183
113
  # Load registry
184
114
  begin
185
- registry_uri = options[:registry] || DEFAULT_REGISTRY
186
- log_debug(@doc, "registry = #{registry_uri.inspect}")
115
+ registry_uri = options[:registry] || RDF::Microdata::DEFAULT_REGISTRY
116
+ log_debug('', "registry = #{registry_uri.inspect}")
187
117
  Registry.load_registry(registry_uri)
188
118
  rescue JSON::ParserError => e
189
119
  log_fatal("Failed to parse registry: #{e.message}", exception: RDF::ReaderError) if (root.nil? && validate?)
@@ -270,6 +200,7 @@ module RDF::Microdata
270
200
  # Parsing a Microdata document (this is *not* the recursive method)
271
201
  def parse_whole_document(doc, base)
272
202
  base = doc_base(base)
203
+ @memory = {}
273
204
  options[:base_uri] = if (base)
274
205
  # Strip any fragment from base
275
206
  base = base.to_s.split('#').first
@@ -280,15 +211,9 @@ module RDF::Microdata
280
211
 
281
212
  log_info(nil) {"parse_whole_doc: base='#{base}'"}
282
213
 
283
- ec = {
284
- memory: {},
285
- current_type: nil,
286
- current_vocabulary: nil,
287
- document_base: base,
288
- }
289
214
  # 1) For each element that is also a top-level item, Generate the triples for that item using the evaluation context.
290
215
  getItems.each do |el|
291
- log_depth {generate_triples(el, ec)}
216
+ log_depth {generate_triples(el, Registry.new(nil))}
292
217
  end
293
218
 
294
219
  log_info(doc, "parse_whole_doc: traversal complete")
@@ -298,12 +223,11 @@ module RDF::Microdata
298
223
  # Generate triples for an item
299
224
  #
300
225
  # @param [RDF::Resource] item
301
- # @param [Hash{Symbol => Object}] ec
226
+ # @param [Registry] vocab
302
227
  # @option ec [Hash{Nokogiri::XML::Element} => RDF::Resource] memory
303
- # @option ec [RDF::Resource] :current_type
228
+ # @option ec [RDF::Resource] :current_vocabulary
304
229
  # @return [RDF::Resource]
305
- def generate_triples(item, ec = {})
306
- memory = ec[:memory]
230
+ def generate_triples(item, vocab)
307
231
  # 1) If there is an entry for item in memory, then let subject be the subject of that entry. Otherwise, if item has a global identifier and that global identifier is an absolute URL, let subject be that global identifier. Otherwise, let subject be a new blank node.
308
232
  subject = if memory.include?(item.node)
309
233
  memory[item.node][:subject]
@@ -312,12 +236,13 @@ module RDF::Microdata
312
236
  end || RDF::Node.new
313
237
  memory[item.node] ||= {}
314
238
 
315
- log_debug(item) {"gentrips(2): subject=#{subject.inspect}, current_type: #{ec[:current_type]}"}
239
+ log_debug(item) {"gentrips(2): subject=#{subject.inspect}, vocab: #{vocab.inspect}"}
316
240
 
317
241
  # 2) Add a mapping from item to subject in memory, if there isn't one already.
318
242
  memory[item.node][:subject] ||= subject
319
243
 
320
244
  # 3) For each type returned from element.itemType of the element defining the item.
245
+ # 4) Set vocab to the first value returned from element.itemType of the element defining the item.
321
246
  type = nil
322
247
  item.attribute('itemtype').to_s.split(' ').map{|n| uri(n)}.select(&:absolute?).each do |t|
323
248
  # 3.1. If type is an absolute URL, generate the following triple:
@@ -325,36 +250,26 @@ module RDF::Microdata
325
250
  add_triple(item, subject, RDF.type, t)
326
251
  end
327
252
 
328
- # 4) Set type to the first value returned from element.itemType of the element defining the item.
329
-
330
- # 5) Otherwise, set type to current type from the Evaluation Context if not empty.
331
- type ||= ec[:current_type]
332
- log_debug(item) {"gentrips(5): type=#{type.inspect}"}
333
-
334
- # 6) If the registry contains a URI prefix that is a character for character match of type up to the length of the URI prefix, set vocab as that URI prefix.
335
- vocab = Registry.find(type)
336
-
337
- # 7) Otherwise, if type is not empty, construct vocab by removing everything following the last SOLIDUS U+002F ("/") or NUMBER SIGN U+0023 ("#") from the path component of type.
338
- vocab ||= begin
339
- type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1')
340
- log_debug(item) {"gentrips(7): type_vocab=#{type_vocab.inspect}"}
341
- Registry.new(type_vocab)
253
+ # 6) If the registry contains a URI prefix that is a character for character match of vocab up to the length of the URI prefix, set vocab as that URI prefix.
254
+ if type || vocab.nil?
255
+ vocab = Registry.find(type) || begin
256
+ type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') unless type.nil?
257
+ log_debug(item) {"gentrips(7): type_vocab=#{type_vocab.inspect}"}
258
+ Registry.new(type_vocab)
259
+ end
342
260
  end
343
261
 
344
- # 8) Update evaluation context setting current vocabulary to vocab.
345
- ec[:current_vocabulary] = vocab
262
+ # Otherwise, use vocab from evaluation context
263
+ log_debug(item) {"gentrips(8): vocab: #{vocab.inspect}"}
346
264
 
347
265
  # 9. For each element _element_ that has one or more property names and is one of the properties of the item _item_, run the following substep:
348
266
  props = item_properties(item)
349
267
  # 9.1. For each name name in element's property names, run the following substeps:
350
268
  props.each do |element|
351
269
  element.attribute('itemprop').to_s.split(' ').compact.each do |name|
352
- log_debug(item) {"gentrips(9.1): name=#{name.inspect}, type=#{type}"}
353
- # 9.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab.
354
- ec_new = ec.merge({current_type: type, current_vocabulary: vocab})
355
-
270
+ log_debug(item) {"gentrips(9.1): name=#{name.inspect}, vocab=#{vocab.inspect}"}
356
271
  # 9.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate.
357
- predicate = vocab.predicateURI(name, ec_new)
272
+ predicate = vocab.predicateURI(name, base_uri)
358
273
 
359
274
  # 9.1.3) Let value be the property value of element.
360
275
  value = property_value(element)
@@ -362,7 +277,7 @@ module RDF::Microdata
362
277
 
363
278
  # 9.1.4) If value is an item, then generate the triples for value context. Replace value by the subject returned from those steps.
364
279
  if value.is_a?(Hash)
365
- value = generate_triples(element, ec_new)
280
+ value = generate_triples(element, vocab)
366
281
  log_debug(item) {"gentrips(9.1.4): value=#{value.inspect}"}
367
282
  end
368
283
 
@@ -384,11 +299,9 @@ module RDF::Microdata
384
299
  props.each do |element|
385
300
  element.attribute('itemprop-reverse').to_s.split(' ').compact.each do |name|
386
301
  log_debug(item) {"gentrips(10.1): name=#{name.inspect}"}
387
- # 10.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab.
388
- ec_new = ec.merge({current_type: type, current_vocabulary: vocab})
389
302
 
390
303
  # 10.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate.
391
- predicate = vocab.predicateURI(name, ec_new)
304
+ predicate = vocab.predicateURI(name, base_uri)
392
305
 
393
306
  # 10.1.3) Let value be the property value of element.
394
307
  value = property_value(element)
@@ -396,7 +309,7 @@ module RDF::Microdata
396
309
 
397
310
  # 10.1.4) If value is an item, then generate the triples for value context. Replace value by the subject returned from those steps.
398
311
  if value.is_a?(Hash)
399
- value = generate_triples(element, ec_new)
312
+ value = generate_triples(element, vocab)
400
313
  log_debug(item) {"gentrips(10.1.4): value=#{value.inspect}"}
401
314
  elsif value.is_a?(RDF::Literal)
402
315
  # 10.1.5) Otherwise, if value is a literal, ignore the value and continue to the next name; it is an error for the value of @itemprop-reverse to be a literal
@@ -432,13 +345,13 @@ module RDF::Microdata
432
345
  # To crawl the properties of an element root with a list memory, the user agent must run the following steps. These steps either fail or return a list with a count of errors. The count of errors is used as part of the authoring conformance criteria below.
433
346
  #
434
347
  # @param [Nokogiri::XML::Element] root
435
- # @param [Array<Nokokogiri::XML::Element>] memory
348
+ # @param [Array<Nokokogiri::XML::Element>] memo
436
349
  # @param [Boolean] reverse crawl reverse properties
437
350
  # @return [Array<Nokogiri::XML::Element>]
438
351
  # Resultant elements
439
- def crawl_properties(root, memory, reverse)
440
- # 1. If root is in memory, then the algorithm fails; abort these steps.
441
- raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memory.include?(root)
352
+ def crawl_properties(root, memo, reverse)
353
+ # 1. If root is in memo, then the algorithm fails; abort these steps.
354
+ raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memo.include?(root)
442
355
 
443
356
  # 2. Collect all the elements in the item root; let results be the resulting list of elements, and errors be the resulting count of errors.
444
357
  results = elements_in_item(root)
@@ -447,13 +360,13 @@ module RDF::Microdata
447
360
  # 3. Remove any elements from results that do not have an @itemprop (@itemprop-reverse) attribute specified.
448
361
  results = results.select {|e| e.has_attribute?(reverse ? 'itemprop-reverse' : 'itemprop')}
449
362
 
450
- # 4. Let new memory be a new list consisting of the old list memory with the addition of root.
451
- raise CrawlFailure, "itemref recursion" if memory.detect {|n| root.node.object_id == n.node.object_id}
452
- new_memory = memory + [root]
363
+ # 4. Let new memo be a new list consisting of the old list memo with the addition of root.
364
+ raise CrawlFailure, "itemref recursion" if memo.detect {|n| root.node.object_id == n.node.object_id}
365
+ new_memo = memo + [root]
453
366
 
454
- # 5. For each element in results that has an @itemscope attribute specified, crawl the properties of the element, with new memory as the memory.
367
+ # 5. For each element in results that has an @itemscope attribute specified, crawl the properties of the element, with new memo as the memo.
455
368
  results.select {|e| e.has_attribute?('itemscope')}.each do |element|
456
- log_depth {crawl_properties(element, new_memory, reverse)}
369
+ log_depth {crawl_properties(element, new_memo, reverse)}
457
370
  end
458
371
 
459
372
  results
@@ -469,7 +382,7 @@ module RDF::Microdata
469
382
  def elements_in_item(root)
470
383
  # Let results and pending be empty lists of elements.
471
384
  # Let errors be zero.
472
- results, memory, errors = [], [], 0
385
+ results, memo, errors = [], [], 0
473
386
 
474
387
  # Add all the children elements of root to pending.
475
388
  pending = root.elements
@@ -487,13 +400,13 @@ module RDF::Microdata
487
400
 
488
401
  # Loop: Remove an element from pending and let current be that element.
489
402
  while current = pending.shift
490
- if memory.include?(current)
403
+ if memo.include?(current)
491
404
  raise CrawlFailure, "elements_in_item: results already includes #{current.inspect}"
492
405
  elsif !current.has_attribute?('itemscope')
493
406
  # If current is not already in results and current does not have an itemscope attribute, then: add all the child elements of current to pending.
494
407
  pending += current.elements
495
408
  end
496
- memory << current
409
+ memo << current
497
410
 
498
411
  # If current is not already in results, then: add current to results.
499
412
  results << current unless results.include?(current)
@@ -510,7 +423,7 @@ module RDF::Microdata
510
423
  value = case
511
424
  when element.has_attribute?('itemscope')
512
425
  {}
513
- when element.name == 'meta'
426
+ when element.has_attribute?('content')
514
427
  RDF::Literal.new(element.attribute('content').to_s, language: element.language)
515
428
  when %w(data meter).include?(element.name) && element.attribute('value')
516
429
  # Lexically scan value and assign appropriate type, otherwise, leave untyped
@@ -3,7 +3,7 @@ module RDF::Microdata
3
3
  ##
4
4
  # Nokogiri implementation of an HTML parser.
5
5
  #
6
- # @see http://nokogiri.org/
6
+ # @see https://nokogiri.org/
7
7
  module Nokogiri
8
8
  ##
9
9
  # Returns the name of the underlying XML library.
@@ -103,6 +103,12 @@ module RDF::Microdata
103
103
  NodeSetProxy.new(@node.elements, self)
104
104
  end
105
105
 
106
+ ##
107
+ # Rational debug output
108
+ def to_str
109
+ @node.path
110
+ end
111
+
106
112
  ##
107
113
  # Proxy for everything else to @node
108
114
  def method_missing(method, *args)
@@ -172,7 +178,7 @@ module RDF::Microdata
172
178
  #
173
179
  # @param [Hash{Symbol => Object}] options
174
180
  # @return [void]
175
- def initialize_html(input, options = {})
181
+ def initialize_html(input, **options)
176
182
  require 'nokogiri' unless defined?(::Nokogiri)
177
183
  @doc = case input
178
184
  when ::Nokogiri::XML::Document
@@ -188,7 +194,7 @@ module RDF::Microdata
188
194
  begin
189
195
  require 'nokogumbo' unless defined?(::Nokogumbo)
190
196
  input = input.read if input.respond_to?(:read)
191
- ::Nokogiri::HTML5(input.force_encoding(options[:encoding]))
197
+ ::Nokogiri::HTML5(input.force_encoding(options[:encoding]), max_parse_errors: 1000)
192
198
  rescue LoadError
193
199
  ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
194
200
  end
@@ -206,7 +212,9 @@ module RDF::Microdata
206
212
  ##
207
213
  # Document errors
208
214
  def doc_errors
209
- @doc.errors.reject {|e| e.to_s =~ /The doctype must be the first token in the document/}
215
+ @doc.errors.reject do |e|
216
+ e.to_s =~ %r{(The doctype must be the first token in the document)|(Expected a doctype token)|(Unexpected '\?' where start tag name is expected)}
217
+ end
210
218
  end
211
219
 
212
220
  ##
@@ -224,7 +232,7 @@ module RDF::Microdata
224
232
  ##
225
233
  # Based on Microdata element.getItems
226
234
  #
227
- # @see http://www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items
235
+ # @see https://www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items
228
236
  def getItems
229
237
  @doc.css('[itemscope]').select {|el| !el.has_attribute?('itemprop')}.map {|n| NodeProxy.new(n)}
230
238
  end