rdf-microdata 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/AUTHORS +1 -0
- data/README +80 -0
- data/UNLICENSE +24 -0
- data/VERSION +1 -0
- data/lib/rdf/microdata.rb +34 -0
- data/lib/rdf/microdata/extensions.rb +34 -0
- data/lib/rdf/microdata/format.rb +21 -0
- data/lib/rdf/microdata/reader.rb +488 -0
- data/lib/rdf/microdata/version.rb +18 -0
- data/lib/rdf/microdata/vocab.rb +5 -0
- metadata +141 -0
data/AUTHORS
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
* Gregg Kellogg <gregg@kellogg-assoc.com>
|
data/README
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# RDF::Microdata reader/writer
|
2
|
+
|
3
|
+
[Microdata][] parser for RDF.rb.
|
4
|
+
|
5
|
+
## DESCRIPTION
|
6
|
+
RDF::Microdata is a Microdata reader for Ruby using the [RDF.rb][RDF.rb] library suite.
|
7
|
+
|
8
|
+
## FEATURES
|
9
|
+
RDF::Microdata parses [Microdata][] into statements or triples.
|
10
|
+
|
11
|
+
* Microdata parser.
|
12
|
+
* Uses Nokogiri for parsing HTML
|
13
|
+
|
14
|
+
Install with 'gem install rdf-microdata'
|
15
|
+
|
16
|
+
## Usage
|
17
|
+
|
18
|
+
### Reading RDF data in the RDFa format
|
19
|
+
|
20
|
+
graph = RDF::Graph.load("etc/foaf.html", :format => :microdata)
|
21
|
+
|
22
|
+
## Dependencies
|
23
|
+
* [RDF.rb](http://rubygems.org/gems/rdf) (>= 0.3.3)
|
24
|
+
* [Nokogiri](http://rubygems.org/gems/nokogiri) (>= 1.3.3)
|
25
|
+
|
26
|
+
## Documentation
|
27
|
+
Full documentation available on [RubyForge](http://rdf.rubyforge.org/microdata)
|
28
|
+
|
29
|
+
### Principle Classes
|
30
|
+
* {RDF::Microdata::Format}
|
31
|
+
* {RDF::Microdata::HTML}
|
32
|
+
Asserts :html format, text/html mime-type and .html file extension.
|
33
|
+
* {RDF::RDFa::Reader}
|
34
|
+
|
35
|
+
### Additional vocabularies
|
36
|
+
|
37
|
+
## TODO
|
38
|
+
* Add support for LibXML and REXML bindings, and use the best available
|
39
|
+
* Consider a SAX-based parser for improved performance
|
40
|
+
|
41
|
+
## Resources
|
42
|
+
* [RDF.rb][RDF.rb]
|
43
|
+
* [Documentation](http://rdf.rubyforge.org/microdata)
|
44
|
+
* [History](file:file.History.html)
|
45
|
+
* [Microdata][]
|
46
|
+
|
47
|
+
## Author
|
48
|
+
* [Gregg Kellogg](http://github.com/gkellogg) - <http://kellogg-assoc.com/>
|
49
|
+
|
50
|
+
## Contributing
|
51
|
+
|
52
|
+
* Do your best to adhere to the existing coding conventions and idioms.
|
53
|
+
* Don't use hard tabs, and don't leave trailing whitespace on any line.
|
54
|
+
* Do document every method you add using [YARD][] annotations. Read the
|
55
|
+
[tutorial][YARD-GS] or just look at the existing code for examples.
|
56
|
+
* Don't touch the `.gemspec`, `VERSION` or `AUTHORS` files. If you need to
|
57
|
+
change them, do so on your private branch only.
|
58
|
+
* Do feel free to add yourself to the `CREDITS` file and the corresponding
|
59
|
+
list in the the `README`. Alphabetical order applies.
|
60
|
+
* Do note that in order for us to merge any non-trivial changes (as a rule
|
61
|
+
of thumb, additions larger than about 15 lines of code), we need an
|
62
|
+
explicit [public domain dedication][PDD] on record from you.
|
63
|
+
|
64
|
+
## License
|
65
|
+
|
66
|
+
This is free and unencumbered public domain software. For more information,
|
67
|
+
see <http://unlicense.org/> or the accompanying {file:UNLICENSE} file.
|
68
|
+
|
69
|
+
## FEEDBACK
|
70
|
+
|
71
|
+
* gregg@kellogg-assoc.com
|
72
|
+
* <http://rubygems.org/rdf-microdata>
|
73
|
+
* <http://github.com/gkellogg/rdf-microdata>
|
74
|
+
* <http://lists.w3.org/Archives/Public/public-rdf-ruby/>
|
75
|
+
|
76
|
+
[RDF.rb]: http://rdf.rubyforge.org/
|
77
|
+
[YARD]: http://yardoc.org/
|
78
|
+
[YARD-GS]: http://rubydoc.info/docs/yard/file/docs/GettingStarted.md
|
79
|
+
[PDD]: http://lists.w3.org/Archives/Public/public-rdf-ruby/2010May/0013.html
|
80
|
+
[Microdata]: http://www.w3.org/TR/2011/WD-microdata-20110525/ "HTML Microdata"
|
data/UNLICENSE
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
This is free and unencumbered software released into the public domain.
|
2
|
+
|
3
|
+
Anyone is free to copy, modify, publish, use, compile, sell, or
|
4
|
+
distribute this software, either in source code form or as a compiled
|
5
|
+
binary, for any purpose, commercial or non-commercial, and by any
|
6
|
+
means.
|
7
|
+
|
8
|
+
In jurisdictions that recognize copyright laws, the author or authors
|
9
|
+
of this software dedicate any and all copyright interest in the
|
10
|
+
software to the public domain. We make this dedication for the benefit
|
11
|
+
of the public at large and to the detriment of our heirs and
|
12
|
+
successors. We intend this dedication to be an overt act of
|
13
|
+
relinquishment in perpetuity of all present and future rights to this
|
14
|
+
software under copyright law.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
19
|
+
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
20
|
+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
21
|
+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
|
24
|
+
For more information, please refer to <http://unlicense.org/>
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,34 @@
|
|
1
|
+
$:.unshift(File.expand_path(File.join(File.dirname(__FILE__), '..')))
|
2
|
+
require 'rdf'
|
3
|
+
|
4
|
+
module RDF
|
5
|
+
##
|
6
|
+
# **`RDF::Microdata`** is a Microdata plugin for RDF.rb.
|
7
|
+
#
|
8
|
+
# @example Requiring the `RDF::Microdata` module
|
9
|
+
# require 'rdf/microdata'
|
10
|
+
#
|
11
|
+
# @example Parsing RDF statements from an HTML file
|
12
|
+
# RDF::Microdata::Reader.open("etc/foaf.html") do |reader|
|
13
|
+
# reader.each_statement do |statement|
|
14
|
+
# puts statement.inspect
|
15
|
+
# end
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
# @see http://rdf.rubyforge.org/
|
19
|
+
# @see http://www.w3.org/TR/2011/WD-microdata-20110525/
|
20
|
+
#
|
21
|
+
# @author [Gregg Kellogg](http://kellogg-assoc.com/)
|
22
|
+
module Microdata
|
23
|
+
require 'rdf/microdata/format'
|
24
|
+
require 'rdf/microdata/vocab'
|
25
|
+
autoload :Profile, 'rdf/microdata/profile'
|
26
|
+
autoload :Reader, 'rdf/microdata/reader'
|
27
|
+
autoload :VERSION, 'rdf/microdata/version'
|
28
|
+
|
29
|
+
def self.debug?; @debug; end
|
30
|
+
def self.debug=(value); @debug = value; end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
require 'rdf/microdata/extensions'
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
class Nokogiri::XML::Node
|
3
|
+
##
|
4
|
+
# Language, taken recursively from element and ancestors
|
5
|
+
def language
|
6
|
+
@lang ||= attribute('lang') ||
|
7
|
+
attributes["lang"] ||
|
8
|
+
attributes["xml:lang"] ||
|
9
|
+
(parent && parent.element? && parent.language)
|
10
|
+
end
|
11
|
+
|
12
|
+
##
|
13
|
+
# Get any xml:base in effect for this element
|
14
|
+
def base
|
15
|
+
if @base.nil?
|
16
|
+
@base = attributes['xml:base'] ||
|
17
|
+
(parent && parent.element? && parent.base) ||
|
18
|
+
false
|
19
|
+
end
|
20
|
+
|
21
|
+
@base == false ? nil : @base
|
22
|
+
end
|
23
|
+
|
24
|
+
def display_path
|
25
|
+
@display_path ||= case self
|
26
|
+
when Nokogiri::XML::Document then ""
|
27
|
+
when Nokogiri::XML::Element then parent ? "#{parent.display_path}/#{name}" : name
|
28
|
+
when Nokogiri::XML::Attr then "#{parent.display_path}@#{name}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Nokogiri::XML::Document
|
34
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module RDF::Microdata
|
2
|
+
##
|
3
|
+
# Microdata format specification.
|
4
|
+
#
|
5
|
+
# @example Obtaining a Microdata format class
|
6
|
+
# RDF::Format.for(:microdata) #=> RDF::Microdata::Format
|
7
|
+
# RDF::Format.for("etc/foaf.html")
|
8
|
+
# RDF::Format.for(:file_name => "etc/foaf.html")
|
9
|
+
# RDF::Format.for(:file_extension => "html")
|
10
|
+
# RDF::Format.for(:content_type => "text/html")
|
11
|
+
#
|
12
|
+
# @example Obtaining serialization format MIME types
|
13
|
+
# RDF::Format.content_types #=> {"text/html" => [RDF::Microdata::Format]}
|
14
|
+
#
|
15
|
+
# @see http://www.w3.org/TR/rdf-testcases/#ntriples
|
16
|
+
class Format < RDF::Format
|
17
|
+
content_encoding 'utf-8'
|
18
|
+
content_type 'text/html', :extension => :html
|
19
|
+
reader { RDF::Microdata::Reader }
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,488 @@
|
|
1
|
+
require 'nokogiri' # FIXME: Implement using different modules as in RDF::TriX
|
2
|
+
|
3
|
+
module RDF::Microdata
|
4
|
+
##
|
5
|
+
# An Microdata parser in Ruby
|
6
|
+
#
|
7
|
+
# Based on processing rules described here:
|
8
|
+
# @see http://dev.w3.org/html5/md/
|
9
|
+
#
|
10
|
+
# @author [Gregg Kellogg](http://kellogg-assoc.com/)
|
11
|
+
class Reader < RDF::Reader
|
12
|
+
format Format
|
13
|
+
XHTML = "http://www.w3.org/1999/xhtml"
|
14
|
+
URL_PROPERTY_ELEMENTS = %w(a area audio embed iframe img link object source track video)
|
15
|
+
|
16
|
+
class CrawlFailure < StandardError #:nodoc:
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# Initializes the Microdata reader instance.
|
21
|
+
#
|
22
|
+
# @param [Nokogiri::HTML::Document, Nokogiri::XML::Document, IO, File, String] input
|
23
|
+
# the input stream to read
|
24
|
+
# @param [Hash{Symbol => Object}] options
|
25
|
+
# any additional options
|
26
|
+
# @option options [Encoding] :encoding (Encoding::UTF_8)
|
27
|
+
# the encoding of the input stream (Ruby 1.9+)
|
28
|
+
# @option options [Boolean] :validate (false)
|
29
|
+
# whether to validate the parsed statements and values
|
30
|
+
# @option options [Boolean] :canonicalize (false)
|
31
|
+
# whether to canonicalize parsed literals
|
32
|
+
# @option options [Boolean] :intern (true)
|
33
|
+
# whether to intern all parsed URIs
|
34
|
+
# @option options [#to_s] :base_uri (nil)
|
35
|
+
# the base URI to use when resolving relative URIs
|
36
|
+
# @option options [Array] :debug
|
37
|
+
# Array to place debug messages
|
38
|
+
# @return [reader]
|
39
|
+
# @yield [reader] `self`
|
40
|
+
# @yieldparam [RDF::Reader] reader
|
41
|
+
# @yieldreturn [void] ignored
|
42
|
+
# @raise [Error]:: Raises RDF::ReaderError if _validate_
|
43
|
+
def initialize(input = $stdin, options = {}, &block)
|
44
|
+
super do
|
45
|
+
@debug = options[:debug]
|
46
|
+
|
47
|
+
@doc = case input
|
48
|
+
when Nokogiri::HTML::Document, Nokogiri::XML::Document
|
49
|
+
input
|
50
|
+
else
|
51
|
+
# Try to detect charset from input
|
52
|
+
options[:encoding] ||= input.charset if input.respond_to?(:charset)
|
53
|
+
|
54
|
+
# Otherwise, default is utf-8
|
55
|
+
options[:encoding] ||= 'utf-8'
|
56
|
+
|
57
|
+
Nokogiri::HTML.parse(input, @base_uri.to_s, options[:encoding])
|
58
|
+
end
|
59
|
+
|
60
|
+
if (@doc.nil? || @doc.root.nil?)
|
61
|
+
add_error(nil, "Empty document")
|
62
|
+
raise RDF::ReaderError, "Empty Document"
|
63
|
+
end
|
64
|
+
errors = @doc.errors.reject {|e| e.to_s =~ /Tag (audio|source|track|video|time) invalid/}
|
65
|
+
add_error(nil, "Synax errors:\n#{@doc.errors}") if !errors.empty? && validate?
|
66
|
+
|
67
|
+
block.call(self) if block_given?
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
# Iterates the given block for each RDF statement in the input.
|
73
|
+
#
|
74
|
+
# @yield [statement]
|
75
|
+
# @yieldparam [RDF::Statement] statement
|
76
|
+
# @return [void]
|
77
|
+
def each_statement(&block)
|
78
|
+
@callback = block
|
79
|
+
|
80
|
+
# parse
|
81
|
+
parse_whole_document(@doc, @base_uri)
|
82
|
+
end
|
83
|
+
|
84
|
+
##
|
85
|
+
# Iterates the given block for each RDF triple in the input.
|
86
|
+
#
|
87
|
+
# @yield [subject, predicate, object]
|
88
|
+
# @yieldparam [RDF::Resource] subject
|
89
|
+
# @yieldparam [RDF::URI] predicate
|
90
|
+
# @yieldparam [RDF::Value] object
|
91
|
+
# @return [void]
|
92
|
+
def each_triple(&block)
|
93
|
+
each_statement do |statement|
|
94
|
+
block.call(*statement.to_triple)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
# Keep track of allocated BNodes
|
101
|
+
def bnode(value = nil)
|
102
|
+
@bnode_cache ||= {}
|
103
|
+
@bnode_cache[value.to_s] ||= RDF::Node.new(value)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Figure out the document path, if it is a Nokogiri::XML::Element or Attribute
|
107
|
+
def node_path(node)
|
108
|
+
"<#{@base_uri}>" + case node
|
109
|
+
when Nokogiri::XML::Node then node.display_path
|
110
|
+
else node.to_s
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# Add debug event to debug array, if specified
|
115
|
+
#
|
116
|
+
# @param [XML Node, any] node:: XML Node or string for showing context
|
117
|
+
# @param [String] message::
|
118
|
+
def add_debug(node, message)
|
119
|
+
puts "#{node_path(node)}: #{message}" if ::RDF::Microdata::debug?
|
120
|
+
@debug << "#{node_path(node)}: #{message}" if @debug.is_a?(Array)
|
121
|
+
end
|
122
|
+
|
123
|
+
def add_error(node, message)
|
124
|
+
add_debug(node, message)
|
125
|
+
raise RDF::ReaderError, message if validate?
|
126
|
+
end
|
127
|
+
|
128
|
+
# add a statement, object can be literal or URI or bnode
|
129
|
+
#
|
130
|
+
# @param [Nokogiri::XML::Node, any] node:: XML Node or string for showing context
|
131
|
+
# @param [URI, BNode] subject:: the subject of the statement
|
132
|
+
# @param [URI] predicate:: the predicate of the statement
|
133
|
+
# @param [URI, BNode, Literal] object:: the object of the statement
|
134
|
+
# @return [Statement]:: Added statement
|
135
|
+
# @raise [ReaderError]:: Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
|
136
|
+
def add_triple(node, subject, predicate, object)
|
137
|
+
statement = RDF::Statement.new(subject, predicate, object)
|
138
|
+
add_debug(node, "statement: #{RDF::NTriples.serialize(statement)}")
|
139
|
+
@callback.call(statement)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Parsing an RDFa document (this is *not* the recursive method)
|
143
|
+
def parse_whole_document(doc, base)
|
144
|
+
base_el = doc.at_css('html>head>base')
|
145
|
+
base = base_el.attribute('href').to_s.split('#').first if base_el
|
146
|
+
|
147
|
+
if (base)
|
148
|
+
# Strip any fragment from base
|
149
|
+
base = base.to_s.split('#').first
|
150
|
+
base = @options[:base_uri] = uri(base)
|
151
|
+
add_debug(base_el, "parse_whole_doc: base='#{base}'")
|
152
|
+
else
|
153
|
+
base = RDF::URI("")
|
154
|
+
end
|
155
|
+
|
156
|
+
##
|
157
|
+
# 1. If the title element is not null, then generate the following triple:
|
158
|
+
#
|
159
|
+
# subject: the document's current address
|
160
|
+
# predicate: http://purl.org/dc/terms/title
|
161
|
+
# object: the concatenation of the data of all the child text nodes of the title element,
|
162
|
+
# in tree order, as a plain literal, with the language information set from
|
163
|
+
# the language of the title element, if it is not unknown.
|
164
|
+
doc.css('html>head>title').each do |title|
|
165
|
+
lang = title.attribute('language')
|
166
|
+
add_triple(title, base, RDF::DC.title, title.inner_text)
|
167
|
+
end
|
168
|
+
|
169
|
+
# 2. For each a, area, and link element in the Document, run these substeps:
|
170
|
+
#
|
171
|
+
# * If the element does not have a rel attribute, then skip this element.
|
172
|
+
# * If the element does not have an href attribute, then skip this element.
|
173
|
+
# * If resolving the element's href attribute relative to the element is not successful,
|
174
|
+
# then skip this element.
|
175
|
+
doc.css('a, area, link').each do |el|
|
176
|
+
rel, href = el.attribute('rel'), el.attribute('href')
|
177
|
+
next unless rel && href
|
178
|
+
href = uri(href, el.base || base)
|
179
|
+
add_debug(el, "a: rel=#{rel.inspect}, href=#{href}")
|
180
|
+
|
181
|
+
# Otherwise, split the value of the element's rel attribute on spaces, obtaining list of tokens.
|
182
|
+
# Coalesce duplicate tokens in list of tokens.
|
183
|
+
tokens = rel.to_s.split(/\s+/).map do |tok|
|
184
|
+
# Convert each token in list of tokens that does not contain a U+003A COLON characters (:)
|
185
|
+
# to ASCII lowercase.
|
186
|
+
tok =~ /:/ ? tok : tok.downcase
|
187
|
+
end.uniq
|
188
|
+
|
189
|
+
# If list of tokens contains both the tokens alternate and stylesheet,
|
190
|
+
# then remove them both and replace them with the single (uppercase) token
|
191
|
+
# ALTERNATE-STYLESHEET.
|
192
|
+
if tokens.include?('alternate') && tokens.include?('stylesheet')
|
193
|
+
tokens = tokens - %w(alternate stylesheet)
|
194
|
+
tokens << 'ALTERNATE-STYLESHEET'
|
195
|
+
end
|
196
|
+
|
197
|
+
tokens.each do |tok|
|
198
|
+
tok_uri = RDF::URI(tok)
|
199
|
+
if tok !~ /:/
|
200
|
+
# For each token token in list of tokens that contains no U+003A COLON characters (:),
|
201
|
+
# generate the following triple:
|
202
|
+
add_triple(el, base, RDF::XHV[tok.gsub('#', '%23')], href)
|
203
|
+
elsif tok_uri.absolute?
|
204
|
+
# For each token token in list of tokens that is an absolute URL, generate the following triple:
|
205
|
+
add_triple(el, base, tok_uri, href)
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
# 3. For each meta element in the Document that has a name attribute and a content attribute,
|
211
|
+
doc.css('meta[name][content]').each do |el|
|
212
|
+
name, content = el.attribute('name'), el.attribute('content')
|
213
|
+
name = name.to_s
|
214
|
+
name_uri = uri(name, el.base || base)
|
215
|
+
add_debug(el, "meta: name=#{name.inspect}")
|
216
|
+
if name !~ /:/
|
217
|
+
# If the value of the name attribute contains no U+003A COLON characters (:),
|
218
|
+
# generate the following triple:
|
219
|
+
add_triple(el, base, RDF::XHV[name.downcase.gsub('#', '%23')], RDF::Literal(content, :language => el.language))
|
220
|
+
elsif name_uri.absolute?
|
221
|
+
# If the value of the name attribute contains no U+003A COLON characters (:),
|
222
|
+
# generate the following triple:
|
223
|
+
add_triple(el, base, name_uri, RDF::Literal(content, :language => el.language))
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
# 4. For each blockquote and q element in the Document that has a cite attribute that resolves
|
228
|
+
# successfully relative to the element, generate the following triple:
|
229
|
+
doc.css('blockquote[cite], q[cite]').each do |el|
|
230
|
+
object = uri(el.attribute('cite'), el.base || base)
|
231
|
+
add_debug(el, "blockquote: cite=#{object}")
|
232
|
+
add_triple(el, base, RDF::DC.source, object)
|
233
|
+
end
|
234
|
+
|
235
|
+
|
236
|
+
# 5. Let memory be a mapping of items to subjects, initially empty.
|
237
|
+
# 6. For each element that is also a top-level microdata item, run the following steps:
|
238
|
+
# * Generate the triples for the item. Pass a reference to memory as the item/subject list.
|
239
|
+
# Let result be the subject returned.
|
240
|
+
# * Generate the following triple:
|
241
|
+
# subject the document's current address
|
242
|
+
# predicate http://www.w3.org/1999/xhtml/microdata#item
|
243
|
+
# object result
|
244
|
+
memory = {}
|
245
|
+
doc.css('[itemscope]').
|
246
|
+
select {|el| !el.has_attribute?('itemprop')}.
|
247
|
+
each do |el|
|
248
|
+
object = generate_triples(el, memory)
|
249
|
+
add_triple(el, base, RDF::MD.item, object)
|
250
|
+
end
|
251
|
+
|
252
|
+
add_debug(doc, "parse_whole_doc: traversal complete")
|
253
|
+
end
|
254
|
+
|
255
|
+
##
|
256
|
+
# Generate triples for an item
|
257
|
+
# @param [RDF::Resource] item
|
258
|
+
# @param [Hash{Nokogiri::XML::Element} => RDF::Resource] memory
|
259
|
+
# @param [Hash{Symbol => Object}] options
|
260
|
+
# @option options [RDF::Resource] :fallback_type
|
261
|
+
# @option options [RDF::Resource] :fallback_name
|
262
|
+
# @return [RDF::Resource]
|
263
|
+
def generate_triples(item, memory, options = {})
|
264
|
+
fallback_type = options[:fallback_type]
|
265
|
+
fallback_name = options[:fallback_name]
|
266
|
+
|
267
|
+
# 1. If there is an entry for item in memory, then let subject be the subject of that entry.
|
268
|
+
# Otherwise, if item has a global identifier and that global identifier is an absolute URL,
|
269
|
+
# let subject be that global identifier. Otherwise, let subject be a new blank node.
|
270
|
+
subject = if memory.include?(item)
|
271
|
+
memory[item][:subject]
|
272
|
+
elsif item.has_attribute?('itemid')
|
273
|
+
u = uri(item.attribute('itemid'))
|
274
|
+
end || RDF::Node.new
|
275
|
+
memory[item] ||= {}
|
276
|
+
|
277
|
+
add_debug(item, "gentrips(2): subject=#{subject.inspect}")
|
278
|
+
|
279
|
+
# 2. Add a mapping from item to subject in memory, if there isn't one already.
|
280
|
+
memory[item][:subject] ||= subject
|
281
|
+
|
282
|
+
# 3. If item has an item type and that item type is an absolute URL, let type be that item type.
|
283
|
+
# Otherwise, let type be the empty string.
|
284
|
+
type = uri(item.attribute('itemtype'))
|
285
|
+
type = '' unless type.absolute?
|
286
|
+
|
287
|
+
if type != ''
|
288
|
+
add_triple(item, subject, RDF.type, type)
|
289
|
+
# 4.2. If type does not contain a U+0023 NUMBER SIGN character (#), then append a # to type.
|
290
|
+
type += '#' unless type.to_s.include?('#')
|
291
|
+
# 4.3. If type does not have a : after its #, append a : to type.
|
292
|
+
type += ':' unless type.to_s.match(/\#:/)
|
293
|
+
elsif fallback_type
|
294
|
+
add_debug(item, "gentrips(5.2): fallback_type=#{fallback_type}, fallback_name=#{fallback_name}")
|
295
|
+
type = fallback_type
|
296
|
+
# 5.2. If type does not contain a U+0023 NUMBER SIGN character (#), then append a # to type.
|
297
|
+
type += '#' unless type.to_s.include?('#')
|
298
|
+
# 5.3. If type does not have a : after its #, append a : to type.
|
299
|
+
type += ':' unless type.to_s.match(/\#:/)
|
300
|
+
# 5.4. If the last character of type is not a :, %20 to type.
|
301
|
+
type += '%20' unless type.to_s[-1] == ':'
|
302
|
+
# 5.5. Append the fragment-escaped value of fallback name to type.
|
303
|
+
type += fallback_name.to_s.gsub('#', '%23')
|
304
|
+
end
|
305
|
+
|
306
|
+
add_debug(item, "gentrips(6): type=#{type.inspect}")
|
307
|
+
|
308
|
+
# 6. For each element _element_ that has one or more property names and is one of the
|
309
|
+
# properties of the item _item_, in the order those elements are given by the algorithm
|
310
|
+
# that returns the properties of an item, run the following substep:
|
311
|
+
props = item_properties(item)
|
312
|
+
|
313
|
+
# 6.1. For each name name in element's property names, run the following substeps:
|
314
|
+
props.each do |element|
|
315
|
+
element.attribute('itemprop').to_s.split(' ').each do |name|
|
316
|
+
add_debug(element, "gentrips(6.1): name=#{name.inspect}")
|
317
|
+
# If type is the empty string and name is not an absolute URL, then abort these substeps.
|
318
|
+
name_uri = RDF::URI(name)
|
319
|
+
next if type == '' && !name_uri.absolute?
|
320
|
+
|
321
|
+
value = property_value(element)
|
322
|
+
add_debug(element, "gentrips(6.1.2) value=#{value.inspect}")
|
323
|
+
|
324
|
+
if value.is_a?(Hash)
|
325
|
+
value = generate_triples(element, memory, :fallback_type => type, :fallback_name => name)
|
326
|
+
end
|
327
|
+
|
328
|
+
add_debug(element, "gentrips(6.1.3): value=#{value.inspect}")
|
329
|
+
|
330
|
+
predicate = if name_uri.absolute?
|
331
|
+
name_uri
|
332
|
+
elsif !name.include?(':')
|
333
|
+
s = type.to_s
|
334
|
+
s += '%20' unless s[-1] == ':'
|
335
|
+
s += name
|
336
|
+
RDF::MD[s.gsub('#', '%23')]
|
337
|
+
end
|
338
|
+
add_debug(element, "gentrips(6.1.5): predicate=#{predicate}")
|
339
|
+
|
340
|
+
add_triple(element, subject, predicate, value) if predicate
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
subject
|
345
|
+
end
|
346
|
+
|
347
|
+
##
|
348
|
+
# To find the properties of an item defined by the element root, the user agent must try
|
349
|
+
# to crawl the properties of the element root, with an empty list as the value of memory:
|
350
|
+
# if this fails, then the properties of the item defined by the element root is an empty
|
351
|
+
# list; otherwise, it is the returned list.
|
352
|
+
#
|
353
|
+
# @param [Nokogiri::XML::Element] item
|
354
|
+
# @return [Array<Nokogiri::XML::Element>]
|
355
|
+
# List of property elements for an item
|
356
|
+
def item_properties(item)
|
357
|
+
add_debug(item, "item_properties")
|
358
|
+
results, errors = crawl_properties(item, [])
|
359
|
+
raise CrawlFailure, "item_props: errors=#{errors}" if errors > 0
|
360
|
+
results
|
361
|
+
rescue CrawlFailure => e
|
362
|
+
add_error(element, e.message)
|
363
|
+
return []
|
364
|
+
end
|
365
|
+
|
366
|
+
##
|
367
|
+
# To crawl the properties of an element root with a list memory, the user agent must run
|
368
|
+
# the following steps. These steps either fail or return a list with a count of errors.
|
369
|
+
# The count of errors is used as part of the authoring conformance criteria below.
|
370
|
+
#
|
371
|
+
# @param [Nokogiri::XML::Element] root
|
372
|
+
# @param [Array<Nokokogiri::XML::Element>] memory
|
373
|
+
# @return [Array<Array<Nokogiri::XML::Element>, Integer>]
|
374
|
+
# Resultant elements and error count
|
375
|
+
def crawl_properties(root, memory)
|
376
|
+
# 1. If root is in memory, then the algorithm fails; abort these steps.
|
377
|
+
raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memory.include?(root)
|
378
|
+
|
379
|
+
# 2. Collect all the elements in the item root; let results be the resulting
|
380
|
+
# list of elements, and errors be the resulting count of errors.
|
381
|
+
results, errors = elements_in_item(root)
|
382
|
+
add_debug(root, "crawl_properties results=#{results.inspect}, errors=#{errors}")
|
383
|
+
|
384
|
+
# 3. Remove any elements from results that do not have an itemprop attribute specified.
|
385
|
+
results = results.select {|e| e.has_attribute?('itemprop')}
|
386
|
+
|
387
|
+
# 4. Let new memory be a new list consisting of the old list memory with the addition of root.
|
388
|
+
new_memory = memory + [root]
|
389
|
+
|
390
|
+
# 5. For each element in results that has an itemscope attribute specified,
|
391
|
+
# crawl the properties of the element, with new memory as the memory.
|
392
|
+
results.select {|e| e.has_attribute?('itemscope')}.each do |element|
|
393
|
+
begin
|
394
|
+
crawl_properties(element, new_memory)
|
395
|
+
rescue CrawlFailure => e
|
396
|
+
# If this fails, then remove the element from results and increment errors.
|
397
|
+
# (If it succeeds, the return value is discarded.)
|
398
|
+
memory -= element
|
399
|
+
add_error(element, e.message)
|
400
|
+
errors += 1
|
401
|
+
end
|
402
|
+
end
|
403
|
+
|
404
|
+
[results, errors]
|
405
|
+
end
|
406
|
+
|
407
|
+
##
|
408
|
+
# To collect all the elements in the item root, the user agent must run these steps.
|
409
|
+
# They return a list of elements and a count of errors.
|
410
|
+
#
|
411
|
+
# @param [Nokogiri::XML::Element] root
|
412
|
+
# @return [Array<Array<Nokogiri::XML::Element>, Integer>]
|
413
|
+
# Resultant elements and error count
|
414
|
+
def elements_in_item(root)
|
415
|
+
# Let results and pending be empty lists of elements.
|
416
|
+
# Let errors be zero.
|
417
|
+
results, errors = [], 0
|
418
|
+
|
419
|
+
# Add all the children elements of root to pending.
|
420
|
+
pending = root.elements
|
421
|
+
|
422
|
+
# If root has an itemref attribute, split the value of that itemref attribute on spaces.
|
423
|
+
# For each resulting token ID,
|
424
|
+
root.attribute('itemref').to_s.split(' ').each do |id|
|
425
|
+
add_debug(root, "elements_in_item itemref id #{id}")
|
426
|
+
# if there is an element in the home subtree of root with the ID ID,
|
427
|
+
# then add the first such element to pending.
|
428
|
+
id_elem = @doc.at_css("##{id}")
|
429
|
+
pending << id_elem if id_elem
|
430
|
+
end
|
431
|
+
add_debug(root, "elements_in_item pending #{pending.inspect}")
|
432
|
+
|
433
|
+
# Loop: Remove an element from pending and let current be that element.
|
434
|
+
while current = pending.shift
|
435
|
+
if results.include?(current)
|
436
|
+
# If current is already in results, increment errors.
|
437
|
+
add_error(current, "elements_in_item: results already includes #{current.inspect}")
|
438
|
+
errors += 1
|
439
|
+
elsif !current.has_attribute?('itemscope')
|
440
|
+
# If current is not already in results and current does not have an itemscope attribute,
|
441
|
+
# then: add all the child elements of current to pending.
|
442
|
+
pending += current.elements
|
443
|
+
end
|
444
|
+
|
445
|
+
# If current is not already in results, then: add current to results.
|
446
|
+
results << current unless results.include?(current)
|
447
|
+
end
|
448
|
+
|
449
|
+
[results, errors]
|
450
|
+
end
|
451
|
+
|
452
|
+
##
|
453
|
+
#
|
454
|
+
def property_value(element)
|
455
|
+
add_debug(element, "property_value(#{element.inspect})")
|
456
|
+
case
|
457
|
+
when element.has_attribute?('itemscope')
|
458
|
+
{}
|
459
|
+
when element.name == 'meta'
|
460
|
+
element.attribute('content').to_s
|
461
|
+
when %w(audio embed iframe img source track video).include?(element.name)
|
462
|
+
uri(element.attribute('src'), element.base)
|
463
|
+
when %w(a area link).include?(element.name)
|
464
|
+
uri(element.attribute('href'), element.base)
|
465
|
+
when %w(object).include?(element.name)
|
466
|
+
uri(element.attribute('data'), element.base)
|
467
|
+
when %w(time).include?(element.name) && element.has_attribute?('datetime')
|
468
|
+
RDF::Literal::DateTime.new(element.attribute('datetime'))
|
469
|
+
else
|
470
|
+
RDF::Literal.new(element.text, :language => element.language)
|
471
|
+
end
|
472
|
+
end
|
473
|
+
|
474
|
+
# Fixme, what about xml:base relative to element?
|
475
|
+
def uri(value, base = nil)
|
476
|
+
value = if base
|
477
|
+
base = uri(base) unless base.is_a?(RDF::URI)
|
478
|
+
base.join(value)
|
479
|
+
else
|
480
|
+
RDF::URI(value)
|
481
|
+
end
|
482
|
+
value.validate! if validate?
|
483
|
+
value.canonicalize! if canonicalize?
|
484
|
+
value = RDF::URI.intern(value) if intern?
|
485
|
+
value
|
486
|
+
end
|
487
|
+
end
|
488
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module RDF::Microdata::VERSION
|
2
|
+
VERSION_FILE = File.join(File.expand_path(File.dirname(__FILE__)), "..", "..", "..", "VERSION")
|
3
|
+
MAJOR, MINOR, TINY, EXTRA = File.read(VERSION_FILE).chop.split(".")
|
4
|
+
|
5
|
+
STRING = [MAJOR, MINOR, TINY, EXTRA].compact.join('.')
|
6
|
+
|
7
|
+
##
|
8
|
+
# @return [String]
|
9
|
+
def self.to_s() STRING end
|
10
|
+
|
11
|
+
##
|
12
|
+
# @return [String]
|
13
|
+
def self.to_str() STRING end
|
14
|
+
|
15
|
+
##
|
16
|
+
# @return [Array(Integer, Integer, Integer)]
|
17
|
+
def self.to_a() STRING.split(".") end
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rdf-microdata
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Gregg Kellogg
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-06-29 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: rdf
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 0.3.3
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 1.4.4
|
36
|
+
type: :runtime
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: yard
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 0.6.0
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id003
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: rspec
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: 2.5.0
|
58
|
+
type: :development
|
59
|
+
version_requirements: *id004
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: rdf-spec
|
62
|
+
prerelease: false
|
63
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.3.2
|
69
|
+
type: :development
|
70
|
+
version_requirements: *id005
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: rdf-n3
|
73
|
+
prerelease: false
|
74
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
75
|
+
none: false
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 0.3.3
|
80
|
+
type: :development
|
81
|
+
version_requirements: *id006
|
82
|
+
- !ruby/object:Gem::Dependency
|
83
|
+
name: rdf-isomorphic
|
84
|
+
prerelease: false
|
85
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: 0.3.4
|
91
|
+
type: :development
|
92
|
+
version_requirements: *id007
|
93
|
+
description: Microdata reader for Ruby.
|
94
|
+
email: public-rdf-ruby@w3.org
|
95
|
+
executables: []
|
96
|
+
|
97
|
+
extensions: []
|
98
|
+
|
99
|
+
extra_rdoc_files: []
|
100
|
+
|
101
|
+
files:
|
102
|
+
- AUTHORS
|
103
|
+
- README
|
104
|
+
- UNLICENSE
|
105
|
+
- VERSION
|
106
|
+
- lib/rdf/microdata/extensions.rb
|
107
|
+
- lib/rdf/microdata/format.rb
|
108
|
+
- lib/rdf/microdata/reader.rb
|
109
|
+
- lib/rdf/microdata/version.rb
|
110
|
+
- lib/rdf/microdata/vocab.rb
|
111
|
+
- lib/rdf/microdata.rb
|
112
|
+
has_rdoc: false
|
113
|
+
homepage: http://github.com/gkellogg/rdf-microdata
|
114
|
+
licenses:
|
115
|
+
- Public Domain
|
116
|
+
post_install_message:
|
117
|
+
rdoc_options: []
|
118
|
+
|
119
|
+
require_paths:
|
120
|
+
- lib
|
121
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
122
|
+
none: false
|
123
|
+
requirements:
|
124
|
+
- - ">="
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: 1.8.1
|
127
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
128
|
+
none: false
|
129
|
+
requirements:
|
130
|
+
- - ">="
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: "0"
|
133
|
+
requirements: []
|
134
|
+
|
135
|
+
rubyforge_project: rdf-microdata
|
136
|
+
rubygems_version: 1.6.2
|
137
|
+
signing_key:
|
138
|
+
specification_version: 3
|
139
|
+
summary: Microdata reader for Ruby.
|
140
|
+
test_files: []
|
141
|
+
|