microdata 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +41 -0
- data/Rakefile +10 -0
- data/bin/microdata.rb +8 -0
- data/lib/microdata.rb +42 -0
- data/lib/microdata/document.rb +22 -0
- data/lib/microdata/item.rb +65 -0
- data/lib/microdata/itemprop.rb +91 -0
- data/lib/microdata/version.rb +3 -0
- data/microdata.gemspec +25 -0
- data/test/data/example.html +22 -0
- data/test/test_helper.rb +2 -0
- data/test/test_parse.rb +36 -0
- metadata +114 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Jason Ronallo
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# Microdata
|
2
|
+
|
3
|
+
Ruby library for extracting HTML5 Microdata
|
4
|
+
|
5
|
+
## Story
|
6
|
+
|
7
|
+
Most of the code here was extracted from [Mida](https://github.com/LawrenceWoodman/mida) by Lawrence Woodman. This was done in order to have a simpler, more generic Microdata parser without all the vocabulary awareness and other features. This gem is also tested under Ruby 1.9.3 and Ruby 2.0.0, though it could be better tested.
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
This library has not been released to RubyGems.org yet, but when it is the intention is to have it install with the following.
|
12
|
+
|
13
|
+
Add this line to your application's Gemfile:
|
14
|
+
|
15
|
+
gem 'microdata'
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install microdata
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
The commandline tool microdata.rb can be run like the following:
|
28
|
+
|
29
|
+
```
|
30
|
+
microdata.rb http://d.lib.ncsu.edu/collections/catalog/mc00383-001-ff0006-001-001_0038
|
31
|
+
```
|
32
|
+
|
33
|
+
Output is in pretty JSON format.
|
34
|
+
|
35
|
+
## Contributing
|
36
|
+
|
37
|
+
1. Fork it
|
38
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
39
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
40
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
41
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/bin/microdata.rb
ADDED
data/lib/microdata.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require "microdata/version"
|
2
|
+
require "nokogiri"
|
3
|
+
require "microdata/item"
|
4
|
+
require "microdata/document"
|
5
|
+
require "microdata/itemprop"
|
6
|
+
require 'open-uri'
|
7
|
+
require 'json'
|
8
|
+
require 'uri'
|
9
|
+
|
10
|
+
module Microdata
|
11
|
+
|
12
|
+
# PROPERTY_VALUES = {
|
13
|
+
# meta: 'content',
|
14
|
+
# audio: 'src',
|
15
|
+
# embed: 'src',
|
16
|
+
# iframe: 'src',
|
17
|
+
# img: 'src',
|
18
|
+
# source: 'src',
|
19
|
+
# video: 'src',
|
20
|
+
# a: 'href',
|
21
|
+
# area: 'href',
|
22
|
+
# link: 'href',
|
23
|
+
# object: 'data',
|
24
|
+
# time: 'datetime'
|
25
|
+
# }
|
26
|
+
|
27
|
+
def self.get_items(location)
|
28
|
+
content = open(location)
|
29
|
+
page_url = location
|
30
|
+
Microdata::Document.new(content, page_url).extract_items
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.to_json(location)
|
34
|
+
items = get_items(location)
|
35
|
+
hash = {}
|
36
|
+
hash[:items] = items.map do |item|
|
37
|
+
item.to_hash
|
38
|
+
end
|
39
|
+
JSON.pretty_generate hash
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Microdata
|
2
|
+
class Document
|
3
|
+
|
4
|
+
attr_reader :items
|
5
|
+
|
6
|
+
def initialize(content, page_url=nil)
|
7
|
+
@doc = Nokogiri::HTML(content)
|
8
|
+
@page_url = page_url
|
9
|
+
@items = extract_items
|
10
|
+
end
|
11
|
+
|
12
|
+
def extract_items
|
13
|
+
itemscopes = @doc.search('//*[@itemscope and not(@itemprop)]')
|
14
|
+
return nil unless itemscopes
|
15
|
+
|
16
|
+
itemscopes.collect do |itemscope|
|
17
|
+
Item.new(itemscope, @page_url)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Microdata
|
2
|
+
class Item
|
3
|
+
attr_reader :type, :properties, :id
|
4
|
+
|
5
|
+
def initialize(top_node, page_url)
|
6
|
+
@top_node = top_node
|
7
|
+
@type = extract_itemtype
|
8
|
+
@id = extract_itemid
|
9
|
+
@properties = {}
|
10
|
+
@page_url = page_url
|
11
|
+
parse_elements(extract_elements(@top_node))
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_hash
|
15
|
+
hash = {}
|
16
|
+
hash[:id] = id if id
|
17
|
+
hash[:type] = type
|
18
|
+
hash[:properties] = {}
|
19
|
+
properties.each do |name, values|
|
20
|
+
final_values = values.map do |value|
|
21
|
+
if value.is_a?(Item)
|
22
|
+
value.to_hash
|
23
|
+
else
|
24
|
+
value
|
25
|
+
end
|
26
|
+
end
|
27
|
+
hash[:properties][name] = final_values
|
28
|
+
end
|
29
|
+
hash
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def extract_elements(node)
|
35
|
+
node.search('./*')
|
36
|
+
end
|
37
|
+
|
38
|
+
def extract_itemid
|
39
|
+
(value = @top_node.attribute('itemid')) ? value.value : nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def extract_itemtype
|
43
|
+
(value = @top_node.attribute('itemtype')) ? value.value.split(' ') : nil
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_elements(elements)
|
47
|
+
elements.each {|element| parse_element(element)}
|
48
|
+
end
|
49
|
+
|
50
|
+
def parse_element(element)
|
51
|
+
itemscope = element.attribute('itemscope')
|
52
|
+
itemprop = element.attribute('itemprop')
|
53
|
+
internal_elements = extract_elements(element)
|
54
|
+
add_itemprop(element) if itemscope || itemprop
|
55
|
+
parse_elements(internal_elements) if internal_elements && !itemscope
|
56
|
+
end
|
57
|
+
|
58
|
+
# Add an 'itemprop' to the properties
|
59
|
+
def add_itemprop(itemprop)
|
60
|
+
properties = Itemprop.parse(itemprop, @page_url)
|
61
|
+
properties.each { |name, value| (@properties[name] ||= []) << value }
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Microdata
|
2
|
+
# Class that parses itemprop elements
|
3
|
+
class Itemprop
|
4
|
+
|
5
|
+
NON_TEXTCONTENT_ELEMENTS = {
|
6
|
+
'a' => 'href', 'area' => 'href',
|
7
|
+
'audio' => 'src', 'embed' => 'src',
|
8
|
+
'iframe' => 'src', 'img' => 'src',
|
9
|
+
'link' => 'href', 'meta' => 'content',
|
10
|
+
'object' => 'data', 'source' => 'src',
|
11
|
+
'time' => 'datetime', 'track' => 'src',
|
12
|
+
'video' => 'src'
|
13
|
+
}
|
14
|
+
|
15
|
+
URL_ATTRIBUTES = ['data', 'href', 'src']
|
16
|
+
|
17
|
+
# A Hash representing the properties.
|
18
|
+
# Hash is of the form {'property name' => 'value'}
|
19
|
+
attr_reader :properties
|
20
|
+
|
21
|
+
# Create a new Itemprop object
|
22
|
+
# [element] The itemprop element to be parsed
|
23
|
+
# [page_url] The url of the page, including filename, used to form
|
24
|
+
# absolute urls
|
25
|
+
def initialize(element, page_url=nil)
|
26
|
+
@element, @page_url = element, page_url
|
27
|
+
@properties = extract_properties
|
28
|
+
end
|
29
|
+
|
30
|
+
# Parse the element and return a hash representing the properties.
|
31
|
+
# Hash is of the form {'property name' => 'value'}
|
32
|
+
# [element] The itemprop element to be parsed
|
33
|
+
# [page_url] The url of the page, including filename, used to form
|
34
|
+
# absolute urls
|
35
|
+
def self.parse(element, page_url=nil)
|
36
|
+
self.new(element, page_url).properties
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
def extract_properties
|
41
|
+
prop_names = extract_property_names
|
42
|
+
prop_names.each_with_object({}) do |name, memo|
|
43
|
+
memo[name] = extract_property
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# This returns an empty string if can't form a valid
|
48
|
+
# absolute url as per the Microdata spec.
|
49
|
+
def make_absolute_url(url)
|
50
|
+
return url unless URI.parse(url).relative?
|
51
|
+
begin
|
52
|
+
URI.parse(@page_url).merge(url).to_s
|
53
|
+
rescue URI::Error
|
54
|
+
url
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def non_textcontent_element?(element)
|
59
|
+
NON_TEXTCONTENT_ELEMENTS.has_key?(element)
|
60
|
+
end
|
61
|
+
|
62
|
+
def url_attribute?(attribute)
|
63
|
+
URL_ATTRIBUTES.include?(attribute)
|
64
|
+
end
|
65
|
+
|
66
|
+
def extract_property_names
|
67
|
+
itemprop_attr = @element.attribute('itemprop')
|
68
|
+
itemprop_attr ? itemprop_attr.value.split() : []
|
69
|
+
end
|
70
|
+
|
71
|
+
def extract_property_value
|
72
|
+
element = @element.name
|
73
|
+
if non_textcontent_element?(element)
|
74
|
+
attribute = NON_TEXTCONTENT_ELEMENTS[element]
|
75
|
+
value = @element.attribute(attribute).value
|
76
|
+
url_attribute?(attribute) ? make_absolute_url(value) : value
|
77
|
+
else
|
78
|
+
@element.inner_text.strip
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def extract_property
|
83
|
+
if @element.attribute('itemscope')
|
84
|
+
Item.new(@element, @page_url)
|
85
|
+
else
|
86
|
+
extract_property_value
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
end
|
data/microdata.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'microdata/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "microdata"
|
8
|
+
spec.version = Microdata::VERSION
|
9
|
+
spec.authors = ["Jason Ronallo"]
|
10
|
+
spec.email = ["jronallo@gmail.com"]
|
11
|
+
spec.description = %q{HTML5 Microdata extractor}
|
12
|
+
spec.summary = %q{Ruby library for extracting HTML5 Microdata}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "nokogiri"
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html>
|
3
|
+
<!-- shameless -->
|
4
|
+
<head>
|
5
|
+
<title>Jason Ronallo</title>
|
6
|
+
</head>
|
7
|
+
|
8
|
+
<body>
|
9
|
+
<span itemscope itemtype="http://schema.org/Person"
|
10
|
+
itemid="http://ronallo.com#me">
|
11
|
+
<a itemprop="url" href="http://twitter.com/ronallo">
|
12
|
+
<span itemprop="name">Jason Ronallo</span>
|
13
|
+
</a> is the
|
14
|
+
<span itemprop="jobTitle">Associate Head of Digital Library Initiatives</span> at
|
15
|
+
<span itemprop="affiliation" itemscope itemtype="http://schema.org/Library" itemid="http://lib.ncsu.edu">
|
16
|
+
<span itemprop="name">
|
17
|
+
<a itemprop="url" href="http://www.lib.ncsu.edu">NCSU Libraries</a>
|
18
|
+
</span>
|
19
|
+
</span>.
|
20
|
+
</span>
|
21
|
+
</body>
|
22
|
+
</html>
|
data/test/test_helper.rb
ADDED
data/test/test_parse.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class TestParse < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@items = Microdata.get_items('test/data/example.html')
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_top_item_type
|
10
|
+
assert_equal ['http://schema.org/Person'], @items.first.type
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_top_item_id
|
14
|
+
assert_equal "http://ronallo.com#me", @items.first.id
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_top_item_properties
|
18
|
+
properties = @items.first.properties
|
19
|
+
assert_equal ["Jason Ronallo"], properties['name']
|
20
|
+
assert_equal ["http://twitter.com/ronallo"], properties['url']
|
21
|
+
assert_equal ["Associate Head of Digital Library Initiatives"], properties['jobTitle']
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_nested_item
|
25
|
+
item = @items.first.properties['affiliation'][0]
|
26
|
+
assert_equal ['http://schema.org/Library'], item.type
|
27
|
+
assert_equal "http://lib.ncsu.edu", item.id
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_nested_item_properties
|
31
|
+
properties = @items.first.properties['affiliation'][0].properties
|
32
|
+
assert_equal ['NCSU Libraries'], properties['name']
|
33
|
+
assert_equal ['http://www.lib.ncsu.edu'], properties['url']
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: microdata
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jason Ronallo
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-03-16 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: bundler
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '1.3'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '1.3'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rake
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: HTML5 Microdata extractor
|
63
|
+
email:
|
64
|
+
- jronallo@gmail.com
|
65
|
+
executables:
|
66
|
+
- microdata.rb
|
67
|
+
extensions: []
|
68
|
+
extra_rdoc_files: []
|
69
|
+
files:
|
70
|
+
- .gitignore
|
71
|
+
- Gemfile
|
72
|
+
- LICENSE.txt
|
73
|
+
- README.md
|
74
|
+
- Rakefile
|
75
|
+
- bin/microdata.rb
|
76
|
+
- lib/microdata.rb
|
77
|
+
- lib/microdata/document.rb
|
78
|
+
- lib/microdata/item.rb
|
79
|
+
- lib/microdata/itemprop.rb
|
80
|
+
- lib/microdata/version.rb
|
81
|
+
- microdata.gemspec
|
82
|
+
- test/data/example.html
|
83
|
+
- test/test_helper.rb
|
84
|
+
- test/test_parse.rb
|
85
|
+
homepage: ''
|
86
|
+
licenses:
|
87
|
+
- MIT
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options: []
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project:
|
106
|
+
rubygems_version: 1.8.25
|
107
|
+
signing_key:
|
108
|
+
specification_version: 3
|
109
|
+
summary: Ruby library for extracting HTML5 Microdata
|
110
|
+
test_files:
|
111
|
+
- test/data/example.html
|
112
|
+
- test/test_helper.rb
|
113
|
+
- test/test_parse.rb
|
114
|
+
has_rdoc:
|