hypermicrodata 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.travis.yml +8 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +100 -0
- data/Rakefile +10 -0
- data/bin/hypermicrodata.rb +25 -0
- data/hypermicrodata.gemspec +28 -0
- data/lib/hypermicrodata.rb +37 -0
- data/lib/hypermicrodata/document.rb +27 -0
- data/lib/hypermicrodata/extract.rb +22 -0
- data/lib/hypermicrodata/item.rb +113 -0
- data/lib/hypermicrodata/itemprop_parser.rb +114 -0
- data/lib/hypermicrodata/link.rb +7 -0
- data/lib/hypermicrodata/property.rb +27 -0
- data/lib/hypermicrodata/rails/html_based_json_renderer.rb +35 -0
- data/lib/hypermicrodata/serializer/base.rb +24 -0
- data/lib/hypermicrodata/serializer/hal.rb +47 -0
- data/lib/hypermicrodata/serializer/jsonld.rb +44 -0
- data/lib/hypermicrodata/serializer/uber.rb +100 -0
- data/lib/hypermicrodata/submit_button.rb +105 -0
- data/lib/hypermicrodata/version.rb +3 -0
- data/lib/uberous/uber.rb +104 -0
- data/test/data/example.html +22 -0
- data/test/data/example_itemref.html +16 -0
- data/test/data/example_with_no_itemscope.html +22 -0
- data/test/test_helper.rb +3 -0
- data/test/test_itemref.rb +19 -0
- data/test/test_json.rb +15 -0
- data/test/test_parse.rb +36 -0
- metadata +139 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6aa222d1d9f2fd94e7eabda85a111de9b63b17ba
|
4
|
+
data.tar.gz: 624be0e7d6c825c69ed224508f6286da2911cd8e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 094a2d0285349d16ff74308ce8756d5a2510f67c0ab564bd93112823c488bc0eeee030725feb87fa6b2f89e2ab4805407a2a536ded47262c3125f81ea1cd9901
|
7
|
+
data.tar.gz: 6753e62b18ea5b2e4b0550b5fcaaf2eeb5f3101efbb61c5af745901285a3d4621761e2b75a8201e670274350cab0a5a2f7bebac8570e4e87357a6e581626b700
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Jason Ronallo, Toru KAWAMURA
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
# Hypermicrodata
|
2
|
+
|
3
|
+
Ruby library for extracting HTML5 Microdata with Hypermedia
|
4
|
+
|
5
|
+
[](https://travis-ci.org/tkawa/hypermicrodata)
|
6
|
+
|
7
|
+
## Story
|
8
|
+
|
9
|
+
Most of the code here was extracted from [Mida](https://github.com/LawrenceWoodman/mida) by Lawrence Woodman. This was done in order to have a simpler, more generic Microdata parser without all the vocabulary awareness and other features. This gem is also tested under Ruby 1.9.3 and Ruby 2.0.0, though it could be better tested.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
This library has not been released to RubyGems.org yet, but when it is the intention is to have it install with the following.
|
14
|
+
|
15
|
+
Add this line to your application's Gemfile:
|
16
|
+
|
17
|
+
gem 'hypermicrodata'
|
18
|
+
|
19
|
+
And then execute:
|
20
|
+
|
21
|
+
$ bundle
|
22
|
+
|
23
|
+
Or install it yourself as:
|
24
|
+
|
25
|
+
$ gem install hypermicrodata
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
### Basic
|
30
|
+
|
31
|
+
```
|
32
|
+
json = Hypermicrodata::Extract.new(html).to_json(:uber)
|
33
|
+
```
|
34
|
+
|
35
|
+
Supported formats are
|
36
|
+
|
37
|
+
- application/vnd.amundsen-uber+json (:uber)
|
38
|
+
- application/hal+json (:hal)
|
39
|
+
- application/json (:plain)
|
40
|
+
|
41
|
+
### Rails Integration
|
42
|
+
|
43
|
+
When you use this in Rails, you don't need to extract data manually.
|
44
|
+
|
45
|
+
/app/controllers/people_controller.rb
|
46
|
+
|
47
|
+
```
|
48
|
+
class PeopleController < ApplicationController
|
49
|
+
before_action :set_message, only: %i(show edit update destroy)
|
50
|
+
include Hypermicrodata::Rails::HtmlBasedJsonRenderer
|
51
|
+
...
|
52
|
+
end
|
53
|
+
```
|
54
|
+
|
55
|
+
/app/views/people/show.html.haml
|
56
|
+
|
57
|
+
```
|
58
|
+
.person{itemscope: true, itemtype: 'http://schema.org/Person',
|
59
|
+
itemid: person_url(@person), data: {main_item: true}}
|
60
|
+
.media
|
61
|
+
.media-image.pull-left
|
62
|
+
= image_tag @person.picture_path, alt: '', itemprop: 'image'
|
63
|
+
.media-body
|
64
|
+
%h1.media-heading
|
65
|
+
%span{itemprop: 'name'}= @person.name
|
66
|
+
= link_to 'collection', people_path, rel: 'collection', itemprop: 'isPartOf'
|
67
|
+
```
|
68
|
+
|
69
|
+
And you can serve following JSON:
|
70
|
+
|
71
|
+
```
|
72
|
+
GET /people/1 HTTP/1.1
|
73
|
+
Host: www.example.com
|
74
|
+
Accept: application/vnd.amundsen-uber+json
|
75
|
+
```
|
76
|
+
|
77
|
+
```
|
78
|
+
{
|
79
|
+
"uber": {
|
80
|
+
"version": "1.0",
|
81
|
+
"data": [{
|
82
|
+
"url": "http://www.example.com/people/1",
|
83
|
+
"name": "Person",
|
84
|
+
"data": [
|
85
|
+
{ "name": "image", "value": "/assets/bob.png" },
|
86
|
+
{ "name": "name", "value": "Bob Smith" },
|
87
|
+
{ "name": "isPartOf", "rel": "collection", "url": "/people" },
|
88
|
+
]
|
89
|
+
}]
|
90
|
+
}
|
91
|
+
}
|
92
|
+
```
|
93
|
+
|
94
|
+
## Contributing
|
95
|
+
|
96
|
+
1. Fork it
|
97
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
98
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
99
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
100
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
# hypermicrodata.rb
|
4
|
+
# Extract HTML5 Microdata and output JSON
|
5
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
require 'hypermicrodata'
|
7
|
+
|
8
|
+
location = ARGV[0]
|
9
|
+
content = open(location)
|
10
|
+
document = Hypermicrodata::Document.new(content, location)
|
11
|
+
items = document.extract_items
|
12
|
+
|
13
|
+
if items.empty? || items.nil?
|
14
|
+
puts "No Microdata items found."
|
15
|
+
itemprops = document.doc.search('//*[@itemprop]')
|
16
|
+
if !itemprops.empty?
|
17
|
+
puts "There are some itemprops, which means no top level items with an itemscope have been found."
|
18
|
+
end
|
19
|
+
else
|
20
|
+
hash = {}
|
21
|
+
hash[:items] = items.map do |item|
|
22
|
+
item.to_hash
|
23
|
+
end
|
24
|
+
puts JSON.pretty_generate(hash)
|
25
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'hypermicrodata/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "hypermicrodata"
|
8
|
+
spec.version = Hypermicrodata::VERSION
|
9
|
+
spec.authors = ["Jason Ronallo", "Toru KAWAMURA"]
|
10
|
+
spec.email = ["jronallo@gmail.com", "tkawa@4bit.net"]
|
11
|
+
spec.description = %q{HTML5 Microdata extractor with Hypermedia}
|
12
|
+
spec.summary = %q{Ruby library for extracting HTML5 Microdata with Hypermedia}
|
13
|
+
spec.homepage = "https://github.com/tkawa/hypermicrodata"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "nokogiri"
|
22
|
+
spec.add_dependency "mechanize"
|
23
|
+
spec.add_dependency "halibut"
|
24
|
+
spec.add_dependency "multi_json"
|
25
|
+
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
27
|
+
spec.add_development_dependency "rake"
|
28
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require "hypermicrodata/version"
|
2
|
+
require "uberous/uber"
|
3
|
+
require "nokogiri"
|
4
|
+
require "mechanize"
|
5
|
+
require "hypermicrodata/item"
|
6
|
+
require "hypermicrodata/document"
|
7
|
+
require "hypermicrodata/property"
|
8
|
+
require "hypermicrodata/link"
|
9
|
+
require "hypermicrodata/itemprop_parser"
|
10
|
+
require "hypermicrodata/submit_button"
|
11
|
+
require "hypermicrodata/serializer/base"
|
12
|
+
require "hypermicrodata/serializer/hal"
|
13
|
+
require "hypermicrodata/serializer/uber"
|
14
|
+
require "hypermicrodata/extract"
|
15
|
+
require "hypermicrodata/rails/html_based_json_renderer"
|
16
|
+
require 'open-uri'
|
17
|
+
require 'json'
|
18
|
+
require 'uri'
|
19
|
+
|
20
|
+
module Hypermicrodata
|
21
|
+
|
22
|
+
def self.get_items(location)
|
23
|
+
content = open(location)
|
24
|
+
page_url = location
|
25
|
+
Hypermicrodata::Document.new(content, page_url).extract_items
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.to_json(location)
|
29
|
+
items = get_items(location)
|
30
|
+
hash = {}
|
31
|
+
hash[:items] = items.map do |item|
|
32
|
+
item.to_hash
|
33
|
+
end
|
34
|
+
JSON.pretty_generate hash
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Hypermicrodata
|
2
|
+
class Document
|
3
|
+
|
4
|
+
attr_reader :items, :doc
|
5
|
+
|
6
|
+
def initialize(content, page_url=nil, filter_xpath_attr=nil)
|
7
|
+
@doc = Nokogiri::HTML(content)
|
8
|
+
@page_url = page_url
|
9
|
+
@filter_xpath_attr = filter_xpath_attr
|
10
|
+
@items = extract_items
|
11
|
+
end
|
12
|
+
|
13
|
+
def extract_items
|
14
|
+
itemscopes = []
|
15
|
+
if @filter_xpath_attr
|
16
|
+
itemscopes = @doc.xpath("//*[#{@filter_xpath_attr} and @itemscope]")
|
17
|
+
puts "XPath //*[#{@filter_xpath_attr}] is not found. root node is used." if itemscopes.empty?
|
18
|
+
end
|
19
|
+
itemscopes = @doc.xpath('self::*[@itemscope] | .//*[@itemscope and not(@itemprop)]') if itemscopes.empty?
|
20
|
+
|
21
|
+
itemscopes.collect do |itemscope|
|
22
|
+
Item.new(itemscope, @page_url)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Hypermicrodata
|
2
|
+
class Extract
|
3
|
+
def initialize(html, options = {})
|
4
|
+
default_data_attr_name = 'main-item'
|
5
|
+
@location = options[:location]
|
6
|
+
@profile_path = options[:profile_path]
|
7
|
+
filter_xpath_attr = "@data-#{options[:data_attr_name] || default_data_attr_name}"
|
8
|
+
@document = Hypermicrodata::Document.new(html, @location, filter_xpath_attr)
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_json(format = :plain, options = {})
|
12
|
+
case format
|
13
|
+
when :hal
|
14
|
+
Hypermicrodata::Serializer::Hal.new(@document, @location, @profile_path).to_json(options)
|
15
|
+
when :uber
|
16
|
+
Hypermicrodata::Serializer::Uber.new(@document, @location, @profile_path).to_json(options)
|
17
|
+
else
|
18
|
+
Hypermicrodata::Serializer::Base.new(@document, @location, @profile_path).to_json(options)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Hypermicrodata
|
2
|
+
class Item
|
3
|
+
attr_reader :type, :properties, :links, :id
|
4
|
+
|
5
|
+
def initialize(top_node, page_url)
|
6
|
+
@top_node = top_node
|
7
|
+
@type = extract_itemtype
|
8
|
+
@id = extract_itemid
|
9
|
+
@properties = {}
|
10
|
+
@links = {}
|
11
|
+
@page_url = page_url
|
12
|
+
add_itemref_properties(@top_node)
|
13
|
+
parse_elements(extract_elements(@top_node))
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_hash
|
17
|
+
hash = {}
|
18
|
+
hash[:id] = id if id
|
19
|
+
hash[:type] = type if type
|
20
|
+
hash[:properties] = {}
|
21
|
+
properties.each do |name, same_name_properties|
|
22
|
+
final_values = same_name_properties.map do |property|
|
23
|
+
if property.item
|
24
|
+
property.item.to_hash
|
25
|
+
else
|
26
|
+
property.value
|
27
|
+
end
|
28
|
+
end
|
29
|
+
hash[:properties][name] = final_values
|
30
|
+
end
|
31
|
+
hash[:links] = {}
|
32
|
+
links.each do |rel, same_rel_links|
|
33
|
+
final_values = same_rel_links.map do |link|
|
34
|
+
if link.item
|
35
|
+
link.item.to_hash
|
36
|
+
else
|
37
|
+
link.value
|
38
|
+
end
|
39
|
+
end
|
40
|
+
hash[:links][rel] = final_values
|
41
|
+
end
|
42
|
+
hash
|
43
|
+
end
|
44
|
+
|
45
|
+
def all_properties_and_links
|
46
|
+
properties.values.flatten | links.values.flatten
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def extract_elements(node)
|
52
|
+
node.search('./*')
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_itemid
|
56
|
+
(value = @top_node.attribute('itemid')) ? value.value : nil
|
57
|
+
end
|
58
|
+
|
59
|
+
def extract_itemtype
|
60
|
+
(value = @top_node.attribute('itemtype')) ? value.value.split(' ') : nil
|
61
|
+
end
|
62
|
+
|
63
|
+
def parse_elements(elements)
|
64
|
+
elements.each {|element| parse_element(element)}
|
65
|
+
end
|
66
|
+
|
67
|
+
def parse_element(element)
|
68
|
+
itemscope = element.attribute('itemscope')
|
69
|
+
itemprop = element.attribute('itemprop')
|
70
|
+
internal_elements = extract_elements(element)
|
71
|
+
add_itemprop(element) if itemscope || itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name)
|
72
|
+
add_form(element) if element.name == 'form'
|
73
|
+
parse_elements(internal_elements) if internal_elements && !itemscope
|
74
|
+
end
|
75
|
+
|
76
|
+
# Add an 'itemprop' to the properties
|
77
|
+
def add_itemprop(element)
|
78
|
+
property = ItempropParser.parse(element, @page_url)
|
79
|
+
if property.link? && property.names.empty? && property.rels.empty?
|
80
|
+
(@links['link'] ||= []) << property
|
81
|
+
else
|
82
|
+
property.names.each { |name| (@properties[name] ||= []) << property }
|
83
|
+
property.rels.each { |rel| (@links[rel] ||= []) << property }
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Add any properties referred to by 'itemref'
|
88
|
+
def add_itemref_properties(element)
|
89
|
+
itemref = element.attribute('itemref')
|
90
|
+
if itemref
|
91
|
+
itemref.value.split(' ').each {|id| parse_elements(find_with_id(id))}
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def add_form(element)
|
96
|
+
submit_buttons = FormParser.parse(element, @page_url)
|
97
|
+
submit_buttons.each do |submit_button|
|
98
|
+
submit_button.names.each { |name| (@properties[name] ||= []) << submit_button }
|
99
|
+
if submit_button.rels.empty?
|
100
|
+
(@links['submit'] ||= []) << submit_button
|
101
|
+
else
|
102
|
+
submit_button.rels.each { |rel| (@links[rel] ||= []) << submit_button }
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Find an element with a matching id
|
108
|
+
def find_with_id(id)
|
109
|
+
@top_node.search("//*[@id='#{id}']")
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|