hypermicrodata 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.travis.yml +8 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +100 -0
- data/Rakefile +10 -0
- data/bin/hypermicrodata.rb +25 -0
- data/hypermicrodata.gemspec +28 -0
- data/lib/hypermicrodata.rb +37 -0
- data/lib/hypermicrodata/document.rb +27 -0
- data/lib/hypermicrodata/extract.rb +22 -0
- data/lib/hypermicrodata/item.rb +113 -0
- data/lib/hypermicrodata/itemprop_parser.rb +114 -0
- data/lib/hypermicrodata/link.rb +7 -0
- data/lib/hypermicrodata/property.rb +27 -0
- data/lib/hypermicrodata/rails/html_based_json_renderer.rb +35 -0
- data/lib/hypermicrodata/serializer/base.rb +24 -0
- data/lib/hypermicrodata/serializer/hal.rb +47 -0
- data/lib/hypermicrodata/serializer/jsonld.rb +44 -0
- data/lib/hypermicrodata/serializer/uber.rb +100 -0
- data/lib/hypermicrodata/submit_button.rb +105 -0
- data/lib/hypermicrodata/version.rb +3 -0
- data/lib/uberous/uber.rb +104 -0
- data/test/data/example.html +22 -0
- data/test/data/example_itemref.html +16 -0
- data/test/data/example_with_no_itemscope.html +22 -0
- data/test/test_helper.rb +3 -0
- data/test/test_itemref.rb +19 -0
- data/test/test_json.rb +15 -0
- data/test/test_parse.rb +36 -0
- metadata +139 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6aa222d1d9f2fd94e7eabda85a111de9b63b17ba
|
4
|
+
data.tar.gz: 624be0e7d6c825c69ed224508f6286da2911cd8e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 094a2d0285349d16ff74308ce8756d5a2510f67c0ab564bd93112823c488bc0eeee030725feb87fa6b2f89e2ab4805407a2a536ded47262c3125f81ea1cd9901
|
7
|
+
data.tar.gz: 6753e62b18ea5b2e4b0550b5fcaaf2eeb5f3101efbb61c5af745901285a3d4621761e2b75a8201e670274350cab0a5a2f7bebac8570e4e87357a6e581626b700
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Jason Ronallo, Toru KAWAMURA
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
# Hypermicrodata
|
2
|
+
|
3
|
+
Ruby library for extracting HTML5 Microdata with Hypermedia
|
4
|
+
|
5
|
+
[![Build Status](https://travis-ci.org/tkawa/hypermicrodata.png)](https://travis-ci.org/tkawa/hypermicrodata)
|
6
|
+
|
7
|
+
## Story
|
8
|
+
|
9
|
+
Most of the code here was extracted from [Mida](https://github.com/LawrenceWoodman/mida) by Lawrence Woodman. This was done in order to have a simpler, more generic Microdata parser without all the vocabulary awareness and other features. This gem is also tested under Ruby 1.9.3 and Ruby 2.0.0, though it could be better tested.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
This library has not been released to RubyGems.org yet, but when it is the intention is to have it install with the following.
|
14
|
+
|
15
|
+
Add this line to your application's Gemfile:
|
16
|
+
|
17
|
+
gem 'hypermicrodata'
|
18
|
+
|
19
|
+
And then execute:
|
20
|
+
|
21
|
+
$ bundle
|
22
|
+
|
23
|
+
Or install it yourself as:
|
24
|
+
|
25
|
+
$ gem install hypermicrodata
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
### Basic
|
30
|
+
|
31
|
+
```
|
32
|
+
json = Hypermicrodata::Extract.new(html).to_json(:uber)
|
33
|
+
```
|
34
|
+
|
35
|
+
Supported formats are
|
36
|
+
|
37
|
+
- application/vnd.amundsen-uber+json (:uber)
|
38
|
+
- application/hal+json (:hal)
|
39
|
+
- application/json (:plain)
|
40
|
+
|
41
|
+
### Rails Integration
|
42
|
+
|
43
|
+
When you use this in Rails, you don't need to extract data manually.
|
44
|
+
|
45
|
+
/app/controllers/people_controller.rb
|
46
|
+
|
47
|
+
```
|
48
|
+
class PeopleController < ApplicationController
|
49
|
+
before_action :set_message, only: %i(show edit update destroy)
|
50
|
+
include Hypermicrodata::Rails::HtmlBasedJsonRenderer
|
51
|
+
...
|
52
|
+
end
|
53
|
+
```
|
54
|
+
|
55
|
+
/app/views/people/show.html.haml
|
56
|
+
|
57
|
+
```
|
58
|
+
.person{itemscope: true, itemtype: 'http://schema.org/Person',
|
59
|
+
itemid: person_url(@person), data: {main_item: true}}
|
60
|
+
.media
|
61
|
+
.media-image.pull-left
|
62
|
+
= image_tag @person.picture_path, alt: '', itemprop: 'image'
|
63
|
+
.media-body
|
64
|
+
%h1.media-heading
|
65
|
+
%span{itemprop: 'name'}= @person.name
|
66
|
+
= link_to 'collection', people_path, rel: 'collection', itemprop: 'isPartOf'
|
67
|
+
```
|
68
|
+
|
69
|
+
And you can serve following JSON:
|
70
|
+
|
71
|
+
```
|
72
|
+
GET /people/1 HTTP/1.1
|
73
|
+
Host: www.example.com
|
74
|
+
Accept: application/vnd.amundsen-uber+json
|
75
|
+
```
|
76
|
+
|
77
|
+
```
|
78
|
+
{
|
79
|
+
"uber": {
|
80
|
+
"version": "1.0",
|
81
|
+
"data": [{
|
82
|
+
"url": "http://www.example.com/people/1",
|
83
|
+
"name": "Person",
|
84
|
+
"data": [
|
85
|
+
{ "name": "image", "value": "/assets/bob.png" },
|
86
|
+
{ "name": "name", "value": "Bob Smith" },
|
87
|
+
{ "name": "isPartOf", "rel": "collection", "url": "/people" },
|
88
|
+
]
|
89
|
+
}]
|
90
|
+
}
|
91
|
+
}
|
92
|
+
```
|
93
|
+
|
94
|
+
## Contributing
|
95
|
+
|
96
|
+
1. Fork it
|
97
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
98
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
99
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
100
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
# hypermicrodata.rb
|
4
|
+
# Extract HTML5 Microdata and output JSON
|
5
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
require 'hypermicrodata'
|
7
|
+
|
8
|
+
location = ARGV[0]
|
9
|
+
content = open(location)
|
10
|
+
document = Hypermicrodata::Document.new(content, location)
|
11
|
+
items = document.extract_items
|
12
|
+
|
13
|
+
if items.empty? || items.nil?
|
14
|
+
puts "No Microdata items found."
|
15
|
+
itemprops = document.doc.search('//*[@itemprop]')
|
16
|
+
if !itemprops.empty?
|
17
|
+
puts "There are some itemprops, which means no top level items with an itemscope have been found."
|
18
|
+
end
|
19
|
+
else
|
20
|
+
hash = {}
|
21
|
+
hash[:items] = items.map do |item|
|
22
|
+
item.to_hash
|
23
|
+
end
|
24
|
+
puts JSON.pretty_generate(hash)
|
25
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'hypermicrodata/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "hypermicrodata"
|
8
|
+
spec.version = Hypermicrodata::VERSION
|
9
|
+
spec.authors = ["Jason Ronallo", "Toru KAWAMURA"]
|
10
|
+
spec.email = ["jronallo@gmail.com", "tkawa@4bit.net"]
|
11
|
+
spec.description = %q{HTML5 Microdata extractor with Hypermedia}
|
12
|
+
spec.summary = %q{Ruby library for extracting HTML5 Microdata with Hypermedia}
|
13
|
+
spec.homepage = "https://github.com/tkawa/hypermicrodata"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "nokogiri"
|
22
|
+
spec.add_dependency "mechanize"
|
23
|
+
spec.add_dependency "halibut"
|
24
|
+
spec.add_dependency "multi_json"
|
25
|
+
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
27
|
+
spec.add_development_dependency "rake"
|
28
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require "hypermicrodata/version"
|
2
|
+
require "uberous/uber"
|
3
|
+
require "nokogiri"
|
4
|
+
require "mechanize"
|
5
|
+
require "hypermicrodata/item"
|
6
|
+
require "hypermicrodata/document"
|
7
|
+
require "hypermicrodata/property"
|
8
|
+
require "hypermicrodata/link"
|
9
|
+
require "hypermicrodata/itemprop_parser"
|
10
|
+
require "hypermicrodata/submit_button"
|
11
|
+
require "hypermicrodata/serializer/base"
|
12
|
+
require "hypermicrodata/serializer/hal"
|
13
|
+
require "hypermicrodata/serializer/uber"
|
14
|
+
require "hypermicrodata/extract"
|
15
|
+
require "hypermicrodata/rails/html_based_json_renderer"
|
16
|
+
require 'open-uri'
|
17
|
+
require 'json'
|
18
|
+
require 'uri'
|
19
|
+
|
20
|
+
module Hypermicrodata
|
21
|
+
|
22
|
+
def self.get_items(location)
|
23
|
+
content = open(location)
|
24
|
+
page_url = location
|
25
|
+
Hypermicrodata::Document.new(content, page_url).extract_items
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.to_json(location)
|
29
|
+
items = get_items(location)
|
30
|
+
hash = {}
|
31
|
+
hash[:items] = items.map do |item|
|
32
|
+
item.to_hash
|
33
|
+
end
|
34
|
+
JSON.pretty_generate hash
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Hypermicrodata
|
2
|
+
class Document
|
3
|
+
|
4
|
+
attr_reader :items, :doc
|
5
|
+
|
6
|
+
def initialize(content, page_url=nil, filter_xpath_attr=nil)
|
7
|
+
@doc = Nokogiri::HTML(content)
|
8
|
+
@page_url = page_url
|
9
|
+
@filter_xpath_attr = filter_xpath_attr
|
10
|
+
@items = extract_items
|
11
|
+
end
|
12
|
+
|
13
|
+
def extract_items
|
14
|
+
itemscopes = []
|
15
|
+
if @filter_xpath_attr
|
16
|
+
itemscopes = @doc.xpath("//*[#{@filter_xpath_attr} and @itemscope]")
|
17
|
+
puts "XPath //*[#{@filter_xpath_attr}] is not found. root node is used." if itemscopes.empty?
|
18
|
+
end
|
19
|
+
itemscopes = @doc.xpath('self::*[@itemscope] | .//*[@itemscope and not(@itemprop)]') if itemscopes.empty?
|
20
|
+
|
21
|
+
itemscopes.collect do |itemscope|
|
22
|
+
Item.new(itemscope, @page_url)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Hypermicrodata
|
2
|
+
class Extract
|
3
|
+
def initialize(html, options = {})
|
4
|
+
default_data_attr_name = 'main-item'
|
5
|
+
@location = options[:location]
|
6
|
+
@profile_path = options[:profile_path]
|
7
|
+
filter_xpath_attr = "@data-#{options[:data_attr_name] || default_data_attr_name}"
|
8
|
+
@document = Hypermicrodata::Document.new(html, @location, filter_xpath_attr)
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_json(format = :plain, options = {})
|
12
|
+
case format
|
13
|
+
when :hal
|
14
|
+
Hypermicrodata::Serializer::Hal.new(@document, @location, @profile_path).to_json(options)
|
15
|
+
when :uber
|
16
|
+
Hypermicrodata::Serializer::Uber.new(@document, @location, @profile_path).to_json(options)
|
17
|
+
else
|
18
|
+
Hypermicrodata::Serializer::Base.new(@document, @location, @profile_path).to_json(options)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Hypermicrodata
|
2
|
+
class Item
|
3
|
+
attr_reader :type, :properties, :links, :id
|
4
|
+
|
5
|
+
def initialize(top_node, page_url)
|
6
|
+
@top_node = top_node
|
7
|
+
@type = extract_itemtype
|
8
|
+
@id = extract_itemid
|
9
|
+
@properties = {}
|
10
|
+
@links = {}
|
11
|
+
@page_url = page_url
|
12
|
+
add_itemref_properties(@top_node)
|
13
|
+
parse_elements(extract_elements(@top_node))
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_hash
|
17
|
+
hash = {}
|
18
|
+
hash[:id] = id if id
|
19
|
+
hash[:type] = type if type
|
20
|
+
hash[:properties] = {}
|
21
|
+
properties.each do |name, same_name_properties|
|
22
|
+
final_values = same_name_properties.map do |property|
|
23
|
+
if property.item
|
24
|
+
property.item.to_hash
|
25
|
+
else
|
26
|
+
property.value
|
27
|
+
end
|
28
|
+
end
|
29
|
+
hash[:properties][name] = final_values
|
30
|
+
end
|
31
|
+
hash[:links] = {}
|
32
|
+
links.each do |rel, same_rel_links|
|
33
|
+
final_values = same_rel_links.map do |link|
|
34
|
+
if link.item
|
35
|
+
link.item.to_hash
|
36
|
+
else
|
37
|
+
link.value
|
38
|
+
end
|
39
|
+
end
|
40
|
+
hash[:links][rel] = final_values
|
41
|
+
end
|
42
|
+
hash
|
43
|
+
end
|
44
|
+
|
45
|
+
def all_properties_and_links
|
46
|
+
properties.values.flatten | links.values.flatten
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def extract_elements(node)
|
52
|
+
node.search('./*')
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_itemid
|
56
|
+
(value = @top_node.attribute('itemid')) ? value.value : nil
|
57
|
+
end
|
58
|
+
|
59
|
+
def extract_itemtype
|
60
|
+
(value = @top_node.attribute('itemtype')) ? value.value.split(' ') : nil
|
61
|
+
end
|
62
|
+
|
63
|
+
def parse_elements(elements)
|
64
|
+
elements.each {|element| parse_element(element)}
|
65
|
+
end
|
66
|
+
|
67
|
+
def parse_element(element)
|
68
|
+
itemscope = element.attribute('itemscope')
|
69
|
+
itemprop = element.attribute('itemprop')
|
70
|
+
internal_elements = extract_elements(element)
|
71
|
+
add_itemprop(element) if itemscope || itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name)
|
72
|
+
add_form(element) if element.name == 'form'
|
73
|
+
parse_elements(internal_elements) if internal_elements && !itemscope
|
74
|
+
end
|
75
|
+
|
76
|
+
# Add an 'itemprop' to the properties
|
77
|
+
def add_itemprop(element)
|
78
|
+
property = ItempropParser.parse(element, @page_url)
|
79
|
+
if property.link? && property.names.empty? && property.rels.empty?
|
80
|
+
(@links['link'] ||= []) << property
|
81
|
+
else
|
82
|
+
property.names.each { |name| (@properties[name] ||= []) << property }
|
83
|
+
property.rels.each { |rel| (@links[rel] ||= []) << property }
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Add any properties referred to by 'itemref'
|
88
|
+
def add_itemref_properties(element)
|
89
|
+
itemref = element.attribute('itemref')
|
90
|
+
if itemref
|
91
|
+
itemref.value.split(' ').each {|id| parse_elements(find_with_id(id))}
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def add_form(element)
|
96
|
+
submit_buttons = FormParser.parse(element, @page_url)
|
97
|
+
submit_buttons.each do |submit_button|
|
98
|
+
submit_button.names.each { |name| (@properties[name] ||= []) << submit_button }
|
99
|
+
if submit_button.rels.empty?
|
100
|
+
(@links['submit'] ||= []) << submit_button
|
101
|
+
else
|
102
|
+
submit_button.rels.each { |rel| (@links[rel] ||= []) << submit_button }
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Find an element with a matching id
|
108
|
+
def find_with_id(id)
|
109
|
+
@top_node.search("//*[@id='#{id}']")
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|