microdata 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in microdata.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jason Ronallo
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,41 @@
1
+ # Microdata
2
+
3
+ Ruby library for extracting HTML5 Microdata
4
+
5
+ ## Story
6
+
7
+ Most of the code here was extracted from [Mida](https://github.com/LawrenceWoodman/mida) by Lawrence Woodman. This was done in order to have a simpler, more generic Microdata parser without all the vocabulary awareness and other features. This gem is also tested under Ruby 1.9.3 and Ruby 2.0.0, though it could be better tested.
8
+
9
+ ## Installation
10
+
11
+ This library has not been released to RubyGems.org yet, but when it is the intention is to have it install with the following.
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ gem 'microdata'
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install microdata
24
+
25
+ ## Usage
26
+
27
+ The commandline tool microdata.rb can be run like the following:
28
+
29
+ ```
30
+ microdata.rb http://d.lib.ncsu.edu/collections/catalog/mc00383-001-ff0006-001-001_0038
31
+ ```
32
+
33
+ Output is in pretty JSON format.
34
+
35
+ ## Contributing
36
+
37
+ 1. Fork it
38
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
39
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
40
+ 4. Push to the branch (`git push origin my-new-feature`)
41
+ 5. Create new Pull Request
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ end
8
+
9
+ desc "Run tests"
10
+ task :default => :test
@@ -0,0 +1,8 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ # microdata.rb
4
+ # Extract HTML5 Microdata and output JSON
5
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
6
+ require 'microdata'
7
+
8
+ puts Microdata.to_json(ARGV[0])
@@ -0,0 +1,42 @@
1
+ require "microdata/version"
2
+ require "nokogiri"
3
+ require "microdata/item"
4
+ require "microdata/document"
5
+ require "microdata/itemprop"
6
+ require 'open-uri'
7
+ require 'json'
8
+ require 'uri'
9
+
10
+ module Microdata
11
+
12
+ # PROPERTY_VALUES = {
13
+ # meta: 'content',
14
+ # audio: 'src',
15
+ # embed: 'src',
16
+ # iframe: 'src',
17
+ # img: 'src',
18
+ # source: 'src',
19
+ # video: 'src',
20
+ # a: 'href',
21
+ # area: 'href',
22
+ # link: 'href',
23
+ # object: 'data',
24
+ # time: 'datetime'
25
+ # }
26
+
27
+ def self.get_items(location)
28
+ content = open(location)
29
+ page_url = location
30
+ Microdata::Document.new(content, page_url).extract_items
31
+ end
32
+
33
+ def self.to_json(location)
34
+ items = get_items(location)
35
+ hash = {}
36
+ hash[:items] = items.map do |item|
37
+ item.to_hash
38
+ end
39
+ JSON.pretty_generate hash
40
+ end
41
+
42
+ end
@@ -0,0 +1,22 @@
1
+ module Microdata
2
+ class Document
3
+
4
+ attr_reader :items
5
+
6
+ def initialize(content, page_url=nil)
7
+ @doc = Nokogiri::HTML(content)
8
+ @page_url = page_url
9
+ @items = extract_items
10
+ end
11
+
12
+ def extract_items
13
+ itemscopes = @doc.search('//*[@itemscope and not(@itemprop)]')
14
+ return nil unless itemscopes
15
+
16
+ itemscopes.collect do |itemscope|
17
+ Item.new(itemscope, @page_url)
18
+ end
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,65 @@
1
+ module Microdata
2
+ class Item
3
+ attr_reader :type, :properties, :id
4
+
5
+ def initialize(top_node, page_url)
6
+ @top_node = top_node
7
+ @type = extract_itemtype
8
+ @id = extract_itemid
9
+ @properties = {}
10
+ @page_url = page_url
11
+ parse_elements(extract_elements(@top_node))
12
+ end
13
+
14
+ def to_hash
15
+ hash = {}
16
+ hash[:id] = id if id
17
+ hash[:type] = type
18
+ hash[:properties] = {}
19
+ properties.each do |name, values|
20
+ final_values = values.map do |value|
21
+ if value.is_a?(Item)
22
+ value.to_hash
23
+ else
24
+ value
25
+ end
26
+ end
27
+ hash[:properties][name] = final_values
28
+ end
29
+ hash
30
+ end
31
+
32
+ private
33
+
34
+ def extract_elements(node)
35
+ node.search('./*')
36
+ end
37
+
38
+ def extract_itemid
39
+ (value = @top_node.attribute('itemid')) ? value.value : nil
40
+ end
41
+
42
+ def extract_itemtype
43
+ (value = @top_node.attribute('itemtype')) ? value.value.split(' ') : nil
44
+ end
45
+
46
+ def parse_elements(elements)
47
+ elements.each {|element| parse_element(element)}
48
+ end
49
+
50
+ def parse_element(element)
51
+ itemscope = element.attribute('itemscope')
52
+ itemprop = element.attribute('itemprop')
53
+ internal_elements = extract_elements(element)
54
+ add_itemprop(element) if itemscope || itemprop
55
+ parse_elements(internal_elements) if internal_elements && !itemscope
56
+ end
57
+
58
+ # Add an 'itemprop' to the properties
59
+ def add_itemprop(itemprop)
60
+ properties = Itemprop.parse(itemprop, @page_url)
61
+ properties.each { |name, value| (@properties[name] ||= []) << value }
62
+ end
63
+
64
+ end
65
+ end
@@ -0,0 +1,91 @@
1
+ module Microdata
2
+ # Class that parses itemprop elements
3
+ class Itemprop
4
+
5
+ NON_TEXTCONTENT_ELEMENTS = {
6
+ 'a' => 'href', 'area' => 'href',
7
+ 'audio' => 'src', 'embed' => 'src',
8
+ 'iframe' => 'src', 'img' => 'src',
9
+ 'link' => 'href', 'meta' => 'content',
10
+ 'object' => 'data', 'source' => 'src',
11
+ 'time' => 'datetime', 'track' => 'src',
12
+ 'video' => 'src'
13
+ }
14
+
15
+ URL_ATTRIBUTES = ['data', 'href', 'src']
16
+
17
+ # A Hash representing the properties.
18
+ # Hash is of the form {'property name' => 'value'}
19
+ attr_reader :properties
20
+
21
+ # Create a new Itemprop object
22
+ # [element] The itemprop element to be parsed
23
+ # [page_url] The url of the page, including filename, used to form
24
+ # absolute urls
25
+ def initialize(element, page_url=nil)
26
+ @element, @page_url = element, page_url
27
+ @properties = extract_properties
28
+ end
29
+
30
+ # Parse the element and return a hash representing the properties.
31
+ # Hash is of the form {'property name' => 'value'}
32
+ # [element] The itemprop element to be parsed
33
+ # [page_url] The url of the page, including filename, used to form
34
+ # absolute urls
35
+ def self.parse(element, page_url=nil)
36
+ self.new(element, page_url).properties
37
+ end
38
+
39
+ private
40
+ def extract_properties
41
+ prop_names = extract_property_names
42
+ prop_names.each_with_object({}) do |name, memo|
43
+ memo[name] = extract_property
44
+ end
45
+ end
46
+
47
+ # This returns an empty string if can't form a valid
48
+ # absolute url as per the Microdata spec.
49
+ def make_absolute_url(url)
50
+ return url unless URI.parse(url).relative?
51
+ begin
52
+ URI.parse(@page_url).merge(url).to_s
53
+ rescue URI::Error
54
+ url
55
+ end
56
+ end
57
+
58
+ def non_textcontent_element?(element)
59
+ NON_TEXTCONTENT_ELEMENTS.has_key?(element)
60
+ end
61
+
62
+ def url_attribute?(attribute)
63
+ URL_ATTRIBUTES.include?(attribute)
64
+ end
65
+
66
+ def extract_property_names
67
+ itemprop_attr = @element.attribute('itemprop')
68
+ itemprop_attr ? itemprop_attr.value.split() : []
69
+ end
70
+
71
+ def extract_property_value
72
+ element = @element.name
73
+ if non_textcontent_element?(element)
74
+ attribute = NON_TEXTCONTENT_ELEMENTS[element]
75
+ value = @element.attribute(attribute).value
76
+ url_attribute?(attribute) ? make_absolute_url(value) : value
77
+ else
78
+ @element.inner_text.strip
79
+ end
80
+ end
81
+
82
+ def extract_property
83
+ if @element.attribute('itemscope')
84
+ Item.new(@element, @page_url)
85
+ else
86
+ extract_property_value
87
+ end
88
+ end
89
+
90
+ end
91
+ end
@@ -0,0 +1,3 @@
1
+ module Microdata
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'microdata/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "microdata"
8
+ spec.version = Microdata::VERSION
9
+ spec.authors = ["Jason Ronallo"]
10
+ spec.email = ["jronallo@gmail.com"]
11
+ spec.description = %q{HTML5 Microdata extractor}
12
+ spec.summary = %q{Ruby library for extracting HTML5 Microdata}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "nokogiri"
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.3"
24
+ spec.add_development_dependency "rake"
25
+ end
@@ -0,0 +1,22 @@
1
+ <!doctype html>
2
+ <html>
3
+ <!-- shameless -->
4
+ <head>
5
+ <title>Jason Ronallo</title>
6
+ </head>
7
+
8
+ <body>
9
+ <span itemscope itemtype="http://schema.org/Person"
10
+ itemid="http://ronallo.com#me">
11
+ <a itemprop="url" href="http://twitter.com/ronallo">
12
+ <span itemprop="name">Jason Ronallo</span>
13
+ </a> is the
14
+ <span itemprop="jobTitle">Associate Head of Digital Library Initiatives</span> at
15
+ <span itemprop="affiliation" itemscope itemtype="http://schema.org/Library" itemid="http://lib.ncsu.edu">
16
+ <span itemprop="name">
17
+ <a itemprop="url" href="http://www.lib.ncsu.edu">NCSU Libraries</a>
18
+ </span>
19
+ </span>.
20
+ </span>
21
+ </body>
22
+ </html>
@@ -0,0 +1,2 @@
1
+ require 'test/unit'
2
+ require 'microdata'
@@ -0,0 +1,36 @@
1
+ require 'test_helper'
2
+
3
+ class TestParse < Test::Unit::TestCase
4
+
5
+ def setup
6
+ @items = Microdata.get_items('test/data/example.html')
7
+ end
8
+
9
+ def test_top_item_type
10
+ assert_equal ['http://schema.org/Person'], @items.first.type
11
+ end
12
+
13
+ def test_top_item_id
14
+ assert_equal "http://ronallo.com#me", @items.first.id
15
+ end
16
+
17
+ def test_top_item_properties
18
+ properties = @items.first.properties
19
+ assert_equal ["Jason Ronallo"], properties['name']
20
+ assert_equal ["http://twitter.com/ronallo"], properties['url']
21
+ assert_equal ["Associate Head of Digital Library Initiatives"], properties['jobTitle']
22
+ end
23
+
24
+ def test_nested_item
25
+ item = @items.first.properties['affiliation'][0]
26
+ assert_equal ['http://schema.org/Library'], item.type
27
+ assert_equal "http://lib.ncsu.edu", item.id
28
+ end
29
+
30
+ def test_nested_item_properties
31
+ properties = @items.first.properties['affiliation'][0].properties
32
+ assert_equal ['NCSU Libraries'], properties['name']
33
+ assert_equal ['http://www.lib.ncsu.edu'], properties['url']
34
+ end
35
+
36
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: microdata
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jason Ronallo
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-16 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: bundler
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: '1.3'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: '1.3'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rake
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: HTML5 Microdata extractor
63
+ email:
64
+ - jronallo@gmail.com
65
+ executables:
66
+ - microdata.rb
67
+ extensions: []
68
+ extra_rdoc_files: []
69
+ files:
70
+ - .gitignore
71
+ - Gemfile
72
+ - LICENSE.txt
73
+ - README.md
74
+ - Rakefile
75
+ - bin/microdata.rb
76
+ - lib/microdata.rb
77
+ - lib/microdata/document.rb
78
+ - lib/microdata/item.rb
79
+ - lib/microdata/itemprop.rb
80
+ - lib/microdata/version.rb
81
+ - microdata.gemspec
82
+ - test/data/example.html
83
+ - test/test_helper.rb
84
+ - test/test_parse.rb
85
+ homepage: ''
86
+ licenses:
87
+ - MIT
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ none: false
94
+ requirements:
95
+ - - ! '>='
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements: []
105
+ rubyforge_project:
106
+ rubygems_version: 1.8.25
107
+ signing_key:
108
+ specification_version: 3
109
+ summary: Ruby library for extracting HTML5 Microdata
110
+ test_files:
111
+ - test/data/example.html
112
+ - test/test_helper.rb
113
+ - test/test_parse.rb
114
+ has_rdoc: