microdata 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in microdata.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jason Ronallo
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,41 @@
1
+ # Microdata
2
+
3
+ Ruby library for extracting HTML5 Microdata
4
+
5
+ ## Story
6
+
7
+ Most of the code here was extracted from [Mida](https://github.com/LawrenceWoodman/mida) by Lawrence Woodman. This was done in order to have a simpler, more generic Microdata parser without all the vocabulary awareness and other features. This gem is also tested under Ruby 1.9.3 and Ruby 2.0.0, though it could be better tested.
8
+
9
+ ## Installation
10
+
11
+ This library has not been released to RubyGems.org yet, but when it is the intention is to have it install with the following.
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ gem 'microdata'
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install microdata
24
+
25
+ ## Usage
26
+
27
+ The commandline tool microdata.rb can be run like the following:
28
+
29
+ ```
30
+ microdata.rb http://d.lib.ncsu.edu/collections/catalog/mc00383-001-ff0006-001-001_0038
31
+ ```
32
+
33
+ Output is in pretty JSON format.
34
+
35
+ ## Contributing
36
+
37
+ 1. Fork it
38
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
39
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
40
+ 4. Push to the branch (`git push origin my-new-feature`)
41
+ 5. Create new Pull Request
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ end
8
+
9
+ desc "Run tests"
10
+ task :default => :test
@@ -0,0 +1,8 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ # microdata.rb
4
+ # Extract HTML5 Microdata and output JSON
5
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
6
+ require 'microdata'
7
+
8
+ puts Microdata.to_json(ARGV[0])
@@ -0,0 +1,42 @@
1
+ require "microdata/version"
2
+ require "nokogiri"
3
+ require "microdata/item"
4
+ require "microdata/document"
5
+ require "microdata/itemprop"
6
+ require 'open-uri'
7
+ require 'json'
8
+ require 'uri'
9
+
10
+ module Microdata
11
+
12
+ # PROPERTY_VALUES = {
13
+ # meta: 'content',
14
+ # audio: 'src',
15
+ # embed: 'src',
16
+ # iframe: 'src',
17
+ # img: 'src',
18
+ # source: 'src',
19
+ # video: 'src',
20
+ # a: 'href',
21
+ # area: 'href',
22
+ # link: 'href',
23
+ # object: 'data',
24
+ # time: 'datetime'
25
+ # }
26
+
27
+ def self.get_items(location)
28
+ content = open(location)
29
+ page_url = location
30
+ Microdata::Document.new(content, page_url).extract_items
31
+ end
32
+
33
+ def self.to_json(location)
34
+ items = get_items(location)
35
+ hash = {}
36
+ hash[:items] = items.map do |item|
37
+ item.to_hash
38
+ end
39
+ JSON.pretty_generate hash
40
+ end
41
+
42
+ end
@@ -0,0 +1,22 @@
1
+ module Microdata
2
+ class Document
3
+
4
+ attr_reader :items
5
+
6
+ def initialize(content, page_url=nil)
7
+ @doc = Nokogiri::HTML(content)
8
+ @page_url = page_url
9
+ @items = extract_items
10
+ end
11
+
12
+ def extract_items
13
+ itemscopes = @doc.search('//*[@itemscope and not(@itemprop)]')
14
+ return nil unless itemscopes
15
+
16
+ itemscopes.collect do |itemscope|
17
+ Item.new(itemscope, @page_url)
18
+ end
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,65 @@
1
+ module Microdata
2
+ class Item
3
+ attr_reader :type, :properties, :id
4
+
5
+ def initialize(top_node, page_url)
6
+ @top_node = top_node
7
+ @type = extract_itemtype
8
+ @id = extract_itemid
9
+ @properties = {}
10
+ @page_url = page_url
11
+ parse_elements(extract_elements(@top_node))
12
+ end
13
+
14
+ def to_hash
15
+ hash = {}
16
+ hash[:id] = id if id
17
+ hash[:type] = type
18
+ hash[:properties] = {}
19
+ properties.each do |name, values|
20
+ final_values = values.map do |value|
21
+ if value.is_a?(Item)
22
+ value.to_hash
23
+ else
24
+ value
25
+ end
26
+ end
27
+ hash[:properties][name] = final_values
28
+ end
29
+ hash
30
+ end
31
+
32
+ private
33
+
34
+ def extract_elements(node)
35
+ node.search('./*')
36
+ end
37
+
38
+ def extract_itemid
39
+ (value = @top_node.attribute('itemid')) ? value.value : nil
40
+ end
41
+
42
+ def extract_itemtype
43
+ (value = @top_node.attribute('itemtype')) ? value.value.split(' ') : nil
44
+ end
45
+
46
+ def parse_elements(elements)
47
+ elements.each {|element| parse_element(element)}
48
+ end
49
+
50
+ def parse_element(element)
51
+ itemscope = element.attribute('itemscope')
52
+ itemprop = element.attribute('itemprop')
53
+ internal_elements = extract_elements(element)
54
+ add_itemprop(element) if itemscope || itemprop
55
+ parse_elements(internal_elements) if internal_elements && !itemscope
56
+ end
57
+
58
+ # Add an 'itemprop' to the properties
59
+ def add_itemprop(itemprop)
60
+ properties = Itemprop.parse(itemprop, @page_url)
61
+ properties.each { |name, value| (@properties[name] ||= []) << value }
62
+ end
63
+
64
+ end
65
+ end
@@ -0,0 +1,91 @@
1
+ module Microdata
2
+ # Class that parses itemprop elements
3
+ class Itemprop
4
+
5
+ NON_TEXTCONTENT_ELEMENTS = {
6
+ 'a' => 'href', 'area' => 'href',
7
+ 'audio' => 'src', 'embed' => 'src',
8
+ 'iframe' => 'src', 'img' => 'src',
9
+ 'link' => 'href', 'meta' => 'content',
10
+ 'object' => 'data', 'source' => 'src',
11
+ 'time' => 'datetime', 'track' => 'src',
12
+ 'video' => 'src'
13
+ }
14
+
15
+ URL_ATTRIBUTES = ['data', 'href', 'src']
16
+
17
+ # A Hash representing the properties.
18
+ # Hash is of the form {'property name' => 'value'}
19
+ attr_reader :properties
20
+
21
+ # Create a new Itemprop object
22
+ # [element] The itemprop element to be parsed
23
+ # [page_url] The url of the page, including filename, used to form
24
+ # absolute urls
25
+ def initialize(element, page_url=nil)
26
+ @element, @page_url = element, page_url
27
+ @properties = extract_properties
28
+ end
29
+
30
+ # Parse the element and return a hash representing the properties.
31
+ # Hash is of the form {'property name' => 'value'}
32
+ # [element] The itemprop element to be parsed
33
+ # [page_url] The url of the page, including filename, used to form
34
+ # absolute urls
35
+ def self.parse(element, page_url=nil)
36
+ self.new(element, page_url).properties
37
+ end
38
+
39
+ private
40
+ def extract_properties
41
+ prop_names = extract_property_names
42
+ prop_names.each_with_object({}) do |name, memo|
43
+ memo[name] = extract_property
44
+ end
45
+ end
46
+
47
+ # This returns an empty string if can't form a valid
48
+ # absolute url as per the Microdata spec.
49
+ def make_absolute_url(url)
50
+ return url unless URI.parse(url).relative?
51
+ begin
52
+ URI.parse(@page_url).merge(url).to_s
53
+ rescue URI::Error
54
+ url
55
+ end
56
+ end
57
+
58
+ def non_textcontent_element?(element)
59
+ NON_TEXTCONTENT_ELEMENTS.has_key?(element)
60
+ end
61
+
62
+ def url_attribute?(attribute)
63
+ URL_ATTRIBUTES.include?(attribute)
64
+ end
65
+
66
+ def extract_property_names
67
+ itemprop_attr = @element.attribute('itemprop')
68
+ itemprop_attr ? itemprop_attr.value.split() : []
69
+ end
70
+
71
+ def extract_property_value
72
+ element = @element.name
73
+ if non_textcontent_element?(element)
74
+ attribute = NON_TEXTCONTENT_ELEMENTS[element]
75
+ value = @element.attribute(attribute).value
76
+ url_attribute?(attribute) ? make_absolute_url(value) : value
77
+ else
78
+ @element.inner_text.strip
79
+ end
80
+ end
81
+
82
+ def extract_property
83
+ if @element.attribute('itemscope')
84
+ Item.new(@element, @page_url)
85
+ else
86
+ extract_property_value
87
+ end
88
+ end
89
+
90
+ end
91
+ end
@@ -0,0 +1,3 @@
1
+ module Microdata
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'microdata/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "microdata"
8
+ spec.version = Microdata::VERSION
9
+ spec.authors = ["Jason Ronallo"]
10
+ spec.email = ["jronallo@gmail.com"]
11
+ spec.description = %q{HTML5 Microdata extractor}
12
+ spec.summary = %q{Ruby library for extracting HTML5 Microdata}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "nokogiri"
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.3"
24
+ spec.add_development_dependency "rake"
25
+ end
@@ -0,0 +1,22 @@
1
+ <!doctype html>
2
+ <html>
3
+ <!-- shameless -->
4
+ <head>
5
+ <title>Jason Ronallo</title>
6
+ </head>
7
+
8
+ <body>
9
+ <span itemscope itemtype="http://schema.org/Person"
10
+ itemid="http://ronallo.com#me">
11
+ <a itemprop="url" href="http://twitter.com/ronallo">
12
+ <span itemprop="name">Jason Ronallo</span>
13
+ </a> is the
14
+ <span itemprop="jobTitle">Associate Head of Digital Library Initiatives</span> at
15
+ <span itemprop="affiliation" itemscope itemtype="http://schema.org/Library" itemid="http://lib.ncsu.edu">
16
+ <span itemprop="name">
17
+ <a itemprop="url" href="http://www.lib.ncsu.edu">NCSU Libraries</a>
18
+ </span>
19
+ </span>.
20
+ </span>
21
+ </body>
22
+ </html>
@@ -0,0 +1,2 @@
1
+ require 'test/unit'
2
+ require 'microdata'
@@ -0,0 +1,36 @@
1
+ require 'test_helper'
2
+
3
+ class TestParse < Test::Unit::TestCase
4
+
5
+ def setup
6
+ @items = Microdata.get_items('test/data/example.html')
7
+ end
8
+
9
+ def test_top_item_type
10
+ assert_equal ['http://schema.org/Person'], @items.first.type
11
+ end
12
+
13
+ def test_top_item_id
14
+ assert_equal "http://ronallo.com#me", @items.first.id
15
+ end
16
+
17
+ def test_top_item_properties
18
+ properties = @items.first.properties
19
+ assert_equal ["Jason Ronallo"], properties['name']
20
+ assert_equal ["http://twitter.com/ronallo"], properties['url']
21
+ assert_equal ["Associate Head of Digital Library Initiatives"], properties['jobTitle']
22
+ end
23
+
24
+ def test_nested_item
25
+ item = @items.first.properties['affiliation'][0]
26
+ assert_equal ['http://schema.org/Library'], item.type
27
+ assert_equal "http://lib.ncsu.edu", item.id
28
+ end
29
+
30
+ def test_nested_item_properties
31
+ properties = @items.first.properties['affiliation'][0].properties
32
+ assert_equal ['NCSU Libraries'], properties['name']
33
+ assert_equal ['http://www.lib.ncsu.edu'], properties['url']
34
+ end
35
+
36
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: microdata
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jason Ronallo
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-16 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: bundler
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: '1.3'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: '1.3'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rake
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: HTML5 Microdata extractor
63
+ email:
64
+ - jronallo@gmail.com
65
+ executables:
66
+ - microdata.rb
67
+ extensions: []
68
+ extra_rdoc_files: []
69
+ files:
70
+ - .gitignore
71
+ - Gemfile
72
+ - LICENSE.txt
73
+ - README.md
74
+ - Rakefile
75
+ - bin/microdata.rb
76
+ - lib/microdata.rb
77
+ - lib/microdata/document.rb
78
+ - lib/microdata/item.rb
79
+ - lib/microdata/itemprop.rb
80
+ - lib/microdata/version.rb
81
+ - microdata.gemspec
82
+ - test/data/example.html
83
+ - test/test_helper.rb
84
+ - test/test_parse.rb
85
+ homepage: ''
86
+ licenses:
87
+ - MIT
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ none: false
94
+ requirements:
95
+ - - ! '>='
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements: []
105
+ rubyforge_project:
106
+ rubygems_version: 1.8.25
107
+ signing_key:
108
+ specification_version: 3
109
+ summary: Ruby library for extracting HTML5 Microdata
110
+ test_files:
111
+ - test/data/example.html
112
+ - test/test_helper.rb
113
+ - test/test_parse.rb
114
+ has_rdoc: