hmachine 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -2
- data/Gemfile +6 -4
- data/Gemfile.lock +51 -0
- data/README.md +123 -9
- data/Rakefile +12 -3
- data/bin/hmachine +99 -0
- data/hmachine.gemspec +132 -0
- data/lib/hmachine.rb +121 -12
- data/lib/hmachine/microformat.rb +39 -20
- data/lib/hmachine/microformat/adr.rb +22 -0
- data/lib/hmachine/microformat/geo.rb +48 -0
- data/lib/hmachine/microformat/hcard.rb +169 -11
- data/lib/hmachine/microformat/rellicense.rb +20 -0
- data/lib/hmachine/microformat/reltag.rb +38 -0
- data/lib/hmachine/microformat/votelinks.rb +42 -0
- data/lib/hmachine/microformat/xfn.rb +54 -0
- data/lib/hmachine/microformat/xmdp.rb +14 -0
- data/lib/hmachine/microformat/xoxo.rb +69 -0
- data/lib/hmachine/pattern.rb +26 -0
- data/lib/hmachine/pattern/abbr.rb +21 -0
- data/lib/hmachine/pattern/datetime.rb +75 -0
- data/lib/hmachine/pattern/typevalue.rb +32 -0
- data/lib/hmachine/pattern/url.rb +32 -0
- data/lib/hmachine/pattern/valueclass.rb +51 -0
- data/lib/hmachine/posh.rb +3 -0
- data/lib/hmachine/posh/anchor.rb +40 -0
- data/lib/hmachine/posh/base.rb +204 -0
- data/lib/hmachine/posh/definition_list.rb +41 -0
- data/test/fixtures/huffduffer.html +466 -0
- data/test/fixtures/likeorhate.html +48 -0
- data/test/fixtures/rel_license.html +4 -0
- data/test/fixtures/test-fixture/hcard/hcard1.html +147 -0
- data/test/fixtures/test-fixture/hcard/hcard11.html +123 -0
- data/test/fixtures/test-fixture/hcard/hcard12.html +178 -0
- data/test/fixtures/test-fixture/hcard/hcard17.html +165 -0
- data/test/fixtures/test-fixture/hcard/hcard2.html +264 -0
- data/test/fixtures/test-fixture/hcard/hcard3.html +144 -0
- data/test/fixtures/test-fixture/hcard/hcard4.html +117 -0
- data/test/fixtures/test-fixture/hcard/hcard5.html +119 -0
- data/test/fixtures/test-fixture/hcard/hcard6.html +188 -0
- data/test/fixtures/test-fixture/hcard/hcard7.html +188 -0
- data/test/fixtures/test-fixture/hcard/hcard8.html +130 -0
- data/test/fixtures/test-fixture/hcard/hcard9.html +111 -0
- data/test/fixtures/test-fixture/hcard/hcard99.html +215 -0
- data/test/fixtures/test-fixture/value-class-date-time/value-dt-test-YYYY-MM-DD--HH-MM.html +9 -0
- data/test/fixtures/test-fixture/value-class-date-time/value-dt-test-abbr-YYYY-MM-DD--HH-MM.html +4 -0
- data/test/fixtures/xfn.html +198 -0
- data/test/fixtures/xmdp.html +32 -0
- data/test/fixtures/xoxo.html +51 -0
- data/test/hmachine_test.rb +122 -6
- data/test/microformat/adr_test.rb +47 -0
- data/test/microformat/geo_test.rb +66 -0
- data/test/microformat/hcard_test.rb +487 -20
- data/test/microformat/rellicense_test.rb +36 -0
- data/test/microformat/reltag_test.rb +61 -0
- data/test/microformat/votelinks_test.rb +44 -0
- data/test/microformat/xfn_test.rb +28 -0
- data/test/microformat/xmdp_test.rb +16 -0
- data/test/microformat/xoxo_test.rb +51 -0
- data/test/microformat_test.rb +12 -34
- data/test/pattern/date_time_test.rb +55 -0
- data/test/pattern/value_class_test.rb +33 -0
- data/test/pattern_test.rb +132 -0
- data/test/posh/anchor_test.rb +41 -0
- data/test/posh/base_test.rb +150 -0
- data/test/posh/definition_list_test.rb +38 -0
- data/test/test_helper.rb +24 -6
- metadata +93 -15
- data/lib/hmachine/microformat/base.rb +0 -17
@@ -0,0 +1,20 @@
|
|
1
|
+
module HMachine
|
2
|
+
module Microformat
|
3
|
+
class RelLicense < POSH::Anchor
|
4
|
+
FRIENDLY_NAME = "rel-license"
|
5
|
+
WIKI_URL = 'http://microformats.org/wiki/rel-license'
|
6
|
+
XMDP = 'http://microformats.org/profile/rel-license'
|
7
|
+
|
8
|
+
selector 'a[rel~="license"], link[rel~="license"]'
|
9
|
+
|
10
|
+
validate {|a| a['rel'] && a['rel'].split.include?('license') }
|
11
|
+
|
12
|
+
alias license url
|
13
|
+
|
14
|
+
def to_s
|
15
|
+
license
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module HMachine
|
2
|
+
module Microformat
|
3
|
+
class RelTag < POSH::Anchor
|
4
|
+
FRIENDLY_NAME = "rel-tag"
|
5
|
+
WIKI_URL = "http://microformats.org/wiki/rel-tag"
|
6
|
+
XMDP = 'http://microformats.org/profile/rel-tag'
|
7
|
+
|
8
|
+
selector 'a[rel~="tag"]'
|
9
|
+
|
10
|
+
validate {|a| a['rel'] && a['rel'].split.include?('tag') }
|
11
|
+
|
12
|
+
def tag
|
13
|
+
@tag ||= { node['href'].split('/').last => node['href'] }
|
14
|
+
end
|
15
|
+
|
16
|
+
def name
|
17
|
+
tag.keys.to_s
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_s
|
21
|
+
name
|
22
|
+
end
|
23
|
+
|
24
|
+
def url
|
25
|
+
tag.values.to_s
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_h
|
29
|
+
tag
|
30
|
+
end
|
31
|
+
|
32
|
+
def inspect
|
33
|
+
"<#{self.class}:#{hash}: '#{tag}'>"
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module HMachine
|
2
|
+
module Microformat
|
3
|
+
class VoteLinks < POSH::Anchor
|
4
|
+
FRIENDLY_NAME = "VoteLinks"
|
5
|
+
WIKI_URL = 'http://microformats.org/wiki/vote-links'
|
6
|
+
XMDP = 'http://microformats.org/profile/vote-links'
|
7
|
+
|
8
|
+
selector 'a[rev~="vote-for"], a[rev~="vote-against"], a[rev~="vote-abstain"]'
|
9
|
+
|
10
|
+
validate do |a|
|
11
|
+
return false unless a['rev']
|
12
|
+
!%w(vote-for vote-against vote-abstain).reject { |vote|
|
13
|
+
a['rev'].split.include?(vote)
|
14
|
+
}.empty?
|
15
|
+
end
|
16
|
+
|
17
|
+
def vote
|
18
|
+
@vote ||= { type => [url, title].compact }
|
19
|
+
end
|
20
|
+
|
21
|
+
def type
|
22
|
+
vote_type = node['rev'].split(' ').reject do |vote|
|
23
|
+
vote.index('vote-') != 0
|
24
|
+
end
|
25
|
+
vote_type.first
|
26
|
+
end
|
27
|
+
|
28
|
+
def for?
|
29
|
+
type == 'vote-for'
|
30
|
+
end
|
31
|
+
|
32
|
+
def against?
|
33
|
+
type == 'vote-against'
|
34
|
+
end
|
35
|
+
|
36
|
+
def abstain?
|
37
|
+
type == 'vote-abstain'
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module HMachine
|
2
|
+
module Microformat
|
3
|
+
class XFN < POSH::Anchor
|
4
|
+
FRIENDLY_NAME = "XFN"
|
5
|
+
WIKI_URL = 'http://microformats.org/wiki/XFN'
|
6
|
+
XMDP = 'http://gmpg.org/xfn/11'
|
7
|
+
|
8
|
+
@@friendship = %w( contact acquaintance friend)
|
9
|
+
@@physical = %w( met )
|
10
|
+
@@professional = %w( co-worker colleague )
|
11
|
+
@@geographical = %w( co-resident neighbor )
|
12
|
+
@@family = %w( child parent sibling spouse kin )
|
13
|
+
@@romantic = %w( muse crush date sweetheart )
|
14
|
+
@@identity = %w( me )
|
15
|
+
|
16
|
+
@@relationships = @@friendship + @@physical + @@professional + @@geographical + @@family + @@romantic + @@identity
|
17
|
+
|
18
|
+
search do |doc|
|
19
|
+
doc.css @@relationships.collect {|rel| "a[rel~='#{rel}']" }.join(', ')
|
20
|
+
end
|
21
|
+
|
22
|
+
validate do |a|
|
23
|
+
return false unless a['rel']
|
24
|
+
!@@relationships.reject { |rel| a['rel'].split.include?(rel) }.empty?
|
25
|
+
end
|
26
|
+
|
27
|
+
# Performant way to parse identity relationships
|
28
|
+
def self.parse_me(document)
|
29
|
+
nodes = document.css("a[rel~='me']")
|
30
|
+
if !nodes.empty?
|
31
|
+
contents = nodes.collect do |node|
|
32
|
+
extract_from(node)
|
33
|
+
end
|
34
|
+
(contents.length == 1) ? contents.first : contents
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
%w(friendship physical professional geographical family romantic identity).each do |type|
|
39
|
+
class_eval %Q{
|
40
|
+
def #{type}?
|
41
|
+
!(@@#{type} & rel).empty?
|
42
|
+
end
|
43
|
+
}
|
44
|
+
end
|
45
|
+
alias me? identity?
|
46
|
+
alias met? physical?
|
47
|
+
|
48
|
+
def inspect
|
49
|
+
"<#{self.class}:#{hash}: '#{rel.join(', ')}'>"
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module HMachine
|
2
|
+
module Microformat
|
3
|
+
class XMDP < POSH::DefinitionList
|
4
|
+
FRIENDLY_NAME = "XMDP"
|
5
|
+
WIKI_URL = 'http://microformats.org/wiki/XMDP'
|
6
|
+
XMDP = 'http://gmpg.org/xmdp/1'
|
7
|
+
|
8
|
+
search {|doc| doc.css('dl.profile') }
|
9
|
+
|
10
|
+
validate {|dl| dl.matches?('dl.profile') }
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module HMachine
|
2
|
+
module Microformat
|
3
|
+
class XOXO < POSH::Base
|
4
|
+
FRIENDLY_NAME = "XOXO"
|
5
|
+
WIKI_URL = 'http://microformats.org/wiki/xoxo'
|
6
|
+
XMDP = 'http://microformats.org/profile/xoxo'
|
7
|
+
|
8
|
+
selector 'ol.xoxo, ul.xoxo, ol.blogroll, ul.blogroll'
|
9
|
+
|
10
|
+
# Seriously ugly WTF
|
11
|
+
def self.build_outline(node)
|
12
|
+
tree = []
|
13
|
+
node.children.each do |child|
|
14
|
+
if child.elem? &&
|
15
|
+
case child.node_name
|
16
|
+
when 'li'
|
17
|
+
if child.children.select {|li| li.elem? }.empty?
|
18
|
+
tree = tree | build_outline(child)
|
19
|
+
else
|
20
|
+
tree << build_outline(child)
|
21
|
+
end
|
22
|
+
when 'ol', 'ul'
|
23
|
+
tree << build_outline(child)
|
24
|
+
when 'dl'
|
25
|
+
definition_list = {}
|
26
|
+
keys = child.css('dt')
|
27
|
+
keys.each do |key|
|
28
|
+
definition = key.next_element if key.next_element.node_name.eql?('dd')
|
29
|
+
definition_contents = definition.children.select {|dd| dd.elem? }
|
30
|
+
definition_list.merge!({ key.content.strip => (definition_contents.empty? ? definition.content.to_s : build_outline(definition)) })
|
31
|
+
end
|
32
|
+
tree << definition_list
|
33
|
+
when 'a'
|
34
|
+
link = { :url => child['href'], :text => child.content.strip }
|
35
|
+
link[:rel] = child['rel'].split(' ') if child['rel']
|
36
|
+
link[:type] = child['type'] if child['type']
|
37
|
+
link[:title] = child['title'] if child['title']
|
38
|
+
tree << link
|
39
|
+
else
|
40
|
+
tree << child.content.strip
|
41
|
+
end
|
42
|
+
elsif (child.text? && !child.content.strip.empty?)
|
43
|
+
tree << child.content.strip
|
44
|
+
end
|
45
|
+
end
|
46
|
+
tree
|
47
|
+
end
|
48
|
+
|
49
|
+
def outline
|
50
|
+
@outline ||= self.class.build_outline(node)
|
51
|
+
end
|
52
|
+
|
53
|
+
def to_a
|
54
|
+
outline
|
55
|
+
end
|
56
|
+
|
57
|
+
def [](index)
|
58
|
+
outline[index]
|
59
|
+
end
|
60
|
+
|
61
|
+
def blogroll?
|
62
|
+
node['class'].split.include?('blogroll')
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'hmachine/pattern/url'
|
2
|
+
require 'hmachine/pattern/datetime'
|
3
|
+
require 'hmachine/pattern/abbr'
|
4
|
+
require 'hmachine/pattern/valueclass'
|
5
|
+
require 'hmachine/pattern/typevalue'
|
6
|
+
|
7
|
+
module HMachine
|
8
|
+
module Pattern
|
9
|
+
|
10
|
+
def self.map(name)
|
11
|
+
case HMachine.normalize(name)
|
12
|
+
when :value_class, :valueclass
|
13
|
+
HMachine::Pattern::ValueClass
|
14
|
+
when :abbr
|
15
|
+
HMachine::Pattern::Abbr
|
16
|
+
when :uri, :url
|
17
|
+
HMachine::Pattern::URL
|
18
|
+
when :typevalue
|
19
|
+
HMachine::Pattern::TypeValue
|
20
|
+
else
|
21
|
+
raise "#{name} is not a recognized markup design pattern."
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module HMachine
|
2
|
+
module Pattern
|
3
|
+
module Abbr
|
4
|
+
extend HMachine
|
5
|
+
WIKI_URL = 'http://microformats.org/wiki/abbr-design-pattern'
|
6
|
+
|
7
|
+
search {|element| element.css('abbr[title]') }
|
8
|
+
|
9
|
+
validate {|abbr| abbr.node_name.eql?('abbr') && abbr['title'] }
|
10
|
+
|
11
|
+
extract do |node|
|
12
|
+
if valid?(node)
|
13
|
+
DateTime.valid?(node['title']) ? DateTime.extract_from(node['title']) : node['title']
|
14
|
+
else
|
15
|
+
node.content.strip
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'time'
|
2
|
+
|
3
|
+
module HMachine
|
4
|
+
module Pattern
|
5
|
+
module DateTime
|
6
|
+
extend HMachine
|
7
|
+
|
8
|
+
# Is this string a simple date?
|
9
|
+
def self.date?(string)
|
10
|
+
!date(string).nil?
|
11
|
+
end
|
12
|
+
|
13
|
+
# Is this string a simple time?
|
14
|
+
def self.time?(string)
|
15
|
+
!time(string).nil?
|
16
|
+
end
|
17
|
+
|
18
|
+
# Normalize ISO8601 Dates
|
19
|
+
def self.date(datestring)
|
20
|
+
datetime = Date._parse(datestring)
|
21
|
+
if !datetime.empty? && datetime[:year] && (datetime[:mon] || datetime[:yday])
|
22
|
+
local = Time.now
|
23
|
+
year = datetime[:year] || local.year
|
24
|
+
if datetime[:yday]
|
25
|
+
ordinal = Date.ordinal(year, datetime[:yday]) rescue nil
|
26
|
+
if ordinal
|
27
|
+
month = ordinal.month
|
28
|
+
day = ordinal.day
|
29
|
+
end
|
30
|
+
else
|
31
|
+
month = datetime[:mon] || local.month
|
32
|
+
day = datetime[:mday] || 1
|
33
|
+
end
|
34
|
+
"#{year}-#{month}-#{day}" if (month && day)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Normalize ISO8601 Times
|
39
|
+
def self.time(timestring)
|
40
|
+
datetime = Date._parse(timestring)
|
41
|
+
if !datetime.empty? && datetime[:hour]
|
42
|
+
local = Time.now
|
43
|
+
hour = datetime[:hour]
|
44
|
+
min = datetime[:min] || 0
|
45
|
+
sec = datetime[:sec] || 0
|
46
|
+
zone = datetime[:zone] || local.utc_offset
|
47
|
+
"T#{hour}:#{min}:#{sec}#{zone}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Build a normalized iso8601 datetime string
|
52
|
+
def self.iso8601(datetime)
|
53
|
+
datestamp = date(datetime) || ''
|
54
|
+
timestamp = time(datetime) || ''
|
55
|
+
datestamp + timestamp
|
56
|
+
end
|
57
|
+
|
58
|
+
validate do |datetime|
|
59
|
+
if !iso8601(datetime).empty?
|
60
|
+
begin
|
61
|
+
Time.parse(iso8601(datetime)).respond_to?(:iso8601)
|
62
|
+
rescue ArgumentError
|
63
|
+
# An out-of-bounds error means a false positive
|
64
|
+
false
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
extract do |datetime|
|
70
|
+
Time.parse(iso8601(datetime))
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module HMachine
|
2
|
+
module Pattern
|
3
|
+
module TypeValue
|
4
|
+
extend HMachine
|
5
|
+
|
6
|
+
search do |doc|
|
7
|
+
doc.css('.type').reject {|type| type.parent.matches?('.type') }
|
8
|
+
end
|
9
|
+
|
10
|
+
extract do |node|
|
11
|
+
if found_in?(node)
|
12
|
+
types_and_values = {}
|
13
|
+
element = find_in(node)
|
14
|
+
types = element.collect {|type| HMachine.normalize Pattern::ValueClass.extract_from(type.unlink) }
|
15
|
+
types = (types.length == 1) ? types.first : types
|
16
|
+
{:type => types, :value => get_value(node)}
|
17
|
+
else
|
18
|
+
get_value(node)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.get_value(node)
|
23
|
+
if Pattern::URL.valid?(node)
|
24
|
+
Pattern::URL.extract_from(node)
|
25
|
+
else
|
26
|
+
Pattern::ValueClass.extract_from(node)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module HMachine
|
2
|
+
module Pattern
|
3
|
+
module URL
|
4
|
+
extend HMachine
|
5
|
+
|
6
|
+
validate {|node| node.matches?("a[href] ,area[href], img[src], object[data]") }
|
7
|
+
|
8
|
+
extract do |url|
|
9
|
+
if valid?(url)
|
10
|
+
value = if (url.node_name.eql?('a') || url.node_name.eql?('area'))
|
11
|
+
url['href']
|
12
|
+
elsif url.node_name.eql?('img')
|
13
|
+
url['src']
|
14
|
+
elsif url.node_name.eql?('object')
|
15
|
+
url['data']
|
16
|
+
end
|
17
|
+
normalize(value) if value
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.normalize(url)
|
22
|
+
uri = URI.parse(url).normalize.to_s
|
23
|
+
if uri.index('mailto:').eql?(0)
|
24
|
+
email = uri.split('mailto:')[1].split('?').first
|
25
|
+
else
|
26
|
+
uri
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module HMachine
|
2
|
+
module Pattern
|
3
|
+
module ValueClass
|
4
|
+
extend HMachine
|
5
|
+
WIKI_URL = 'http://microformats.org/wiki/value-class-pattern'
|
6
|
+
|
7
|
+
search do |element|
|
8
|
+
element.css('.value, .value-title[title]').reject {|val| val.parent.matches?('.value') }
|
9
|
+
end
|
10
|
+
|
11
|
+
validate {|value| value.matches?('.value, .value-title[title]') }
|
12
|
+
|
13
|
+
extract do |node|
|
14
|
+
if found_in?(node)
|
15
|
+
values = get_values(node)
|
16
|
+
normalize_values = values.collect { |val| DateTime.valid?(val) ? DateTime.iso8601(val) : val }.join
|
17
|
+
DateTime.valid?(normalize_values) ? DateTime.extract_from(normalize_values) : normalize_values
|
18
|
+
elsif Abbr.valid?(node)
|
19
|
+
Abbr.extract_from(node)
|
20
|
+
else
|
21
|
+
get_text(node)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.get_values(node)
|
26
|
+
find_in(node).collect do |val|
|
27
|
+
if ((val.node_name.eql?('img') || val.node_name.eql?('area')) && val['alt'])
|
28
|
+
val['alt'].strip
|
29
|
+
elsif (val.node_name.eql?('object') && val['data'])
|
30
|
+
val['data'].strip
|
31
|
+
elsif (Abbr.valid?(val) || val.matches?('.value-title'))
|
32
|
+
val['title'].strip
|
33
|
+
else
|
34
|
+
val.content.strip
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.get_text(node)
|
40
|
+
if ((node.node_name.eql?('img') || node.node_name.eql?('area')) && node['alt'])
|
41
|
+
node['alt'].strip
|
42
|
+
elsif (node.node_name.eql?('object') && node['data'])
|
43
|
+
node['data'].strip
|
44
|
+
else
|
45
|
+
node.content.strip
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|