uformats 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +8 -0
- data/COPYING +4 -0
- data/README +36 -0
- data/lib/uformats.rb +158 -0
- data/lib/uformats/basic.rb +268 -0
- data/lib/uformats/hcalendar.rb +23 -0
- data/lib/uformats/hcard.rb +100 -0
- data/lib/uformats/hreview.rb +182 -0
- data/lib/uformats/pluralizer.rb +32 -0
- data/test/all.rb +4 -0
- data/test/basic_test.rb +441 -0
- data/test/hcalendar_test.rb +157 -0
- data/test/hcard_test.rb +438 -0
- data/test/hreview_test.rb +500 -0
- data/test/pluralizer_test.rb +24 -0
- data/test/test_helper.rb +13 -0
- data/test/uformats_test.rb +235 -0
- metadata +70 -0
data/CHANGES
ADDED
data/COPYING
ADDED
data/README
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
uformats
|
2
|
+
|
3
|
+
ABOUT
|
4
|
+
|
5
|
+
A library for parsing various microformats under Ruby:
|
6
|
+
|
7
|
+
* hReview (0.3)
|
8
|
+
* hCard
|
9
|
+
* hCalendar (vevent only; limited support)
|
10
|
+
* rel-tag
|
11
|
+
* rel-license
|
12
|
+
* include-pattern
|
13
|
+
|
14
|
+
PREREQUISITES
|
15
|
+
|
16
|
+
htmlentities >= 3.0.1 (http://htmlentities.rubyforge.org/)
|
17
|
+
|
18
|
+
If you want to try the sample code, you will also need:
|
19
|
+
|
20
|
+
tidy (http://rubyforge.org/projects/tidy)
|
21
|
+
|
22
|
+
You'll also need to install the tidy library for your operating system. For OS
|
23
|
+
X, it's available via Darwin Ports (port install tidy). For Windows, get it
|
24
|
+
from http://dev.int64.org/tidy.html and rename 'tidy.dll' to 'libtidy.dll'.
|
25
|
+
For Linux etc. it should be available via the repositories.
|
26
|
+
|
27
|
+
INSTALLATION
|
28
|
+
|
29
|
+
To install (requires root/admin privileges):
|
30
|
+
|
31
|
+
# ruby setup.rb
|
32
|
+
|
33
|
+
To test:
|
34
|
+
|
35
|
+
$ ruby setup.rb test
|
36
|
+
|
data/lib/uformats.rb
ADDED
@@ -0,0 +1,158 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
require 'uformats/pluralizer'
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
module Microformats
|
6
|
+
|
7
|
+
#
|
8
|
+
# Provides method access to an internal Hash
|
9
|
+
#
|
10
|
+
class MethodHash
|
11
|
+
|
12
|
+
instance_methods.each { |m| undef_method m unless m =~ /^__/ }
|
13
|
+
|
14
|
+
def initialize(hash={})
|
15
|
+
@hash = hash.dup
|
16
|
+
end
|
17
|
+
|
18
|
+
def []=(k,v)
|
19
|
+
@hash[k] = v
|
20
|
+
end
|
21
|
+
|
22
|
+
def [](k)
|
23
|
+
return @hash[k]
|
24
|
+
end
|
25
|
+
|
26
|
+
def method_missing(sym, *args)
|
27
|
+
return Pluralizer.singularized_lookup(self, sym)
|
28
|
+
end
|
29
|
+
|
30
|
+
def each(&blk)
|
31
|
+
case blk.arity
|
32
|
+
when 1
|
33
|
+
@hash.each do |k,v|
|
34
|
+
yield [k, v]
|
35
|
+
end
|
36
|
+
when 2
|
37
|
+
@hash.each do |k,v|
|
38
|
+
yield k, v
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end # MethodHash
|
43
|
+
|
44
|
+
#
|
45
|
+
# Expand all include-pattern objects in a document
|
46
|
+
#
|
47
|
+
def expand_include_patterns!(rexml_source)
|
48
|
+
each_element_by_class(rexml_source, :include) do |target|
|
49
|
+
if (target.attribute('data') && target.attribute('data').value =~ /^#/)
|
50
|
+
source_id = target.attribute('data').value.sub(/^#/, '')
|
51
|
+
source = rexml_source.elements["//*[@id='#{source_id}']"]
|
52
|
+
if source
|
53
|
+
replacement = source.deep_clone
|
54
|
+
replacement.attributes.delete('id')
|
55
|
+
target.replace_with(replacement)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
return rexml_source
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Yield each element matching sym at or below the given node
|
64
|
+
#
|
65
|
+
def each_element_by_class(rexml_source, sym)
|
66
|
+
css_class = Microformats.sym_to_css_class(sym)
|
67
|
+
rexml_source.each_element("descendant-or-self::*[contains(@class,'#{css_class}')]") do |element|
|
68
|
+
if element.attribute('class').value =~ /\b#{css_class}\b/
|
69
|
+
yield element
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
#
|
75
|
+
# Returns the first element matching sym at or below the given node, or nil if none exists
|
76
|
+
#
|
77
|
+
def first_element_by_class(rexml_source, sym)
|
78
|
+
each_element_by_class(rexml_source, sym) do |element|
|
79
|
+
return element
|
80
|
+
end
|
81
|
+
return nil
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Return a list of all rel-tag elements found at or above the given node
|
86
|
+
#
|
87
|
+
def rel_tags_above_element(rexml_element)
|
88
|
+
tags = []
|
89
|
+
rexml_element.each_element('ancestor::*/a[@rel="tag"]') do |tagged_element|
|
90
|
+
tag = rel_tag(tagged_element)
|
91
|
+
tags << tag if tag
|
92
|
+
end
|
93
|
+
tags.compact!
|
94
|
+
return tags.empty? ? nil : tags
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# Return a list of all rel-tag elements found at or below the given node
|
99
|
+
#
|
100
|
+
def rel_tags_below_element(rexml_element)
|
101
|
+
tags = []
|
102
|
+
rexml_element.each_element('descendant-or-self::a[@rel="tag"]') do |tagged_element|
|
103
|
+
tag = rel_tag(tagged_element)
|
104
|
+
tags << tag if tag
|
105
|
+
end
|
106
|
+
return tags.empty? ? nil : tags
|
107
|
+
end
|
108
|
+
|
109
|
+
def rel_tag(rexml_element)
|
110
|
+
if rexml_element.attribute('href')
|
111
|
+
href = rexml_element.attribute('href').value
|
112
|
+
href.gsub!(%r!^.*/|[\?\#].*$!, '')
|
113
|
+
return CGI.unescape(href)
|
114
|
+
else
|
115
|
+
return nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
#
|
120
|
+
# Return a list of all rel-license elements found in the given element
|
121
|
+
#
|
122
|
+
def rel_licenses(rexml_element)
|
123
|
+
licenses = []
|
124
|
+
rexml_element.each_element('descendant-or-self::a[@rel="license"]') do |license_element|
|
125
|
+
if license_element.attribute('href')
|
126
|
+
licenses << license_element.attribute('href').value
|
127
|
+
end
|
128
|
+
end
|
129
|
+
return licenses.empty? ? nil : licenses
|
130
|
+
end
|
131
|
+
|
132
|
+
#
|
133
|
+
# Convert a Ruby symbol to a CSS class, e.g. :foo_bar => "foo-bar"
|
134
|
+
#
|
135
|
+
def sym_to_css_class(sym)
|
136
|
+
return sym.to_s.gsub('_', '-')
|
137
|
+
end
|
138
|
+
|
139
|
+
#
|
140
|
+
# Convert a css class to a Ruby symbol
|
141
|
+
#
|
142
|
+
def css_class_to_sym(css_class)
|
143
|
+
return css_class.gsub('-', '_').to_sym
|
144
|
+
end
|
145
|
+
|
146
|
+
#
|
147
|
+
# Returns a list of symbols corresponding to an element's classes
|
148
|
+
#
|
149
|
+
def classes_for_element(rexml_element)
|
150
|
+
if (css_class = rexml_element.attribute('class'))
|
151
|
+
return css_class.value.strip.split(/\s/).map{ |e| css_class_to_sym(e) }
|
152
|
+
else
|
153
|
+
return []
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
extend self
|
158
|
+
end
|
@@ -0,0 +1,268 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
require 'cgi'
|
3
|
+
require 'uformats'
|
4
|
+
require 'uformats/pluralizer'
|
5
|
+
require 'htmlentities'
|
6
|
+
require 'time'
|
7
|
+
require 'uri'
|
8
|
+
|
9
|
+
module Microformats
|
10
|
+
class BasicNestedFormat
|
11
|
+
|
12
|
+
alias_method :__class__, :class
|
13
|
+
|
14
|
+
class << self
|
15
|
+
|
16
|
+
#
|
17
|
+
# Define or return a hash describing the structure of the derived microformat,
|
18
|
+
# using symbols, hashes, and arrays.
|
19
|
+
#
|
20
|
+
# E.g.
|
21
|
+
#
|
22
|
+
# structure({
|
23
|
+
# :foo => :integer, # singular integral value
|
24
|
+
# :bar => [:string], # singular or plural string value
|
25
|
+
# :baz => { # substructure below 'baz'
|
26
|
+
# :created_on => :datetime # 'created-on' is interpeted as a date/time value
|
27
|
+
# }
|
28
|
+
# })
|
29
|
+
#
|
30
|
+
def structure(structure=nil)
|
31
|
+
if structure
|
32
|
+
@structure = structure
|
33
|
+
legal_keys = []
|
34
|
+
structure.each do |key, value|
|
35
|
+
legal_keys << key
|
36
|
+
if value.is_a?(Array)
|
37
|
+
legal_keys << Pluralizer.pluralize(key)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
legal_keys.each do |key|
|
41
|
+
class_eval "def #{key}() return Pluralizer.singularized_lookup(@tree, :#{key}) ; end"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
return @structure
|
45
|
+
end
|
46
|
+
|
47
|
+
#
|
48
|
+
# By default, the name of the class is converted to lowercase and used as the
|
49
|
+
# CSS class identifying this format (e.g. HReview => "hreview"). This method
|
50
|
+
# can be used to override this.
|
51
|
+
#
|
52
|
+
def identifier(ident=nil)
|
53
|
+
@identifier_class = ident if ident
|
54
|
+
return @identifier_class ||= self.to_s.split('::').last.downcase.to_sym
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# Yields a new instance for each of the matching microformats within
|
59
|
+
# the given document.
|
60
|
+
#
|
61
|
+
def each(source, url=nil)
|
62
|
+
if source.respond_to?(:each_element)
|
63
|
+
xml_source = source
|
64
|
+
else
|
65
|
+
xml_source = REXML::Document.new(source.to_s)
|
66
|
+
end
|
67
|
+
Microformats.expand_include_patterns!(xml_source)
|
68
|
+
Microformats.each_element_by_class(xml_source, identifier) do |element|
|
69
|
+
yield new(element.dup, url)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
# Returns a new instance for the first matching microformat in the
|
75
|
+
# given document.
|
76
|
+
#
|
77
|
+
def first(source, url=nil)
|
78
|
+
each(source, url) do |mf|
|
79
|
+
return mf
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end # class << self
|
84
|
+
|
85
|
+
def initialize(source, url=nil)
|
86
|
+
if source.respond_to?(:each_element)
|
87
|
+
@source = source
|
88
|
+
else
|
89
|
+
@source = REXML::Document.new(source.to_s)
|
90
|
+
end
|
91
|
+
@url = url
|
92
|
+
@tree = parse(__class__.structure, @source, MethodHash.new)
|
93
|
+
post_process!
|
94
|
+
end
|
95
|
+
attr_reader :source
|
96
|
+
|
97
|
+
def post_process!
|
98
|
+
end
|
99
|
+
|
100
|
+
def parse(structure, source, default_tree=nil)
|
101
|
+
tree = default_tree || MethodHash.new
|
102
|
+
structure.each do |tag, substructure|
|
103
|
+
if substructure.is_a?(Array)
|
104
|
+
key = Pluralizer.pluralize(tag)
|
105
|
+
else
|
106
|
+
key = tag
|
107
|
+
end
|
108
|
+
Microformats.each_element_by_class(source, tag) do |element|
|
109
|
+
tree[key] = parse_tag(substructure, source, tag)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
return tree
|
113
|
+
end
|
114
|
+
|
115
|
+
def parse_tag(substructure, source, tag)
|
116
|
+
if substructure.is_a? Array
|
117
|
+
substructure = substructure[0]
|
118
|
+
elements = []
|
119
|
+
is_plural = true
|
120
|
+
else
|
121
|
+
is_plural = false
|
122
|
+
end
|
123
|
+
Microformats.each_element_by_class(source, tag) do |element|
|
124
|
+
if substructure.is_a? Hash
|
125
|
+
this = parse(substructure, element)
|
126
|
+
# Special case for values that can be encapsulated or not
|
127
|
+
unless this[:value]
|
128
|
+
ignored_classes = substructure.keys - [:value]
|
129
|
+
if substructure[:value] == :value
|
130
|
+
v = process_as_value(element, ignored_classes)
|
131
|
+
this[:value] ||= v if v
|
132
|
+
elsif substructure[:value] == :email
|
133
|
+
v = process_value_as_email(element)
|
134
|
+
this[:value] ||= v if v
|
135
|
+
end
|
136
|
+
end
|
137
|
+
else
|
138
|
+
this = self.__send__("process_as_#{substructure}".to_sym, element)
|
139
|
+
end
|
140
|
+
if is_plural
|
141
|
+
elements << this
|
142
|
+
else
|
143
|
+
return this
|
144
|
+
end
|
145
|
+
end
|
146
|
+
return elements
|
147
|
+
end
|
148
|
+
|
149
|
+
def process_as_string(element, ignored_classes=[])
|
150
|
+
case element.fully_expanded_name
|
151
|
+
when 'abbr'
|
152
|
+
return element.attribute('title').value.strip
|
153
|
+
when 'img'
|
154
|
+
return element.attribute('alt').value.strip
|
155
|
+
else
|
156
|
+
buffer = ''
|
157
|
+
element.children.each do |child|
|
158
|
+
case child
|
159
|
+
when REXML::Text
|
160
|
+
buffer << child.to_s
|
161
|
+
else
|
162
|
+
has_classes = Microformats.classes_for_element(child)
|
163
|
+
if (has_classes & ignored_classes).empty?
|
164
|
+
buffer << process_as_string(child, ignored_classes)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
return HTMLEntities.decode_entities(buffer.strip.gsub(/\s+/um, ' '))
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def process_as_integer(element)
|
173
|
+
text = process_as_string(element)
|
174
|
+
return nil unless text
|
175
|
+
return nil unless text =~ /^\d+$/
|
176
|
+
return text.to_i(10)
|
177
|
+
end
|
178
|
+
|
179
|
+
def process_as_float(element)
|
180
|
+
text = process_as_string(element)
|
181
|
+
return nil unless text
|
182
|
+
return nil unless text =~ /^\d+(?:\.\d+)?$/
|
183
|
+
return text.to_f
|
184
|
+
end
|
185
|
+
|
186
|
+
def process_as_xhtml(element)
|
187
|
+
return element.children.map{ |c| c.to_s }.join.strip
|
188
|
+
end
|
189
|
+
|
190
|
+
def process_as_datetime(element)
|
191
|
+
text = process_as_string(element)
|
192
|
+
return nil unless text && text =~ /\d{2,}/
|
193
|
+
text << '-01T00:00:00Z' if text =~ /^\d{4}-\d{2}$/
|
194
|
+
text << '01T000000Z' if text =~ /^\d{6}$/
|
195
|
+
text << '-T00:00:00Z' if text =~ /^\d{4}-\d{2}-\d{2}$/
|
196
|
+
text << 'T000000Z' if text =~ /^\d{8}$/
|
197
|
+
text.gsub!(/\++/, '+')
|
198
|
+
text.gsub!(/\-+/, '-')
|
199
|
+
begin
|
200
|
+
return Time.parse(text)
|
201
|
+
rescue
|
202
|
+
return nil
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
def process_as_url(element)
|
207
|
+
element.each_element('descendant-or-self::*[@href or @src]') do |subelement|
|
208
|
+
if (href = subelement.attribute('href'))
|
209
|
+
return absolute_url(href.value.strip)
|
210
|
+
elsif (src = subelement.attribute('src'))
|
211
|
+
return absolute_url(src.value.strip)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
return nil
|
215
|
+
end
|
216
|
+
|
217
|
+
def process_as_value(element, ignored_classes=[])
|
218
|
+
Microformats.each_element_by_class(element, :value) do |e|
|
219
|
+
return process_as_string(e, ignored_classes)
|
220
|
+
end
|
221
|
+
return process_as_string(element, ignored_classes)
|
222
|
+
end
|
223
|
+
|
224
|
+
def process_as_email(element)
|
225
|
+
url = process_as_url(element)
|
226
|
+
if url
|
227
|
+
return url.sub(/^mailto:/, '')
|
228
|
+
else
|
229
|
+
return process_as_string(element)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def process_value_as_email(element)
|
234
|
+
email = process_as_url(element)
|
235
|
+
return email.sub(/^mailto:/, '') if email
|
236
|
+
return process_as_value(element)
|
237
|
+
end
|
238
|
+
|
239
|
+
def absolute_url(relative_url)
|
240
|
+
return relative_url unless @url
|
241
|
+
return URI.join(@url, relative_url).to_s
|
242
|
+
end
|
243
|
+
|
244
|
+
#
|
245
|
+
# Instead of worrying about calling methods on nil, lookup provides an
|
246
|
+
# easy way of accessing deeply nested data.
|
247
|
+
#
|
248
|
+
# E.g.
|
249
|
+
#
|
250
|
+
# object.somethings[1].other # => error if somethings[1] is nil
|
251
|
+
# object.lookup(:somethings, 1, :other) # => nil if any level is nil
|
252
|
+
#
|
253
|
+
def lookup(*list)
|
254
|
+
tree = @tree
|
255
|
+
list.each do |level|
|
256
|
+
if level.is_a?(Symbol)
|
257
|
+
tree = tree.__send__(level)
|
258
|
+
else
|
259
|
+
tree = tree.__send__(:[], level)
|
260
|
+
end
|
261
|
+
return nil unless tree
|
262
|
+
end
|
263
|
+
return tree
|
264
|
+
end
|
265
|
+
|
266
|
+
end # BasicNestedFormat
|
267
|
+
|
268
|
+
end # Microformats
|