uformats 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +8 -0
- data/COPYING +4 -0
- data/README +36 -0
- data/lib/uformats.rb +158 -0
- data/lib/uformats/basic.rb +268 -0
- data/lib/uformats/hcalendar.rb +23 -0
- data/lib/uformats/hcard.rb +100 -0
- data/lib/uformats/hreview.rb +182 -0
- data/lib/uformats/pluralizer.rb +32 -0
- data/test/all.rb +4 -0
- data/test/basic_test.rb +441 -0
- data/test/hcalendar_test.rb +157 -0
- data/test/hcard_test.rb +438 -0
- data/test/hreview_test.rb +500 -0
- data/test/pluralizer_test.rb +24 -0
- data/test/test_helper.rb +13 -0
- data/test/uformats_test.rb +235 -0
- metadata +70 -0
data/CHANGES
ADDED
data/COPYING
ADDED
data/README
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
uformats
|
2
|
+
|
3
|
+
ABOUT
|
4
|
+
|
5
|
+
A library for parsing various microformats under Ruby:
|
6
|
+
|
7
|
+
* hReview (0.3)
|
8
|
+
* hCard
|
9
|
+
* hCalendar (vevent only; limited support)
|
10
|
+
* rel-tag
|
11
|
+
* rel-license
|
12
|
+
* include-pattern
|
13
|
+
|
14
|
+
PREREQUISITES
|
15
|
+
|
16
|
+
htmlentities >= 3.0.1 (http://htmlentities.rubyforge.org/)
|
17
|
+
|
18
|
+
If you want to try the sample code, you will also need:
|
19
|
+
|
20
|
+
tidy (http://rubyforge.org/projects/tidy)
|
21
|
+
|
22
|
+
You'll also need to install the tidy library for your operating system. For OS
|
23
|
+
X, it's available via Darwin Ports (port install tidy). For Windows, get it
|
24
|
+
from http://dev.int64.org/tidy.html and rename 'tidy.dll' to 'libtidy.dll'.
|
25
|
+
For Linux etc. it should be available via the repositories.
|
26
|
+
|
27
|
+
INSTALLATION
|
28
|
+
|
29
|
+
To install (requires root/admin privileges):
|
30
|
+
|
31
|
+
# ruby setup.rb
|
32
|
+
|
33
|
+
To test:
|
34
|
+
|
35
|
+
$ ruby setup.rb test
|
36
|
+
|
data/lib/uformats.rb
ADDED
@@ -0,0 +1,158 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
require 'uformats/pluralizer'
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
module Microformats
|
6
|
+
|
7
|
+
#
|
8
|
+
# Provides method access to an internal Hash
|
9
|
+
#
|
10
|
+
class MethodHash
|
11
|
+
|
12
|
+
instance_methods.each { |m| undef_method m unless m =~ /^__/ }
|
13
|
+
|
14
|
+
def initialize(hash={})
|
15
|
+
@hash = hash.dup
|
16
|
+
end
|
17
|
+
|
18
|
+
def []=(k,v)
|
19
|
+
@hash[k] = v
|
20
|
+
end
|
21
|
+
|
22
|
+
def [](k)
|
23
|
+
return @hash[k]
|
24
|
+
end
|
25
|
+
|
26
|
+
def method_missing(sym, *args)
|
27
|
+
return Pluralizer.singularized_lookup(self, sym)
|
28
|
+
end
|
29
|
+
|
30
|
+
def each(&blk)
|
31
|
+
case blk.arity
|
32
|
+
when 1
|
33
|
+
@hash.each do |k,v|
|
34
|
+
yield [k, v]
|
35
|
+
end
|
36
|
+
when 2
|
37
|
+
@hash.each do |k,v|
|
38
|
+
yield k, v
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end # MethodHash
|
43
|
+
|
44
|
+
#
|
45
|
+
# Expand all include-pattern objects in a document
|
46
|
+
#
|
47
|
+
def expand_include_patterns!(rexml_source)
|
48
|
+
each_element_by_class(rexml_source, :include) do |target|
|
49
|
+
if (target.attribute('data') && target.attribute('data').value =~ /^#/)
|
50
|
+
source_id = target.attribute('data').value.sub(/^#/, '')
|
51
|
+
source = rexml_source.elements["//*[@id='#{source_id}']"]
|
52
|
+
if source
|
53
|
+
replacement = source.deep_clone
|
54
|
+
replacement.attributes.delete('id')
|
55
|
+
target.replace_with(replacement)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
return rexml_source
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Yield each element matching sym at or below the given node
|
64
|
+
#
|
65
|
+
def each_element_by_class(rexml_source, sym)
|
66
|
+
css_class = Microformats.sym_to_css_class(sym)
|
67
|
+
rexml_source.each_element("descendant-or-self::*[contains(@class,'#{css_class}')]") do |element|
|
68
|
+
if element.attribute('class').value =~ /\b#{css_class}\b/
|
69
|
+
yield element
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
#
|
75
|
+
# Returns the first element matching sym at or below the given node, or nil if none exists
|
76
|
+
#
|
77
|
+
def first_element_by_class(rexml_source, sym)
|
78
|
+
each_element_by_class(rexml_source, sym) do |element|
|
79
|
+
return element
|
80
|
+
end
|
81
|
+
return nil
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Return a list of all rel-tag elements found at or above the given node
|
86
|
+
#
|
87
|
+
def rel_tags_above_element(rexml_element)
|
88
|
+
tags = []
|
89
|
+
rexml_element.each_element('ancestor::*/a[@rel="tag"]') do |tagged_element|
|
90
|
+
tag = rel_tag(tagged_element)
|
91
|
+
tags << tag if tag
|
92
|
+
end
|
93
|
+
tags.compact!
|
94
|
+
return tags.empty? ? nil : tags
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# Return a list of all rel-tag elements found at or below the given node
|
99
|
+
#
|
100
|
+
def rel_tags_below_element(rexml_element)
|
101
|
+
tags = []
|
102
|
+
rexml_element.each_element('descendant-or-self::a[@rel="tag"]') do |tagged_element|
|
103
|
+
tag = rel_tag(tagged_element)
|
104
|
+
tags << tag if tag
|
105
|
+
end
|
106
|
+
return tags.empty? ? nil : tags
|
107
|
+
end
|
108
|
+
|
109
|
+
def rel_tag(rexml_element)
|
110
|
+
if rexml_element.attribute('href')
|
111
|
+
href = rexml_element.attribute('href').value
|
112
|
+
href.gsub!(%r!^.*/|[\?\#].*$!, '')
|
113
|
+
return CGI.unescape(href)
|
114
|
+
else
|
115
|
+
return nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
#
|
120
|
+
# Return a list of all rel-license elements found in the given element
|
121
|
+
#
|
122
|
+
def rel_licenses(rexml_element)
|
123
|
+
licenses = []
|
124
|
+
rexml_element.each_element('descendant-or-self::a[@rel="license"]') do |license_element|
|
125
|
+
if license_element.attribute('href')
|
126
|
+
licenses << license_element.attribute('href').value
|
127
|
+
end
|
128
|
+
end
|
129
|
+
return licenses.empty? ? nil : licenses
|
130
|
+
end
|
131
|
+
|
132
|
+
#
|
133
|
+
# Convert a Ruby symbol to a CSS class, e.g. :foo_bar => "foo-bar"
|
134
|
+
#
|
135
|
+
def sym_to_css_class(sym)
|
136
|
+
return sym.to_s.gsub('_', '-')
|
137
|
+
end
|
138
|
+
|
139
|
+
#
|
140
|
+
# Convert a css class to a Ruby symbol
|
141
|
+
#
|
142
|
+
def css_class_to_sym(css_class)
|
143
|
+
return css_class.gsub('-', '_').to_sym
|
144
|
+
end
|
145
|
+
|
146
|
+
#
|
147
|
+
# Returns a list of symbols corresponding to an element's classes
|
148
|
+
#
|
149
|
+
def classes_for_element(rexml_element)
|
150
|
+
if (css_class = rexml_element.attribute('class'))
|
151
|
+
return css_class.value.strip.split(/\s/).map{ |e| css_class_to_sym(e) }
|
152
|
+
else
|
153
|
+
return []
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
extend self
|
158
|
+
end
|
@@ -0,0 +1,268 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
require 'cgi'
|
3
|
+
require 'uformats'
|
4
|
+
require 'uformats/pluralizer'
|
5
|
+
require 'htmlentities'
|
6
|
+
require 'time'
|
7
|
+
require 'uri'
|
8
|
+
|
9
|
+
module Microformats
|
10
|
+
class BasicNestedFormat
|
11
|
+
|
12
|
+
alias_method :__class__, :class
|
13
|
+
|
14
|
+
class << self
|
15
|
+
|
16
|
+
#
|
17
|
+
# Define or return a hash describing the structure of the derived microformat,
|
18
|
+
# using symbols, hashes, and arrays.
|
19
|
+
#
|
20
|
+
# E.g.
|
21
|
+
#
|
22
|
+
# structure({
|
23
|
+
# :foo => :integer, # singular integral value
|
24
|
+
# :bar => [:string], # singular or plural string value
|
25
|
+
# :baz => { # substructure below 'baz'
|
26
|
+
# :created_on => :datetime # 'created-on' is interpeted as a date/time value
|
27
|
+
# }
|
28
|
+
# })
|
29
|
+
#
|
30
|
+
def structure(structure=nil)
|
31
|
+
if structure
|
32
|
+
@structure = structure
|
33
|
+
legal_keys = []
|
34
|
+
structure.each do |key, value|
|
35
|
+
legal_keys << key
|
36
|
+
if value.is_a?(Array)
|
37
|
+
legal_keys << Pluralizer.pluralize(key)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
legal_keys.each do |key|
|
41
|
+
class_eval "def #{key}() return Pluralizer.singularized_lookup(@tree, :#{key}) ; end"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
return @structure
|
45
|
+
end
|
46
|
+
|
47
|
+
#
|
48
|
+
# By default, the name of the class is converted to lowercase and used as the
|
49
|
+
# CSS class identifying this format (e.g. HReview => "hreview"). This method
|
50
|
+
# can be used to override this.
|
51
|
+
#
|
52
|
+
def identifier(ident=nil)
|
53
|
+
@identifier_class = ident if ident
|
54
|
+
return @identifier_class ||= self.to_s.split('::').last.downcase.to_sym
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# Yields a new instance for each of the matching microformats within
|
59
|
+
# the given document.
|
60
|
+
#
|
61
|
+
def each(source, url=nil)
|
62
|
+
if source.respond_to?(:each_element)
|
63
|
+
xml_source = source
|
64
|
+
else
|
65
|
+
xml_source = REXML::Document.new(source.to_s)
|
66
|
+
end
|
67
|
+
Microformats.expand_include_patterns!(xml_source)
|
68
|
+
Microformats.each_element_by_class(xml_source, identifier) do |element|
|
69
|
+
yield new(element.dup, url)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
# Returns a new instance for the first matching microformat in the
|
75
|
+
# given document.
|
76
|
+
#
|
77
|
+
def first(source, url=nil)
|
78
|
+
each(source, url) do |mf|
|
79
|
+
return mf
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end # class << self
|
84
|
+
|
85
|
+
def initialize(source, url=nil)
|
86
|
+
if source.respond_to?(:each_element)
|
87
|
+
@source = source
|
88
|
+
else
|
89
|
+
@source = REXML::Document.new(source.to_s)
|
90
|
+
end
|
91
|
+
@url = url
|
92
|
+
@tree = parse(__class__.structure, @source, MethodHash.new)
|
93
|
+
post_process!
|
94
|
+
end
|
95
|
+
attr_reader :source
|
96
|
+
|
97
|
+
def post_process!
|
98
|
+
end
|
99
|
+
|
100
|
+
def parse(structure, source, default_tree=nil)
|
101
|
+
tree = default_tree || MethodHash.new
|
102
|
+
structure.each do |tag, substructure|
|
103
|
+
if substructure.is_a?(Array)
|
104
|
+
key = Pluralizer.pluralize(tag)
|
105
|
+
else
|
106
|
+
key = tag
|
107
|
+
end
|
108
|
+
Microformats.each_element_by_class(source, tag) do |element|
|
109
|
+
tree[key] = parse_tag(substructure, source, tag)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
return tree
|
113
|
+
end
|
114
|
+
|
115
|
+
def parse_tag(substructure, source, tag)
|
116
|
+
if substructure.is_a? Array
|
117
|
+
substructure = substructure[0]
|
118
|
+
elements = []
|
119
|
+
is_plural = true
|
120
|
+
else
|
121
|
+
is_plural = false
|
122
|
+
end
|
123
|
+
Microformats.each_element_by_class(source, tag) do |element|
|
124
|
+
if substructure.is_a? Hash
|
125
|
+
this = parse(substructure, element)
|
126
|
+
# Special case for values that can be encapsulated or not
|
127
|
+
unless this[:value]
|
128
|
+
ignored_classes = substructure.keys - [:value]
|
129
|
+
if substructure[:value] == :value
|
130
|
+
v = process_as_value(element, ignored_classes)
|
131
|
+
this[:value] ||= v if v
|
132
|
+
elsif substructure[:value] == :email
|
133
|
+
v = process_value_as_email(element)
|
134
|
+
this[:value] ||= v if v
|
135
|
+
end
|
136
|
+
end
|
137
|
+
else
|
138
|
+
this = self.__send__("process_as_#{substructure}".to_sym, element)
|
139
|
+
end
|
140
|
+
if is_plural
|
141
|
+
elements << this
|
142
|
+
else
|
143
|
+
return this
|
144
|
+
end
|
145
|
+
end
|
146
|
+
return elements
|
147
|
+
end
|
148
|
+
|
149
|
+
def process_as_string(element, ignored_classes=[])
|
150
|
+
case element.fully_expanded_name
|
151
|
+
when 'abbr'
|
152
|
+
return element.attribute('title').value.strip
|
153
|
+
when 'img'
|
154
|
+
return element.attribute('alt').value.strip
|
155
|
+
else
|
156
|
+
buffer = ''
|
157
|
+
element.children.each do |child|
|
158
|
+
case child
|
159
|
+
when REXML::Text
|
160
|
+
buffer << child.to_s
|
161
|
+
else
|
162
|
+
has_classes = Microformats.classes_for_element(child)
|
163
|
+
if (has_classes & ignored_classes).empty?
|
164
|
+
buffer << process_as_string(child, ignored_classes)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
return HTMLEntities.decode_entities(buffer.strip.gsub(/\s+/um, ' '))
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def process_as_integer(element)
|
173
|
+
text = process_as_string(element)
|
174
|
+
return nil unless text
|
175
|
+
return nil unless text =~ /^\d+$/
|
176
|
+
return text.to_i(10)
|
177
|
+
end
|
178
|
+
|
179
|
+
def process_as_float(element)
|
180
|
+
text = process_as_string(element)
|
181
|
+
return nil unless text
|
182
|
+
return nil unless text =~ /^\d+(?:\.\d+)?$/
|
183
|
+
return text.to_f
|
184
|
+
end
|
185
|
+
|
186
|
+
def process_as_xhtml(element)
|
187
|
+
return element.children.map{ |c| c.to_s }.join.strip
|
188
|
+
end
|
189
|
+
|
190
|
+
def process_as_datetime(element)
|
191
|
+
text = process_as_string(element)
|
192
|
+
return nil unless text && text =~ /\d{2,}/
|
193
|
+
text << '-01T00:00:00Z' if text =~ /^\d{4}-\d{2}$/
|
194
|
+
text << '01T000000Z' if text =~ /^\d{6}$/
|
195
|
+
text << '-T00:00:00Z' if text =~ /^\d{4}-\d{2}-\d{2}$/
|
196
|
+
text << 'T000000Z' if text =~ /^\d{8}$/
|
197
|
+
text.gsub!(/\++/, '+')
|
198
|
+
text.gsub!(/\-+/, '-')
|
199
|
+
begin
|
200
|
+
return Time.parse(text)
|
201
|
+
rescue
|
202
|
+
return nil
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
def process_as_url(element)
|
207
|
+
element.each_element('descendant-or-self::*[@href or @src]') do |subelement|
|
208
|
+
if (href = subelement.attribute('href'))
|
209
|
+
return absolute_url(href.value.strip)
|
210
|
+
elsif (src = subelement.attribute('src'))
|
211
|
+
return absolute_url(src.value.strip)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
return nil
|
215
|
+
end
|
216
|
+
|
217
|
+
def process_as_value(element, ignored_classes=[])
|
218
|
+
Microformats.each_element_by_class(element, :value) do |e|
|
219
|
+
return process_as_string(e, ignored_classes)
|
220
|
+
end
|
221
|
+
return process_as_string(element, ignored_classes)
|
222
|
+
end
|
223
|
+
|
224
|
+
def process_as_email(element)
|
225
|
+
url = process_as_url(element)
|
226
|
+
if url
|
227
|
+
return url.sub(/^mailto:/, '')
|
228
|
+
else
|
229
|
+
return process_as_string(element)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def process_value_as_email(element)
|
234
|
+
email = process_as_url(element)
|
235
|
+
return email.sub(/^mailto:/, '') if email
|
236
|
+
return process_as_value(element)
|
237
|
+
end
|
238
|
+
|
239
|
+
def absolute_url(relative_url)
|
240
|
+
return relative_url unless @url
|
241
|
+
return URI.join(@url, relative_url).to_s
|
242
|
+
end
|
243
|
+
|
244
|
+
#
|
245
|
+
# Instead of worrying about calling methods on nil, lookup provides an
|
246
|
+
# easy way of accessing deeply nested data.
|
247
|
+
#
|
248
|
+
# E.g.
|
249
|
+
#
|
250
|
+
# object.somethings[1].other # => error if somethings[1] is nil
|
251
|
+
# object.lookup(:somethings, 1, :other) # => nil if any level is nil
|
252
|
+
#
|
253
|
+
def lookup(*list)
|
254
|
+
tree = @tree
|
255
|
+
list.each do |level|
|
256
|
+
if level.is_a?(Symbol)
|
257
|
+
tree = tree.__send__(level)
|
258
|
+
else
|
259
|
+
tree = tree.__send__(:[], level)
|
260
|
+
end
|
261
|
+
return nil unless tree
|
262
|
+
end
|
263
|
+
return tree
|
264
|
+
end
|
265
|
+
|
266
|
+
end # BasicNestedFormat
|
267
|
+
|
268
|
+
end # Microformats
|