uformats 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES ADDED
@@ -0,0 +1,8 @@
1
+ 1.2.1 (2006-06-14)
2
+
3
+ * Minor changes to work with HTMLEntities 3.0.1+
4
+ * Added gem
5
+
6
+ 1.2 (2006-03-08)
7
+
8
+ * First public release
data/COPYING ADDED
@@ -0,0 +1,4 @@
1
+ Copyright Revieworld Ltd. 2006
2
+
3
+ You may use, copy and redistribute this library under the same terms as Ruby
4
+ itself (see http://www.ruby-lang.org/en/LICENSE.txt).
data/README ADDED
@@ -0,0 +1,36 @@
1
+ uformats
2
+
3
+ ABOUT
4
+
5
+ A library for parsing various microformats under Ruby:
6
+
7
+ * hReview (0.3)
8
+ * hCard
9
+ * hCalendar (vevent only; limited support)
10
+ * rel-tag
11
+ * rel-license
12
+ * include-pattern
13
+
14
+ PREREQUISITES
15
+
16
+ htmlentities >= 3.0.1 (http://htmlentities.rubyforge.org/)
17
+
18
+ If you want to try the sample code, you will also need:
19
+
20
+ tidy (http://rubyforge.org/projects/tidy)
21
+
22
+ You'll also need to install the tidy library for your operating system. For OS
23
+ X, it's available via Darwin Ports (port install tidy). For Windows, get it
24
+ from http://dev.int64.org/tidy.html and rename 'tidy.dll' to 'libtidy.dll'.
25
+ For Linux etc. it should be available via the repositories.
26
+
27
+ INSTALLATION
28
+
29
+ To install (requires root/admin privileges):
30
+
31
+ # ruby setup.rb
32
+
33
+ To test:
34
+
35
+ $ ruby setup.rb test
36
+
@@ -0,0 +1,158 @@
1
+ require 'rexml/document'
2
+ require 'uformats/pluralizer'
3
+ require 'cgi'
4
+
5
+ module Microformats
6
+
7
+ #
8
+ # Provides method access to an internal Hash
9
+ #
10
+ class MethodHash
11
+
12
+ instance_methods.each { |m| undef_method m unless m =~ /^__/ }
13
+
14
+ def initialize(hash={})
15
+ @hash = hash.dup
16
+ end
17
+
18
+ def []=(k,v)
19
+ @hash[k] = v
20
+ end
21
+
22
+ def [](k)
23
+ return @hash[k]
24
+ end
25
+
26
+ def method_missing(sym, *args)
27
+ return Pluralizer.singularized_lookup(self, sym)
28
+ end
29
+
30
+ def each(&blk)
31
+ case blk.arity
32
+ when 1
33
+ @hash.each do |k,v|
34
+ yield [k, v]
35
+ end
36
+ when 2
37
+ @hash.each do |k,v|
38
+ yield k, v
39
+ end
40
+ end
41
+ end
42
+ end # MethodHash
43
+
44
+ #
45
+ # Expand all include-pattern objects in a document
46
+ #
47
+ def expand_include_patterns!(rexml_source)
48
+ each_element_by_class(rexml_source, :include) do |target|
49
+ if (target.attribute('data') && target.attribute('data').value =~ /^#/)
50
+ source_id = target.attribute('data').value.sub(/^#/, '')
51
+ source = rexml_source.elements["//*[@id='#{source_id}']"]
52
+ if source
53
+ replacement = source.deep_clone
54
+ replacement.attributes.delete('id')
55
+ target.replace_with(replacement)
56
+ end
57
+ end
58
+ end
59
+ return rexml_source
60
+ end
61
+
62
+ #
63
+ # Yield each element matching sym at or below the given node
64
+ #
65
+ def each_element_by_class(rexml_source, sym)
66
+ css_class = Microformats.sym_to_css_class(sym)
67
+ rexml_source.each_element("descendant-or-self::*[contains(@class,'#{css_class}')]") do |element|
68
+ if element.attribute('class').value =~ /\b#{css_class}\b/
69
+ yield element
70
+ end
71
+ end
72
+ end
73
+
74
+ #
75
+ # Returns the first element matching sym at or below the given node, or nil if none exists
76
+ #
77
+ def first_element_by_class(rexml_source, sym)
78
+ each_element_by_class(rexml_source, sym) do |element|
79
+ return element
80
+ end
81
+ return nil
82
+ end
83
+
84
+ #
85
+ # Return a list of all rel-tag elements found at or above the given node
86
+ #
87
+ def rel_tags_above_element(rexml_element)
88
+ tags = []
89
+ rexml_element.each_element('ancestor::*/a[@rel="tag"]') do |tagged_element|
90
+ tag = rel_tag(tagged_element)
91
+ tags << tag if tag
92
+ end
93
+ tags.compact!
94
+ return tags.empty? ? nil : tags
95
+ end
96
+
97
+ #
98
+ # Return a list of all rel-tag elements found at or below the given node
99
+ #
100
+ def rel_tags_below_element(rexml_element)
101
+ tags = []
102
+ rexml_element.each_element('descendant-or-self::a[@rel="tag"]') do |tagged_element|
103
+ tag = rel_tag(tagged_element)
104
+ tags << tag if tag
105
+ end
106
+ return tags.empty? ? nil : tags
107
+ end
108
+
109
+ def rel_tag(rexml_element)
110
+ if rexml_element.attribute('href')
111
+ href = rexml_element.attribute('href').value
112
+ href.gsub!(%r!^.*/|[\?\#].*$!, '')
113
+ return CGI.unescape(href)
114
+ else
115
+ return nil
116
+ end
117
+ end
118
+
119
+ #
120
+ # Return a list of all rel-license elements found in the given element
121
+ #
122
+ def rel_licenses(rexml_element)
123
+ licenses = []
124
+ rexml_element.each_element('descendant-or-self::a[@rel="license"]') do |license_element|
125
+ if license_element.attribute('href')
126
+ licenses << license_element.attribute('href').value
127
+ end
128
+ end
129
+ return licenses.empty? ? nil : licenses
130
+ end
131
+
132
+ #
133
+ # Convert a Ruby symbol to a CSS class, e.g. :foo_bar => "foo-bar"
134
+ #
135
+ def sym_to_css_class(sym)
136
+ return sym.to_s.gsub('_', '-')
137
+ end
138
+
139
+ #
140
+ # Convert a css class to a Ruby symbol
141
+ #
142
+ def css_class_to_sym(css_class)
143
+ return css_class.gsub('-', '_').to_sym
144
+ end
145
+
146
+ #
147
+ # Returns a list of symbols corresponding to an element's classes
148
+ #
149
+ def classes_for_element(rexml_element)
150
+ if (css_class = rexml_element.attribute('class'))
151
+ return css_class.value.strip.split(/\s/).map{ |e| css_class_to_sym(e) }
152
+ else
153
+ return []
154
+ end
155
+ end
156
+
157
+ extend self
158
+ end
@@ -0,0 +1,268 @@
1
+ require 'rexml/document'
2
+ require 'cgi'
3
+ require 'uformats'
4
+ require 'uformats/pluralizer'
5
+ require 'htmlentities'
6
+ require 'time'
7
+ require 'uri'
8
+
9
+ module Microformats
10
+ class BasicNestedFormat
11
+
12
+ alias_method :__class__, :class
13
+
14
+ class << self
15
+
16
+ #
17
+ # Define or return a hash describing the structure of the derived microformat,
18
+ # using symbols, hashes, and arrays.
19
+ #
20
+ # E.g.
21
+ #
22
+ # structure({
23
+ # :foo => :integer, # singular integral value
24
+ # :bar => [:string], # singular or plural string value
25
+ # :baz => { # substructure below 'baz'
26
+ # :created_on => :datetime # 'created-on' is interpeted as a date/time value
27
+ # }
28
+ # })
29
+ #
30
+ def structure(structure=nil)
31
+ if structure
32
+ @structure = structure
33
+ legal_keys = []
34
+ structure.each do |key, value|
35
+ legal_keys << key
36
+ if value.is_a?(Array)
37
+ legal_keys << Pluralizer.pluralize(key)
38
+ end
39
+ end
40
+ legal_keys.each do |key|
41
+ class_eval "def #{key}() return Pluralizer.singularized_lookup(@tree, :#{key}) ; end"
42
+ end
43
+ end
44
+ return @structure
45
+ end
46
+
47
+ #
48
+ # By default, the name of the class is converted to lowercase and used as the
49
+ # CSS class identifying this format (e.g. HReview => "hreview"). This method
50
+ # can be used to override this.
51
+ #
52
+ def identifier(ident=nil)
53
+ @identifier_class = ident if ident
54
+ return @identifier_class ||= self.to_s.split('::').last.downcase.to_sym
55
+ end
56
+
57
+ #
58
+ # Yields a new instance for each of the matching microformats within
59
+ # the given document.
60
+ #
61
+ def each(source, url=nil)
62
+ if source.respond_to?(:each_element)
63
+ xml_source = source
64
+ else
65
+ xml_source = REXML::Document.new(source.to_s)
66
+ end
67
+ Microformats.expand_include_patterns!(xml_source)
68
+ Microformats.each_element_by_class(xml_source, identifier) do |element|
69
+ yield new(element.dup, url)
70
+ end
71
+ end
72
+
73
+ #
74
+ # Returns a new instance for the first matching microformat in the
75
+ # given document.
76
+ #
77
+ def first(source, url=nil)
78
+ each(source, url) do |mf|
79
+ return mf
80
+ end
81
+ end
82
+
83
+ end # class << self
84
+
85
+ def initialize(source, url=nil)
86
+ if source.respond_to?(:each_element)
87
+ @source = source
88
+ else
89
+ @source = REXML::Document.new(source.to_s)
90
+ end
91
+ @url = url
92
+ @tree = parse(__class__.structure, @source, MethodHash.new)
93
+ post_process!
94
+ end
95
+ attr_reader :source
96
+
97
+ def post_process!
98
+ end
99
+
100
+ def parse(structure, source, default_tree=nil)
101
+ tree = default_tree || MethodHash.new
102
+ structure.each do |tag, substructure|
103
+ if substructure.is_a?(Array)
104
+ key = Pluralizer.pluralize(tag)
105
+ else
106
+ key = tag
107
+ end
108
+ Microformats.each_element_by_class(source, tag) do |element|
109
+ tree[key] = parse_tag(substructure, source, tag)
110
+ end
111
+ end
112
+ return tree
113
+ end
114
+
115
+ def parse_tag(substructure, source, tag)
116
+ if substructure.is_a? Array
117
+ substructure = substructure[0]
118
+ elements = []
119
+ is_plural = true
120
+ else
121
+ is_plural = false
122
+ end
123
+ Microformats.each_element_by_class(source, tag) do |element|
124
+ if substructure.is_a? Hash
125
+ this = parse(substructure, element)
126
+ # Special case for values that can be encapsulated or not
127
+ unless this[:value]
128
+ ignored_classes = substructure.keys - [:value]
129
+ if substructure[:value] == :value
130
+ v = process_as_value(element, ignored_classes)
131
+ this[:value] ||= v if v
132
+ elsif substructure[:value] == :email
133
+ v = process_value_as_email(element)
134
+ this[:value] ||= v if v
135
+ end
136
+ end
137
+ else
138
+ this = self.__send__("process_as_#{substructure}".to_sym, element)
139
+ end
140
+ if is_plural
141
+ elements << this
142
+ else
143
+ return this
144
+ end
145
+ end
146
+ return elements
147
+ end
148
+
149
+ def process_as_string(element, ignored_classes=[])
150
+ case element.fully_expanded_name
151
+ when 'abbr'
152
+ return element.attribute('title').value.strip
153
+ when 'img'
154
+ return element.attribute('alt').value.strip
155
+ else
156
+ buffer = ''
157
+ element.children.each do |child|
158
+ case child
159
+ when REXML::Text
160
+ buffer << child.to_s
161
+ else
162
+ has_classes = Microformats.classes_for_element(child)
163
+ if (has_classes & ignored_classes).empty?
164
+ buffer << process_as_string(child, ignored_classes)
165
+ end
166
+ end
167
+ end
168
+ return HTMLEntities.decode_entities(buffer.strip.gsub(/\s+/um, ' '))
169
+ end
170
+ end
171
+
172
+ def process_as_integer(element)
173
+ text = process_as_string(element)
174
+ return nil unless text
175
+ return nil unless text =~ /^\d+$/
176
+ return text.to_i(10)
177
+ end
178
+
179
+ def process_as_float(element)
180
+ text = process_as_string(element)
181
+ return nil unless text
182
+ return nil unless text =~ /^\d+(?:\.\d+)?$/
183
+ return text.to_f
184
+ end
185
+
186
+ def process_as_xhtml(element)
187
+ return element.children.map{ |c| c.to_s }.join.strip
188
+ end
189
+
190
+ def process_as_datetime(element)
191
+ text = process_as_string(element)
192
+ return nil unless text && text =~ /\d{2,}/
193
+ text << '-01T00:00:00Z' if text =~ /^\d{4}-\d{2}$/
194
+ text << '01T000000Z' if text =~ /^\d{6}$/
195
+ text << '-T00:00:00Z' if text =~ /^\d{4}-\d{2}-\d{2}$/
196
+ text << 'T000000Z' if text =~ /^\d{8}$/
197
+ text.gsub!(/\++/, '+')
198
+ text.gsub!(/\-+/, '-')
199
+ begin
200
+ return Time.parse(text)
201
+ rescue
202
+ return nil
203
+ end
204
+ end
205
+
206
+ def process_as_url(element)
207
+ element.each_element('descendant-or-self::*[@href or @src]') do |subelement|
208
+ if (href = subelement.attribute('href'))
209
+ return absolute_url(href.value.strip)
210
+ elsif (src = subelement.attribute('src'))
211
+ return absolute_url(src.value.strip)
212
+ end
213
+ end
214
+ return nil
215
+ end
216
+
217
+ def process_as_value(element, ignored_classes=[])
218
+ Microformats.each_element_by_class(element, :value) do |e|
219
+ return process_as_string(e, ignored_classes)
220
+ end
221
+ return process_as_string(element, ignored_classes)
222
+ end
223
+
224
+ def process_as_email(element)
225
+ url = process_as_url(element)
226
+ if url
227
+ return url.sub(/^mailto:/, '')
228
+ else
229
+ return process_as_string(element)
230
+ end
231
+ end
232
+
233
+ def process_value_as_email(element)
234
+ email = process_as_url(element)
235
+ return email.sub(/^mailto:/, '') if email
236
+ return process_as_value(element)
237
+ end
238
+
239
+ def absolute_url(relative_url)
240
+ return relative_url unless @url
241
+ return URI.join(@url, relative_url).to_s
242
+ end
243
+
244
+ #
245
+ # Instead of worrying about calling methods on nil, lookup provides an
246
+ # easy way of accessing deeply nested data.
247
+ #
248
+ # E.g.
249
+ #
250
+ # object.somethings[1].other # => error if somethings[1] is nil
251
+ # object.lookup(:somethings, 1, :other) # => nil if any level is nil
252
+ #
253
+ def lookup(*list)
254
+ tree = @tree
255
+ list.each do |level|
256
+ if level.is_a?(Symbol)
257
+ tree = tree.__send__(level)
258
+ else
259
+ tree = tree.__send__(:[], level)
260
+ end
261
+ return nil unless tree
262
+ end
263
+ return tree
264
+ end
265
+
266
+ end # BasicNestedFormat
267
+
268
+ end # Microformats