uformats 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES ADDED
@@ -0,0 +1,8 @@
1
+ 1.2.1 (2006-06-14)
2
+
3
+ * Minor changes to work with HTMLEntities 3.0.1+
4
+ * Added gem
5
+
6
+ 1.2 (2006-03-08)
7
+
8
+ * First public release
data/COPYING ADDED
@@ -0,0 +1,4 @@
1
+ Copyright Revieworld Ltd. 2006
2
+
3
+ You may use, copy and redistribute this library under the same terms as Ruby
4
+ itself (see http://www.ruby-lang.org/en/LICENSE.txt).
data/README ADDED
@@ -0,0 +1,36 @@
1
+ uformats
2
+
3
+ ABOUT
4
+
5
+ A library for parsing various microformats under Ruby:
6
+
7
+ * hReview (0.3)
8
+ * hCard
9
+ * hCalendar (vevent only; limited support)
10
+ * rel-tag
11
+ * rel-license
12
+ * include-pattern
13
+
14
+ PREREQUISITES
15
+
16
+ htmlentities >= 3.0.1 (http://htmlentities.rubyforge.org/)
17
+
18
+ If you want to try the sample code, you will also need:
19
+
20
+ tidy (http://rubyforge.org/projects/tidy)
21
+
22
+ You'll also need to install the tidy library for your operating system. For OS
23
+ X, it's available via Darwin Ports (port install tidy). For Windows, get it
24
+ from http://dev.int64.org/tidy.html and rename 'tidy.dll' to 'libtidy.dll'.
25
+ For Linux etc. it should be available via the repositories.
26
+
27
+ INSTALLATION
28
+
29
+ To install (requires root/admin privileges):
30
+
31
+ # ruby setup.rb
32
+
33
+ To test:
34
+
35
+ $ ruby setup.rb test
36
+
@@ -0,0 +1,158 @@
1
+ require 'rexml/document'
2
+ require 'uformats/pluralizer'
3
+ require 'cgi'
4
+
5
+ module Microformats
6
+
7
+ #
8
+ # Provides method access to an internal Hash
9
+ #
10
+ class MethodHash
11
+
12
+ instance_methods.each { |m| undef_method m unless m =~ /^__/ }
13
+
14
+ def initialize(hash={})
15
+ @hash = hash.dup
16
+ end
17
+
18
+ def []=(k,v)
19
+ @hash[k] = v
20
+ end
21
+
22
+ def [](k)
23
+ return @hash[k]
24
+ end
25
+
26
+ def method_missing(sym, *args)
27
+ return Pluralizer.singularized_lookup(self, sym)
28
+ end
29
+
30
+ def each(&blk)
31
+ case blk.arity
32
+ when 1
33
+ @hash.each do |k,v|
34
+ yield [k, v]
35
+ end
36
+ when 2
37
+ @hash.each do |k,v|
38
+ yield k, v
39
+ end
40
+ end
41
+ end
42
+ end # MethodHash
43
+
44
+ #
45
+ # Expand all include-pattern objects in a document
46
+ #
47
+ def expand_include_patterns!(rexml_source)
48
+ each_element_by_class(rexml_source, :include) do |target|
49
+ if (target.attribute('data') && target.attribute('data').value =~ /^#/)
50
+ source_id = target.attribute('data').value.sub(/^#/, '')
51
+ source = rexml_source.elements["//*[@id='#{source_id}']"]
52
+ if source
53
+ replacement = source.deep_clone
54
+ replacement.attributes.delete('id')
55
+ target.replace_with(replacement)
56
+ end
57
+ end
58
+ end
59
+ return rexml_source
60
+ end
61
+
62
+ #
63
+ # Yield each element matching sym at or below the given node
64
+ #
65
+ def each_element_by_class(rexml_source, sym)
66
+ css_class = Microformats.sym_to_css_class(sym)
67
+ rexml_source.each_element("descendant-or-self::*[contains(@class,'#{css_class}')]") do |element|
68
+ if element.attribute('class').value =~ /\b#{css_class}\b/
69
+ yield element
70
+ end
71
+ end
72
+ end
73
+
74
+ #
75
+ # Returns the first element matching sym at or below the given node, or nil if none exists
76
+ #
77
+ def first_element_by_class(rexml_source, sym)
78
+ each_element_by_class(rexml_source, sym) do |element|
79
+ return element
80
+ end
81
+ return nil
82
+ end
83
+
84
+ #
85
+ # Return a list of all rel-tag elements found at or above the given node
86
+ #
87
+ def rel_tags_above_element(rexml_element)
88
+ tags = []
89
+ rexml_element.each_element('ancestor::*/a[@rel="tag"]') do |tagged_element|
90
+ tag = rel_tag(tagged_element)
91
+ tags << tag if tag
92
+ end
93
+ tags.compact!
94
+ return tags.empty? ? nil : tags
95
+ end
96
+
97
+ #
98
+ # Return a list of all rel-tag elements found at or below the given node
99
+ #
100
+ def rel_tags_below_element(rexml_element)
101
+ tags = []
102
+ rexml_element.each_element('descendant-or-self::a[@rel="tag"]') do |tagged_element|
103
+ tag = rel_tag(tagged_element)
104
+ tags << tag if tag
105
+ end
106
+ return tags.empty? ? nil : tags
107
+ end
108
+
109
+ def rel_tag(rexml_element)
110
+ if rexml_element.attribute('href')
111
+ href = rexml_element.attribute('href').value
112
+ href.gsub!(%r!^.*/|[\?\#].*$!, '')
113
+ return CGI.unescape(href)
114
+ else
115
+ return nil
116
+ end
117
+ end
118
+
119
+ #
120
+ # Return a list of all rel-license elements found in the given element
121
+ #
122
+ def rel_licenses(rexml_element)
123
+ licenses = []
124
+ rexml_element.each_element('descendant-or-self::a[@rel="license"]') do |license_element|
125
+ if license_element.attribute('href')
126
+ licenses << license_element.attribute('href').value
127
+ end
128
+ end
129
+ return licenses.empty? ? nil : licenses
130
+ end
131
+
132
+ #
133
+ # Convert a Ruby symbol to a CSS class, e.g. :foo_bar => "foo-bar"
134
+ #
135
+ def sym_to_css_class(sym)
136
+ return sym.to_s.gsub('_', '-')
137
+ end
138
+
139
+ #
140
+ # Convert a css class to a Ruby symbol
141
+ #
142
+ def css_class_to_sym(css_class)
143
+ return css_class.gsub('-', '_').to_sym
144
+ end
145
+
146
+ #
147
+ # Returns a list of symbols corresponding to an element's classes
148
+ #
149
+ def classes_for_element(rexml_element)
150
+ if (css_class = rexml_element.attribute('class'))
151
+ return css_class.value.strip.split(/\s/).map{ |e| css_class_to_sym(e) }
152
+ else
153
+ return []
154
+ end
155
+ end
156
+
157
+ extend self
158
+ end
@@ -0,0 +1,268 @@
1
+ require 'rexml/document'
2
+ require 'cgi'
3
+ require 'uformats'
4
+ require 'uformats/pluralizer'
5
+ require 'htmlentities'
6
+ require 'time'
7
+ require 'uri'
8
+
9
+ module Microformats
10
+ class BasicNestedFormat
11
+
12
+ alias_method :__class__, :class
13
+
14
+ class << self
15
+
16
+ #
17
+ # Define or return a hash describing the structure of the derived microformat,
18
+ # using symbols, hashes, and arrays.
19
+ #
20
+ # E.g.
21
+ #
22
+ # structure({
23
+ # :foo => :integer, # singular integral value
24
+ # :bar => [:string], # singular or plural string value
25
+ # :baz => { # substructure below 'baz'
26
+ # :created_on => :datetime # 'created-on' is interpeted as a date/time value
27
+ # }
28
+ # })
29
+ #
30
+ def structure(structure=nil)
31
+ if structure
32
+ @structure = structure
33
+ legal_keys = []
34
+ structure.each do |key, value|
35
+ legal_keys << key
36
+ if value.is_a?(Array)
37
+ legal_keys << Pluralizer.pluralize(key)
38
+ end
39
+ end
40
+ legal_keys.each do |key|
41
+ class_eval "def #{key}() return Pluralizer.singularized_lookup(@tree, :#{key}) ; end"
42
+ end
43
+ end
44
+ return @structure
45
+ end
46
+
47
+ #
48
+ # By default, the name of the class is converted to lowercase and used as the
49
+ # CSS class identifying this format (e.g. HReview => "hreview"). This method
50
+ # can be used to override this.
51
+ #
52
+ def identifier(ident=nil)
53
+ @identifier_class = ident if ident
54
+ return @identifier_class ||= self.to_s.split('::').last.downcase.to_sym
55
+ end
56
+
57
+ #
58
+ # Yields a new instance for each of the matching microformats within
59
+ # the given document.
60
+ #
61
+ def each(source, url=nil)
62
+ if source.respond_to?(:each_element)
63
+ xml_source = source
64
+ else
65
+ xml_source = REXML::Document.new(source.to_s)
66
+ end
67
+ Microformats.expand_include_patterns!(xml_source)
68
+ Microformats.each_element_by_class(xml_source, identifier) do |element|
69
+ yield new(element.dup, url)
70
+ end
71
+ end
72
+
73
+ #
74
+ # Returns a new instance for the first matching microformat in the
75
+ # given document.
76
+ #
77
+ def first(source, url=nil)
78
+ each(source, url) do |mf|
79
+ return mf
80
+ end
81
+ end
82
+
83
+ end # class << self
84
+
85
+ def initialize(source, url=nil)
86
+ if source.respond_to?(:each_element)
87
+ @source = source
88
+ else
89
+ @source = REXML::Document.new(source.to_s)
90
+ end
91
+ @url = url
92
+ @tree = parse(__class__.structure, @source, MethodHash.new)
93
+ post_process!
94
+ end
95
+ attr_reader :source
96
+
97
+ def post_process!
98
+ end
99
+
100
+ def parse(structure, source, default_tree=nil)
101
+ tree = default_tree || MethodHash.new
102
+ structure.each do |tag, substructure|
103
+ if substructure.is_a?(Array)
104
+ key = Pluralizer.pluralize(tag)
105
+ else
106
+ key = tag
107
+ end
108
+ Microformats.each_element_by_class(source, tag) do |element|
109
+ tree[key] = parse_tag(substructure, source, tag)
110
+ end
111
+ end
112
+ return tree
113
+ end
114
+
115
+ def parse_tag(substructure, source, tag)
116
+ if substructure.is_a? Array
117
+ substructure = substructure[0]
118
+ elements = []
119
+ is_plural = true
120
+ else
121
+ is_plural = false
122
+ end
123
+ Microformats.each_element_by_class(source, tag) do |element|
124
+ if substructure.is_a? Hash
125
+ this = parse(substructure, element)
126
+ # Special case for values that can be encapsulated or not
127
+ unless this[:value]
128
+ ignored_classes = substructure.keys - [:value]
129
+ if substructure[:value] == :value
130
+ v = process_as_value(element, ignored_classes)
131
+ this[:value] ||= v if v
132
+ elsif substructure[:value] == :email
133
+ v = process_value_as_email(element)
134
+ this[:value] ||= v if v
135
+ end
136
+ end
137
+ else
138
+ this = self.__send__("process_as_#{substructure}".to_sym, element)
139
+ end
140
+ if is_plural
141
+ elements << this
142
+ else
143
+ return this
144
+ end
145
+ end
146
+ return elements
147
+ end
148
+
149
+ def process_as_string(element, ignored_classes=[])
150
+ case element.fully_expanded_name
151
+ when 'abbr'
152
+ return element.attribute('title').value.strip
153
+ when 'img'
154
+ return element.attribute('alt').value.strip
155
+ else
156
+ buffer = ''
157
+ element.children.each do |child|
158
+ case child
159
+ when REXML::Text
160
+ buffer << child.to_s
161
+ else
162
+ has_classes = Microformats.classes_for_element(child)
163
+ if (has_classes & ignored_classes).empty?
164
+ buffer << process_as_string(child, ignored_classes)
165
+ end
166
+ end
167
+ end
168
+ return HTMLEntities.decode_entities(buffer.strip.gsub(/\s+/um, ' '))
169
+ end
170
+ end
171
+
172
+ def process_as_integer(element)
173
+ text = process_as_string(element)
174
+ return nil unless text
175
+ return nil unless text =~ /^\d+$/
176
+ return text.to_i(10)
177
+ end
178
+
179
+ def process_as_float(element)
180
+ text = process_as_string(element)
181
+ return nil unless text
182
+ return nil unless text =~ /^\d+(?:\.\d+)?$/
183
+ return text.to_f
184
+ end
185
+
186
+ def process_as_xhtml(element)
187
+ return element.children.map{ |c| c.to_s }.join.strip
188
+ end
189
+
190
+ def process_as_datetime(element)
191
+ text = process_as_string(element)
192
+ return nil unless text && text =~ /\d{2,}/
193
+ text << '-01T00:00:00Z' if text =~ /^\d{4}-\d{2}$/
194
+ text << '01T000000Z' if text =~ /^\d{6}$/
195
+ text << '-T00:00:00Z' if text =~ /^\d{4}-\d{2}-\d{2}$/
196
+ text << 'T000000Z' if text =~ /^\d{8}$/
197
+ text.gsub!(/\++/, '+')
198
+ text.gsub!(/\-+/, '-')
199
+ begin
200
+ return Time.parse(text)
201
+ rescue
202
+ return nil
203
+ end
204
+ end
205
+
206
+ def process_as_url(element)
207
+ element.each_element('descendant-or-self::*[@href or @src]') do |subelement|
208
+ if (href = subelement.attribute('href'))
209
+ return absolute_url(href.value.strip)
210
+ elsif (src = subelement.attribute('src'))
211
+ return absolute_url(src.value.strip)
212
+ end
213
+ end
214
+ return nil
215
+ end
216
+
217
+ def process_as_value(element, ignored_classes=[])
218
+ Microformats.each_element_by_class(element, :value) do |e|
219
+ return process_as_string(e, ignored_classes)
220
+ end
221
+ return process_as_string(element, ignored_classes)
222
+ end
223
+
224
+ def process_as_email(element)
225
+ url = process_as_url(element)
226
+ if url
227
+ return url.sub(/^mailto:/, '')
228
+ else
229
+ return process_as_string(element)
230
+ end
231
+ end
232
+
233
+ def process_value_as_email(element)
234
+ email = process_as_url(element)
235
+ return email.sub(/^mailto:/, '') if email
236
+ return process_as_value(element)
237
+ end
238
+
239
+ def absolute_url(relative_url)
240
+ return relative_url unless @url
241
+ return URI.join(@url, relative_url).to_s
242
+ end
243
+
244
+ #
245
+ # Instead of worrying about calling methods on nil, lookup provides an
246
+ # easy way of accessing deeply nested data.
247
+ #
248
+ # E.g.
249
+ #
250
+ # object.somethings[1].other # => error if somethings[1] is nil
251
+ # object.lookup(:somethings, 1, :other) # => nil if any level is nil
252
+ #
253
+ def lookup(*list)
254
+ tree = @tree
255
+ list.each do |level|
256
+ if level.is_a?(Symbol)
257
+ tree = tree.__send__(level)
258
+ else
259
+ tree = tree.__send__(:[], level)
260
+ end
261
+ return nil unless tree
262
+ end
263
+ return tree
264
+ end
265
+
266
+ end # BasicNestedFormat
267
+
268
+ end # Microformats