feedtools 0.2.22 → 0.2.23

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,629 @@
1
+ #--
2
+ # Copyright (c) 2005 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ require 'feed_tools'
25
+ require 'feed_tools/helpers/xml_helper'
26
+ require 'rexml/document'
27
+
28
+ module FeedTools
29
+ # Methods for pulling remote data
30
+ module HtmlHelper
31
+ # Escapes all html entities
32
+ def self.escape_entities(html)
33
+ return nil if html.nil?
34
+ escaped_html = CGI.escapeHTML(html)
35
+ escaped_html.gsub!(/'/, "'")
36
+ escaped_html.gsub!(/"/, """)
37
+ return escaped_html
38
+ end
39
+
40
+ # Unescapes all html entities
41
+ def self.unescape_entities(html)
42
+ return nil if html.nil?
43
+ unescaped_html = html
44
+ unescaped_html.gsub!(/&/, "&")
45
+ unescaped_html.gsub!(/&/, "&")
46
+ unescaped_html = unescaped_html.gsub(/&#x\d+;/) do |hex|
47
+ "&#" + hex[3..-2].to_i(16).to_s + ";"
48
+ end
49
+ unescaped_html = CGI.unescapeHTML(unescaped_html)
50
+ unescaped_html.gsub!(/'/, "'")
51
+ unescaped_html.gsub!(/"/, "\"")
52
+ return unescaped_html
53
+ end
54
+
55
+ # Removes all html tags from the html formatted text, but leaves
56
+ # escaped entities alone.
57
+ def self.strip_html_tags(html)
58
+ return nil if html.nil?
59
+ stripped_html = html
60
+ stripped_html.gsub!(/<\/?[^>]+>/, "")
61
+ return stripped_html
62
+ end
63
+
64
+ # Removes all html tags from the html formatted text and removes
65
+ # escaped entities.
66
+ def self.convert_html_to_plain_text(html)
67
+ return nil if html.nil?
68
+ stripped_html = html
69
+ stripped_html = FeedTools::HtmlHelper.strip_html_tags(stripped_html)
70
+ stripped_html = FeedTools::HtmlHelper.unescape_entities(stripped_html)
71
+ stripped_html.gsub!(/&#8216;/, "'")
72
+ stripped_html.gsub!(/&#8217;/, "'")
73
+ stripped_html.gsub!(/&#8220;/, "\"")
74
+ stripped_html.gsub!(/&#8221;/, "\"")
75
+ return stripped_html
76
+ end
77
+
78
+ # Returns true if the html tidy module can be used.
79
+ #
80
+ # Obviously, you need the tidy gem installed in order to run with html
81
+ # tidy features turned on.
82
+ #
83
+ # This method does a fairly complicated, and probably unnecessarily
84
+ # desperate search for the libtidy library. If you want this thing to
85
+ # execute fast, the best thing to do is to set Tidy.path ahead of time.
86
+ # If Tidy.path is set, this method doesn't do much. If it's not set,
87
+ # it will do it's darnedest to find the libtidy library. If you set
88
+ # the LIBTIDYPATH environment variable to the libtidy library, it should
89
+ # be able to find it.
90
+ #
91
+ # Once the library is located, this method will run much faster.
92
+ def self.tidy_enabled?
93
+ # This is an override variable to keep tidy from being used even if it
94
+ # is available.
95
+ if FeedTools.configurations[:tidy_enabled] == false
96
+ return false
97
+ end
98
+ if @tidy_enabled.nil? || @tidy_enabled == false
99
+ @tidy_enabled = false
100
+ begin
101
+ require 'tidy'
102
+ if Tidy.path.nil?
103
+ # *Shrug*, just brute force it, I guess. There's a lot of places
104
+ # this thing might be hiding in, depending on platform and general
105
+ # sanity of the person who installed the thing. Most of these are
106
+ # probably unlikely, but it's not like checking unlikely locations
107
+ # hurts. Much. Especially if you actually find it.
108
+ libtidy_locations = [
109
+ '/usr/local/lib/libtidy.dylib',
110
+ '/opt/local/lib/libtidy.dylib',
111
+ '/usr/lib/libtidy.dylib',
112
+ '/usr/local/lib/tidylib.dylib',
113
+ '/opt/local/lib/tidylib.dylib',
114
+ '/usr/lib/tidylib.dylib',
115
+ '/usr/local/lib/tidy.dylib',
116
+ '/opt/local/lib/tidy.dylib',
117
+ '/usr/lib/tidy.dylib',
118
+ '/usr/local/lib/libtidy.so',
119
+ '/opt/local/lib/libtidy.so',
120
+ '/usr/lib/libtidy.so',
121
+ '/usr/local/lib/tidylib.so',
122
+ '/opt/local/lib/tidylib.so',
123
+ '/usr/lib/tidylib.so',
124
+ '/usr/local/lib/tidy.so',
125
+ '/opt/local/lib/tidy.so',
126
+ '/usr/lib/tidy.so',
127
+ 'C:\Program Files\Tidy\tidy.dll',
128
+ 'C:\Tidy\tidy.dll',
129
+ 'C:\Ruby\bin\tidy.dll',
130
+ 'C:\Ruby\tidy.dll',
131
+ '/usr/local/lib',
132
+ '/opt/local/lib',
133
+ '/usr/lib'
134
+ ]
135
+ # We just made this thing up, but if someone sets it, we'll
136
+ # go ahead and check it
137
+ unless ENV['LIBTIDYPATH'].nil?
138
+ libtidy_locations =
139
+ libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
140
+ end
141
+ for path in libtidy_locations
142
+ if File.exists? path
143
+ if File.ftype(path) == "file"
144
+ Tidy.path = path
145
+ @tidy_enabled = true
146
+ break
147
+ elsif File.ftype(path) == "directory"
148
+ # Ok, now perhaps we're getting a bit more desperate
149
+ lib_paths =
150
+ `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
151
+ # If there's more than one, grab the first one and
152
+ # hope for the best, and if it doesn't work, then blame the
153
+ # user for not specifying more accurately.
154
+ tidy_path = lib_paths.split("\n").first
155
+ unless tidy_path.nil?
156
+ Tidy.path = tidy_path
157
+ @tidy_enabled = true
158
+ break
159
+ end
160
+ end
161
+ end
162
+ end
163
+ # Still couldn't find it.
164
+ unless @tidy_enabled
165
+ @tidy_enabled = false
166
+ end
167
+ else
168
+ @tidy_enabled = true
169
+ end
170
+ rescue LoadError
171
+ # Tidy not installed, disable features that rely on tidy.
172
+ @tidy_enabled = false
173
+ end
174
+ end
175
+ return @tidy_enabled
176
+ end
177
+
178
+ # Tidys up the html
179
+ def self.tidy_html(html, options = {})
180
+ return nil if html.nil?
181
+
182
+ FeedTools::GenericHelper.validate_options([ :input_encoding,
183
+ :output_encoding ],
184
+ options.keys)
185
+ options = { :input_encoding => "utf-8",
186
+ :output_encoding => "utf-8" }.merge(options)
187
+
188
+ if FeedTools::HtmlHelper.tidy_enabled?
189
+ is_fragment = true
190
+ html.gsub!(/&lt;!'/, "&amp;lt;!'")
191
+ if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
192
+ (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
193
+ is_fragment = false
194
+ end
195
+ if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
196
+ is_fragment = false
197
+ end
198
+
199
+ # Tidy sucks?
200
+ # TODO: find the correct set of tidy options to set so
201
+ # that *ugly* hacks like this aren't necessary.
202
+ html = html.gsub(/\302\240/, "\240")
203
+
204
+ tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
205
+ tidy.options.output_xml = true
206
+ tidy.options.markup = true
207
+ tidy.options.indent = true
208
+ tidy.options.wrap = 0
209
+ tidy.options.logical_emphasis = true
210
+ tidy.options.input_encoding = options[:input_encoding]
211
+ tidy.options.output_encoding = options[:output_encoding]
212
+ tidy.options.doctype = "omit"
213
+ xml = tidy.clean(html)
214
+ xml
215
+ end
216
+ if is_fragment
217
+ # Tidy sticks <html>...<body>[our html]</body>...</html> in.
218
+ # We don't want this.
219
+ tidy_html.strip!
220
+ tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
221
+ tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
222
+ tidy_html.gsub!("\t", " ")
223
+ tidy_html = FeedTools::HtmlHelper.unindent(tidy_html, 4)
224
+ tidy_html.strip!
225
+ end
226
+ else
227
+ tidy_html = html
228
+ end
229
+ if tidy_html.blank? && !html.blank?
230
+ tidy_html = html.strip
231
+ end
232
+ return tidy_html
233
+ end
234
+
235
+ # Unindents a text selection by a specified number of spaces.
236
+ def self.unindent(text, spaces)
237
+ lines = text.split("\n")
238
+ buffer = ""
239
+ for line in lines
240
+ for index in 0...spaces
241
+ if line[0...1] == " "
242
+ line = line[1..-1]
243
+ else
244
+ break
245
+ end
246
+ end
247
+ buffer << line << "\n"
248
+ end
249
+ return buffer
250
+ end
251
+
252
+ # Removes all dangerous html tags from the html formatted text.
253
+ # If mode is set to :escape, dangerous and unknown elements will
254
+ # be escaped. If mode is set to :strip, dangerous and unknown
255
+ # elements and all children will be removed entirely.
256
+ # Dangerous or unknown attributes are always removed.
257
+ def self.sanitize_html(html, mode=:strip)
258
+ return nil if html.nil?
259
+
260
+ # Lists borrowed from Mark Pilgrim's feedparser
261
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
262
+ 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
263
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
264
+ 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
265
+ 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
266
+ 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
267
+ 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
268
+ 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
269
+ 'u', 'ul', 'var']
270
+
271
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
272
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
273
+ 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
274
+ 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
275
+ 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
276
+ 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
277
+ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
278
+ 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
279
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
280
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
281
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width']
282
+
283
+ # Replace with appropriate named entities
284
+ html.gsub!(/&#x26;/, "&amp;")
285
+ html.gsub!(/&#38;/, "&amp;")
286
+ html.gsub!(/&lt;!'/, "&amp;lt;!'")
287
+
288
+ # Hackity hack. But it works, and it seems plenty fast enough.
289
+ html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
290
+
291
+ sanitize_node = lambda do |html_node|
292
+ if html_node.respond_to? :children
293
+ for child in html_node.children
294
+ if child.kind_of? REXML::Element
295
+ unless acceptable_elements.include? child.name.downcase
296
+ if mode == :strip
297
+ html_node.delete_element(child)
298
+ else
299
+ new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
300
+ html_node.insert_after(child, new_child)
301
+ html_node.delete_element(child)
302
+ end
303
+ end
304
+ for attribute in child.attributes.keys
305
+ if !(attribute =~ /^xmlns/)
306
+ unless acceptable_attributes.include? attribute.downcase
307
+ child.delete_attribute(attribute)
308
+ end
309
+ end
310
+ end
311
+ end
312
+ sanitize_node.call(child)
313
+ end
314
+ end
315
+ html_node
316
+ end
317
+ sanitize_node.call(html_doc.root)
318
+ html = html_doc.root.inner_xml
319
+ return html
320
+ end
321
+
322
+ # Returns true if the type string provided indicates that something is
323
+ # xml or xhtml content.
324
+ def self.xml_type?(type)
325
+ if [
326
+ "xml",
327
+ "xhtml",
328
+ "application/xhtml+xml"
329
+ ].include?(type)
330
+ return true
331
+ elsif type != nil && type[-3..-1] == "xml"
332
+ return true
333
+ else
334
+ return false
335
+ end
336
+ end
337
+
338
+ # Returns true if the type string provided indicates that something is
339
+ # html or xhtml content.
340
+ def self.text_type?(type)
341
+ return [
342
+ "text",
343
+ "text/plain"
344
+ ].include?(type)
345
+ end
346
+
347
+ # Returns true if the type string provided indicates that something is
348
+ # html or xhtml content.
349
+ def self.html_type?(type)
350
+ return [
351
+ "html",
352
+ "xhtml",
353
+ "text/html",
354
+ "application/xhtml+xml"
355
+ ].include?(type)
356
+ end
357
+
358
+ # Returns true if the type string provided indicates that something is
359
+ # only html (not xhtml) content.
360
+ def self.only_html_type?(type)
361
+ return [
362
+ "html",
363
+ "text/html"
364
+ ].include?(type)
365
+ end
366
+
367
+ # can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
368
+
369
+ # Resolves all relative uris in a block of html.
370
+ def self.resolve_relative_uris(html, base_uri_sources=[])
371
+ relative_uri_attributes = [
372
+ ["a", "href"],
373
+ ["applet", "codebase"],
374
+ ["area", "href"],
375
+ ["blockquote", "cite"],
376
+ ["body", "background"],
377
+ ["del", "cite"],
378
+ ["form", "action"],
379
+ ["frame", "longdesc"],
380
+ ["frame", "src"],
381
+ ["iframe", "longdesc"],
382
+ ["iframe", "src"],
383
+ ["head", "profile"],
384
+ ["img", "longdesc"],
385
+ ["img", "src"],
386
+ ["img", "usemap"],
387
+ ["input", "src"],
388
+ ["input", "usemap"],
389
+ ["ins", "cite"],
390
+ ["link", "href"],
391
+ ["object", "classid"],
392
+ ["object", "codebase"],
393
+ ["object", "data"],
394
+ ["object", "usemap"],
395
+ ["q", "cite"],
396
+ ["script", "src"]
397
+ ]
398
+ html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
399
+
400
+ resolve_node = lambda do |html_node|
401
+ if html_node.respond_to? :children
402
+ for child in html_node.children
403
+ if child.kind_of? REXML::Element
404
+ for element_attribute_pair in relative_uri_attributes
405
+ if child.name.downcase == element_attribute_pair[0]
406
+ attribute = child.attribute(element_attribute_pair[1])
407
+ if attribute != nil
408
+ href = attribute.value
409
+ href = FeedTools::UriHelper.resolve_relative_uri(
410
+ href, [child.base_uri] | base_uri_sources)
411
+ child.attribute(
412
+ element_attribute_pair[1]).instance_variable_set(
413
+ "@value", href)
414
+ end
415
+ end
416
+ end
417
+ end
418
+ resolve_node.call(child)
419
+ end
420
+ end
421
+ html_node
422
+ end
423
+ resolve_node.call(html_doc.root)
424
+ html = html_doc.root.inner_xml
425
+ return html
426
+ end
427
+
428
+ # Returns a string containing normalized xhtml from within a REXML node.
429
+ def self.extract_xhtml(rexml_node)
430
+ rexml_node_dup = rexml_node.deep_clone
431
+ normalize_namespaced_xhtml = lambda do |node, node_dup|
432
+ if node.kind_of? REXML::Element
433
+ node_namespace = node.namespace
434
+ # Massive hack, relies on REXML not changing
435
+ for index in 0...node.attributes.values.size
436
+ attribute = node.attributes.values[index]
437
+ attribute_dup = node_dup.attributes.values[index]
438
+ if attribute.namespace == FEED_TOOLS_NAMESPACES['xhtml']
439
+ attribute_dup.instance_variable_set(
440
+ "@expanded_name", attribute.name)
441
+ end
442
+ if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
443
+ if attribute.name == 'xmlns'
444
+ node_dup.attributes.delete('xmlns')
445
+ end
446
+ end
447
+ end
448
+ if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
449
+ node_dup.instance_variable_set("@expanded_name", node.name)
450
+ end
451
+ if !node_namespace.blank? && node.prefix.blank?
452
+ if node.namespace != FEED_TOOLS_NAMESPACES['xhtml']
453
+ node_dup.add_namespace(node_namespace)
454
+ end
455
+ end
456
+ end
457
+ for index in 0...node.children.size
458
+ child = node.children[index]
459
+ child_dup = node_dup.children[index]
460
+ if child.kind_of? REXML::Element
461
+ normalize_namespaced_xhtml.call(child, child_dup)
462
+ end
463
+ end
464
+ end
465
+ normalize_namespaced_xhtml.call(rexml_node, rexml_node_dup)
466
+ buffer = ""
467
+ rexml_node_dup.each_child do |child|
468
+ if child.kind_of? REXML::Comment
469
+ buffer << "<!--" + child.to_s + "-->"
470
+ else
471
+ buffer << child.to_s
472
+ end
473
+ end
474
+ return buffer.strip
475
+ end
476
+
477
+ # Given a REXML node, returns its content, normalized as HTML.
478
+ def self.process_text_construct(content_node, feed_type, feed_version,
479
+ base_uri_sources=[])
480
+ if content_node.nil?
481
+ return nil
482
+ end
483
+
484
+ content = nil
485
+ root_node_name = nil
486
+ type = FeedTools::XmlHelper.try_xpaths(content_node, "@type",
487
+ :select_result_value => true)
488
+ mode = FeedTools::XmlHelper.try_xpaths(content_node, "@mode",
489
+ :select_result_value => true)
490
+ encoding = FeedTools::XmlHelper.try_xpaths(content_node, "@encoding",
491
+ :select_result_value => true)
492
+
493
+ if type.nil?
494
+ atom_namespaces = [
495
+ FEED_TOOLS_NAMESPACES['atom10'],
496
+ FEED_TOOLS_NAMESPACES['atom03']
497
+ ]
498
+ if ((atom_namespaces.include?(content_node.namespace) ||
499
+ atom_namespaces.include?(content_node.root.namespace)) ||
500
+ feed_type == "atom")
501
+ type = "text"
502
+ end
503
+ end
504
+
505
+ # Note that we're checking for misuse of type, mode and encoding here
506
+ if content_node.cdatas.size > 0
507
+ content = content_node.cdatas.first.to_s.strip
508
+ elsif type == "base64" || mode == "base64" ||
509
+ encoding == "base64"
510
+ content = Base64.decode64(content_node.inner_xml.strip)
511
+ elsif type == "xhtml" || mode == "xhtml" ||
512
+ type == "xml" || mode == "xml" ||
513
+ type == "application/xhtml+xml" ||
514
+ content_node.namespace == FEED_TOOLS_NAMESPACES['xhtml']
515
+ content = FeedTools::HtmlHelper.extract_xhtml(content_node)
516
+ elsif type == "escaped" || mode == "escaped"
517
+ content = FeedTools::HtmlHelper.unescape_entities(
518
+ content_node.inner_xml.strip)
519
+ elsif type == "text" || mode == "text" ||
520
+ type == "text/plain" || mode == "text/plain"
521
+ content = FeedTools::HtmlHelper.unescape_entities(
522
+ content_node.inner_xml.strip)
523
+ else
524
+ content = content_node.inner_xml.strip
525
+ repair_entities = true
526
+ end
527
+ if type == "text" || mode == "text" ||
528
+ type == "text/plain" || mode == "text/plain"
529
+ content = FeedTools::HtmlHelper.escape_entities(content)
530
+ end
531
+ unless content.nil?
532
+ if FeedTools.configurations[:sanitization_enabled]
533
+ content = FeedTools::HtmlHelper.sanitize_html(content, :strip)
534
+ end
535
+ content = FeedTools::HtmlHelper.resolve_relative_uris(content,
536
+ [content_node.base_uri] | base_uri_sources)
537
+ if repair_entities
538
+ content = FeedTools::HtmlHelper.unescape_entities(content)
539
+ end
540
+ content = FeedTools::HtmlHelper.tidy_html(content)
541
+ end
542
+ if FeedTools.configurations[:tab_spaces] != nil
543
+ spaces = FeedTools.configurations[:tab_spaces].to_i
544
+ content.gsub!("\t", " " * spaces) unless content.blank?
545
+ end
546
+ content.strip unless content.blank?
547
+ content = nil if content.blank?
548
+ return content
549
+ end
550
+
551
+ # Strips semantically empty div wrapper elements
552
+ def self.strip_wrapper_element(xhtml)
553
+ return nil if xhtml.nil?
554
+ return xhtml if xhtml.blank?
555
+ begin
556
+ doc = REXML::Document.new(xhtml.to_s.strip)
557
+ if doc.children.size == 1
558
+ child = doc.children[0]
559
+ if child.name.downcase == "div"
560
+ return child.inner_xml.strip
561
+ end
562
+ end
563
+ return xhtml.to_s.strip
564
+ rescue Exception
565
+ return xhtml.to_s.strip
566
+ end
567
+ end
568
+
569
+ # Given a block of html, locates feed links with a given mime type.
570
+ def self.extract_link_by_mime_type(html, mime_type)
571
+ require 'feed_tools/vendor/htree'
572
+ require 'feed_tools/helpers/xml_helper'
573
+
574
+ # This is technically very, very wrong. But it saves oodles of
575
+ # clock cycles, and probably works 99.999% of the time.
576
+ html_document = HTree.parse_xml(
577
+ FeedTools::HtmlHelper.tidy_html(html.gsub(/<body>(.|\n)*<\/body>/, ""))).to_rexml
578
+ html_node = nil
579
+ head_node = nil
580
+ link_nodes = []
581
+ for node in html_document.children
582
+ next unless node.kind_of?(REXML::Element)
583
+ if node.name.downcase == "html" &&
584
+ node.children.size > 0
585
+ html_node = node
586
+ break
587
+ end
588
+ end
589
+ return nil if html_node.nil?
590
+ for node in html_node.children
591
+ next unless node.kind_of?(REXML::Element)
592
+ if node.name.downcase == "head"
593
+ head_node = node
594
+ break
595
+ end
596
+ if node.name.downcase == "link"
597
+ link_nodes << node
598
+ end
599
+ end
600
+ return nil if html_node.nil? && link_nodes.empty?
601
+ if !head_node.nil?
602
+ link_nodes = []
603
+ for node in head_node.children
604
+ next unless node.kind_of?(REXML::Element)
605
+ if node.name.downcase == "link"
606
+ link_nodes << node
607
+ end
608
+ end
609
+ end
610
+ find_link_nodes = lambda do |links|
611
+ for link in links
612
+ next unless link.kind_of?(REXML::Element)
613
+ if link.attributes['type'].to_s.strip.downcase ==
614
+ mime_type.downcase &&
615
+ link.attributes['rel'].to_s.strip.downcase == "alternate"
616
+ href = link.attributes['href']
617
+ return href unless href.blank?
618
+ end
619
+ end
620
+ for link in links
621
+ next unless link.kind_of?(REXML::Element)
622
+ find_link_nodes.call(link.children)
623
+ end
624
+ end
625
+ find_link_nodes.call(link_nodes)
626
+ return nil
627
+ end
628
+ end
629
+ end