feedtools 0.2.22 → 0.2.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,629 @@
1
+ #--
2
+ # Copyright (c) 2005 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ require 'feed_tools'
25
+ require 'feed_tools/helpers/xml_helper'
26
+ require 'rexml/document'
27
+
28
+ module FeedTools
29
+ # Methods for pulling remote data
30
+ module HtmlHelper
31
+ # Escapes all html entities
32
+ def self.escape_entities(html)
33
+ return nil if html.nil?
34
+ escaped_html = CGI.escapeHTML(html)
35
+ escaped_html.gsub!(/'/, "'")
36
+ escaped_html.gsub!(/"/, """)
37
+ return escaped_html
38
+ end
39
+
40
+ # Unescapes all html entities
41
+ def self.unescape_entities(html)
42
+ return nil if html.nil?
43
+ unescaped_html = html
44
+ unescaped_html.gsub!(/&/, "&")
45
+ unescaped_html.gsub!(/&/, "&")
46
+ unescaped_html = unescaped_html.gsub(/&#x\d+;/) do |hex|
47
+ "&#" + hex[3..-2].to_i(16).to_s + ";"
48
+ end
49
+ unescaped_html = CGI.unescapeHTML(unescaped_html)
50
+ unescaped_html.gsub!(/'/, "'")
51
+ unescaped_html.gsub!(/"/, "\"")
52
+ return unescaped_html
53
+ end
54
+
55
+ # Removes all html tags from the html formatted text, but leaves
56
+ # escaped entities alone.
57
+ def self.strip_html_tags(html)
58
+ return nil if html.nil?
59
+ stripped_html = html
60
+ stripped_html.gsub!(/<\/?[^>]+>/, "")
61
+ return stripped_html
62
+ end
63
+
64
+ # Removes all html tags from the html formatted text and removes
65
+ # escaped entities.
66
+ def self.convert_html_to_plain_text(html)
67
+ return nil if html.nil?
68
+ stripped_html = html
69
+ stripped_html = FeedTools::HtmlHelper.strip_html_tags(stripped_html)
70
+ stripped_html = FeedTools::HtmlHelper.unescape_entities(stripped_html)
71
+ stripped_html.gsub!(/&#8216;/, "'")
72
+ stripped_html.gsub!(/&#8217;/, "'")
73
+ stripped_html.gsub!(/&#8220;/, "\"")
74
+ stripped_html.gsub!(/&#8221;/, "\"")
75
+ return stripped_html
76
+ end
77
+
78
+ # Returns true if the html tidy module can be used.
79
+ #
80
+ # Obviously, you need the tidy gem installed in order to run with html
81
+ # tidy features turned on.
82
+ #
83
+ # This method does a fairly complicated, and probably unnecessarily
84
+ # desperate search for the libtidy library. If you want this thing to
85
+ # execute fast, the best thing to do is to set Tidy.path ahead of time.
86
+ # If Tidy.path is set, this method doesn't do much. If it's not set,
87
+ # it will do it's darnedest to find the libtidy library. If you set
88
+ # the LIBTIDYPATH environment variable to the libtidy library, it should
89
+ # be able to find it.
90
+ #
91
+ # Once the library is located, this method will run much faster.
92
+ def self.tidy_enabled?
93
+ # This is an override variable to keep tidy from being used even if it
94
+ # is available.
95
+ if FeedTools.configurations[:tidy_enabled] == false
96
+ return false
97
+ end
98
+ if @tidy_enabled.nil? || @tidy_enabled == false
99
+ @tidy_enabled = false
100
+ begin
101
+ require 'tidy'
102
+ if Tidy.path.nil?
103
+ # *Shrug*, just brute force it, I guess. There's a lot of places
104
+ # this thing might be hiding in, depending on platform and general
105
+ # sanity of the person who installed the thing. Most of these are
106
+ # probably unlikely, but it's not like checking unlikely locations
107
+ # hurts. Much. Especially if you actually find it.
108
+ libtidy_locations = [
109
+ '/usr/local/lib/libtidy.dylib',
110
+ '/opt/local/lib/libtidy.dylib',
111
+ '/usr/lib/libtidy.dylib',
112
+ '/usr/local/lib/tidylib.dylib',
113
+ '/opt/local/lib/tidylib.dylib',
114
+ '/usr/lib/tidylib.dylib',
115
+ '/usr/local/lib/tidy.dylib',
116
+ '/opt/local/lib/tidy.dylib',
117
+ '/usr/lib/tidy.dylib',
118
+ '/usr/local/lib/libtidy.so',
119
+ '/opt/local/lib/libtidy.so',
120
+ '/usr/lib/libtidy.so',
121
+ '/usr/local/lib/tidylib.so',
122
+ '/opt/local/lib/tidylib.so',
123
+ '/usr/lib/tidylib.so',
124
+ '/usr/local/lib/tidy.so',
125
+ '/opt/local/lib/tidy.so',
126
+ '/usr/lib/tidy.so',
127
+ 'C:\Program Files\Tidy\tidy.dll',
128
+ 'C:\Tidy\tidy.dll',
129
+ 'C:\Ruby\bin\tidy.dll',
130
+ 'C:\Ruby\tidy.dll',
131
+ '/usr/local/lib',
132
+ '/opt/local/lib',
133
+ '/usr/lib'
134
+ ]
135
+ # We just made this thing up, but if someone sets it, we'll
136
+ # go ahead and check it
137
+ unless ENV['LIBTIDYPATH'].nil?
138
+ libtidy_locations =
139
+ libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
140
+ end
141
+ for path in libtidy_locations
142
+ if File.exists? path
143
+ if File.ftype(path) == "file"
144
+ Tidy.path = path
145
+ @tidy_enabled = true
146
+ break
147
+ elsif File.ftype(path) == "directory"
148
+ # Ok, now perhaps we're getting a bit more desperate
149
+ lib_paths =
150
+ `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
151
+ # If there's more than one, grab the first one and
152
+ # hope for the best, and if it doesn't work, then blame the
153
+ # user for not specifying more accurately.
154
+ tidy_path = lib_paths.split("\n").first
155
+ unless tidy_path.nil?
156
+ Tidy.path = tidy_path
157
+ @tidy_enabled = true
158
+ break
159
+ end
160
+ end
161
+ end
162
+ end
163
+ # Still couldn't find it.
164
+ unless @tidy_enabled
165
+ @tidy_enabled = false
166
+ end
167
+ else
168
+ @tidy_enabled = true
169
+ end
170
+ rescue LoadError
171
+ # Tidy not installed, disable features that rely on tidy.
172
+ @tidy_enabled = false
173
+ end
174
+ end
175
+ return @tidy_enabled
176
+ end
177
+
178
+ # Tidys up the html
179
+ def self.tidy_html(html, options = {})
180
+ return nil if html.nil?
181
+
182
+ FeedTools::GenericHelper.validate_options([ :input_encoding,
183
+ :output_encoding ],
184
+ options.keys)
185
+ options = { :input_encoding => "utf-8",
186
+ :output_encoding => "utf-8" }.merge(options)
187
+
188
+ if FeedTools::HtmlHelper.tidy_enabled?
189
+ is_fragment = true
190
+ html.gsub!(/&lt;!'/, "&amp;lt;!'")
191
+ if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
192
+ (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
193
+ is_fragment = false
194
+ end
195
+ if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
196
+ is_fragment = false
197
+ end
198
+
199
+ # Tidy sucks?
200
+ # TODO: find the correct set of tidy options to set so
201
+ # that *ugly* hacks like this aren't necessary.
202
+ html = html.gsub(/\302\240/, "\240")
203
+
204
+ tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
205
+ tidy.options.output_xml = true
206
+ tidy.options.markup = true
207
+ tidy.options.indent = true
208
+ tidy.options.wrap = 0
209
+ tidy.options.logical_emphasis = true
210
+ tidy.options.input_encoding = options[:input_encoding]
211
+ tidy.options.output_encoding = options[:output_encoding]
212
+ tidy.options.doctype = "omit"
213
+ xml = tidy.clean(html)
214
+ xml
215
+ end
216
+ if is_fragment
217
+ # Tidy sticks <html>...<body>[our html]</body>...</html> in.
218
+ # We don't want this.
219
+ tidy_html.strip!
220
+ tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
221
+ tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
222
+ tidy_html.gsub!("\t", " ")
223
+ tidy_html = FeedTools::HtmlHelper.unindent(tidy_html, 4)
224
+ tidy_html.strip!
225
+ end
226
+ else
227
+ tidy_html = html
228
+ end
229
+ if tidy_html.blank? && !html.blank?
230
+ tidy_html = html.strip
231
+ end
232
+ return tidy_html
233
+ end
234
+
235
+ # Unindents a text selection by a specified number of spaces.
236
+ def self.unindent(text, spaces)
237
+ lines = text.split("\n")
238
+ buffer = ""
239
+ for line in lines
240
+ for index in 0...spaces
241
+ if line[0...1] == " "
242
+ line = line[1..-1]
243
+ else
244
+ break
245
+ end
246
+ end
247
+ buffer << line << "\n"
248
+ end
249
+ return buffer
250
+ end
251
+
252
+ # Removes all dangerous html tags from the html formatted text.
253
+ # If mode is set to :escape, dangerous and unknown elements will
254
+ # be escaped. If mode is set to :strip, dangerous and unknown
255
+ # elements and all children will be removed entirely.
256
+ # Dangerous or unknown attributes are always removed.
257
+ def self.sanitize_html(html, mode=:strip)
258
+ return nil if html.nil?
259
+
260
+ # Lists borrowed from Mark Pilgrim's feedparser
261
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
262
+ 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
263
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
264
+ 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
265
+ 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
266
+ 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
267
+ 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
268
+ 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
269
+ 'u', 'ul', 'var']
270
+
271
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
272
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
273
+ 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
274
+ 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
275
+ 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
276
+ 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
277
+ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
278
+ 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
279
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
280
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
281
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width']
282
+
283
+ # Replace with appropriate named entities
284
+ html.gsub!(/&#x26;/, "&amp;")
285
+ html.gsub!(/&#38;/, "&amp;")
286
+ html.gsub!(/&lt;!'/, "&amp;lt;!'")
287
+
288
+ # Hackity hack. But it works, and it seems plenty fast enough.
289
+ html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
290
+
291
+ sanitize_node = lambda do |html_node|
292
+ if html_node.respond_to? :children
293
+ for child in html_node.children
294
+ if child.kind_of? REXML::Element
295
+ unless acceptable_elements.include? child.name.downcase
296
+ if mode == :strip
297
+ html_node.delete_element(child)
298
+ else
299
+ new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
300
+ html_node.insert_after(child, new_child)
301
+ html_node.delete_element(child)
302
+ end
303
+ end
304
+ for attribute in child.attributes.keys
305
+ if !(attribute =~ /^xmlns/)
306
+ unless acceptable_attributes.include? attribute.downcase
307
+ child.delete_attribute(attribute)
308
+ end
309
+ end
310
+ end
311
+ end
312
+ sanitize_node.call(child)
313
+ end
314
+ end
315
+ html_node
316
+ end
317
+ sanitize_node.call(html_doc.root)
318
+ html = html_doc.root.inner_xml
319
+ return html
320
+ end
321
+
322
+ # Returns true if the type string provided indicates that something is
323
+ # xml or xhtml content.
324
+ def self.xml_type?(type)
325
+ if [
326
+ "xml",
327
+ "xhtml",
328
+ "application/xhtml+xml"
329
+ ].include?(type)
330
+ return true
331
+ elsif type != nil && type[-3..-1] == "xml"
332
+ return true
333
+ else
334
+ return false
335
+ end
336
+ end
337
+
338
+ # Returns true if the type string provided indicates that something is
339
+ # html or xhtml content.
340
+ def self.text_type?(type)
341
+ return [
342
+ "text",
343
+ "text/plain"
344
+ ].include?(type)
345
+ end
346
+
347
+ # Returns true if the type string provided indicates that something is
348
+ # html or xhtml content.
349
+ def self.html_type?(type)
350
+ return [
351
+ "html",
352
+ "xhtml",
353
+ "text/html",
354
+ "application/xhtml+xml"
355
+ ].include?(type)
356
+ end
357
+
358
+ # Returns true if the type string provided indicates that something is
359
+ # only html (not xhtml) content.
360
+ def self.only_html_type?(type)
361
+ return [
362
+ "html",
363
+ "text/html"
364
+ ].include?(type)
365
+ end
366
+
367
+ # can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
368
+
369
+ # Resolves all relative uris in a block of html.
370
+ def self.resolve_relative_uris(html, base_uri_sources=[])
371
+ relative_uri_attributes = [
372
+ ["a", "href"],
373
+ ["applet", "codebase"],
374
+ ["area", "href"],
375
+ ["blockquote", "cite"],
376
+ ["body", "background"],
377
+ ["del", "cite"],
378
+ ["form", "action"],
379
+ ["frame", "longdesc"],
380
+ ["frame", "src"],
381
+ ["iframe", "longdesc"],
382
+ ["iframe", "src"],
383
+ ["head", "profile"],
384
+ ["img", "longdesc"],
385
+ ["img", "src"],
386
+ ["img", "usemap"],
387
+ ["input", "src"],
388
+ ["input", "usemap"],
389
+ ["ins", "cite"],
390
+ ["link", "href"],
391
+ ["object", "classid"],
392
+ ["object", "codebase"],
393
+ ["object", "data"],
394
+ ["object", "usemap"],
395
+ ["q", "cite"],
396
+ ["script", "src"]
397
+ ]
398
+ html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
399
+
400
+ resolve_node = lambda do |html_node|
401
+ if html_node.respond_to? :children
402
+ for child in html_node.children
403
+ if child.kind_of? REXML::Element
404
+ for element_attribute_pair in relative_uri_attributes
405
+ if child.name.downcase == element_attribute_pair[0]
406
+ attribute = child.attribute(element_attribute_pair[1])
407
+ if attribute != nil
408
+ href = attribute.value
409
+ href = FeedTools::UriHelper.resolve_relative_uri(
410
+ href, [child.base_uri] | base_uri_sources)
411
+ child.attribute(
412
+ element_attribute_pair[1]).instance_variable_set(
413
+ "@value", href)
414
+ end
415
+ end
416
+ end
417
+ end
418
+ resolve_node.call(child)
419
+ end
420
+ end
421
+ html_node
422
+ end
423
+ resolve_node.call(html_doc.root)
424
+ html = html_doc.root.inner_xml
425
+ return html
426
+ end
427
+
428
+ # Returns a string containing normalized xhtml from within a REXML node.
429
+ def self.extract_xhtml(rexml_node)
430
+ rexml_node_dup = rexml_node.deep_clone
431
+ normalize_namespaced_xhtml = lambda do |node, node_dup|
432
+ if node.kind_of? REXML::Element
433
+ node_namespace = node.namespace
434
+ # Massive hack, relies on REXML not changing
435
+ for index in 0...node.attributes.values.size
436
+ attribute = node.attributes.values[index]
437
+ attribute_dup = node_dup.attributes.values[index]
438
+ if attribute.namespace == FEED_TOOLS_NAMESPACES['xhtml']
439
+ attribute_dup.instance_variable_set(
440
+ "@expanded_name", attribute.name)
441
+ end
442
+ if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
443
+ if attribute.name == 'xmlns'
444
+ node_dup.attributes.delete('xmlns')
445
+ end
446
+ end
447
+ end
448
+ if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
449
+ node_dup.instance_variable_set("@expanded_name", node.name)
450
+ end
451
+ if !node_namespace.blank? && node.prefix.blank?
452
+ if node.namespace != FEED_TOOLS_NAMESPACES['xhtml']
453
+ node_dup.add_namespace(node_namespace)
454
+ end
455
+ end
456
+ end
457
+ for index in 0...node.children.size
458
+ child = node.children[index]
459
+ child_dup = node_dup.children[index]
460
+ if child.kind_of? REXML::Element
461
+ normalize_namespaced_xhtml.call(child, child_dup)
462
+ end
463
+ end
464
+ end
465
+ normalize_namespaced_xhtml.call(rexml_node, rexml_node_dup)
466
+ buffer = ""
467
+ rexml_node_dup.each_child do |child|
468
+ if child.kind_of? REXML::Comment
469
+ buffer << "<!--" + child.to_s + "-->"
470
+ else
471
+ buffer << child.to_s
472
+ end
473
+ end
474
+ return buffer.strip
475
+ end
476
+
477
+ # Given a REXML node, returns its content, normalized as HTML.
478
+ def self.process_text_construct(content_node, feed_type, feed_version,
479
+ base_uri_sources=[])
480
+ if content_node.nil?
481
+ return nil
482
+ end
483
+
484
+ content = nil
485
+ root_node_name = nil
486
+ type = FeedTools::XmlHelper.try_xpaths(content_node, "@type",
487
+ :select_result_value => true)
488
+ mode = FeedTools::XmlHelper.try_xpaths(content_node, "@mode",
489
+ :select_result_value => true)
490
+ encoding = FeedTools::XmlHelper.try_xpaths(content_node, "@encoding",
491
+ :select_result_value => true)
492
+
493
+ if type.nil?
494
+ atom_namespaces = [
495
+ FEED_TOOLS_NAMESPACES['atom10'],
496
+ FEED_TOOLS_NAMESPACES['atom03']
497
+ ]
498
+ if ((atom_namespaces.include?(content_node.namespace) ||
499
+ atom_namespaces.include?(content_node.root.namespace)) ||
500
+ feed_type == "atom")
501
+ type = "text"
502
+ end
503
+ end
504
+
505
+ # Note that we're checking for misuse of type, mode and encoding here
506
+ if content_node.cdatas.size > 0
507
+ content = content_node.cdatas.first.to_s.strip
508
+ elsif type == "base64" || mode == "base64" ||
509
+ encoding == "base64"
510
+ content = Base64.decode64(content_node.inner_xml.strip)
511
+ elsif type == "xhtml" || mode == "xhtml" ||
512
+ type == "xml" || mode == "xml" ||
513
+ type == "application/xhtml+xml" ||
514
+ content_node.namespace == FEED_TOOLS_NAMESPACES['xhtml']
515
+ content = FeedTools::HtmlHelper.extract_xhtml(content_node)
516
+ elsif type == "escaped" || mode == "escaped"
517
+ content = FeedTools::HtmlHelper.unescape_entities(
518
+ content_node.inner_xml.strip)
519
+ elsif type == "text" || mode == "text" ||
520
+ type == "text/plain" || mode == "text/plain"
521
+ content = FeedTools::HtmlHelper.unescape_entities(
522
+ content_node.inner_xml.strip)
523
+ else
524
+ content = content_node.inner_xml.strip
525
+ repair_entities = true
526
+ end
527
+ if type == "text" || mode == "text" ||
528
+ type == "text/plain" || mode == "text/plain"
529
+ content = FeedTools::HtmlHelper.escape_entities(content)
530
+ end
531
+ unless content.nil?
532
+ if FeedTools.configurations[:sanitization_enabled]
533
+ content = FeedTools::HtmlHelper.sanitize_html(content, :strip)
534
+ end
535
+ content = FeedTools::HtmlHelper.resolve_relative_uris(content,
536
+ [content_node.base_uri] | base_uri_sources)
537
+ if repair_entities
538
+ content = FeedTools::HtmlHelper.unescape_entities(content)
539
+ end
540
+ content = FeedTools::HtmlHelper.tidy_html(content)
541
+ end
542
+ if FeedTools.configurations[:tab_spaces] != nil
543
+ spaces = FeedTools.configurations[:tab_spaces].to_i
544
+ content.gsub!("\t", " " * spaces) unless content.blank?
545
+ end
546
+ content.strip unless content.blank?
547
+ content = nil if content.blank?
548
+ return content
549
+ end
550
+
551
+ # Strips semantically empty div wrapper elements
552
+ def self.strip_wrapper_element(xhtml)
553
+ return nil if xhtml.nil?
554
+ return xhtml if xhtml.blank?
555
+ begin
556
+ doc = REXML::Document.new(xhtml.to_s.strip)
557
+ if doc.children.size == 1
558
+ child = doc.children[0]
559
+ if child.name.downcase == "div"
560
+ return child.inner_xml.strip
561
+ end
562
+ end
563
+ return xhtml.to_s.strip
564
+ rescue Exception
565
+ return xhtml.to_s.strip
566
+ end
567
+ end
568
+
569
+ # Given a block of html, locates feed links with a given mime type.
570
+ def self.extract_link_by_mime_type(html, mime_type)
571
+ require 'feed_tools/vendor/htree'
572
+ require 'feed_tools/helpers/xml_helper'
573
+
574
+ # This is technically very, very wrong. But it saves oodles of
575
+ # clock cycles, and probably works 99.999% of the time.
576
+ html_document = HTree.parse_xml(
577
+ FeedTools::HtmlHelper.tidy_html(html.gsub(/<body>(.|\n)*<\/body>/, ""))).to_rexml
578
+ html_node = nil
579
+ head_node = nil
580
+ link_nodes = []
581
+ for node in html_document.children
582
+ next unless node.kind_of?(REXML::Element)
583
+ if node.name.downcase == "html" &&
584
+ node.children.size > 0
585
+ html_node = node
586
+ break
587
+ end
588
+ end
589
+ return nil if html_node.nil?
590
+ for node in html_node.children
591
+ next unless node.kind_of?(REXML::Element)
592
+ if node.name.downcase == "head"
593
+ head_node = node
594
+ break
595
+ end
596
+ if node.name.downcase == "link"
597
+ link_nodes << node
598
+ end
599
+ end
600
+ return nil if html_node.nil? && link_nodes.empty?
601
+ if !head_node.nil?
602
+ link_nodes = []
603
+ for node in head_node.children
604
+ next unless node.kind_of?(REXML::Element)
605
+ if node.name.downcase == "link"
606
+ link_nodes << node
607
+ end
608
+ end
609
+ end
610
+ find_link_nodes = lambda do |links|
611
+ for link in links
612
+ next unless link.kind_of?(REXML::Element)
613
+ if link.attributes['type'].to_s.strip.downcase ==
614
+ mime_type.downcase &&
615
+ link.attributes['rel'].to_s.strip.downcase == "alternate"
616
+ href = link.attributes['href']
617
+ return href unless href.blank?
618
+ end
619
+ end
620
+ for link in links
621
+ next unless link.kind_of?(REXML::Element)
622
+ find_link_nodes.call(link.children)
623
+ end
624
+ end
625
+ find_link_nodes.call(link_nodes)
626
+ return nil
627
+ end
628
+ end
629
+ end