feedtools 0.2.27 → 0.2.28
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +2 -0
- data/lib/feed_tools/feed.rb +8 -2
- data/lib/feed_tools/helpers/html_helper.rb +54 -35
- data/lib/feed_tools/version.rb +1 -1
- metadata +2 -2
data/CHANGELOG
CHANGED
data/lib/feed_tools/feed.rb
CHANGED
@@ -224,6 +224,7 @@ module FeedTools
|
|
224
224
|
"Autodiscovery loop detected: #{autodiscovered_url}"
|
225
225
|
end
|
226
226
|
self.feed_data = nil
|
227
|
+
|
227
228
|
self.href = autodiscovered_url
|
228
229
|
if FeedTools.feed_cache.nil?
|
229
230
|
self.cache_object = nil
|
@@ -714,8 +715,13 @@ module FeedTools
|
|
714
715
|
begin
|
715
716
|
@xml_document = REXML::Document.new(self.feed_data_utf_8)
|
716
717
|
rescue Exception
|
717
|
-
# Something failed, attempt to repair the xml with
|
718
|
-
|
718
|
+
# Something failed, attempt to repair the xml with html5lib.
|
719
|
+
begin
|
720
|
+
@xml_document = HTML5::XMLParser.parse(self.feed_data_utf_8)
|
721
|
+
rescue Exception
|
722
|
+
# Failed again, give up.
|
723
|
+
return nil
|
724
|
+
end
|
719
725
|
end
|
720
726
|
end
|
721
727
|
end
|
@@ -561,47 +561,66 @@ module FeedTools
|
|
561
561
|
|
562
562
|
# Given a block of html, locates feed links with a given mime type.
|
563
563
|
def self.extract_link_by_mime_type(html, mime_type)
|
564
|
-
require 'feed_tools/vendor/htree'
|
565
564
|
require 'feed_tools/helpers/xml_helper'
|
566
565
|
|
566
|
+
# HACK: Prevent the parser from freaking out if it sees this:
|
567
|
+
html = html.gsub(/<!'/, "<!'")
|
568
|
+
|
567
569
|
# This is technically very, very wrong. But it saves oodles of
|
568
570
|
# clock cycles, and probably works 99.999% of the time.
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
571
|
+
html.gsub!(/<body.*?>(.|\n)*?<\/body>/, "<body></body>")
|
572
|
+
html.gsub!(/<script.*?>(.|\n)*?<\/script>/, "")
|
573
|
+
html.gsub!(/<noscript.*?>(.|\n)*?<\/noscript>/, "")
|
574
|
+
html.gsub!(/<!--(.|\n)*?-->/, "")
|
575
|
+
|
576
|
+
html = FeedTools::HtmlHelper.tidy_html(html)
|
577
|
+
|
578
|
+
document = HTML5::HTMLParser.parse(html)
|
579
|
+
|
574
580
|
link_nodes = []
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
link_nodes << node
|
581
|
+
get_link_nodes = lambda do |root_node|
|
582
|
+
html_node = nil
|
583
|
+
head_node = nil
|
584
|
+
return nil if !root_node.respond_to?(:children)
|
585
|
+
if root_node.name.downcase == "html" &&
|
586
|
+
root_node.children.size > 0
|
587
|
+
html_node = root_node
|
588
|
+
else
|
589
|
+
for node in fragment_node.children
|
590
|
+
next unless node.kind_of?(REXML::Element)
|
591
|
+
if node.name.downcase == "html" &&
|
592
|
+
node.children.size > 0
|
593
|
+
html_node = node
|
594
|
+
break
|
595
|
+
end
|
596
|
+
end
|
592
597
|
end
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
598
|
+
if html_node != nil
|
599
|
+
for node in html_node.children
|
600
|
+
next unless node.kind_of?(REXML::Element)
|
601
|
+
if node.name.downcase == "head"
|
602
|
+
head_node = node
|
603
|
+
break
|
604
|
+
end
|
605
|
+
if node.name.downcase == "link"
|
606
|
+
link_nodes << node
|
607
|
+
end
|
608
|
+
end
|
609
|
+
if html_node != nil || !link_nodes.empty?
|
610
|
+
if head_node != nil
|
611
|
+
link_nodes = []
|
612
|
+
for node in head_node.children
|
613
|
+
next unless node.kind_of?(REXML::Element)
|
614
|
+
if node.name.downcase == "link"
|
615
|
+
link_nodes << node
|
616
|
+
end
|
617
|
+
end
|
618
|
+
end
|
601
619
|
end
|
602
620
|
end
|
603
621
|
end
|
604
|
-
|
622
|
+
get_link_nodes.call(document.root)
|
623
|
+
process_link_nodes = lambda do |links|
|
605
624
|
for link in links
|
606
625
|
next unless link.kind_of?(REXML::Element)
|
607
626
|
if link.attributes['type'].to_s.strip.downcase ==
|
@@ -613,11 +632,11 @@ module FeedTools
|
|
613
632
|
end
|
614
633
|
for link in links
|
615
634
|
next unless link.kind_of?(REXML::Element)
|
616
|
-
|
635
|
+
process_link_nodes.call(link.children)
|
617
636
|
end
|
618
637
|
end
|
619
|
-
|
638
|
+
process_link_nodes.call(link_nodes)
|
620
639
|
return nil
|
621
640
|
end
|
622
641
|
end
|
623
|
-
end
|
642
|
+
end
|
data/lib/feed_tools/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feedtools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.28
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bob Aman
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-01
|
12
|
+
date: 2008-02-01 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|