feedtools 0.2.27 → 0.2.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +2 -0
- data/lib/feed_tools/feed.rb +8 -2
- data/lib/feed_tools/helpers/html_helper.rb +54 -35
- data/lib/feed_tools/version.rb +1 -1
- metadata +2 -2
data/CHANGELOG
CHANGED
data/lib/feed_tools/feed.rb
CHANGED
@@ -224,6 +224,7 @@ module FeedTools
|
|
224
224
|
"Autodiscovery loop detected: #{autodiscovered_url}"
|
225
225
|
end
|
226
226
|
self.feed_data = nil
|
227
|
+
|
227
228
|
self.href = autodiscovered_url
|
228
229
|
if FeedTools.feed_cache.nil?
|
229
230
|
self.cache_object = nil
|
@@ -714,8 +715,13 @@ module FeedTools
|
|
714
715
|
begin
|
715
716
|
@xml_document = REXML::Document.new(self.feed_data_utf_8)
|
716
717
|
rescue Exception
|
717
|
-
# Something failed, attempt to repair the xml with
|
718
|
-
|
718
|
+
# Something failed, attempt to repair the xml with html5lib.
|
719
|
+
begin
|
720
|
+
@xml_document = HTML5::XMLParser.parse(self.feed_data_utf_8)
|
721
|
+
rescue Exception
|
722
|
+
# Failed again, give up.
|
723
|
+
return nil
|
724
|
+
end
|
719
725
|
end
|
720
726
|
end
|
721
727
|
end
|
@@ -561,47 +561,66 @@ module FeedTools
|
|
561
561
|
|
562
562
|
# Given a block of html, locates feed links with a given mime type.
|
563
563
|
def self.extract_link_by_mime_type(html, mime_type)
|
564
|
-
require 'feed_tools/vendor/htree'
|
565
564
|
require 'feed_tools/helpers/xml_helper'
|
566
565
|
|
566
|
+
# HACK: Prevent the parser from freaking out if it sees this:
|
567
|
+
html = html.gsub(/<!'/, "<!'")
|
568
|
+
|
567
569
|
# This is technically very, very wrong. But it saves oodles of
|
568
570
|
# clock cycles, and probably works 99.999% of the time.
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
571
|
+
html.gsub!(/<body.*?>(.|\n)*?<\/body>/, "<body></body>")
|
572
|
+
html.gsub!(/<script.*?>(.|\n)*?<\/script>/, "")
|
573
|
+
html.gsub!(/<noscript.*?>(.|\n)*?<\/noscript>/, "")
|
574
|
+
html.gsub!(/<!--(.|\n)*?-->/, "")
|
575
|
+
|
576
|
+
html = FeedTools::HtmlHelper.tidy_html(html)
|
577
|
+
|
578
|
+
document = HTML5::HTMLParser.parse(html)
|
579
|
+
|
574
580
|
link_nodes = []
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
link_nodes << node
|
581
|
+
get_link_nodes = lambda do |root_node|
|
582
|
+
html_node = nil
|
583
|
+
head_node = nil
|
584
|
+
return nil if !root_node.respond_to?(:children)
|
585
|
+
if root_node.name.downcase == "html" &&
|
586
|
+
root_node.children.size > 0
|
587
|
+
html_node = root_node
|
588
|
+
else
|
589
|
+
for node in fragment_node.children
|
590
|
+
next unless node.kind_of?(REXML::Element)
|
591
|
+
if node.name.downcase == "html" &&
|
592
|
+
node.children.size > 0
|
593
|
+
html_node = node
|
594
|
+
break
|
595
|
+
end
|
596
|
+
end
|
592
597
|
end
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
598
|
+
if html_node != nil
|
599
|
+
for node in html_node.children
|
600
|
+
next unless node.kind_of?(REXML::Element)
|
601
|
+
if node.name.downcase == "head"
|
602
|
+
head_node = node
|
603
|
+
break
|
604
|
+
end
|
605
|
+
if node.name.downcase == "link"
|
606
|
+
link_nodes << node
|
607
|
+
end
|
608
|
+
end
|
609
|
+
if html_node != nil || !link_nodes.empty?
|
610
|
+
if head_node != nil
|
611
|
+
link_nodes = []
|
612
|
+
for node in head_node.children
|
613
|
+
next unless node.kind_of?(REXML::Element)
|
614
|
+
if node.name.downcase == "link"
|
615
|
+
link_nodes << node
|
616
|
+
end
|
617
|
+
end
|
618
|
+
end
|
601
619
|
end
|
602
620
|
end
|
603
621
|
end
|
604
|
-
|
622
|
+
get_link_nodes.call(document.root)
|
623
|
+
process_link_nodes = lambda do |links|
|
605
624
|
for link in links
|
606
625
|
next unless link.kind_of?(REXML::Element)
|
607
626
|
if link.attributes['type'].to_s.strip.downcase ==
|
@@ -613,11 +632,11 @@ module FeedTools
|
|
613
632
|
end
|
614
633
|
for link in links
|
615
634
|
next unless link.kind_of?(REXML::Element)
|
616
|
-
|
635
|
+
process_link_nodes.call(link.children)
|
617
636
|
end
|
618
637
|
end
|
619
|
-
|
638
|
+
process_link_nodes.call(link_nodes)
|
620
639
|
return nil
|
621
640
|
end
|
622
641
|
end
|
623
|
-
end
|
642
|
+
end
|
data/lib/feed_tools/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feedtools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.28
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bob Aman
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-01
|
12
|
+
date: 2008-02-01 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|