feedtools 0.2.27 → 0.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,5 @@
1
+ === FeedTools 0.2.28
2
+ * fixed major bug with autodiscovery
1
3
  === FeedTools 0.2.27
2
4
  * now uses a new URI class
3
5
  * full support for IRIs
@@ -224,6 +224,7 @@ module FeedTools
224
224
  "Autodiscovery loop detected: #{autodiscovered_url}"
225
225
  end
226
226
  self.feed_data = nil
227
+
227
228
  self.href = autodiscovered_url
228
229
  if FeedTools.feed_cache.nil?
229
230
  self.cache_object = nil
@@ -714,8 +715,13 @@ module FeedTools
714
715
  begin
715
716
  @xml_document = REXML::Document.new(self.feed_data_utf_8)
716
717
  rescue Exception
717
- # Something failed, attempt to repair the xml with htree.
718
- @xml_document = HTree.parse(self.feed_data_utf_8).to_rexml
718
+ # Something failed, attempt to repair the xml with html5lib.
719
+ begin
720
+ @xml_document = HTML5::XMLParser.parse(self.feed_data_utf_8)
721
+ rescue Exception
722
+ # Failed again, give up.
723
+ return nil
724
+ end
719
725
  end
720
726
  end
721
727
  end
@@ -561,47 +561,66 @@ module FeedTools
561
561
 
562
562
  # Given a block of html, locates feed links with a given mime type.
563
563
  def self.extract_link_by_mime_type(html, mime_type)
564
- require 'feed_tools/vendor/htree'
565
564
  require 'feed_tools/helpers/xml_helper'
566
565
 
566
+ # HACK: Prevent the parser from freaking out if it sees this:
567
+ html = html.gsub(/<!'/, "&lt;!'")
568
+
567
569
  # This is technically very, very wrong. But it saves oodles of
568
570
  # clock cycles, and probably works 99.999% of the time.
569
- html_document = HTree.parse_xml(
570
- FeedTools::HtmlHelper.tidy_html(
571
- html.gsub(/<body.*?>(.|\n)*<\/body>/, "<body>-</body>"))).to_rexml
572
- html_node = nil
573
- head_node = nil
571
+ html.gsub!(/<body.*?>(.|\n)*?<\/body>/, "<body></body>")
572
+ html.gsub!(/<script.*?>(.|\n)*?<\/script>/, "")
573
+ html.gsub!(/<noscript.*?>(.|\n)*?<\/noscript>/, "")
574
+ html.gsub!(/<!--(.|\n)*?-->/, "")
575
+
576
+ html = FeedTools::HtmlHelper.tidy_html(html)
577
+
578
+ document = HTML5::HTMLParser.parse(html)
579
+
574
580
  link_nodes = []
575
- for node in html_document.children
576
- next unless node.kind_of?(REXML::Element)
577
- if node.name.downcase == "html" &&
578
- node.children.size > 0
579
- html_node = node
580
- break
581
- end
582
- end
583
- return nil if html_node.nil?
584
- for node in html_node.children
585
- next unless node.kind_of?(REXML::Element)
586
- if node.name.downcase == "head"
587
- head_node = node
588
- break
589
- end
590
- if node.name.downcase == "link"
591
- link_nodes << node
581
+ get_link_nodes = lambda do |root_node|
582
+ html_node = nil
583
+ head_node = nil
584
+ return nil if !root_node.respond_to?(:children)
585
+ if root_node.name.downcase == "html" &&
586
+ root_node.children.size > 0
587
+ html_node = root_node
588
+ else
589
+ for node in fragment_node.children
590
+ next unless node.kind_of?(REXML::Element)
591
+ if node.name.downcase == "html" &&
592
+ node.children.size > 0
593
+ html_node = node
594
+ break
595
+ end
596
+ end
592
597
  end
593
- end
594
- return nil if html_node.nil? && link_nodes.empty?
595
- if !head_node.nil?
596
- link_nodes = []
597
- for node in head_node.children
598
- next unless node.kind_of?(REXML::Element)
599
- if node.name.downcase == "link"
600
- link_nodes << node
598
+ if html_node != nil
599
+ for node in html_node.children
600
+ next unless node.kind_of?(REXML::Element)
601
+ if node.name.downcase == "head"
602
+ head_node = node
603
+ break
604
+ end
605
+ if node.name.downcase == "link"
606
+ link_nodes << node
607
+ end
608
+ end
609
+ if html_node != nil || !link_nodes.empty?
610
+ if head_node != nil
611
+ link_nodes = []
612
+ for node in head_node.children
613
+ next unless node.kind_of?(REXML::Element)
614
+ if node.name.downcase == "link"
615
+ link_nodes << node
616
+ end
617
+ end
618
+ end
601
619
  end
602
620
  end
603
621
  end
604
- find_link_nodes = lambda do |links|
622
+ get_link_nodes.call(document.root)
623
+ process_link_nodes = lambda do |links|
605
624
  for link in links
606
625
  next unless link.kind_of?(REXML::Element)
607
626
  if link.attributes['type'].to_s.strip.downcase ==
@@ -613,11 +632,11 @@ module FeedTools
613
632
  end
614
633
  for link in links
615
634
  next unless link.kind_of?(REXML::Element)
616
- find_link_nodes.call(link.children)
635
+ process_link_nodes.call(link.children)
617
636
  end
618
637
  end
619
- find_link_nodes.call(link_nodes)
638
+ process_link_nodes.call(link_nodes)
620
639
  return nil
621
640
  end
622
641
  end
623
- end
642
+ end
@@ -2,7 +2,7 @@ module FeedTools
2
2
  module FEED_TOOLS_VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 2
5
- TINY = 27
5
+ TINY = 28
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feedtools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.27
4
+ version: 0.2.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bob Aman
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-01-31 00:00:00 -05:00
12
+ date: 2008-02-01 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency