feedtools 0.2.27 → 0.2.28

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,5 @@
1
+ === FeedTools 0.2.28
2
+ * fixed major bug with autodiscovery
1
3
  === FeedTools 0.2.27
2
4
  * now uses a new URI class
3
5
  * full support for IRIs
@@ -224,6 +224,7 @@ module FeedTools
224
224
  "Autodiscovery loop detected: #{autodiscovered_url}"
225
225
  end
226
226
  self.feed_data = nil
227
+
227
228
  self.href = autodiscovered_url
228
229
  if FeedTools.feed_cache.nil?
229
230
  self.cache_object = nil
@@ -714,8 +715,13 @@ module FeedTools
714
715
  begin
715
716
  @xml_document = REXML::Document.new(self.feed_data_utf_8)
716
717
  rescue Exception
717
- # Something failed, attempt to repair the xml with htree.
718
- @xml_document = HTree.parse(self.feed_data_utf_8).to_rexml
718
+ # Something failed, attempt to repair the xml with html5lib.
719
+ begin
720
+ @xml_document = HTML5::XMLParser.parse(self.feed_data_utf_8)
721
+ rescue Exception
722
+ # Failed again, give up.
723
+ return nil
724
+ end
719
725
  end
720
726
  end
721
727
  end
@@ -561,47 +561,66 @@ module FeedTools
561
561
 
562
562
  # Given a block of html, locates feed links with a given mime type.
563
563
  def self.extract_link_by_mime_type(html, mime_type)
564
- require 'feed_tools/vendor/htree'
565
564
  require 'feed_tools/helpers/xml_helper'
566
565
 
566
+ # HACK: Prevent the parser from freaking out if it sees this:
567
+ html = html.gsub(/<!'/, "&lt;!'")
568
+
567
569
  # This is technically very, very wrong. But it saves oodles of
568
570
  # clock cycles, and probably works 99.999% of the time.
569
- html_document = HTree.parse_xml(
570
- FeedTools::HtmlHelper.tidy_html(
571
- html.gsub(/<body.*?>(.|\n)*<\/body>/, "<body>-</body>"))).to_rexml
572
- html_node = nil
573
- head_node = nil
571
+ html.gsub!(/<body.*?>(.|\n)*?<\/body>/, "<body></body>")
572
+ html.gsub!(/<script.*?>(.|\n)*?<\/script>/, "")
573
+ html.gsub!(/<noscript.*?>(.|\n)*?<\/noscript>/, "")
574
+ html.gsub!(/<!--(.|\n)*?-->/, "")
575
+
576
+ html = FeedTools::HtmlHelper.tidy_html(html)
577
+
578
+ document = HTML5::HTMLParser.parse(html)
579
+
574
580
  link_nodes = []
575
- for node in html_document.children
576
- next unless node.kind_of?(REXML::Element)
577
- if node.name.downcase == "html" &&
578
- node.children.size > 0
579
- html_node = node
580
- break
581
- end
582
- end
583
- return nil if html_node.nil?
584
- for node in html_node.children
585
- next unless node.kind_of?(REXML::Element)
586
- if node.name.downcase == "head"
587
- head_node = node
588
- break
589
- end
590
- if node.name.downcase == "link"
591
- link_nodes << node
581
+ get_link_nodes = lambda do |root_node|
582
+ html_node = nil
583
+ head_node = nil
584
+ return nil if !root_node.respond_to?(:children)
585
+ if root_node.name.downcase == "html" &&
586
+ root_node.children.size > 0
587
+ html_node = root_node
588
+ else
589
+ for node in fragment_node.children
590
+ next unless node.kind_of?(REXML::Element)
591
+ if node.name.downcase == "html" &&
592
+ node.children.size > 0
593
+ html_node = node
594
+ break
595
+ end
596
+ end
592
597
  end
593
- end
594
- return nil if html_node.nil? && link_nodes.empty?
595
- if !head_node.nil?
596
- link_nodes = []
597
- for node in head_node.children
598
- next unless node.kind_of?(REXML::Element)
599
- if node.name.downcase == "link"
600
- link_nodes << node
598
+ if html_node != nil
599
+ for node in html_node.children
600
+ next unless node.kind_of?(REXML::Element)
601
+ if node.name.downcase == "head"
602
+ head_node = node
603
+ break
604
+ end
605
+ if node.name.downcase == "link"
606
+ link_nodes << node
607
+ end
608
+ end
609
+ if html_node != nil || !link_nodes.empty?
610
+ if head_node != nil
611
+ link_nodes = []
612
+ for node in head_node.children
613
+ next unless node.kind_of?(REXML::Element)
614
+ if node.name.downcase == "link"
615
+ link_nodes << node
616
+ end
617
+ end
618
+ end
601
619
  end
602
620
  end
603
621
  end
604
- find_link_nodes = lambda do |links|
622
+ get_link_nodes.call(document.root)
623
+ process_link_nodes = lambda do |links|
605
624
  for link in links
606
625
  next unless link.kind_of?(REXML::Element)
607
626
  if link.attributes['type'].to_s.strip.downcase ==
@@ -613,11 +632,11 @@ module FeedTools
613
632
  end
614
633
  for link in links
615
634
  next unless link.kind_of?(REXML::Element)
616
- find_link_nodes.call(link.children)
635
+ process_link_nodes.call(link.children)
617
636
  end
618
637
  end
619
- find_link_nodes.call(link_nodes)
638
+ process_link_nodes.call(link_nodes)
620
639
  return nil
621
640
  end
622
641
  end
623
- end
642
+ end
@@ -2,7 +2,7 @@ module FeedTools
2
2
  module FEED_TOOLS_VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 2
5
- TINY = 27
5
+ TINY = 28
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feedtools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.27
4
+ version: 0.2.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bob Aman
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-01-31 00:00:00 -05:00
12
+ date: 2008-02-01 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency