RubyGems - truffle-hog - Versions diffs - 0.0.1 → 0.0.2 - Mend

truffle-hog 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

data/lib/truffle-hog.rb CHANGED

@@ -1,12 +1,9 @@
 module TruffleHog
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
   def self.parse_feed_urls(html, favor = :all)
-    rss_links  = []
-    atom_links = []
-    rss_links  = (scan_for_tag(html, "a", "rss") + scan_for_tag(html, "link", "rss")).flatten.uniq
-    atom_links = (scan_for_tag(html, "a", "atom") + scan_for_tag(html, "link", "atom")).flatten.uniq
+    rss_links  = scan_for_tag(html, "rss")
+    atom_links = scan_for_tag(html, "atom")
     case favor
     when :all
@@ -18,10 +15,24 @@ module TruffleHog
     end
   end
-  def self.scan_for_tag(html, tag, type)
-    href_first = html.scan(/<#{tag}.*href\=['"](.*?)['"].*type\=['"]application\/#{type}\+xml['"].*?>/)
-    return href_first unless href_first.empty?
-    html.scan(/<#{tag}.*type\=['"]application\/#{type}\+xml['"].*href=['"](.*?)['"].*?>/)
+  def self.scan_for_tag(html, type)
+    urls(html, "link", type) + urls(html, "a", type)
+  end
+  def self.urls(html, tag, type)
+    tags = html.scan(/(<#{tag}.*?>)/).flatten
+    feed_tags = collect(tags, type)
+    feed_tags.map do |tag|
+      url = tag.match(/.*href=['"](.*?)['"].*/)[1]
+      url =~ /^http.*/ ? url : nil
+    end.compact
+  end
+  def self.collect(tags, type)
+    tags.collect {|t| t if feed?(t, type)}.compact
+  end
+  def self.feed?(html, type)
+    html =~ /.*type=['"]application\/#{type}\+xml['"].*/
   end
 end

data/spec/truffle-hog_spec.rb CHANGED

@@ -6,10 +6,7 @@ describe "parsing html" do
   end
   it "parses all feed urls" do
-    TruffleHog.parse_feed_urls(@html).should == ["http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss",
-                                                 "http://www.pauldix.net/in_head/index.rdf", "http://www.pauldix.net/in_head/rss.xml",
-                                                 "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom",
-                                                 "http://www.pauldix.net/in_head/atom.xml"]
+    TruffleHog.parse_feed_urls(@html).should == ["http://www.pauldix.net/in_head/index.rdf", "http://www.pauldix.net/in_head/rss.xml", "http://feeds.feedburner.com/PaulDixExplainsNothing", "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss", "http://www.pauldix.net/in_head/atom.xml", "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom"]
   end
   it "parses rss feeds from the link tags in head" do
@@ -41,4 +38,12 @@ describe "parsing html" do
   it "returns atom feeds if rss is favored, but none are found"
   it "returns rss feeds if atom is favored, but none are found"
+  describe "odd regressions" do
+    it "doesn't go into an infinite loop on this input" do
+      input = File.read("#{File.dirname(__FILE__)}/infinite.html")
+      feed_urls = TruffleHog.parse_feed_urls(input)
+      feed_urls.should == ["http://feeds.feedburner.com/cryptload"]
+    end
+  end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: truffle-hog
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - Paul Dix