truffle-hog 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,9 @@
1
1
  module TruffleHog
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
 
4
4
  def self.parse_feed_urls(html, favor = :all)
5
- rss_links = []
6
- atom_links = []
7
-
8
- rss_links = (scan_for_tag(html, "a", "rss") + scan_for_tag(html, "link", "rss")).flatten.uniq
9
- atom_links = (scan_for_tag(html, "a", "atom") + scan_for_tag(html, "link", "atom")).flatten.uniq
5
+ rss_links = scan_for_tag(html, "rss")
6
+ atom_links = scan_for_tag(html, "atom")
10
7
 
11
8
  case favor
12
9
  when :all
@@ -18,10 +15,24 @@ module TruffleHog
18
15
  end
19
16
  end
20
17
 
21
- def self.scan_for_tag(html, tag, type)
22
- href_first = html.scan(/<#{tag}.*href\=['"](.*?)['"].*type\=['"]application\/#{type}\+xml['"].*?>/)
23
- return href_first unless href_first.empty?
24
-
25
- html.scan(/<#{tag}.*type\=['"]application\/#{type}\+xml['"].*href=['"](.*?)['"].*?>/)
18
+ def self.scan_for_tag(html, type)
19
+ urls(html, "link", type) + urls(html, "a", type)
20
+ end
21
+
22
+ def self.urls(html, tag, type)
23
+ tags = html.scan(/(<#{tag}.*?>)/).flatten
24
+ feed_tags = collect(tags, type)
25
+ feed_tags.map do |tag|
26
+ url = tag.match(/.*href=['"](.*?)['"].*/)[1]
27
+ url =~ /^http.*/ ? url : nil
28
+ end.compact
29
+ end
30
+
31
+ def self.collect(tags, type)
32
+ tags.collect {|t| t if feed?(t, type)}.compact
33
+ end
34
+
35
+ def self.feed?(html, type)
36
+ html =~ /.*type=['"]application\/#{type}\+xml['"].*/
26
37
  end
27
38
  end
@@ -6,10 +6,7 @@ describe "parsing html" do
6
6
  end
7
7
 
8
8
  it "parses all feed urls" do
9
- TruffleHog.parse_feed_urls(@html).should == ["http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss",
10
- "http://www.pauldix.net/in_head/index.rdf", "http://www.pauldix.net/in_head/rss.xml",
11
- "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom",
12
- "http://www.pauldix.net/in_head/atom.xml"]
9
+ TruffleHog.parse_feed_urls(@html).should == ["http://www.pauldix.net/in_head/index.rdf", "http://www.pauldix.net/in_head/rss.xml", "http://feeds.feedburner.com/PaulDixExplainsNothing", "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss", "http://www.pauldix.net/in_head/atom.xml", "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom"]
13
10
  end
14
11
 
15
12
  it "parses rss feeds from the link tags in head" do
@@ -41,4 +38,12 @@ describe "parsing html" do
41
38
 
42
39
  it "returns atom feeds if rss is favored, but none are found"
43
40
  it "returns rss feeds if atom is favored, but none are found"
41
+
42
+ describe "odd regressions" do
43
+ it "doesn't go into an infinite loop on this input" do
44
+ input = File.read("#{File.dirname(__FILE__)}/infinite.html")
45
+ feed_urls = TruffleHog.parse_feed_urls(input)
46
+ feed_urls.should == ["http://feeds.feedburner.com/cryptload"]
47
+ end
48
+ end
44
49
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: truffle-hog
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Dix