truffle-hog 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/truffle-hog.rb +22 -11
- data/spec/truffle-hog_spec.rb +9 -4
- metadata +1 -1
data/lib/truffle-hog.rb
CHANGED
@@ -1,12 +1,9 @@
|
|
1
1
|
module TruffleHog
|
2
|
-
VERSION = "0.0.
|
2
|
+
VERSION = "0.0.2"
|
3
3
|
|
4
4
|
def self.parse_feed_urls(html, favor = :all)
|
5
|
-
rss_links =
|
6
|
-
atom_links =
|
7
|
-
|
8
|
-
rss_links = (scan_for_tag(html, "a", "rss") + scan_for_tag(html, "link", "rss")).flatten.uniq
|
9
|
-
atom_links = (scan_for_tag(html, "a", "atom") + scan_for_tag(html, "link", "atom")).flatten.uniq
|
5
|
+
rss_links = scan_for_tag(html, "rss")
|
6
|
+
atom_links = scan_for_tag(html, "atom")
|
10
7
|
|
11
8
|
case favor
|
12
9
|
when :all
|
@@ -18,10 +15,24 @@ module TruffleHog
|
|
18
15
|
end
|
19
16
|
end
|
20
17
|
|
21
|
-
def self.scan_for_tag(html,
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
18
|
+
def self.scan_for_tag(html, type)
|
19
|
+
urls(html, "link", type) + urls(html, "a", type)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.urls(html, tag, type)
|
23
|
+
tags = html.scan(/(<#{tag}.*?>)/).flatten
|
24
|
+
feed_tags = collect(tags, type)
|
25
|
+
feed_tags.map do |tag|
|
26
|
+
url = tag.match(/.*href=['"](.*?)['"].*/)[1]
|
27
|
+
url =~ /^http.*/ ? url : nil
|
28
|
+
end.compact
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.collect(tags, type)
|
32
|
+
tags.collect {|t| t if feed?(t, type)}.compact
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.feed?(html, type)
|
36
|
+
html =~ /.*type=['"]application\/#{type}\+xml['"].*/
|
26
37
|
end
|
27
38
|
end
|
data/spec/truffle-hog_spec.rb
CHANGED
@@ -6,10 +6,7 @@ describe "parsing html" do
|
|
6
6
|
end
|
7
7
|
|
8
8
|
it "parses all feed urls" do
|
9
|
-
TruffleHog.parse_feed_urls(@html).should == ["http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss",
|
10
|
-
"http://www.pauldix.net/in_head/index.rdf", "http://www.pauldix.net/in_head/rss.xml",
|
11
|
-
"http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom",
|
12
|
-
"http://www.pauldix.net/in_head/atom.xml"]
|
9
|
+
TruffleHog.parse_feed_urls(@html).should == ["http://www.pauldix.net/in_head/index.rdf", "http://www.pauldix.net/in_head/rss.xml", "http://feeds.feedburner.com/PaulDixExplainsNothing", "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss", "http://www.pauldix.net/in_head/atom.xml", "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom"]
|
13
10
|
end
|
14
11
|
|
15
12
|
it "parses rss feeds from the link tags in head" do
|
@@ -41,4 +38,12 @@ describe "parsing html" do
|
|
41
38
|
|
42
39
|
it "returns atom feeds if rss is favored, but none are found"
|
43
40
|
it "returns rss feeds if atom is favored, but none are found"
|
41
|
+
|
42
|
+
describe "odd regressions" do
|
43
|
+
it "doesn't go into an infinite loop on this input" do
|
44
|
+
input = File.read("#{File.dirname(__FILE__)}/infinite.html")
|
45
|
+
feed_urls = TruffleHog.parse_feed_urls(input)
|
46
|
+
feed_urls.should == ["http://feeds.feedburner.com/cryptload"]
|
47
|
+
end
|
48
|
+
end
|
44
49
|
end
|