truffle-hog 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/truffle-hog.rb +22 -11
- data/spec/truffle-hog_spec.rb +9 -4
- metadata +1 -1
data/lib/truffle-hog.rb
CHANGED
@@ -1,12 +1,9 @@
|
|
1
1
|
module TruffleHog
|
2
|
-
VERSION = "0.0.
|
2
|
+
VERSION = "0.0.2"
|
3
3
|
|
4
4
|
def self.parse_feed_urls(html, favor = :all)
|
5
|
-
rss_links =
|
6
|
-
atom_links =
|
7
|
-
|
8
|
-
rss_links = (scan_for_tag(html, "a", "rss") + scan_for_tag(html, "link", "rss")).flatten.uniq
|
9
|
-
atom_links = (scan_for_tag(html, "a", "atom") + scan_for_tag(html, "link", "atom")).flatten.uniq
|
5
|
+
rss_links = scan_for_tag(html, "rss")
|
6
|
+
atom_links = scan_for_tag(html, "atom")
|
10
7
|
|
11
8
|
case favor
|
12
9
|
when :all
|
@@ -18,10 +15,24 @@ module TruffleHog
|
|
18
15
|
end
|
19
16
|
end
|
20
17
|
|
21
|
-
def self.scan_for_tag(html,
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
18
|
+
def self.scan_for_tag(html, type)
|
19
|
+
urls(html, "link", type) + urls(html, "a", type)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.urls(html, tag, type)
|
23
|
+
tags = html.scan(/(<#{tag}.*?>)/).flatten
|
24
|
+
feed_tags = collect(tags, type)
|
25
|
+
feed_tags.map do |tag|
|
26
|
+
url = tag.match(/.*href=['"](.*?)['"].*/)[1]
|
27
|
+
url =~ /^http.*/ ? url : nil
|
28
|
+
end.compact
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.collect(tags, type)
|
32
|
+
tags.collect {|t| t if feed?(t, type)}.compact
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.feed?(html, type)
|
36
|
+
html =~ /.*type=['"]application\/#{type}\+xml['"].*/
|
26
37
|
end
|
27
38
|
end
|
data/spec/truffle-hog_spec.rb
CHANGED
@@ -6,10 +6,7 @@ describe "parsing html" do
|
|
6
6
|
end
|
7
7
|
|
8
8
|
it "parses all feed urls" do
|
9
|
-
TruffleHog.parse_feed_urls(@html).should == ["http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss",
|
10
|
-
"http://www.pauldix.net/in_head/index.rdf", "http://www.pauldix.net/in_head/rss.xml",
|
11
|
-
"http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom",
|
12
|
-
"http://www.pauldix.net/in_head/atom.xml"]
|
9
|
+
TruffleHog.parse_feed_urls(@html).should == ["http://www.pauldix.net/in_head/index.rdf", "http://www.pauldix.net/in_head/rss.xml", "http://feeds.feedburner.com/PaulDixExplainsNothing", "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss", "http://www.pauldix.net/in_head/atom.xml", "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom"]
|
13
10
|
end
|
14
11
|
|
15
12
|
it "parses rss feeds from the link tags in head" do
|
@@ -41,4 +38,12 @@ describe "parsing html" do
|
|
41
38
|
|
42
39
|
it "returns atom feeds if rss is favored, but none are found"
|
43
40
|
it "returns rss feeds if atom is favored, but none are found"
|
41
|
+
|
42
|
+
describe "odd regressions" do
|
43
|
+
it "doesn't go into an infinite loop on this input" do
|
44
|
+
input = File.read("#{File.dirname(__FILE__)}/infinite.html")
|
45
|
+
feed_urls = TruffleHog.parse_feed_urls(input)
|
46
|
+
feed_urls.should == ["http://feeds.feedburner.com/cryptload"]
|
47
|
+
end
|
48
|
+
end
|
44
49
|
end
|