feed_detector 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +26 -2
- data/lib/feed_detector/version.rb +1 -1
- data/lib/feed_detector.rb +31 -38
- data/test/feed_detector_test.rb +4 -4
- metadata +1 -1
data/README.md
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
# FeedDetector
|
2
2
|
|
3
|
-
|
3
|
+
Detecting RSS feeds: When you use a proper browser like Mozilla Firefox you will see a syndication icon every time you visit a website that has RSS feeds:
|
4
|
+
|
5
|
+
It does this by reading certain HTML tags.
|
6
|
+
|
7
|
+
After a quick search I couldn’t find any code to do this in my own project, so I wrote a little piece of code for it with a RubyOnRails integration test.
|
4
8
|
|
5
9
|
## Installation
|
6
10
|
|
@@ -18,7 +22,27 @@ Or install it yourself as:
|
|
18
22
|
|
19
23
|
## Usage
|
20
24
|
|
21
|
-
|
25
|
+
FeedDetector.fetch_feed_urls('http://www.rubycorner.com')
|
26
|
+
=> ["http://www.rubycorner.com/feeds/updated/atom10", "http://www.rubycorner.com/feeds/updated/rss20"]
|
27
|
+
|
28
|
+
FeedDetector.fetch_feed_urls('http://blog.dominiek.com/')
|
29
|
+
=> ["http://blog.dominiek.com/feed/atom.xml"]
|
30
|
+
|
31
|
+
FeedDetector.fetch_feed_urls('http://blog.dominiek.com/feed/atom.xml')
|
32
|
+
=> ["http://blog.dominiek.com/feed/atom.xml"]
|
33
|
+
|
34
|
+
FeedDetector.fetch_feed_urls('http://www.flickr.com/photos/dominiekterheide/', :rss)
|
35
|
+
=> ["http://api.flickr.com/services/feeds/photos_public.gne?id=71386598@N00&lang=en-us&format=rss_200"]
|
36
|
+
|
37
|
+
alternatively you can parse HTML with
|
38
|
+
|
39
|
+
FeedDetector.get_feed_paths(html_data)
|
40
|
+
|
41
|
+
see integration test for more examples
|
42
|
+
|
43
|
+
## TODO
|
44
|
+
* Decouple parsing and retrieving data.
|
45
|
+
* Tests should not depend on data retrieved from the web taking a particular form.
|
22
46
|
|
23
47
|
## Contributing
|
24
48
|
|
data/lib/feed_detector.rb
CHANGED
@@ -1,35 +1,16 @@
|
|
1
1
|
require "feed_detector/version"
|
2
2
|
require "net/http"
|
3
3
|
require "uri"
|
4
|
+
require "nokogiri"
|
4
5
|
|
5
6
|
module FeedDetector
|
6
7
|
|
7
|
-
#
|
8
|
+
# returns all feed urls from one url
|
8
9
|
# for example: http://blog.dominiek.com/ => http://blog.dominiek.com/feed/atom.xml
|
9
10
|
# only_detect can force detection of :rss or :atom
|
10
|
-
# if nil is returned
|
11
|
-
def self.url_from_string(url)
|
12
|
-
if url =~ /^http:\/\//
|
13
|
-
url
|
14
|
-
else
|
15
|
-
"http://#{url}"
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
## converts relative urls to absolute urls
|
20
|
-
def self.to_absolute_url(page_url,feed_url)
|
21
|
-
if feed_url =~ /^http:\/\// ## if its absolute
|
22
|
-
feed_url
|
23
|
-
elsif feed_url =~ /^\// ## relative to the host root ## '/some_dir_from_root/feed.xml'
|
24
|
-
"http://#{URI.parse(page_url).host.to_s + feed_url}"
|
25
|
-
else ## relative to the page path ## 'feed.xml'
|
26
|
-
feed_path = page_url.scan(/^(http:\/\/[^\/]+)((?:\/[^\/]+)+(?=\/))?\/?(?:[^\/]+)?$/i).to_s
|
27
|
-
feed_path +'/'+ feed_url
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
11
|
+
# if nil is returned no feeds were detected - perhaps because it's the feed url
|
31
12
|
def self.fetch_feed_urls(page_url, only_detect=nil)
|
32
|
-
retries = 3
|
13
|
+
retries = 3 # default retries
|
33
14
|
html = ""
|
34
15
|
begin
|
35
16
|
response = Net::HTTP.get_response(URI.parse(page_url)) # sends get request
|
@@ -47,30 +28,42 @@ module FeedDetector
|
|
47
28
|
feed_urls.map { |feed_url| self.to_absolute_url(page_url, feed_url) }
|
48
29
|
end
|
49
30
|
|
50
|
-
##
|
51
31
|
# get the feed href from an HTML document
|
52
32
|
# for example:
|
53
33
|
# ...
|
54
34
|
# <link href="/feed/atom.xml" rel="alternate" type="application/atom+xml" />
|
55
|
-
# ...
|
56
35
|
# => /feed/atom.xml
|
57
36
|
# only_detect can force detection of :rss or :atom
|
58
37
|
def self.get_feed_paths(html, only_detect=nil)
|
59
|
-
matches =[]
|
38
|
+
matches = []
|
60
39
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
matches
|
70
|
-
# matches |= rss_feed
|
40
|
+
# parse html with nokogiri to find all link tags (see self.get_link_href)
|
41
|
+
doc = Nokogiri::HTML.parse(html)
|
42
|
+
|
43
|
+
unless only_detect
|
44
|
+
["rss", "atom"].each do |type|
|
45
|
+
matches << get_link_href(doc, type)
|
46
|
+
end
|
47
|
+
else
|
48
|
+
matches << get_link_href(doc, only_detect.to_s)
|
71
49
|
end
|
72
50
|
|
73
|
-
|
74
|
-
|
51
|
+
matches.flatten
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
# finding all link tags and get the attribute href
|
57
|
+
# types: rss or atom
|
58
|
+
def self.get_link_href doc, type
|
59
|
+
doc.css('link[type="application/'+type+'+xml"]').map { |link| link['href'] }
|
75
60
|
end
|
61
|
+
|
62
|
+
# converts relative urls to absolute urls
|
63
|
+
def self.to_absolute_url(page_url,feed_url)
|
64
|
+
return feed_url if feed_url =~ /^http:\/\// # already absolute
|
65
|
+
File.join(page_url, feed_url)
|
66
|
+
end
|
67
|
+
|
68
|
+
|
76
69
|
end
|
data/test/feed_detector_test.rb
CHANGED
@@ -7,7 +7,7 @@ class FeedDetectorTest < Test::Unit::TestCase
|
|
7
7
|
@body = []
|
8
8
|
|
9
9
|
@wordpress_atom_url = 'http://bettysteger.com/feed/' # the link says it's RSS, the XML is really ATOM
|
10
|
-
@wordpress_single_feed_page_url = 'http://
|
10
|
+
@wordpress_single_feed_page_url = 'http://showmaniac.org'
|
11
11
|
@wordpress_several_feed_page_url = 'http://bettysteger.com/'
|
12
12
|
|
13
13
|
@blogger_atom_url = 'http://ethandraws.blogspot.com/feeds/posts/default'
|
@@ -80,12 +80,12 @@ class FeedDetectorTest < Test::Unit::TestCase
|
|
80
80
|
|
81
81
|
def test_fetch_feed_urls
|
82
82
|
# page containing a single feed pointer
|
83
|
-
result = ["http://
|
83
|
+
result = ["http://showmaniac.org/feed"]
|
84
84
|
feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url)
|
85
85
|
assert_equal(result, feed_paths)
|
86
|
-
feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :rss)
|
87
|
-
assert_equal(result, feed_paths)
|
88
86
|
feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :atom)
|
87
|
+
assert_equal(result, feed_paths)
|
88
|
+
feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :rss)
|
89
89
|
assert_equal([], feed_paths)
|
90
90
|
|
91
91
|
# page containing several feed pointers
|