feed_detector 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,6 +1,10 @@
1
1
  # FeedDetector
2
2
 
3
- TODO: Write a gem description
3
+ Detecting RSS feeds: When you use a proper browser like Mozilla Firefox you will see a syndication icon every time you visit a website that has RSS feeds:
4
+
5
+ It does this by reading certain HTML tags.
6
+
7
+ After a quick search I couldn’t find any code to do this in my own project, so I wrote a little piece of code for it with a RubyOnRails integration test.
4
8
 
5
9
  ## Installation
6
10
 
@@ -18,7 +22,27 @@ Or install it yourself as:
18
22
 
19
23
  ## Usage
20
24
 
21
- TODO: Write usage instructions here
25
+ FeedDetector.fetch_feed_urls('http://www.rubycorner.com')
26
+ => ["http://www.rubycorner.com/feeds/updated/atom10", "http://www.rubycorner.com/feeds/updated/rss20"]
27
+
28
+ FeedDetector.fetch_feed_urls('http://blog.dominiek.com/')
29
+ => ["http://blog.dominiek.com/feed/atom.xml"]
30
+
31
+ FeedDetector.fetch_feed_urls('http://blog.dominiek.com/feed/atom.xml')
32
+ => ["http://blog.dominiek.com/feed/atom.xml"]
33
+
34
+ FeedDetector.fetch_feed_urls('http://www.flickr.com/photos/dominiekterheide/', :rss)
35
+ => ["http://api.flickr.com/services/feeds/photos_public.gne?id=71386598@N00&lang=en-us&format=rss_200"]
36
+
37
+ alternatively you can parse HTML with
38
+
39
+ FeedDetector.get_feed_paths(html_data)
40
+
41
+ see integration test for more examples
42
+
43
+ ## TODO
44
+ * Decouple parsing and retrieving data.
45
+ * Tests should not depend on data retrieved from the web taking a particular form.
22
46
 
23
47
  ## Contributing
24
48
 
@@ -1,3 +1,3 @@
1
1
  module FeedDetector
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/feed_detector.rb CHANGED
@@ -1,35 +1,16 @@
1
1
  require "feed_detector/version"
2
2
  require "net/http"
3
3
  require "uri"
4
+ require "nokogiri"
4
5
 
5
6
  module FeedDetector
6
7
 
7
- # return the feed url for a url
8
+ # returns all feed urls from one url
8
9
  # for example: http://blog.dominiek.com/ => http://blog.dominiek.com/feed/atom.xml
9
10
  # only_detect can force detection of :rss or :atom
10
- # if nil is returned the has no discernible feed url -- perhaps because it's the feed url
11
- def self.url_from_string(url)
12
- if url =~ /^http:\/\//
13
- url
14
- else
15
- "http://#{url}"
16
- end
17
- end
18
-
19
- ## converts relative urls to absolute urls
20
- def self.to_absolute_url(page_url,feed_url)
21
- if feed_url =~ /^http:\/\// ## if its absolute
22
- feed_url
23
- elsif feed_url =~ /^\// ## relative to the host root ## '/some_dir_from_root/feed.xml'
24
- "http://#{URI.parse(page_url).host.to_s + feed_url}"
25
- else ## relative to the page path ## 'feed.xml'
26
- feed_path = page_url.scan(/^(http:\/\/[^\/]+)((?:\/[^\/]+)+(?=\/))?\/?(?:[^\/]+)?$/i).to_s
27
- feed_path +'/'+ feed_url
28
- end
29
- end
30
-
11
+ # if nil is returned no feeds were detected - perhaps because it's the feed url
31
12
  def self.fetch_feed_urls(page_url, only_detect=nil)
32
- retries = 3 ## default retries
13
+ retries = 3 # default retries
33
14
  html = ""
34
15
  begin
35
16
  response = Net::HTTP.get_response(URI.parse(page_url)) # sends get request
@@ -47,30 +28,42 @@ module FeedDetector
47
28
  feed_urls.map { |feed_url| self.to_absolute_url(page_url, feed_url) }
48
29
  end
49
30
 
50
- ##
51
31
  # get the feed href from an HTML document
52
32
  # for example:
53
33
  # ...
54
34
  # <link href="/feed/atom.xml" rel="alternate" type="application/atom+xml" />
55
- # ...
56
35
  # => /feed/atom.xml
57
36
  # only_detect can force detection of :rss or :atom
58
37
  def self.get_feed_paths(html, only_detect=nil)
59
- matches =[]
38
+ matches = []
60
39
 
61
- unless only_detect && only_detect != :atom
62
- matches |= html.scan(/<link.*href=['"]*([^\s'"]+)['"]*.*application\/atom\+xml.*>/)
63
- matches |= html.scan(/<link.*application\/atom\+xml.*href=['"]*([^\s'"]+)['"]*.*>/)
64
- #matches |= atom_feed
65
- end
66
-
67
- unless only_detect && only_detect != :rss
68
- matches |= html.scan(/<link.*href=['"]*([^\s'"]+)['"]*.*application\/rss\+xml.*>/)
69
- matches |= html.scan(/<link.*application\/rss\+xml.*href=['"]*([^\s'"]+)['"]*.*>/)
70
- # matches |= rss_feed
40
+ # parse html with nokogiri to find all link tags (see self.get_link_href)
41
+ doc = Nokogiri::HTML.parse(html)
42
+
43
+ unless only_detect
44
+ ["rss", "atom"].each do |type|
45
+ matches << get_link_href(doc, type)
46
+ end
47
+ else
48
+ matches << get_link_href(doc, only_detect.to_s)
71
49
  end
72
50
 
73
- flattened_matches = matches.flatten
74
- flattened_matches
51
+ matches.flatten
52
+ end
53
+
54
+ private
55
+
56
+ # finding all link tags and get the attribute href
57
+ # types: rss or atom
58
+ def self.get_link_href doc, type
59
+ doc.css('link[type="application/'+type+'+xml"]').map { |link| link['href'] }
75
60
  end
61
+
62
+ # converts relative urls to absolute urls
63
+ def self.to_absolute_url(page_url,feed_url)
64
+ return feed_url if feed_url =~ /^http:\/\// # already absolute
65
+ File.join(page_url, feed_url)
66
+ end
67
+
68
+
76
69
  end
@@ -7,7 +7,7 @@ class FeedDetectorTest < Test::Unit::TestCase
7
7
  @body = []
8
8
 
9
9
  @wordpress_atom_url = 'http://bettysteger.com/feed/' # the link says it's RSS, the XML is really ATOM
10
- @wordpress_single_feed_page_url = 'http://9gag.com/'
10
+ @wordpress_single_feed_page_url = 'http://showmaniac.org'
11
11
  @wordpress_several_feed_page_url = 'http://bettysteger.com/'
12
12
 
13
13
  @blogger_atom_url = 'http://ethandraws.blogspot.com/feeds/posts/default'
@@ -80,12 +80,12 @@ class FeedDetectorTest < Test::Unit::TestCase
80
80
 
81
81
  def test_fetch_feed_urls
82
82
  # page containing a single feed pointer
83
- result = ["http://9gag.com/rss/site/feed.rss"]
83
+ result = ["http://showmaniac.org/feed"]
84
84
  feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url)
85
85
  assert_equal(result, feed_paths)
86
- feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :rss)
87
- assert_equal(result, feed_paths)
88
86
  feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :atom)
87
+ assert_equal(result, feed_paths)
88
+ feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :rss)
89
89
  assert_equal([], feed_paths)
90
90
 
91
91
  # page containing several feed pointers
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feed_detector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: