RubyGems - feed_detector - Versions diffs - 0.0.1 → 0.0.2 - Mend

feed_detector 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/README.md CHANGED Viewed

@@ -1,6 +1,10 @@
 # FeedDetector
-TODO: Write a gem description
+Detecting RSS feeds: When you use a proper browser like Mozilla Firefox you will see a syndication icon every time you visit a website that has RSS feeds:
+It does this by reading certain HTML tags.
+After a quick search I couldn’t find any code to do this in my own project, so I wrote a little piece of code for it with a RubyOnRails integration test.
 ## Installation
@@ -18,7 +22,27 @@ Or install it yourself as:
 ## Usage
-TODO: Write usage instructions here
+    FeedDetector.fetch_feed_urls('http://www.rubycorner.com')
+    => ["http://www.rubycorner.com/feeds/updated/atom10", "http://www.rubycorner.com/feeds/updated/rss20"]
+    FeedDetector.fetch_feed_urls('http://blog.dominiek.com/')
+     => ["http://blog.dominiek.com/feed/atom.xml"]
+    FeedDetector.fetch_feed_urls('http://blog.dominiek.com/feed/atom.xml')
+    => ["http://blog.dominiek.com/feed/atom.xml"]
+    FeedDetector.fetch_feed_urls('http://www.flickr.com/photos/dominiekterheide/', :rss)
+    => ["http://api.flickr.com/services/feeds/photos_public.gne?id=71386598@N00&amp;lang=en-us&format=rss_200"]
+alternatively you can parse HTML with
+    FeedDetector.get_feed_paths(html_data)
+see integration test for more examples
+## TODO
+* Decouple parsing and retrieving data.
+* Tests should not depend on data retrieved from the web taking a particular form.
 ## Contributing

data/lib/feed_detector/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module FeedDetector
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

data/lib/feed_detector.rb CHANGED Viewed

@@ -1,35 +1,16 @@
 require "feed_detector/version"
 require "net/http"
 require "uri"
+require "nokogiri"
 module FeedDetector
-  # return the feed url for a url
+  # returns all feed urls from one url
   # for example: http://blog.dominiek.com/ => http://blog.dominiek.com/feed/atom.xml
   # only_detect can force detection of :rss or :atom
-  # if nil is returned the has no discernible feed url -- perhaps because it's the feed url
-  def self.url_from_string(url)
-    if url =~ /^http:\/\//
-      url
-    else
-      "http://#{url}"
-    end
-  end
-  ## converts relative urls to absolute urls
-  def self.to_absolute_url(page_url,feed_url)
-    if feed_url =~ /^http:\/\// ## if its absolute
-      feed_url
-    elsif feed_url =~ /^\//  ## relative to the host root ## '/some_dir_from_root/feed.xml'
-      "http://#{URI.parse(page_url).host.to_s + feed_url}"
-    else  ## relative to the page path ## 'feed.xml'
-      feed_path = page_url.scan(/^(http:\/\/[^\/]+)((?:\/[^\/]+)+(?=\/))?\/?(?:[^\/]+)?$/i).to_s
-      feed_path +'/'+ feed_url
-    end
-  end
+  # if nil is returned no feeds were detected - perhaps because it's the feed url
   def self.fetch_feed_urls(page_url, only_detect=nil)
-    retries = 3 ## default retries
+    retries = 3 # default retries
     html = ""
     begin
       response = Net::HTTP.get_response(URI.parse(page_url)) # sends get request
@@ -47,30 +28,42 @@ module FeedDetector
     feed_urls.map { |feed_url| self.to_absolute_url(page_url, feed_url) }
   end
-  ##
   # get the feed href from an HTML document
   # for example:
   # ...
   # <link href="/feed/atom.xml" rel="alternate" type="application/atom+xml" />
-  # ...
   # => /feed/atom.xml
   # only_detect can force detection of :rss or :atom
   def self.get_feed_paths(html, only_detect=nil)
-    matches =[]
+    matches = []
-    unless only_detect && only_detect != :atom
-      matches |= html.scan(/<link.*href=['"]*([^\s'"]+)['"]*.*application\/atom\+xml.*>/)
-      matches |= html.scan(/<link.*application\/atom\+xml.*href=['"]*([^\s'"]+)['"]*.*>/)
-      #matches |=  atom_feed
-    end
-    unless only_detect && only_detect != :rss
-      matches |= html.scan(/<link.*href=['"]*([^\s'"]+)['"]*.*application\/rss\+xml.*>/)
-      matches |= html.scan(/<link.*application\/rss\+xml.*href=['"]*([^\s'"]+)['"]*.*>/)
-    #  matches |= rss_feed
+    # parse html with nokogiri to find all link tags (see self.get_link_href)
+    doc = Nokogiri::HTML.parse(html)
+    unless only_detect
+      ["rss", "atom"].each do |type|
+        matches << get_link_href(doc, type)
+      end
+    else
+      matches << get_link_href(doc, only_detect.to_s)
     end
-    flattened_matches = matches.flatten
-    flattened_matches
+    matches.flatten
+  end
+  private
+  # finding all link tags and get the attribute href
+  # types: rss or atom
+  def self.get_link_href doc, type
+    doc.css('link[type="application/'+type+'+xml"]').map { |link| link['href'] }
   end
+  # converts relative urls to absolute urls
+  def self.to_absolute_url(page_url,feed_url)
+    return feed_url if feed_url =~ /^http:\/\// # already absolute
+    File.join(page_url, feed_url)
+  end
 end

data/test/feed_detector_test.rb CHANGED Viewed

@@ -7,7 +7,7 @@ class FeedDetectorTest < Test::Unit::TestCase
     @body = []
     @wordpress_atom_url = 'http://bettysteger.com/feed/' # the link says it's RSS, the XML is really ATOM
-    @wordpress_single_feed_page_url = 'http://9gag.com/'
+    @wordpress_single_feed_page_url = 'http://showmaniac.org'
     @wordpress_several_feed_page_url = 'http://bettysteger.com/'
     @blogger_atom_url = 'http://ethandraws.blogspot.com/feeds/posts/default'
@@ -80,12 +80,12 @@ class FeedDetectorTest < Test::Unit::TestCase
   def test_fetch_feed_urls
     # page containing a single feed pointer
-    result = ["http://9gag.com/rss/site/feed.rss"]
+    result = ["http://showmaniac.org/feed"]
     feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url)
     assert_equal(result, feed_paths)
-    feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :rss)
-    assert_equal(result, feed_paths)
     feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :atom)
+    assert_equal(result, feed_paths)
+    feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :rss)
     assert_equal([], feed_paths)
     # page containing several feed pointers

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: feed_detector
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
   prerelease:
 platform: ruby
 authors: