feed_detector 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in feed_detector.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Bettina Steger
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # FeedDetector
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'feed_detector'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install feed_detector
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,17 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/feed_detector/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Bettina Steger"]
6
+ gem.email = ["bettina_steger@gmx.at"]
7
+ gem.description = %q{A ruby gem based on code from Dominiek's post (http://synaptify.com/?p=93) on detecting feeds. }
8
+ gem.summary = %q{A ruby gem based on code from Dominiek's post (http://synaptify.com/?p=93) on detecting feeds. }
9
+ gem.homepage = ""
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "feed_detector"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = FeedDetector::VERSION
17
+ end
@@ -0,0 +1,3 @@
1
+ module FeedDetector
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,76 @@
1
+ require "feed_detector/version"
2
+ require "net/http"
3
+ require "uri"
4
+
5
+ module FeedDetector
6
+
7
+ # return the feed url for a url
8
+ # for example: http://blog.dominiek.com/ => http://blog.dominiek.com/feed/atom.xml
9
+ # only_detect can force detection of :rss or :atom
10
+ # if nil is returned the has no discernible feed url -- perhaps because it's the feed url
11
+ def self.url_from_string(url)
12
+ if url =~ /^http:\/\//
13
+ url
14
+ else
15
+ "http://#{url}"
16
+ end
17
+ end
18
+
19
+ ## converts relative urls to absolute urls
20
+ def self.to_absolute_url(page_url,feed_url)
21
+ if feed_url =~ /^http:\/\// ## if its absolute
22
+ feed_url
23
+ elsif feed_url =~ /^\// ## relative to the host root ## '/some_dir_from_root/feed.xml'
24
+ "http://#{URI.parse(page_url).host.to_s + feed_url}"
25
+ else ## relative to the page path ## 'feed.xml'
26
+ feed_path = page_url.scan(/^(http:\/\/[^\/]+)((?:\/[^\/]+)+(?=\/))?\/?(?:[^\/]+)?$/i).to_s
27
+ feed_path +'/'+ feed_url
28
+ end
29
+ end
30
+
31
+ def self.fetch_feed_urls(page_url, only_detect=nil)
32
+ retries = 3 ## default retries
33
+ html = ""
34
+ begin
35
+ response = Net::HTTP.get_response(URI.parse(page_url)) # sends get request
36
+ html = response.body
37
+ rescue
38
+ retries -= 1
39
+ if retries > 0
40
+ sleep 0.42 and retry
41
+ else
42
+ raise
43
+ end
44
+ end
45
+
46
+ feed_urls = self.get_feed_paths(html, only_detect)
47
+ feed_urls.map { |feed_url| self.to_absolute_url(page_url, feed_url) }
48
+ end
49
+
50
+ ##
51
+ # get the feed href from an HTML document
52
+ # for example:
53
+ # ...
54
+ # <link href="/feed/atom.xml" rel="alternate" type="application/atom+xml" />
55
+ # ...
56
+ # => /feed/atom.xml
57
+ # only_detect can force detection of :rss or :atom
58
+ def self.get_feed_paths(html, only_detect=nil)
59
+ matches =[]
60
+
61
+ unless only_detect && only_detect != :atom
62
+ matches |= html.scan(/<link.*href=['"]*([^\s'"]+)['"]*.*application\/atom\+xml.*>/)
63
+ matches |= html.scan(/<link.*application\/atom\+xml.*href=['"]*([^\s'"]+)['"]*.*>/)
64
+ #matches |= atom_feed
65
+ end
66
+
67
+ unless only_detect && only_detect != :rss
68
+ matches |= html.scan(/<link.*href=['"]*([^\s'"]+)['"]*.*application\/rss\+xml.*>/)
69
+ matches |= html.scan(/<link.*application\/rss\+xml.*href=['"]*([^\s'"]+)['"]*.*>/)
70
+ # matches |= rss_feed
71
+ end
72
+
73
+ flattened_matches = matches.flatten
74
+ flattened_matches
75
+ end
76
+ end
@@ -0,0 +1,144 @@
1
+ require 'test/unit'
2
+ $LOAD_PATH << File.join(File.dirname(__FILE__), '..', 'lib')
3
+ require 'feed_detector'
4
+
5
+ class FeedDetectorTest < Test::Unit::TestCase
6
+ def setup
7
+ @body = []
8
+
9
+ @wordpress_atom_url = 'http://bettysteger.com/feed/' # the link says it's RSS, the XML is really ATOM
10
+ @wordpress_single_feed_page_url = 'http://9gag.com/'
11
+ @wordpress_several_feed_page_url = 'http://bettysteger.com/'
12
+
13
+ @blogger_atom_url = 'http://ethandraws.blogspot.com/feeds/posts/default'
14
+ @blogger_other_atom_url = 'http://www.blogger.com/feeds/21351008/posts/default'
15
+ @blogger_rss_url = 'http://ethandraws.blogspot.com/feeds/posts/default?alt=rss'
16
+ @blogger_page_url = 'http://ethandraws.blogspot.com/'
17
+ end
18
+
19
+ def test_get_feed_path
20
+ # page containing no feeds
21
+ html = make_html(nil_feed_html).join("\n")
22
+ feed_paths = FeedDetector.get_feed_paths(html)
23
+ assert_equal([], feed_paths)
24
+
25
+ # page containing a single atom feed
26
+ html = make_html(single_atom_feed_html).join("\n")
27
+ feed_paths = FeedDetector.get_feed_paths(html)
28
+ assert_equal(['/feed/atom.xml'], feed_paths)
29
+
30
+ # page containing a single rss feed
31
+ html = make_html(single_rss_feed_html).join("\n")
32
+ feed_paths = FeedDetector.get_feed_paths(html)
33
+ assert_equal(['/feed/blog.xml'], feed_paths)
34
+
35
+ # page containing several feeds
36
+ html = make_html(multi_feed_html).join("\n")
37
+ feed_paths = FeedDetector.get_feed_paths(html)
38
+ assert_equal(["/feed/blog.xml",
39
+ "http://ethandraws.blogspot.com/feeds/posts/default",
40
+ "http://ethandraws.blogspot.com/feeds/posts/default?alt=rss",
41
+ "http://giftedslacker.com/feed/",
42
+ "http://www.blogger.com/feeds/21351008/posts/default"], feed_paths.sort)
43
+ end
44
+
45
+ def test_get_feed_paths_with_only_detect
46
+ # page containing no feeds w/rss
47
+ html = make_html(nil_feed_html).join("\n")
48
+ feed_paths = FeedDetector.get_feed_paths(html, :rss)
49
+ assert_equal([], feed_paths)
50
+
51
+ # page containing no feeds w/atom
52
+ html = make_html(nil_feed_html).join("\n")
53
+ feed_paths = FeedDetector.get_feed_paths(html, :atom)
54
+ assert_equal([], feed_paths)
55
+
56
+ # page containing a single feed w/rss
57
+ html = make_html(single_rss_feed_html).join("\n")
58
+ feed_paths = FeedDetector.get_feed_paths(html, :rss)
59
+ assert_equal(['/feed/blog.xml'], feed_paths)
60
+
61
+ # page containing a single feed w/atom
62
+ html = make_html(single_atom_feed_html).join("\n")
63
+ feed_paths = FeedDetector.get_feed_paths(html, :atom)
64
+ assert_equal(['/feed/atom.xml'], feed_paths)
65
+
66
+ # page containing several feeds w/rss
67
+ html = make_html(multi_feed_html(:rss)).join("\n")
68
+ feed_paths = FeedDetector.get_feed_paths(html, :rss)
69
+ assert_equal(["/feed/blog.xml",
70
+ "http://ethandraws.blogspot.com/feeds/posts/default?alt=rss",
71
+ "http://giftedslacker.com/feed/"], feed_paths.sort)
72
+
73
+ # page containing several feeds w/atom
74
+ html = make_html(multi_feed_html(:atom)).join("\n")
75
+ feed_paths = FeedDetector.get_feed_paths(html, :atom)
76
+ assert_equal(["/feed/atom.xml",
77
+ "http://ethandraws.blogspot.com/feeds/posts/default",
78
+ "http://www.blogger.com/feeds/21351008/posts/default"], feed_paths.sort)
79
+ end
80
+
81
+ def test_fetch_feed_urls
82
+ # page containing a single feed pointer
83
+ result = ["http://9gag.com/rss/site/feed.rss"]
84
+ feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url)
85
+ assert_equal(result, feed_paths)
86
+ feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :rss)
87
+ assert_equal(result, feed_paths)
88
+ feed_paths = FeedDetector.fetch_feed_urls(@wordpress_single_feed_page_url, :atom)
89
+ assert_equal([], feed_paths)
90
+
91
+ # page containing several feed pointers
92
+ result = [@wordpress_atom_url, "http://bettysteger.com/comments/feed/"]
93
+ feed_paths = FeedDetector.fetch_feed_urls(@wordpress_several_feed_page_url)
94
+ assert_equal(result, feed_paths)
95
+ # feed_paths = FeedDetector.fetch_feed_urls(@wordpress_several_feed_page_url, :rss)
96
+ # assert_equal(["http://www.hasmanydevelopers.com/rss.xml"], feed_paths)
97
+ # feed_paths = FeedDetector.fetch_feed_urls(@wordpress_several_feed_page_url, :atom)
98
+ # assert_equal(["http://www.hasmanydevelopers.com/atom.xml"], feed_paths)
99
+ end
100
+
101
+ #TODO: add tests for malformed urls
102
+
103
+ private
104
+ def multi_feed_html(spec=:rss)
105
+ body = []
106
+ body << ' <link rel="alternate" type="application/atom+xml" title="Ethan Draws - Atom" href="http://ethandraws.blogspot.com/feeds/posts/default" />'
107
+ body << ' <link rel="service.post" type="application/atom+xml" title="Ethan Draws - Atom" href="http://www.blogger.com/feeds/21351008/posts/default" />'
108
+ body << ' <link rel="alternate" type="application/rss+xml" title="Ethan Draws - RSS" href="http://ethandraws.blogspot.com/feeds/posts/default?alt=rss" />'
109
+ body << ' <link rel="alternate" type="application/rss+xml" title="Gifted Slacker RSS Feed" href="http://giftedslacker.com/feed/" />'
110
+ if spec == :rss
111
+ body << ' <link href="/feed/blog.xml" rel="alternate" type="application/rss+xml" />'
112
+ else
113
+ body << ' <link href="/feed/atom.xml" rel="alternate" type="application/atom+xml" />'
114
+ end
115
+ body
116
+ end
117
+
118
+ def single_atom_feed_html
119
+ body = []
120
+ body << ' <link href="/feed/atom.xml" rel="alternate" type="application/atom+xml" />'
121
+ body
122
+ end
123
+
124
+ def single_rss_feed_html
125
+ body = []
126
+ body << ' <link href="/feed/blog.xml" rel="alternate" type="application/rss+xml" />'
127
+ body
128
+ end
129
+
130
+ def nil_feed_html
131
+ []
132
+ end
133
+
134
+ def make_html(lines)
135
+ @body = []
136
+ @body << ' <html>'
137
+ @body << ' <head>'
138
+ @body << ' <link href="/super.css" rel="alternate" type="text/css"/>'
139
+ lines.each { |line| @body << line }
140
+ @body << ' </head>'
141
+ @body << ' </html>'
142
+ end
143
+
144
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feed_detector
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Bettina Steger
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-04-27 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: ! 'A ruby gem based on code from Dominiek''s post (http://synaptify.com/?p=93)
15
+ on detecting feeds. '
16
+ email:
17
+ - bettina_steger@gmx.at
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - .gitignore
23
+ - Gemfile
24
+ - LICENSE
25
+ - README.md
26
+ - Rakefile
27
+ - feed_detector.gemspec
28
+ - lib/feed_detector.rb
29
+ - lib/feed_detector/version.rb
30
+ - test/feed_detector_test.rb
31
+ homepage: ''
32
+ licenses: []
33
+ post_install_message:
34
+ rdoc_options: []
35
+ require_paths:
36
+ - lib
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ requirements: []
50
+ rubyforge_project:
51
+ rubygems_version: 1.8.11
52
+ signing_key:
53
+ specification_version: 3
54
+ summary: A ruby gem based on code from Dominiek's post (http://synaptify.com/?p=93)
55
+ on detecting feeds.
56
+ test_files:
57
+ - test/feed_detector_test.rb
58
+ has_rdoc: