feedisco 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +11 -0
- data/Gemfile.lock +31 -0
- data/LICENSE +20 -0
- data/README.md +129 -0
- data/Rakefile +49 -0
- data/VERSION +1 -0
- data/feedisco.gemspec +66 -0
- data/lib/feedisco/checks.rb +50 -0
- data/lib/feedisco/discovery.rb +57 -0
- data/lib/feedisco/utilities.rb +46 -0
- data/lib/feedisco/version.rb +6 -0
- data/lib/feedisco.rb +50 -0
- data/script/console +22 -0
- data/spec/fixtures/alternate.html +11 -0
- data/spec/fixtures/no_link.html +12 -0
- data/spec/fixtures/one_link_in_body.html +12 -0
- data/spec/fixtures/several_links.html +15 -0
- data/spec/lib/checks_spec.rb +36 -0
- data/spec/lib/discovery_spec.rb +117 -0
- data/spec/lib/utilities_spec.rb +41 -0
- data/spec/spec_helper.rb +22 -0
- metadata +120 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.3)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.8.4)
|
7
|
+
bundler (~> 1.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
rdoc
|
11
|
+
json (1.7.5)
|
12
|
+
nokogiri (1.5.5)
|
13
|
+
rake (0.9.2.2)
|
14
|
+
rdoc (3.12)
|
15
|
+
json (~> 1.4)
|
16
|
+
rspec (2.11.0)
|
17
|
+
rspec-core (~> 2.11.0)
|
18
|
+
rspec-expectations (~> 2.11.0)
|
19
|
+
rspec-mocks (~> 2.11.0)
|
20
|
+
rspec-core (2.11.1)
|
21
|
+
rspec-expectations (2.11.3)
|
22
|
+
diff-lcs (~> 1.1.3)
|
23
|
+
rspec-mocks (2.11.3)
|
24
|
+
|
25
|
+
PLATFORMS
|
26
|
+
ruby
|
27
|
+
|
28
|
+
DEPENDENCIES
|
29
|
+
jeweler
|
30
|
+
nokogiri
|
31
|
+
rspec
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Romain Champourlier <romain@softr.li>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
# Feedisco
|
2
|
+
|
3
|
+
### Summary
|
4
|
+
|
5
|
+
Feedisco is a small and lightweight library focused on RSS/Atom feed discovery. It is intended to do little, but to do it well!
|
6
|
+
|
7
|
+
### Use case(s)
|
8
|
+
|
9
|
+
* I want to **find the URLs of a RSS/Atom feeds for my users' blogs**
|
10
|
+
* I want to **check that an user-supplied URL is truly a RSS/Atom feed URL**
|
11
|
+
* I want to let my user **choose from a list of discovered RSS/Atom feeds** when he entered his website URL
|
12
|
+
* Probably more...
|
13
|
+
|
14
|
+
### The methods
|
15
|
+
|
16
|
+
```
|
17
|
+
Feedisco.find(url)
|
18
|
+
Feedisco.feed?(url)
|
19
|
+
```
|
20
|
+
|
21
|
+
### How to start
|
22
|
+
|
23
|
+
**Install it with Rubygems**
|
24
|
+
|
25
|
+
```
|
26
|
+
$ gem install feedisco
|
27
|
+
$ irb
|
28
|
+
|
29
|
+
irb(main):001:0> require 'rubygems'
|
30
|
+
=> true
|
31
|
+
irb(main):001:0> require 'feedisco'
|
32
|
+
=> true
|
33
|
+
```
|
34
|
+
|
35
|
+
**...or clone it from GitHub**
|
36
|
+
|
37
|
+
```
|
38
|
+
$ git clone https://github.com/rchampourlier/feedisco
|
39
|
+
$ cd feedisco
|
40
|
+
$ bundle install
|
41
|
+
$ bundle exec script/console
|
42
|
+
```
|
43
|
+
|
44
|
+
_**Nota Bene:**_
|
45
|
+
|
46
|
+
* _`script/console` is a small script available in the repo that loads an IRB/Pry console with Feedisco preloaded, so it's ready to use! (feel like `rails console`)_
|
47
|
+
|
48
|
+
### Examples
|
49
|
+
|
50
|
+
```
|
51
|
+
irb(main):001:0> Feedisco.find('rchampourlier.com')
|
52
|
+
=> ["http://feeds.rchampourlier.com/rchampourlier"]
|
53
|
+
|
54
|
+
irb(main):002:0> Feedisco.find('google.com')
|
55
|
+
=> []
|
56
|
+
|
57
|
+
irb(main):003:0> Feedisco.feed?('google.com')
|
58
|
+
=> false
|
59
|
+
|
60
|
+
irb(main):007:0> Feedisco.feed?('feeds.rchampourlier.com/rchampourlier')
|
61
|
+
=> true
|
62
|
+
|
63
|
+
irb(main):010:0> puts Feedisco.find('http://edition.cnn.com/services/rss/')
|
64
|
+
http://rss.cnn.com/rss/edition.rss
|
65
|
+
http://rss.cnn.com/rss/edition_asia.rss
|
66
|
+
http://rss.cnn.com/rss/edition_europe.rss
|
67
|
+
http://rss.cnn.com/rss/edition_us.rss
|
68
|
+
http://rss.cnn.com/rss/edition_world.rss
|
69
|
+
http://rss.cnn.com/rss/edition_africa.rss
|
70
|
+
http://rss.cnn.com/rss/edition_americas.rss
|
71
|
+
[and a lot more...]
|
72
|
+
```
|
73
|
+
|
74
|
+
### Installation
|
75
|
+
|
76
|
+
Add this to your Gemfile (the gem is not published to Rubygems until it is good enough, so tell me when you think it should!):
|
77
|
+
|
78
|
+
```
|
79
|
+
gem 'feedisco', :git => 'https://github.com/rchampourlier/feedisco', :ref => 'master'
|
80
|
+
```
|
81
|
+
|
82
|
+
Or just clone it and do whatever you want:
|
83
|
+
|
84
|
+
```
|
85
|
+
$ git clone http://github.com/rchampourlier/feedisco
|
86
|
+
```
|
87
|
+
|
88
|
+
|
89
|
+
### Why should you use it?
|
90
|
+
|
91
|
+
* Well, because you need to discover feeds URL from a given URL, and nothing more!
|
92
|
+
* Because you wan't it to be simple, with clean-code you can correct, complete, update, test, if you need it.
|
93
|
+
* Because you just need to add a line to your Gemfile to use it.
|
94
|
+
* Because it should still follow modern feed filename conventions (like those ones used by WordPress blogs, or Blogger, etc) - that's the part taken from [Feedbag](https://github.com/damog/feedbag)!
|
95
|
+
|
96
|
+
### Why did I build it?
|
97
|
+
|
98
|
+
* Because I wanted a **simple and lightweight** library to discover RSS/Atom feeds.
|
99
|
+
* Because [Feedbag](https://github.com/damog/feedbag) was using Hpricot, and I was more Nokogiri.
|
100
|
+
* Because I wanted something with a little more tests than Feedbag.
|
101
|
+
|
102
|
+
### Bugs, issues, contributions
|
103
|
+
|
104
|
+
You can use the [GitHub repo](https://github.com/rchampourlier/feedisco) for all this, so don't hesitate!
|
105
|
+
|
106
|
+
Contribute by forking, tweaking, and sending pull requests, but please add the appropriate tests!
|
107
|
+
|
108
|
+
### History
|
109
|
+
|
110
|
+
#### `0.1.1`
|
111
|
+
|
112
|
+
* Completed `Rakefile`
|
113
|
+
* Changed a method from private to public, specs are green again
|
114
|
+
|
115
|
+
#### `0.1.0`
|
116
|
+
|
117
|
+
* Initial release
|
118
|
+
|
119
|
+
### Author
|
120
|
+
|
121
|
+
[Romain Champourlier](http://softr.li)
|
122
|
+
|
123
|
+
### Copyright
|
124
|
+
|
125
|
+
This is free software. See [LICENSE](http://github.com/rchampourlier/feedisco/master/LICENSE) for more information.
|
126
|
+
|
127
|
+
### Thanks
|
128
|
+
|
129
|
+
[David Moreno](http://damog.net/) for [Feedbag](https://github.com/damog/feedbag) which I used before writing Feedisco. Plus Feedisco is more than inspired from Feedbag, even if I almost rewrote everything to make (to my sense) more Ruby-way, readable, and in particular more tested.
|
data/Rakefile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
|
6
|
+
begin
|
7
|
+
Bundler.setup(:default, :development)
|
8
|
+
rescue Bundler::BundlerError => e
|
9
|
+
$stderr.puts e.message
|
10
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
11
|
+
exit e.status_code
|
12
|
+
end
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "feedisco"
|
18
|
+
gem.summary = "A simple feed discovery library"
|
19
|
+
gem.description = "Feedisco is a small and lightweight library focused on RSS/Atom feed discovery. It is intended to do little, but to do it well!"
|
20
|
+
gem.email = "romain@softr.li"
|
21
|
+
gem.homepage = "http://github.com/rchampourlier/feedisco"
|
22
|
+
gem.authors = ["Romain Champourlier"]
|
23
|
+
gem.license = "MIT"
|
24
|
+
gem.add_dependency 'nokogiri'
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rspec/core'
|
29
|
+
require 'rspec/core/rake_task'
|
30
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
end
|
33
|
+
|
34
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
spec.rcov = true
|
37
|
+
end
|
38
|
+
|
39
|
+
task :default => :spec
|
40
|
+
|
41
|
+
require 'rdoc/task'
|
42
|
+
Rake::RDocTask.new do |rdoc|
|
43
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
44
|
+
|
45
|
+
rdoc.rdoc_dir = 'rdoc'
|
46
|
+
rdoc.title = "oro #{version}"
|
47
|
+
rdoc.rdoc_files.include('README*')
|
48
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
49
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.1
|
data/feedisco.gemspec
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "feedisco"
|
8
|
+
s.version = "0.1.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Romain Champourlier"]
|
12
|
+
s.date = "2012-09-28"
|
13
|
+
s.description = "Feedisco is a small and lightweight library focused on RSS/Atom feed discovery. It is intended to do little, but to do it well!"
|
14
|
+
s.email = "romain@softr.li"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
"Gemfile",
|
21
|
+
"Gemfile.lock",
|
22
|
+
"LICENSE",
|
23
|
+
"README.md",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"feedisco.gemspec",
|
27
|
+
"lib/feedisco.rb",
|
28
|
+
"lib/feedisco/checks.rb",
|
29
|
+
"lib/feedisco/discovery.rb",
|
30
|
+
"lib/feedisco/utilities.rb",
|
31
|
+
"lib/feedisco/version.rb",
|
32
|
+
"script/console",
|
33
|
+
"spec/fixtures/alternate.html",
|
34
|
+
"spec/fixtures/no_link.html",
|
35
|
+
"spec/fixtures/one_link_in_body.html",
|
36
|
+
"spec/fixtures/several_links.html",
|
37
|
+
"spec/lib/checks_spec.rb",
|
38
|
+
"spec/lib/discovery_spec.rb",
|
39
|
+
"spec/lib/utilities_spec.rb",
|
40
|
+
"spec/spec_helper.rb"
|
41
|
+
]
|
42
|
+
s.homepage = "http://github.com/rchampourlier/feedisco"
|
43
|
+
s.licenses = ["MIT"]
|
44
|
+
s.require_paths = ["lib"]
|
45
|
+
s.rubygems_version = "1.8.23"
|
46
|
+
s.summary = "A simple feed discovery library"
|
47
|
+
|
48
|
+
if s.respond_to? :specification_version then
|
49
|
+
s.specification_version = 3
|
50
|
+
|
51
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
52
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
53
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
54
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
55
|
+
else
|
56
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
57
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
58
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
59
|
+
end
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
62
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
63
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Feedisco::Checks
|
2
|
+
|
3
|
+
# Check if the specified URL is a feed URL. The check is performed by opening the
|
4
|
+
# URL and checking the content type. If it matches a content type within
|
5
|
+
# Feedisco.feed_content_types, the URL is considered as a feed and the method returns
|
6
|
+
# true.
|
7
|
+
def feed?(url)
|
8
|
+
feed_content_type?(url)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Determines if the specified URL looks like a feed. We consider it does if:
|
12
|
+
# - it ends with a 'feed-suffix': .rdf, .xml, .rss
|
13
|
+
# - it contains a 'feed=rss' or 'feed=atom' query param (well, we don't check
|
14
|
+
# if it is really a query param, as long as it is in the URL)
|
15
|
+
# - it ends with 'atom' or 'feed' (with or without the '/' at the end)
|
16
|
+
def looks_like_feed?(url)
|
17
|
+
(url =~ %r{(\.(rdf|xml|rss)$|feed=(rss|atom)(&(.)+)?$|(atom|feed)/?$)}i) != nil
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# Open the specified URL and check its content type. Returns true if the content type
|
23
|
+
# is a feed content type (in Feedisco.feed_content_types)
|
24
|
+
#
|
25
|
+
# You can pass an URL (a string) or a file (open(...), a Tempfile instance) to the method.
|
26
|
+
def feed_content_type?(url_or_file)
|
27
|
+
opened = false
|
28
|
+
|
29
|
+
if url_or_file.is_a? String
|
30
|
+
harmonized_url = harmonize_url(url_or_file)
|
31
|
+
file = open(harmonized_url)
|
32
|
+
opened = true
|
33
|
+
|
34
|
+
elsif url_or_file.class.to_s == 'Tempfile'
|
35
|
+
file = url_or_file
|
36
|
+
|
37
|
+
else raise ArgumentError.new('argument must be a String (url) or a Tempfile created with `open(url)`')
|
38
|
+
end
|
39
|
+
|
40
|
+
# Retrieve page content type
|
41
|
+
content_type = file.content_type.downcase
|
42
|
+
if content_type == "application/octet-stream"
|
43
|
+
content_type = file.meta["content-type"].gsub(/;.*$/, '')
|
44
|
+
end
|
45
|
+
file.close if opened
|
46
|
+
|
47
|
+
# Check if the content-type indicates RSS/Atom feed (in Feedisco.feed_content_types)
|
48
|
+
Feedisco.feed_content_types.include?(content_type)
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'feedisco/utilities'
|
2
|
+
require 'feedisco/checks'
|
3
|
+
|
4
|
+
module Feedisco
|
5
|
+
extend Checks
|
6
|
+
extend Utilities
|
7
|
+
|
8
|
+
module Discovery
|
9
|
+
|
10
|
+
# Find RSS/Atom feed URLs by looking around the specified URL.
|
11
|
+
def find(url, args = {})
|
12
|
+
raise ArgumentError.new("url can't be nil!") if url.nil?
|
13
|
+
|
14
|
+
harmonized_url = harmonize_url(url)
|
15
|
+
|
16
|
+
raise ArgumentError.new("url's protocol must be 'http(s)' or 'feed' (#{url})") if harmonized_url.nil?
|
17
|
+
|
18
|
+
feeds = []
|
19
|
+
|
20
|
+
# Open the URL to check the content-type or crawl for feed links
|
21
|
+
open(harmonized_url) do |file|
|
22
|
+
|
23
|
+
if feed_content_type?(file)
|
24
|
+
# Add the url to feeds if it shows a feed content type
|
25
|
+
feeds << harmonized_url
|
26
|
+
|
27
|
+
else
|
28
|
+
# Else, parse the page to search for links
|
29
|
+
doc = Nokogiri::HTML(file.read)
|
30
|
+
|
31
|
+
# Check <link> elements
|
32
|
+
doc.css('link').each do |link|
|
33
|
+
feeds << complete_extracted_url(link[:href], harmonized_url) if link[:rel] =~ %r{(alternate|service.feed)}i && Feedisco.feed_content_types.include?(link[:type].downcase.strip)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Check <a> elements
|
37
|
+
doc.css('a').each do |a|
|
38
|
+
if (looks_like_feed?(a[:href]) &&
|
39
|
+
(a[:href] =~ %r{\A/} || a[:href] =~ %r{#{URI.parse(harmonized_url).host}/}) &&
|
40
|
+
!feeds.include?(a[:href]))
|
41
|
+
|
42
|
+
feeds << complete_extracted_url(a[:href], harmonized_url)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Check <a> elements again, less restrictively, so we add all discovered feeds even the ones
|
47
|
+
# on external domains, but the will come after in the feeds array.
|
48
|
+
doc.css('a').each do |a|
|
49
|
+
feeds << complete_extracted_url(a[:href], harmonized_url) if (looks_like_feed?(a[:href]) && !feeds.include?(a[:href]))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
feeds
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Feedisco::Utilities
|
2
|
+
|
3
|
+
# Check the specified URL protocol. To be considered as a valid feed URL, it must
|
4
|
+
# match either 'feed', 'http', or 'https', or be nil. If it does, the url is returned
|
5
|
+
# with 'http' or 'https' protocol (replacing nil and 'feed' ones). Else, it returns nil.
|
6
|
+
def harmonize_url(url)
|
7
|
+
url_uri = URI.parse(url)
|
8
|
+
|
9
|
+
case url_uri.scheme
|
10
|
+
when nil
|
11
|
+
"http://#{url}"
|
12
|
+
|
13
|
+
when 'feed'
|
14
|
+
url.sub(%r{feed://}, 'http://')
|
15
|
+
|
16
|
+
when %r{http(s)?}
|
17
|
+
url
|
18
|
+
|
19
|
+
else
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Complete extracted_url with page_url:
|
25
|
+
# - if extracted_url is relative, completes it with the protocol,
|
26
|
+
# host and path from page_url (page_url is expected to be absolute!)
|
27
|
+
# - just returns extracted_url if it is absolute.
|
28
|
+
def complete_extracted_url(extracted_url, page_url)
|
29
|
+
extracted_uri = URI.parse(extracted_url)
|
30
|
+
page_uri = URI.parse(page_url)
|
31
|
+
|
32
|
+
if extracted_uri.absolute?
|
33
|
+
extracted_url
|
34
|
+
|
35
|
+
else
|
36
|
+
raise ArgumentError.new('page_url must be absolute if extracted_url isn\'t!') unless page_uri.absolute?
|
37
|
+
|
38
|
+
if extracted_url =~ %r{\A/}
|
39
|
+
# Starts with '/', root of page_url's domain
|
40
|
+
"#{page_uri.scheme}://#{page_uri.host}#{extracted_url}"
|
41
|
+
else
|
42
|
+
"#{page_uri.scheme}://#{page_uri.host}#{page_uri.path}/#{extracted_url}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/feedisco.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# Feedisco
|
4
|
+
#
|
5
|
+
# Built from Feedbag
|
6
|
+
# - replaced Hpricot by Nokogiri
|
7
|
+
# - improved discovery to check on /rss and /atom URIs
|
8
|
+
# - removed the global variables
|
9
|
+
#
|
10
|
+
# Copyright Axiombox (c) 2008
|
11
|
+
# David Moreno <david@axiombox.com> (c) 2008
|
12
|
+
#
|
13
|
+
# This program is free software: you can redistribute it and/or modify
|
14
|
+
# it under the terms of the GNU General Public License as published by
|
15
|
+
# the Free Software Foundation, either version 3 of the License, or
|
16
|
+
# (at your option) any later version.
|
17
|
+
#
|
18
|
+
# This program is distributed in the hope that it will be useful,
|
19
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
20
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
21
|
+
# GNU General Public License for more details.
|
22
|
+
#
|
23
|
+
# You should have received a copy of the GNU General Public License
|
24
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
25
|
+
|
26
|
+
require "rubygems"
|
27
|
+
require "nokogiri"
|
28
|
+
require "open-uri"
|
29
|
+
require "net/http"
|
30
|
+
|
31
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
32
|
+
|
33
|
+
module Feedisco
|
34
|
+
def self.feed_content_types
|
35
|
+
[
|
36
|
+
'application/x.atom+xml',
|
37
|
+
'application/atom+xml',
|
38
|
+
'application/xml',
|
39
|
+
'text/xml',
|
40
|
+
'application/rss+xml',
|
41
|
+
'application/rdf+xml',
|
42
|
+
]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
require 'feedisco/discovery'
|
47
|
+
|
48
|
+
Feedisco.extend Feedisco::Discovery
|
49
|
+
|
50
|
+
$LOAD_PATH.shift
|
data/script/console
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This small script runs an IRB console ready for Feedisco fun!
|
4
|
+
# Be sure to run within a `bundle exec` if your using Bundler!
|
5
|
+
|
6
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "feedisco.rb"))
|
7
|
+
|
8
|
+
# prevent STDOUT & STDERR to be reopened (apps do this to be able to log under Passenger)
|
9
|
+
def STDOUT.reopen(*args); end
|
10
|
+
def STDERR.reopen(*args); end
|
11
|
+
|
12
|
+
begin
|
13
|
+
require "pry"
|
14
|
+
Interpreter = Pry
|
15
|
+
rescue LoadError
|
16
|
+
require "irb"
|
17
|
+
require "irb/completion"
|
18
|
+
Interpreter = IRB
|
19
|
+
end
|
20
|
+
|
21
|
+
# START
|
22
|
+
Interpreter.start
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<html>
|
2
|
+
|
3
|
+
<head>
|
4
|
+
<link rel="shortcut icon" href="http://example.com/shortcut.png">
|
5
|
+
<link rel="alternate" type="application/nope" href="http://example.com/page.nope">
|
6
|
+
</head>
|
7
|
+
|
8
|
+
<body>
|
9
|
+
<a href="http://example.com/feed.nope">feed.nope</a>
|
10
|
+
</body>
|
11
|
+
|
12
|
+
</html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<html>
|
2
|
+
|
3
|
+
<head>
|
4
|
+
<link rel="shortcut icon" href="http://example.com/shortcut.png">
|
5
|
+
<link rel="alternate" type="application/nope" href="http://example.com/page.nope">
|
6
|
+
</head>
|
7
|
+
|
8
|
+
<body>
|
9
|
+
<a href="http://example.com/feed.rss">feed.rss</a>
|
10
|
+
</body>
|
11
|
+
|
12
|
+
</html>
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<html>
|
2
|
+
|
3
|
+
<head>
|
4
|
+
<link rel="shortcut icon" href="http://example.com/shortcut.png">
|
5
|
+
<link rel="alternate" type="application/xml" href="http://example.com/feed.xml">
|
6
|
+
</head>
|
7
|
+
|
8
|
+
<body>
|
9
|
+
<a href="http://another.domain.com/feed.rss">another domain's feed.rss</a>
|
10
|
+
<a href="http://example.com/feed.rss">feed.rss</a>
|
11
|
+
<a href="http://another.domain.com/feed.rss">another domain's feed.rss</a>
|
12
|
+
<a href="http://example.com/feed.rss">feed.rss</a>
|
13
|
+
</body>
|
14
|
+
|
15
|
+
</html>
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'feedisco'
|
2
|
+
|
3
|
+
describe "Feedisco::Checks" do
|
4
|
+
|
5
|
+
describe 'looks_like_feed?' do
|
6
|
+
|
7
|
+
%w(feed.rdf feed.xml feed.rss feed?feed=atom feed?feed=rss feed/atom feed/feed feed/atom/ feed/feed/).each do |url|
|
8
|
+
it "should return true for '#{url}' " do
|
9
|
+
Feedisco.looks_like_feed?(url).should be_true
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
%w(feed.txt feed?feed=atomic feed=none example.com/atomic).each do |url|
|
14
|
+
it "should return false for '#{url}' " do
|
15
|
+
Feedisco.looks_like_feed?(url).should_not be_true
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe 'feed? (using real websites, may break if they change)' do
|
21
|
+
|
22
|
+
['http://feeds.rchampourlier.com/rchampourlier',
|
23
|
+
'http://rss.cnn.com/rss/cnn_topstories.rss'].each do |url|
|
24
|
+
it "should return true for '#{url}'" do
|
25
|
+
Feedisco.feed?(url).should be_true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
['http://rchampourlier.com',
|
30
|
+
'http://www.cnn.com/services/rss/'].each do |url|
|
31
|
+
it "should return false for '#{url}'" do
|
32
|
+
Feedisco.feed?(url).should_not be_true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'feedisco'
|
2
|
+
|
3
|
+
describe "Feedisco::Discovery" do
|
4
|
+
|
5
|
+
describe "find" do
|
6
|
+
|
7
|
+
it 'should raise an ArgumentError if specified url is nil' do
|
8
|
+
expect {
|
9
|
+
Feedisco.find(nil)
|
10
|
+
}.to raise_error(ArgumentError)
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should return an array with only the specified URL if it has a feed content type' do
|
14
|
+
file = stub(:class => stub(:to_s => 'Tempfile'), :content_type => 'application/xml')
|
15
|
+
|
16
|
+
Feedisco.should_receive(:open).and_yield(file)
|
17
|
+
Feedisco.find('example.com/feed.xml').should == ['http://example.com/feed.xml']
|
18
|
+
end
|
19
|
+
|
20
|
+
context 'from fixtures' do
|
21
|
+
|
22
|
+
it 'should return an empty array' do
|
23
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'no_link.html'))
|
24
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
25
|
+
|
26
|
+
Feedisco.should_receive(:open).and_yield(file)
|
27
|
+
Feedisco.find('example.com').should == []
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should include the alternate link' do
|
31
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'alternate.html'))
|
32
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
33
|
+
|
34
|
+
Feedisco.should_receive(:open).and_yield(file)
|
35
|
+
Feedisco.find('example.com').should include('http://example.com/feed.xml')
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should include a <a> link in the body' do
|
39
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'one_link_in_body.html'))
|
40
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
41
|
+
|
42
|
+
Feedisco.should_receive(:open).and_yield(file)
|
43
|
+
Feedisco.find('example.com').should include("http://example.com/feed.rss")
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should include link to feeds on the URL\'s domain' do
|
47
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'several_links.html'))
|
48
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
49
|
+
|
50
|
+
Feedisco.should_receive(:open).and_yield(file)
|
51
|
+
Feedisco.find('example.com').should include("http://example.com/feed.rss")
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'should have the alternate link as the first of the returned feed' do
|
55
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'several_links.html'))
|
56
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
57
|
+
|
58
|
+
Feedisco.should_receive(:open).and_yield(file)
|
59
|
+
Feedisco.find('example.com').first.should == "http://example.com/feed.xml"
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'should include link to feeds on other domains' do
|
63
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'several_links.html'))
|
64
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
65
|
+
|
66
|
+
Feedisco.should_receive(:open).and_yield(file)
|
67
|
+
Feedisco.find('example.com').should include "http://another.domain.com/feed.rss"
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'should have links to feeds on other domains after links to feeds on the same domain' do
|
71
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'several_links.html'))
|
72
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
73
|
+
|
74
|
+
Feedisco.should_receive(:open).and_yield(file)
|
75
|
+
feeds = Feedisco.find('example.com')
|
76
|
+
feeds.index('http://example.com/feed.rss').should < feeds.index('http://another.domain.com/feed.rss')
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should include each link only once' do
|
80
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'several_links.html'))
|
81
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
82
|
+
|
83
|
+
Feedisco.should_receive(:open).and_yield(file)
|
84
|
+
feeds = Feedisco.find('example.com')
|
85
|
+
feeds.select { |f| f == 'http://example.com/feed.rss' }.count.should == 1
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
context 'from real websites (this may change so some examples may break)' do
|
90
|
+
|
91
|
+
it "should return the URL for a feed URL" do
|
92
|
+
Feedisco.find('http://feeds.rchampourlier.com/rchampourlier').should == ['http://feeds.rchampourlier.com/rchampourlier']
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should return an empty array for 'www.google.com'" do
|
96
|
+
Feedisco.find("www.google.com").should == []
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should return 'http://feeds.rchampourlier.com/rchampourlier' for 'rchampourlier.com'" do
|
100
|
+
Feedisco.find("www.rchampourlier.com").should == ['http://feeds.rchampourlier.com/rchampourlier']
|
101
|
+
end
|
102
|
+
|
103
|
+
it "should raise an URI::InvalidURIError for an invalid URL" do
|
104
|
+
expect {
|
105
|
+
Feedisco.find("not url")
|
106
|
+
}.to raise_error(URI::InvalidURIError)
|
107
|
+
end
|
108
|
+
|
109
|
+
it "should raise an ArgumentError if the specified url's scheme is not 'http(s)' or 'feed'" do
|
110
|
+
expect {
|
111
|
+
Feedisco.find('ftp://rchampourlier.com')
|
112
|
+
}.to raise_error(ArgumentError)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'feedisco'
|
2
|
+
|
3
|
+
describe "Feedisco::Utilities" do
|
4
|
+
|
5
|
+
describe 'harmonize_url' do
|
6
|
+
|
7
|
+
{ 'example.com' => 'http://example.com',
|
8
|
+
'example.com/page' => 'http://example.com/page',
|
9
|
+
'feed://example.com' => 'http://example.com',
|
10
|
+
'http://example.com' => 'http://example.com',
|
11
|
+
'https://example.com' => 'https://example.com',
|
12
|
+
'ftp://example.com' => nil
|
13
|
+
}.each do |url, expected|
|
14
|
+
|
15
|
+
it "should return '#{expected || :nil}' for '#{url}'" do
|
16
|
+
Feedisco.harmonize_url(url).should == expected
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe 'complete_extracted_url' do
|
22
|
+
|
23
|
+
it 'should return the extracted url if it is absolute' do
|
24
|
+
Feedisco.complete_extracted_url('http://example.com/page', 'http://example.com/another').should == 'http://example.com/page'
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should add the scheme and host if the extracted url is relative from the root path' do
|
28
|
+
Feedisco.complete_extracted_url('/page', 'http://example.com').should == 'http://example.com/page'
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should add the extracted_url\'s path if it\'s relative from the page\'s page' do
|
32
|
+
Feedisco.complete_extracted_url('page', 'http://example.com/root').should == 'http://example.com/root/page'
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should raise ArgumentError if both extracted and page urls are relative' do
|
36
|
+
expect {
|
37
|
+
Feedisco.complete_extracted_url('/relative', '/relative/too')
|
38
|
+
}.to raise_error(ArgumentError)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
$LOAD_PATH << "." unless $LOAD_PATH.include?(".")
|
2
|
+
require 'logger'
|
3
|
+
|
4
|
+
##
|
5
|
+
# Bundler and its require
|
6
|
+
#
|
7
|
+
begin
|
8
|
+
require "rubygems"
|
9
|
+
require "bundler"
|
10
|
+
|
11
|
+
if Gem::Version.new(Bundler::VERSION) <= Gem::Version.new("0.9.5")
|
12
|
+
raise RuntimeError, "Your bundler version is too old." +
|
13
|
+
"Run `gem install bundler` to upgrade."
|
14
|
+
end
|
15
|
+
|
16
|
+
# Set up load paths for all bundled gems
|
17
|
+
Bundler.setup
|
18
|
+
rescue Bundler::GemNotFound
|
19
|
+
raise RuntimeError, "Bundler couldn't find some gems." +
|
20
|
+
"Did you run \`bundle install\`?"
|
21
|
+
end
|
22
|
+
Bundler.require
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feedisco
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Romain Champourlier
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-28 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: jeweler
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: nokogiri
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: Feedisco is a small and lightweight library focused on RSS/Atom feed
|
63
|
+
discovery. It is intended to do little, but to do it well!
|
64
|
+
email: romain@softr.li
|
65
|
+
executables: []
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files:
|
68
|
+
- LICENSE
|
69
|
+
- README.md
|
70
|
+
files:
|
71
|
+
- Gemfile
|
72
|
+
- Gemfile.lock
|
73
|
+
- LICENSE
|
74
|
+
- README.md
|
75
|
+
- Rakefile
|
76
|
+
- VERSION
|
77
|
+
- feedisco.gemspec
|
78
|
+
- lib/feedisco.rb
|
79
|
+
- lib/feedisco/checks.rb
|
80
|
+
- lib/feedisco/discovery.rb
|
81
|
+
- lib/feedisco/utilities.rb
|
82
|
+
- lib/feedisco/version.rb
|
83
|
+
- script/console
|
84
|
+
- spec/fixtures/alternate.html
|
85
|
+
- spec/fixtures/no_link.html
|
86
|
+
- spec/fixtures/one_link_in_body.html
|
87
|
+
- spec/fixtures/several_links.html
|
88
|
+
- spec/lib/checks_spec.rb
|
89
|
+
- spec/lib/discovery_spec.rb
|
90
|
+
- spec/lib/utilities_spec.rb
|
91
|
+
- spec/spec_helper.rb
|
92
|
+
homepage: http://github.com/rchampourlier/feedisco
|
93
|
+
licenses:
|
94
|
+
- MIT
|
95
|
+
post_install_message:
|
96
|
+
rdoc_options: []
|
97
|
+
require_paths:
|
98
|
+
- lib
|
99
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
100
|
+
none: false
|
101
|
+
requirements:
|
102
|
+
- - ! '>='
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
segments:
|
106
|
+
- 0
|
107
|
+
hash: -595904197689466947
|
108
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
110
|
+
requirements:
|
111
|
+
- - ! '>='
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
version: '0'
|
114
|
+
requirements: []
|
115
|
+
rubyforge_project:
|
116
|
+
rubygems_version: 1.8.23
|
117
|
+
signing_key:
|
118
|
+
specification_version: 3
|
119
|
+
summary: A simple feed discovery library
|
120
|
+
test_files: []
|