feedisco 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +11 -0
- data/Gemfile.lock +31 -0
- data/LICENSE +20 -0
- data/README.md +129 -0
- data/Rakefile +49 -0
- data/VERSION +1 -0
- data/feedisco.gemspec +66 -0
- data/lib/feedisco/checks.rb +50 -0
- data/lib/feedisco/discovery.rb +57 -0
- data/lib/feedisco/utilities.rb +46 -0
- data/lib/feedisco/version.rb +6 -0
- data/lib/feedisco.rb +50 -0
- data/script/console +22 -0
- data/spec/fixtures/alternate.html +11 -0
- data/spec/fixtures/no_link.html +12 -0
- data/spec/fixtures/one_link_in_body.html +12 -0
- data/spec/fixtures/several_links.html +15 -0
- data/spec/lib/checks_spec.rb +36 -0
- data/spec/lib/discovery_spec.rb +117 -0
- data/spec/lib/utilities_spec.rb +41 -0
- data/spec/spec_helper.rb +22 -0
- metadata +120 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.3)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.8.4)
|
7
|
+
bundler (~> 1.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
rdoc
|
11
|
+
json (1.7.5)
|
12
|
+
nokogiri (1.5.5)
|
13
|
+
rake (0.9.2.2)
|
14
|
+
rdoc (3.12)
|
15
|
+
json (~> 1.4)
|
16
|
+
rspec (2.11.0)
|
17
|
+
rspec-core (~> 2.11.0)
|
18
|
+
rspec-expectations (~> 2.11.0)
|
19
|
+
rspec-mocks (~> 2.11.0)
|
20
|
+
rspec-core (2.11.1)
|
21
|
+
rspec-expectations (2.11.3)
|
22
|
+
diff-lcs (~> 1.1.3)
|
23
|
+
rspec-mocks (2.11.3)
|
24
|
+
|
25
|
+
PLATFORMS
|
26
|
+
ruby
|
27
|
+
|
28
|
+
DEPENDENCIES
|
29
|
+
jeweler
|
30
|
+
nokogiri
|
31
|
+
rspec
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Romain Champourlier <romain@softr.li>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
# Feedisco
|
2
|
+
|
3
|
+
### Summary
|
4
|
+
|
5
|
+
Feedisco is a small and lightweight library focused on RSS/Atom feed discovery. It is intended to do little, but to do it well!
|
6
|
+
|
7
|
+
### Use case(s)
|
8
|
+
|
9
|
+
* I want to **find the URLs of a RSS/Atom feeds for my users' blogs**
|
10
|
+
* I want to **check that an user-supplied URL is truly a RSS/Atom feed URL**
|
11
|
+
* I want to let my user **choose from a list of discovered RSS/Atom feeds** when he entered his website URL
|
12
|
+
* Probably more...
|
13
|
+
|
14
|
+
### The methods
|
15
|
+
|
16
|
+
```
|
17
|
+
Feedisco.find(url)
|
18
|
+
Feedisco.feed?(url)
|
19
|
+
```
|
20
|
+
|
21
|
+
### How to start
|
22
|
+
|
23
|
+
**Install it with Rubygems**
|
24
|
+
|
25
|
+
```
|
26
|
+
$ gem install feedisco
|
27
|
+
$ irb
|
28
|
+
|
29
|
+
irb(main):001:0> require 'rubygems'
|
30
|
+
=> true
|
31
|
+
irb(main):001:0> require 'feedisco'
|
32
|
+
=> true
|
33
|
+
```
|
34
|
+
|
35
|
+
**...or clone it from GitHub**
|
36
|
+
|
37
|
+
```
|
38
|
+
$ git clone https://github.com/rchampourlier/feedisco
|
39
|
+
$ cd feedisco
|
40
|
+
$ bundle install
|
41
|
+
$ bundle exec script/console
|
42
|
+
```
|
43
|
+
|
44
|
+
_**Nota Bene:**_
|
45
|
+
|
46
|
+
* _`script/console` is a small script available in the repo that loads an IRB/Pry console with Feedisco preloaded, so it's ready to use! (feel like `rails console`)_
|
47
|
+
|
48
|
+
### Examples
|
49
|
+
|
50
|
+
```
|
51
|
+
irb(main):001:0> Feedisco.find('rchampourlier.com')
|
52
|
+
=> ["http://feeds.rchampourlier.com/rchampourlier"]
|
53
|
+
|
54
|
+
irb(main):002:0> Feedisco.find('google.com')
|
55
|
+
=> []
|
56
|
+
|
57
|
+
irb(main):003:0> Feedisco.feed?('google.com')
|
58
|
+
=> false
|
59
|
+
|
60
|
+
irb(main):007:0> Feedisco.feed?('feeds.rchampourlier.com/rchampourlier')
|
61
|
+
=> true
|
62
|
+
|
63
|
+
irb(main):010:0> puts Feedisco.find('http://edition.cnn.com/services/rss/')
|
64
|
+
http://rss.cnn.com/rss/edition.rss
|
65
|
+
http://rss.cnn.com/rss/edition_asia.rss
|
66
|
+
http://rss.cnn.com/rss/edition_europe.rss
|
67
|
+
http://rss.cnn.com/rss/edition_us.rss
|
68
|
+
http://rss.cnn.com/rss/edition_world.rss
|
69
|
+
http://rss.cnn.com/rss/edition_africa.rss
|
70
|
+
http://rss.cnn.com/rss/edition_americas.rss
|
71
|
+
[and a lot more...]
|
72
|
+
```
|
73
|
+
|
74
|
+
### Installation
|
75
|
+
|
76
|
+
Add this to your Gemfile (the gem is not published to Rubygems until it is good enough, so tell me when you think it should!):
|
77
|
+
|
78
|
+
```
|
79
|
+
gem 'feedisco', :git => 'https://github.com/rchampourlier/feedisco', :ref => 'master'
|
80
|
+
```
|
81
|
+
|
82
|
+
Or just clone it and do whatever you want:
|
83
|
+
|
84
|
+
```
|
85
|
+
$ git clone http://github.com/rchampourlier/feedisco
|
86
|
+
```
|
87
|
+
|
88
|
+
|
89
|
+
### Why should you use it?
|
90
|
+
|
91
|
+
* Well, because you need to discover feeds URL from a given URL, and nothing more!
|
92
|
+
* Because you wan't it to be simple, with clean-code you can correct, complete, update, test, if you need it.
|
93
|
+
* Because you just need to add a line to your Gemfile to use it.
|
94
|
+
* Because it should still follow modern feed filename conventions (like those ones used by WordPress blogs, or Blogger, etc) - that's the part taken from [Feedbag](https://github.com/damog/feedbag)!
|
95
|
+
|
96
|
+
### Why did I build it?
|
97
|
+
|
98
|
+
* Because I wanted a **simple and lightweight** library to discover RSS/Atom feeds.
|
99
|
+
* Because [Feedbag](https://github.com/damog/feedbag) was using Hpricot, and I was more Nokogiri.
|
100
|
+
* Because I wanted something with a little more tests than Feedbag.
|
101
|
+
|
102
|
+
### Bugs, issues, contributions
|
103
|
+
|
104
|
+
You can use the [GitHub repo](https://github.com/rchampourlier/feedisco) for all this, so don't hesitate!
|
105
|
+
|
106
|
+
Contribute by forking, tweaking, and sending pull requests, but please add the appropriate tests!
|
107
|
+
|
108
|
+
### History
|
109
|
+
|
110
|
+
#### `0.1.1`
|
111
|
+
|
112
|
+
* Completed `Rakefile`
|
113
|
+
* Changed a method from private to public, specs are green again
|
114
|
+
|
115
|
+
#### `0.1.0`
|
116
|
+
|
117
|
+
* Initial release
|
118
|
+
|
119
|
+
### Author
|
120
|
+
|
121
|
+
[Romain Champourlier](http://softr.li)
|
122
|
+
|
123
|
+
### Copyright
|
124
|
+
|
125
|
+
This is free software. See [LICENSE](http://github.com/rchampourlier/feedisco/master/LICENSE) for more information.
|
126
|
+
|
127
|
+
### Thanks
|
128
|
+
|
129
|
+
[David Moreno](http://damog.net/) for [Feedbag](https://github.com/damog/feedbag) which I used before writing Feedisco. Plus Feedisco is more than inspired from Feedbag, even if I almost rewrote everything to make (to my sense) more Ruby-way, readable, and in particular more tested.
|
data/Rakefile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
|
6
|
+
begin
|
7
|
+
Bundler.setup(:default, :development)
|
8
|
+
rescue Bundler::BundlerError => e
|
9
|
+
$stderr.puts e.message
|
10
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
11
|
+
exit e.status_code
|
12
|
+
end
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "feedisco"
|
18
|
+
gem.summary = "A simple feed discovery library"
|
19
|
+
gem.description = "Feedisco is a small and lightweight library focused on RSS/Atom feed discovery. It is intended to do little, but to do it well!"
|
20
|
+
gem.email = "romain@softr.li"
|
21
|
+
gem.homepage = "http://github.com/rchampourlier/feedisco"
|
22
|
+
gem.authors = ["Romain Champourlier"]
|
23
|
+
gem.license = "MIT"
|
24
|
+
gem.add_dependency 'nokogiri'
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rspec/core'
|
29
|
+
require 'rspec/core/rake_task'
|
30
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
end
|
33
|
+
|
34
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
spec.rcov = true
|
37
|
+
end
|
38
|
+
|
39
|
+
task :default => :spec
|
40
|
+
|
41
|
+
require 'rdoc/task'
|
42
|
+
Rake::RDocTask.new do |rdoc|
|
43
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
44
|
+
|
45
|
+
rdoc.rdoc_dir = 'rdoc'
|
46
|
+
rdoc.title = "oro #{version}"
|
47
|
+
rdoc.rdoc_files.include('README*')
|
48
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
49
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.1
|
data/feedisco.gemspec
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "feedisco"
|
8
|
+
s.version = "0.1.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Romain Champourlier"]
|
12
|
+
s.date = "2012-09-28"
|
13
|
+
s.description = "Feedisco is a small and lightweight library focused on RSS/Atom feed discovery. It is intended to do little, but to do it well!"
|
14
|
+
s.email = "romain@softr.li"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
"Gemfile",
|
21
|
+
"Gemfile.lock",
|
22
|
+
"LICENSE",
|
23
|
+
"README.md",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"feedisco.gemspec",
|
27
|
+
"lib/feedisco.rb",
|
28
|
+
"lib/feedisco/checks.rb",
|
29
|
+
"lib/feedisco/discovery.rb",
|
30
|
+
"lib/feedisco/utilities.rb",
|
31
|
+
"lib/feedisco/version.rb",
|
32
|
+
"script/console",
|
33
|
+
"spec/fixtures/alternate.html",
|
34
|
+
"spec/fixtures/no_link.html",
|
35
|
+
"spec/fixtures/one_link_in_body.html",
|
36
|
+
"spec/fixtures/several_links.html",
|
37
|
+
"spec/lib/checks_spec.rb",
|
38
|
+
"spec/lib/discovery_spec.rb",
|
39
|
+
"spec/lib/utilities_spec.rb",
|
40
|
+
"spec/spec_helper.rb"
|
41
|
+
]
|
42
|
+
s.homepage = "http://github.com/rchampourlier/feedisco"
|
43
|
+
s.licenses = ["MIT"]
|
44
|
+
s.require_paths = ["lib"]
|
45
|
+
s.rubygems_version = "1.8.23"
|
46
|
+
s.summary = "A simple feed discovery library"
|
47
|
+
|
48
|
+
if s.respond_to? :specification_version then
|
49
|
+
s.specification_version = 3
|
50
|
+
|
51
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
52
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
53
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
54
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
55
|
+
else
|
56
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
57
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
58
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
59
|
+
end
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
62
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
63
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Feedisco::Checks
|
2
|
+
|
3
|
+
# Check if the specified URL is a feed URL. The check is performed by opening the
|
4
|
+
# URL and checking the content type. If it matches a content type within
|
5
|
+
# Feedisco.feed_content_types, the URL is considered as a feed and the method returns
|
6
|
+
# true.
|
7
|
+
def feed?(url)
|
8
|
+
feed_content_type?(url)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Determines if the specified URL looks like a feed. We consider it does if:
|
12
|
+
# - it ends with a 'feed-suffix': .rdf, .xml, .rss
|
13
|
+
# - it contains a 'feed=rss' or 'feed=atom' query param (well, we don't check
|
14
|
+
# if it is really a query param, as long as it is in the URL)
|
15
|
+
# - it ends with 'atom' or 'feed' (with or without the '/' at the end)
|
16
|
+
def looks_like_feed?(url)
|
17
|
+
(url =~ %r{(\.(rdf|xml|rss)$|feed=(rss|atom)(&(.)+)?$|(atom|feed)/?$)}i) != nil
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# Open the specified URL and check its content type. Returns true if the content type
|
23
|
+
# is a feed content type (in Feedisco.feed_content_types)
|
24
|
+
#
|
25
|
+
# You can pass an URL (a string) or a file (open(...), a Tempfile instance) to the method.
|
26
|
+
def feed_content_type?(url_or_file)
|
27
|
+
opened = false
|
28
|
+
|
29
|
+
if url_or_file.is_a? String
|
30
|
+
harmonized_url = harmonize_url(url_or_file)
|
31
|
+
file = open(harmonized_url)
|
32
|
+
opened = true
|
33
|
+
|
34
|
+
elsif url_or_file.class.to_s == 'Tempfile'
|
35
|
+
file = url_or_file
|
36
|
+
|
37
|
+
else raise ArgumentError.new('argument must be a String (url) or a Tempfile created with `open(url)`')
|
38
|
+
end
|
39
|
+
|
40
|
+
# Retrieve page content type
|
41
|
+
content_type = file.content_type.downcase
|
42
|
+
if content_type == "application/octet-stream"
|
43
|
+
content_type = file.meta["content-type"].gsub(/;.*$/, '')
|
44
|
+
end
|
45
|
+
file.close if opened
|
46
|
+
|
47
|
+
# Check if the content-type indicates RSS/Atom feed (in Feedisco.feed_content_types)
|
48
|
+
Feedisco.feed_content_types.include?(content_type)
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'feedisco/utilities'
|
2
|
+
require 'feedisco/checks'
|
3
|
+
|
4
|
+
module Feedisco
|
5
|
+
extend Checks
|
6
|
+
extend Utilities
|
7
|
+
|
8
|
+
module Discovery
|
9
|
+
|
10
|
+
# Find RSS/Atom feed URLs by looking around the specified URL.
|
11
|
+
def find(url, args = {})
|
12
|
+
raise ArgumentError.new("url can't be nil!") if url.nil?
|
13
|
+
|
14
|
+
harmonized_url = harmonize_url(url)
|
15
|
+
|
16
|
+
raise ArgumentError.new("url's protocol must be 'http(s)' or 'feed' (#{url})") if harmonized_url.nil?
|
17
|
+
|
18
|
+
feeds = []
|
19
|
+
|
20
|
+
# Open the URL to check the content-type or crawl for feed links
|
21
|
+
open(harmonized_url) do |file|
|
22
|
+
|
23
|
+
if feed_content_type?(file)
|
24
|
+
# Add the url to feeds if it shows a feed content type
|
25
|
+
feeds << harmonized_url
|
26
|
+
|
27
|
+
else
|
28
|
+
# Else, parse the page to search for links
|
29
|
+
doc = Nokogiri::HTML(file.read)
|
30
|
+
|
31
|
+
# Check <link> elements
|
32
|
+
doc.css('link').each do |link|
|
33
|
+
feeds << complete_extracted_url(link[:href], harmonized_url) if link[:rel] =~ %r{(alternate|service.feed)}i && Feedisco.feed_content_types.include?(link[:type].downcase.strip)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Check <a> elements
|
37
|
+
doc.css('a').each do |a|
|
38
|
+
if (looks_like_feed?(a[:href]) &&
|
39
|
+
(a[:href] =~ %r{\A/} || a[:href] =~ %r{#{URI.parse(harmonized_url).host}/}) &&
|
40
|
+
!feeds.include?(a[:href]))
|
41
|
+
|
42
|
+
feeds << complete_extracted_url(a[:href], harmonized_url)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Check <a> elements again, less restrictively, so we add all discovered feeds even the ones
|
47
|
+
# on external domains, but the will come after in the feeds array.
|
48
|
+
doc.css('a').each do |a|
|
49
|
+
feeds << complete_extracted_url(a[:href], harmonized_url) if (looks_like_feed?(a[:href]) && !feeds.include?(a[:href]))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
feeds
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Feedisco::Utilities
|
2
|
+
|
3
|
+
# Check the specified URL protocol. To be considered as a valid feed URL, it must
|
4
|
+
# match either 'feed', 'http', or 'https', or be nil. If it does, the url is returned
|
5
|
+
# with 'http' or 'https' protocol (replacing nil and 'feed' ones). Else, it returns nil.
|
6
|
+
def harmonize_url(url)
|
7
|
+
url_uri = URI.parse(url)
|
8
|
+
|
9
|
+
case url_uri.scheme
|
10
|
+
when nil
|
11
|
+
"http://#{url}"
|
12
|
+
|
13
|
+
when 'feed'
|
14
|
+
url.sub(%r{feed://}, 'http://')
|
15
|
+
|
16
|
+
when %r{http(s)?}
|
17
|
+
url
|
18
|
+
|
19
|
+
else
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Complete extracted_url with page_url:
|
25
|
+
# - if extracted_url is relative, completes it with the protocol,
|
26
|
+
# host and path from page_url (page_url is expected to be absolute!)
|
27
|
+
# - just returns extracted_url if it is absolute.
|
28
|
+
def complete_extracted_url(extracted_url, page_url)
|
29
|
+
extracted_uri = URI.parse(extracted_url)
|
30
|
+
page_uri = URI.parse(page_url)
|
31
|
+
|
32
|
+
if extracted_uri.absolute?
|
33
|
+
extracted_url
|
34
|
+
|
35
|
+
else
|
36
|
+
raise ArgumentError.new('page_url must be absolute if extracted_url isn\'t!') unless page_uri.absolute?
|
37
|
+
|
38
|
+
if extracted_url =~ %r{\A/}
|
39
|
+
# Starts with '/', root of page_url's domain
|
40
|
+
"#{page_uri.scheme}://#{page_uri.host}#{extracted_url}"
|
41
|
+
else
|
42
|
+
"#{page_uri.scheme}://#{page_uri.host}#{page_uri.path}/#{extracted_url}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/feedisco.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# Feedisco
|
4
|
+
#
|
5
|
+
# Built from Feedbag
|
6
|
+
# - replaced Hpricot by Nokogiri
|
7
|
+
# - improved discovery to check on /rss and /atom URIs
|
8
|
+
# - removed the global variables
|
9
|
+
#
|
10
|
+
# Copyright Axiombox (c) 2008
|
11
|
+
# David Moreno <david@axiombox.com> (c) 2008
|
12
|
+
#
|
13
|
+
# This program is free software: you can redistribute it and/or modify
|
14
|
+
# it under the terms of the GNU General Public License as published by
|
15
|
+
# the Free Software Foundation, either version 3 of the License, or
|
16
|
+
# (at your option) any later version.
|
17
|
+
#
|
18
|
+
# This program is distributed in the hope that it will be useful,
|
19
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
20
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
21
|
+
# GNU General Public License for more details.
|
22
|
+
#
|
23
|
+
# You should have received a copy of the GNU General Public License
|
24
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
25
|
+
|
26
|
+
require "rubygems"
|
27
|
+
require "nokogiri"
|
28
|
+
require "open-uri"
|
29
|
+
require "net/http"
|
30
|
+
|
31
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
32
|
+
|
33
|
+
module Feedisco
|
34
|
+
def self.feed_content_types
|
35
|
+
[
|
36
|
+
'application/x.atom+xml',
|
37
|
+
'application/atom+xml',
|
38
|
+
'application/xml',
|
39
|
+
'text/xml',
|
40
|
+
'application/rss+xml',
|
41
|
+
'application/rdf+xml',
|
42
|
+
]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
require 'feedisco/discovery'
|
47
|
+
|
48
|
+
Feedisco.extend Feedisco::Discovery
|
49
|
+
|
50
|
+
$LOAD_PATH.shift
|
data/script/console
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This small script runs an IRB console ready for Feedisco fun!
|
4
|
+
# Be sure to run within a `bundle exec` if your using Bundler!
|
5
|
+
|
6
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "feedisco.rb"))
|
7
|
+
|
8
|
+
# prevent STDOUT & STDERR to be reopened (apps do this to be able to log under Passenger)
|
9
|
+
def STDOUT.reopen(*args); end
|
10
|
+
def STDERR.reopen(*args); end
|
11
|
+
|
12
|
+
begin
|
13
|
+
require "pry"
|
14
|
+
Interpreter = Pry
|
15
|
+
rescue LoadError
|
16
|
+
require "irb"
|
17
|
+
require "irb/completion"
|
18
|
+
Interpreter = IRB
|
19
|
+
end
|
20
|
+
|
21
|
+
# START
|
22
|
+
Interpreter.start
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<html>
|
2
|
+
|
3
|
+
<head>
|
4
|
+
<link rel="shortcut icon" href="http://example.com/shortcut.png">
|
5
|
+
<link rel="alternate" type="application/nope" href="http://example.com/page.nope">
|
6
|
+
</head>
|
7
|
+
|
8
|
+
<body>
|
9
|
+
<a href="http://example.com/feed.nope">feed.nope</a>
|
10
|
+
</body>
|
11
|
+
|
12
|
+
</html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<html>
|
2
|
+
|
3
|
+
<head>
|
4
|
+
<link rel="shortcut icon" href="http://example.com/shortcut.png">
|
5
|
+
<link rel="alternate" type="application/nope" href="http://example.com/page.nope">
|
6
|
+
</head>
|
7
|
+
|
8
|
+
<body>
|
9
|
+
<a href="http://example.com/feed.rss">feed.rss</a>
|
10
|
+
</body>
|
11
|
+
|
12
|
+
</html>
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<html>
|
2
|
+
|
3
|
+
<head>
|
4
|
+
<link rel="shortcut icon" href="http://example.com/shortcut.png">
|
5
|
+
<link rel="alternate" type="application/xml" href="http://example.com/feed.xml">
|
6
|
+
</head>
|
7
|
+
|
8
|
+
<body>
|
9
|
+
<a href="http://another.domain.com/feed.rss">another domain's feed.rss</a>
|
10
|
+
<a href="http://example.com/feed.rss">feed.rss</a>
|
11
|
+
<a href="http://another.domain.com/feed.rss">another domain's feed.rss</a>
|
12
|
+
<a href="http://example.com/feed.rss">feed.rss</a>
|
13
|
+
</body>
|
14
|
+
|
15
|
+
</html>
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'feedisco'
|
2
|
+
|
3
|
+
describe "Feedisco::Checks" do
|
4
|
+
|
5
|
+
describe 'looks_like_feed?' do
|
6
|
+
|
7
|
+
%w(feed.rdf feed.xml feed.rss feed?feed=atom feed?feed=rss feed/atom feed/feed feed/atom/ feed/feed/).each do |url|
|
8
|
+
it "should return true for '#{url}' " do
|
9
|
+
Feedisco.looks_like_feed?(url).should be_true
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
%w(feed.txt feed?feed=atomic feed=none example.com/atomic).each do |url|
|
14
|
+
it "should return false for '#{url}' " do
|
15
|
+
Feedisco.looks_like_feed?(url).should_not be_true
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe 'feed? (using real websites, may break if they change)' do
|
21
|
+
|
22
|
+
['http://feeds.rchampourlier.com/rchampourlier',
|
23
|
+
'http://rss.cnn.com/rss/cnn_topstories.rss'].each do |url|
|
24
|
+
it "should return true for '#{url}'" do
|
25
|
+
Feedisco.feed?(url).should be_true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
['http://rchampourlier.com',
|
30
|
+
'http://www.cnn.com/services/rss/'].each do |url|
|
31
|
+
it "should return false for '#{url}'" do
|
32
|
+
Feedisco.feed?(url).should_not be_true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'feedisco'
|
2
|
+
|
3
|
+
describe "Feedisco::Discovery" do
|
4
|
+
|
5
|
+
describe "find" do
|
6
|
+
|
7
|
+
it 'should raise an ArgumentError if specified url is nil' do
|
8
|
+
expect {
|
9
|
+
Feedisco.find(nil)
|
10
|
+
}.to raise_error(ArgumentError)
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should return an array with only the specified URL if it has a feed content type' do
|
14
|
+
file = stub(:class => stub(:to_s => 'Tempfile'), :content_type => 'application/xml')
|
15
|
+
|
16
|
+
Feedisco.should_receive(:open).and_yield(file)
|
17
|
+
Feedisco.find('example.com/feed.xml').should == ['http://example.com/feed.xml']
|
18
|
+
end
|
19
|
+
|
20
|
+
context 'from fixtures' do
|
21
|
+
|
22
|
+
it 'should return an empty array' do
|
23
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'no_link.html'))
|
24
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
25
|
+
|
26
|
+
Feedisco.should_receive(:open).and_yield(file)
|
27
|
+
Feedisco.find('example.com').should == []
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should include the alternate link' do
|
31
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'alternate.html'))
|
32
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
33
|
+
|
34
|
+
Feedisco.should_receive(:open).and_yield(file)
|
35
|
+
Feedisco.find('example.com').should include('http://example.com/feed.xml')
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should include a <a> link in the body' do
|
39
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'one_link_in_body.html'))
|
40
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
41
|
+
|
42
|
+
Feedisco.should_receive(:open).and_yield(file)
|
43
|
+
Feedisco.find('example.com').should include("http://example.com/feed.rss")
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should include link to feeds on the URL\'s domain' do
|
47
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'several_links.html'))
|
48
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
49
|
+
|
50
|
+
Feedisco.should_receive(:open).and_yield(file)
|
51
|
+
Feedisco.find('example.com').should include("http://example.com/feed.rss")
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'should have the alternate link as the first of the returned feed' do
|
55
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'several_links.html'))
|
56
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
57
|
+
|
58
|
+
Feedisco.should_receive(:open).and_yield(file)
|
59
|
+
Feedisco.find('example.com').first.should == "http://example.com/feed.xml"
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'should include link to feeds on other domains' do
|
63
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'several_links.html'))
|
64
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
65
|
+
|
66
|
+
Feedisco.should_receive(:open).and_yield(file)
|
67
|
+
Feedisco.find('example.com').should include "http://another.domain.com/feed.rss"
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'should have links to feeds on other domains after links to feeds on the same domain' do
|
71
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'several_links.html'))
|
72
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
73
|
+
|
74
|
+
Feedisco.should_receive(:open).and_yield(file)
|
75
|
+
feeds = Feedisco.find('example.com')
|
76
|
+
feeds.index('http://example.com/feed.rss').should < feeds.index('http://another.domain.com/feed.rss')
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should include each link only once' do
|
80
|
+
file = File.open(File.join(File.dirname(__FILE__), '..', 'fixtures', 'several_links.html'))
|
81
|
+
file.stub!(:class => stub(:to_s => 'Tempfile'), :content_type => 'text/html')
|
82
|
+
|
83
|
+
Feedisco.should_receive(:open).and_yield(file)
|
84
|
+
feeds = Feedisco.find('example.com')
|
85
|
+
feeds.select { |f| f == 'http://example.com/feed.rss' }.count.should == 1
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
context 'from real websites (this may change so some examples may break)' do
|
90
|
+
|
91
|
+
it "should return the URL for a feed URL" do
|
92
|
+
Feedisco.find('http://feeds.rchampourlier.com/rchampourlier').should == ['http://feeds.rchampourlier.com/rchampourlier']
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should return an empty array for 'www.google.com'" do
|
96
|
+
Feedisco.find("www.google.com").should == []
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should return 'http://feeds.rchampourlier.com/rchampourlier' for 'rchampourlier.com'" do
|
100
|
+
Feedisco.find("www.rchampourlier.com").should == ['http://feeds.rchampourlier.com/rchampourlier']
|
101
|
+
end
|
102
|
+
|
103
|
+
it "should raise an URI::InvalidURIError for an invalid URL" do
|
104
|
+
expect {
|
105
|
+
Feedisco.find("not url")
|
106
|
+
}.to raise_error(URI::InvalidURIError)
|
107
|
+
end
|
108
|
+
|
109
|
+
it "should raise an ArgumentError if the specified url's scheme is not 'http(s)' or 'feed'" do
|
110
|
+
expect {
|
111
|
+
Feedisco.find('ftp://rchampourlier.com')
|
112
|
+
}.to raise_error(ArgumentError)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'feedisco'
|
2
|
+
|
3
|
+
describe "Feedisco::Utilities" do
|
4
|
+
|
5
|
+
describe 'harmonize_url' do
|
6
|
+
|
7
|
+
{ 'example.com' => 'http://example.com',
|
8
|
+
'example.com/page' => 'http://example.com/page',
|
9
|
+
'feed://example.com' => 'http://example.com',
|
10
|
+
'http://example.com' => 'http://example.com',
|
11
|
+
'https://example.com' => 'https://example.com',
|
12
|
+
'ftp://example.com' => nil
|
13
|
+
}.each do |url, expected|
|
14
|
+
|
15
|
+
it "should return '#{expected || :nil}' for '#{url}'" do
|
16
|
+
Feedisco.harmonize_url(url).should == expected
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe 'complete_extracted_url' do
|
22
|
+
|
23
|
+
it 'should return the extracted url if it is absolute' do
|
24
|
+
Feedisco.complete_extracted_url('http://example.com/page', 'http://example.com/another').should == 'http://example.com/page'
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should add the scheme and host if the extracted url is relative from the root path' do
|
28
|
+
Feedisco.complete_extracted_url('/page', 'http://example.com').should == 'http://example.com/page'
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should add the extracted_url\'s path if it\'s relative from the page\'s page' do
|
32
|
+
Feedisco.complete_extracted_url('page', 'http://example.com/root').should == 'http://example.com/root/page'
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should raise ArgumentError if both extracted and page urls are relative' do
|
36
|
+
expect {
|
37
|
+
Feedisco.complete_extracted_url('/relative', '/relative/too')
|
38
|
+
}.to raise_error(ArgumentError)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
$LOAD_PATH << "." unless $LOAD_PATH.include?(".")
|
2
|
+
require 'logger'
|
3
|
+
|
4
|
+
##
|
5
|
+
# Bundler and its require
|
6
|
+
#
|
7
|
+
begin
|
8
|
+
require "rubygems"
|
9
|
+
require "bundler"
|
10
|
+
|
11
|
+
if Gem::Version.new(Bundler::VERSION) <= Gem::Version.new("0.9.5")
|
12
|
+
raise RuntimeError, "Your bundler version is too old." +
|
13
|
+
"Run `gem install bundler` to upgrade."
|
14
|
+
end
|
15
|
+
|
16
|
+
# Set up load paths for all bundled gems
|
17
|
+
Bundler.setup
|
18
|
+
rescue Bundler::GemNotFound
|
19
|
+
raise RuntimeError, "Bundler couldn't find some gems." +
|
20
|
+
"Did you run \`bundle install\`?"
|
21
|
+
end
|
22
|
+
Bundler.require
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feedisco
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Romain Champourlier
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-28 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: jeweler
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: nokogiri
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: Feedisco is a small and lightweight library focused on RSS/Atom feed
|
63
|
+
discovery. It is intended to do little, but to do it well!
|
64
|
+
email: romain@softr.li
|
65
|
+
executables: []
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files:
|
68
|
+
- LICENSE
|
69
|
+
- README.md
|
70
|
+
files:
|
71
|
+
- Gemfile
|
72
|
+
- Gemfile.lock
|
73
|
+
- LICENSE
|
74
|
+
- README.md
|
75
|
+
- Rakefile
|
76
|
+
- VERSION
|
77
|
+
- feedisco.gemspec
|
78
|
+
- lib/feedisco.rb
|
79
|
+
- lib/feedisco/checks.rb
|
80
|
+
- lib/feedisco/discovery.rb
|
81
|
+
- lib/feedisco/utilities.rb
|
82
|
+
- lib/feedisco/version.rb
|
83
|
+
- script/console
|
84
|
+
- spec/fixtures/alternate.html
|
85
|
+
- spec/fixtures/no_link.html
|
86
|
+
- spec/fixtures/one_link_in_body.html
|
87
|
+
- spec/fixtures/several_links.html
|
88
|
+
- spec/lib/checks_spec.rb
|
89
|
+
- spec/lib/discovery_spec.rb
|
90
|
+
- spec/lib/utilities_spec.rb
|
91
|
+
- spec/spec_helper.rb
|
92
|
+
homepage: http://github.com/rchampourlier/feedisco
|
93
|
+
licenses:
|
94
|
+
- MIT
|
95
|
+
post_install_message:
|
96
|
+
rdoc_options: []
|
97
|
+
require_paths:
|
98
|
+
- lib
|
99
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
100
|
+
none: false
|
101
|
+
requirements:
|
102
|
+
- - ! '>='
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
segments:
|
106
|
+
- 0
|
107
|
+
hash: -595904197689466947
|
108
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
110
|
+
requirements:
|
111
|
+
- - ! '>='
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
version: '0'
|
114
|
+
requirements: []
|
115
|
+
rubyforge_project:
|
116
|
+
rubygems_version: 1.8.23
|
117
|
+
signing_key:
|
118
|
+
specification_version: 3
|
119
|
+
summary: A simple feed discovery library
|
120
|
+
test_files: []
|