feed_searcher 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -0
- data/README.md +18 -0
- data/Rakefile +4 -0
- data/feed_searcher.gemspec +3 -0
- data/lib/feed_searcher/page.rb +71 -3
- data/lib/feed_searcher/version.rb +1 -1
- data/spec/feed_searcher_spec.rb +102 -41
- data/spec/fixtures/example.atom +15 -0
- data/spec/fixtures/example.html +18 -0
- data/spec/fixtures/example.rss +12 -0
- data/spec/spec_helper.rb +3 -0
- metadata +57 -2
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# FeedSearcher
|
2
2
|
Search RSS feed URLs from the given URL.
|
3
3
|
|
4
|
+
[![Build Status](https://travis-ci.org/fastladder/feed_searcher.png?branch=master)](https://travis-ci.org/fastladder/feed_searcher)
|
5
|
+
|
4
6
|
|
5
7
|
## Installation
|
6
8
|
```
|
@@ -14,3 +16,19 @@ require "feed_searcher"
|
|
14
16
|
FeedSearcher.search("https://github.com/r7kamura/feed_searcher")
|
15
17
|
#=> ["https://github.com/r7kamura/feed_searcher/commits/master.atom"]
|
16
18
|
```
|
19
|
+
|
20
|
+
|
21
|
+
## Internal
|
22
|
+
Let me explain how FeedSearcher works along its execution sequence.
|
23
|
+
|
24
|
+
1. Fetches the HTML source of the given URL
|
25
|
+
2. Finds link elements (represented as XPath format)
|
26
|
+
3. Extracts URLs from the elements via its `href` attribute
|
27
|
+
4. Includes the given URL if its resource itself is a feed
|
28
|
+
5. Converts to relative path to absolute path
|
29
|
+
|
30
|
+
FeedSearcher finds link elements matcing following XPath patterns.
|
31
|
+
|
32
|
+
* //link[@rel='alternate'][@type='application/atom+xml']
|
33
|
+
* //link[@rel='alternate'][@type='application/rdf+xml']
|
34
|
+
* //link[@rel='alternate'][@type='application/rss+xml']
|
data/Rakefile
CHANGED
data/feed_searcher.gemspec
CHANGED
@@ -20,8 +20,11 @@ Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_dependency "mechanize", ">= 1.0.0"
|
22
22
|
spec.add_dependency "nokogiri"
|
23
|
+
spec.add_development_dependency "activesupport"
|
23
24
|
spec.add_development_dependency "bundler", "~> 1.3"
|
25
|
+
spec.add_development_dependency "pry"
|
24
26
|
spec.add_development_dependency "rake"
|
25
27
|
spec.add_development_dependency "rspec", ">= 2.13.0"
|
28
|
+
spec.add_development_dependency "simplecov"
|
26
29
|
spec.add_development_dependency "webmock"
|
27
30
|
end
|
data/lib/feed_searcher/page.rb
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
class FeedSearcher
|
2
2
|
class Page
|
3
|
+
EXTENSIONS = %w[
|
4
|
+
atom
|
5
|
+
rdf
|
6
|
+
rss
|
7
|
+
]
|
8
|
+
|
3
9
|
MIME_TYPES = %w[
|
4
10
|
application/atom+xml
|
5
11
|
application/rdf+xml
|
@@ -13,12 +19,54 @@ class FeedSearcher
|
|
13
19
|
end
|
14
20
|
|
15
21
|
def feed_urls
|
16
|
-
|
22
|
+
urls = []
|
23
|
+
urls << url if (has_feed_mime_type? || has_feed_extension?) && xml?
|
24
|
+
urls += links.map {|link| link["href"] }
|
17
25
|
end
|
18
26
|
|
19
27
|
private
|
20
28
|
|
21
|
-
def
|
29
|
+
def has_xml_declaration?
|
30
|
+
!!body.index("<?xml")
|
31
|
+
end
|
32
|
+
|
33
|
+
def has_feed_mime_type?
|
34
|
+
MIME_TYPES.include?(mime_type)
|
35
|
+
end
|
36
|
+
|
37
|
+
def has_feed_extension?
|
38
|
+
EXTENSIONS.include?(extension)
|
39
|
+
end
|
40
|
+
|
41
|
+
def parsable_as_xml?
|
42
|
+
!!xml
|
43
|
+
end
|
44
|
+
|
45
|
+
def xml?
|
46
|
+
has_xml_declaration? && parsable_as_xml?
|
47
|
+
end
|
48
|
+
|
49
|
+
def url
|
50
|
+
page.uri.to_s
|
51
|
+
end
|
52
|
+
|
53
|
+
def content_type
|
54
|
+
page.response["content-type"]
|
55
|
+
end
|
56
|
+
|
57
|
+
def mime_type
|
58
|
+
content_type.sub(/;.*\z/, "") if content_type
|
59
|
+
end
|
60
|
+
|
61
|
+
def extension
|
62
|
+
File.extname(page.uri.path).sub(/^\./, "")
|
63
|
+
end
|
64
|
+
|
65
|
+
def body
|
66
|
+
page.body
|
67
|
+
end
|
68
|
+
|
69
|
+
def links
|
22
70
|
root.xpath("//link[@rel='alternate' and (#{types_query})]")
|
23
71
|
end
|
24
72
|
|
@@ -27,7 +75,27 @@ class FeedSearcher
|
|
27
75
|
end
|
28
76
|
|
29
77
|
def root
|
30
|
-
|
78
|
+
xml || html
|
79
|
+
end
|
80
|
+
|
81
|
+
def xml
|
82
|
+
if @xml.nil?
|
83
|
+
@xml = parse_xml
|
84
|
+
else
|
85
|
+
@xml
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def html
|
90
|
+
Nokogiri.HTML(body)
|
91
|
+
end
|
92
|
+
|
93
|
+
def parse_xml
|
94
|
+
Nokogiri.XML(body) do |config|
|
95
|
+
config.options = Nokogiri::XML::ParseOptions::STRICT | Nokogiri::XML::ParseOptions::NOENT
|
96
|
+
end
|
97
|
+
rescue
|
98
|
+
false
|
31
99
|
end
|
32
100
|
end
|
33
101
|
end
|
data/spec/feed_searcher_spec.rb
CHANGED
@@ -1,50 +1,111 @@
|
|
1
1
|
require "spec_helper"
|
2
|
+
require "active_support/core_ext/string/strip"
|
2
3
|
|
3
4
|
describe FeedSearcher do
|
4
5
|
describe ".search" do
|
5
|
-
|
6
|
-
|
7
|
-
:
|
8
|
-
|
9
|
-
|
10
|
-
<
|
11
|
-
<
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
body
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
6
|
+
context "when the specified resource is HTML" do
|
7
|
+
before do
|
8
|
+
stub_request(:get, "http://example.com/").to_return(
|
9
|
+
:body => <<-EOS.strip_heredoc
|
10
|
+
<!DOCTYPE HTML>
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<meta charset="UTF-8">
|
14
|
+
<link href="http://example.com/1" rel="alternate" type="application/atom+xml" />
|
15
|
+
<link href="http://example.com/2" rel="alternate" type="application/rdf+xml" />
|
16
|
+
<link href="http://example.com/3" rel="alternate" type="application/rss+xml" />
|
17
|
+
<link href="http://example.com/4" rel="alternate" type="application/xml" />
|
18
|
+
<link href="http://example.com/5" rel="resource" type="application/rss+xml" />
|
19
|
+
<link href="http://www.example.com/6" rel="alternate" type="application/rss+xml" />
|
20
|
+
<link href="http://other-example.com/7" rel="alternate" type="application/rss+xml" />
|
21
|
+
<link href="/8" rel="alternate" type="application/rss+xml" />
|
22
|
+
</head>
|
23
|
+
<body>
|
24
|
+
body
|
25
|
+
</body>
|
26
|
+
</html>
|
27
|
+
EOS
|
28
|
+
)
|
29
|
+
end
|
30
|
+
|
31
|
+
# This example makes sure the following specifications.
|
32
|
+
#
|
33
|
+
# * it recognizes application/atom+xml
|
34
|
+
# * it recognizes application/rdf+xml
|
35
|
+
# * it recognizes application/rss+xml
|
36
|
+
# * it does not recognize application/xml
|
37
|
+
# * it keeps subdomain
|
38
|
+
# * it keeps other domain
|
39
|
+
# * it converts relative path to absolute url
|
40
|
+
#
|
41
|
+
it "returns feed URLs from link elements in the specified resource" do
|
42
|
+
FeedSearcher.search("http://example.com/").should == %w[
|
43
|
+
http://example.com/1
|
44
|
+
http://example.com/2
|
45
|
+
http://example.com/3
|
46
|
+
http://www.example.com/6
|
47
|
+
http://other-example.com/7
|
48
|
+
http://example.com/8
|
49
|
+
]
|
50
|
+
end
|
27
51
|
end
|
28
52
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
53
|
+
context "when the specified resource has feed MIME type and be parsable as XML" do
|
54
|
+
before do
|
55
|
+
stub_request(:get, "http://example.com/").to_return(
|
56
|
+
:headers => { "Content-Type" => "application/rss+xml; charset=UTF-8" },
|
57
|
+
:body => <<-EOS.strip_heredoc
|
58
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
59
|
+
<rss>
|
60
|
+
<channel>
|
61
|
+
<title>title</title>
|
62
|
+
<link>http://exmple.com/</link>
|
63
|
+
<item>
|
64
|
+
<title>item title</title>
|
65
|
+
<link>http://example.com/item</link>
|
66
|
+
<description>item description</description>
|
67
|
+
</item>
|
68
|
+
</channel>
|
69
|
+
</rss>
|
70
|
+
EOS
|
71
|
+
)
|
72
|
+
end
|
73
|
+
|
74
|
+
it "returns itself as a feed url" do
|
75
|
+
FeedSearcher.search("http://example.com/").should == %w[
|
76
|
+
http://example.com/
|
77
|
+
]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
context "when the specified resource has feed extension and be parsable as XML" do
|
82
|
+
before do
|
83
|
+
stub_request(:get, "http://example.com/feed.atom").to_return(
|
84
|
+
:body => <<-EOS.strip_heredoc
|
85
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
86
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
87
|
+
<title>title</title>
|
88
|
+
<link rel="self" href="http://example.com/1"/>
|
89
|
+
<link rel="alternate" href="http://example.com/"/>
|
90
|
+
<entry>
|
91
|
+
<title>item title</title>
|
92
|
+
<link rel="alternate" href="http://example.com/"/>
|
93
|
+
<content type="html">
|
94
|
+
<div xmlns="http://www.w3.org/1999/xhtml">
|
95
|
+
<p>item content</p>
|
96
|
+
</div>
|
97
|
+
</content>
|
98
|
+
</entry>
|
99
|
+
</feed>
|
100
|
+
EOS
|
101
|
+
)
|
102
|
+
end
|
103
|
+
|
104
|
+
it "returns itself as a feed url" do
|
105
|
+
FeedSearcher.search("http://example.com/feed.atom").should == %w[
|
106
|
+
http://example.com/feed.atom
|
107
|
+
]
|
108
|
+
end
|
48
109
|
end
|
49
110
|
end
|
50
111
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
3
|
+
<title>title</title>
|
4
|
+
<link rel="self" href="http://example.com/1"/>
|
5
|
+
<link rel="alternate" href="http://example.com/"/>
|
6
|
+
<entry>
|
7
|
+
<title>item title</title>
|
8
|
+
<link rel="alternate" href="http://example.com/"/>
|
9
|
+
<content type="html">
|
10
|
+
<div xmlns="http://www.w3.org/1999/xhtml">
|
11
|
+
<p>item content</p>
|
12
|
+
</div>
|
13
|
+
</content>
|
14
|
+
</entry>
|
15
|
+
</feed>
|
@@ -0,0 +1,18 @@
|
|
1
|
+
<!DOCTYPE HTML>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
<link href="http://example.com/1" rel="alternate" type="application/atom+xml" />
|
6
|
+
<link href="http://example.com/2" rel="alternate" type="application/rdf+xml" />
|
7
|
+
<link href="http://example.com/3" rel="alternate" type="application/rss+xml" />
|
8
|
+
<link href="http://example.com/4" rel="alternate" type="application/xml" />
|
9
|
+
<link href="http://example.com/5" rel="resource" type="application/rss+xml" />
|
10
|
+
<link href="http://www.example.com/6" rel="alternate" type="application/rss+xml" />
|
11
|
+
<link href="http://other-example.com/7" rel="alternate" type="application/rss+xml" />
|
12
|
+
<link href="/8" rel="alternate" type="application/rss+xml" />
|
13
|
+
</head>
|
14
|
+
<body>
|
15
|
+
body
|
16
|
+
</body>
|
17
|
+
</html>
|
18
|
+
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<rss>
|
3
|
+
<channel>
|
4
|
+
<title>title</title>
|
5
|
+
<link>http://exmple.com/</link>
|
6
|
+
<item>
|
7
|
+
<title>item title</title>
|
8
|
+
<link>http://example.com/item</link>
|
9
|
+
<description>item description</description>
|
10
|
+
</item>
|
11
|
+
</channel>
|
12
|
+
</rss>
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feed_searcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -43,6 +43,22 @@ dependencies:
|
|
43
43
|
- - ! '>='
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: activesupport
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
46
62
|
- !ruby/object:Gem::Dependency
|
47
63
|
name: bundler
|
48
64
|
requirement: !ruby/object:Gem::Requirement
|
@@ -59,6 +75,22 @@ dependencies:
|
|
59
75
|
- - ~>
|
60
76
|
- !ruby/object:Gem::Version
|
61
77
|
version: '1.3'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: pry
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
62
94
|
- !ruby/object:Gem::Dependency
|
63
95
|
name: rake
|
64
96
|
requirement: !ruby/object:Gem::Requirement
|
@@ -91,6 +123,22 @@ dependencies:
|
|
91
123
|
- - ! '>='
|
92
124
|
- !ruby/object:Gem::Version
|
93
125
|
version: 2.13.0
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: simplecov
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :development
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
94
142
|
- !ruby/object:Gem::Dependency
|
95
143
|
name: webmock
|
96
144
|
requirement: !ruby/object:Gem::Requirement
|
@@ -115,6 +163,7 @@ extensions: []
|
|
115
163
|
extra_rdoc_files: []
|
116
164
|
files:
|
117
165
|
- .gitignore
|
166
|
+
- .travis.yml
|
118
167
|
- Gemfile
|
119
168
|
- LICENSE.txt
|
120
169
|
- README.md
|
@@ -125,6 +174,9 @@ files:
|
|
125
174
|
- lib/feed_searcher/page.rb
|
126
175
|
- lib/feed_searcher/version.rb
|
127
176
|
- spec/feed_searcher_spec.rb
|
177
|
+
- spec/fixtures/example.atom
|
178
|
+
- spec/fixtures/example.html
|
179
|
+
- spec/fixtures/example.rss
|
128
180
|
- spec/spec_helper.rb
|
129
181
|
homepage: https://github.com/r7kamura/feed_searcher
|
130
182
|
licenses:
|
@@ -153,5 +205,8 @@ specification_version: 3
|
|
153
205
|
summary: Search RSS feed URLs from the given URL
|
154
206
|
test_files:
|
155
207
|
- spec/feed_searcher_spec.rb
|
208
|
+
- spec/fixtures/example.atom
|
209
|
+
- spec/fixtures/example.html
|
210
|
+
- spec/fixtures/example.rss
|
156
211
|
- spec/spec_helper.rb
|
157
212
|
has_rdoc:
|