feed_searcher 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +6 -0
- data/README.md +18 -0
- data/Rakefile +4 -0
- data/feed_searcher.gemspec +3 -0
- data/lib/feed_searcher/page.rb +71 -3
- data/lib/feed_searcher/version.rb +1 -1
- data/spec/feed_searcher_spec.rb +102 -41
- data/spec/fixtures/example.atom +15 -0
- data/spec/fixtures/example.html +18 -0
- data/spec/fixtures/example.rss +12 -0
- data/spec/spec_helper.rb +3 -0
- metadata +57 -2
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# FeedSearcher
|
2
2
|
Search RSS feed URLs from the given URL.
|
3
3
|
|
4
|
+
[](https://travis-ci.org/fastladder/feed_searcher)
|
5
|
+
|
4
6
|
|
5
7
|
## Installation
|
6
8
|
```
|
@@ -14,3 +16,19 @@ require "feed_searcher"
|
|
14
16
|
FeedSearcher.search("https://github.com/r7kamura/feed_searcher")
|
15
17
|
#=> ["https://github.com/r7kamura/feed_searcher/commits/master.atom"]
|
16
18
|
```
|
19
|
+
|
20
|
+
|
21
|
+
## Internal
|
22
|
+
Let me explain how FeedSearcher works along its execution sequence.
|
23
|
+
|
24
|
+
1. Fetches the HTML source of the given URL
|
25
|
+
2. Finds link elements (represented as XPath format)
|
26
|
+
3. Extracts URLs from the elements via its `href` attribute
|
27
|
+
4. Includes the given URL if its resource itself is a feed
|
28
|
+
5. Converts to relative path to absolute path
|
29
|
+
|
30
|
+
FeedSearcher finds link elements matcing following XPath patterns.
|
31
|
+
|
32
|
+
* //link[@rel='alternate'][@type='application/atom+xml']
|
33
|
+
* //link[@rel='alternate'][@type='application/rdf+xml']
|
34
|
+
* //link[@rel='alternate'][@type='application/rss+xml']
|
data/Rakefile
CHANGED
data/feed_searcher.gemspec
CHANGED
@@ -20,8 +20,11 @@ Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_dependency "mechanize", ">= 1.0.0"
|
22
22
|
spec.add_dependency "nokogiri"
|
23
|
+
spec.add_development_dependency "activesupport"
|
23
24
|
spec.add_development_dependency "bundler", "~> 1.3"
|
25
|
+
spec.add_development_dependency "pry"
|
24
26
|
spec.add_development_dependency "rake"
|
25
27
|
spec.add_development_dependency "rspec", ">= 2.13.0"
|
28
|
+
spec.add_development_dependency "simplecov"
|
26
29
|
spec.add_development_dependency "webmock"
|
27
30
|
end
|
data/lib/feed_searcher/page.rb
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
class FeedSearcher
|
2
2
|
class Page
|
3
|
+
EXTENSIONS = %w[
|
4
|
+
atom
|
5
|
+
rdf
|
6
|
+
rss
|
7
|
+
]
|
8
|
+
|
3
9
|
MIME_TYPES = %w[
|
4
10
|
application/atom+xml
|
5
11
|
application/rdf+xml
|
@@ -13,12 +19,54 @@ class FeedSearcher
|
|
13
19
|
end
|
14
20
|
|
15
21
|
def feed_urls
|
16
|
-
|
22
|
+
urls = []
|
23
|
+
urls << url if (has_feed_mime_type? || has_feed_extension?) && xml?
|
24
|
+
urls += links.map {|link| link["href"] }
|
17
25
|
end
|
18
26
|
|
19
27
|
private
|
20
28
|
|
21
|
-
def
|
29
|
+
def has_xml_declaration?
|
30
|
+
!!body.index("<?xml")
|
31
|
+
end
|
32
|
+
|
33
|
+
def has_feed_mime_type?
|
34
|
+
MIME_TYPES.include?(mime_type)
|
35
|
+
end
|
36
|
+
|
37
|
+
def has_feed_extension?
|
38
|
+
EXTENSIONS.include?(extension)
|
39
|
+
end
|
40
|
+
|
41
|
+
def parsable_as_xml?
|
42
|
+
!!xml
|
43
|
+
end
|
44
|
+
|
45
|
+
def xml?
|
46
|
+
has_xml_declaration? && parsable_as_xml?
|
47
|
+
end
|
48
|
+
|
49
|
+
def url
|
50
|
+
page.uri.to_s
|
51
|
+
end
|
52
|
+
|
53
|
+
def content_type
|
54
|
+
page.response["content-type"]
|
55
|
+
end
|
56
|
+
|
57
|
+
def mime_type
|
58
|
+
content_type.sub(/;.*\z/, "") if content_type
|
59
|
+
end
|
60
|
+
|
61
|
+
def extension
|
62
|
+
File.extname(page.uri.path).sub(/^\./, "")
|
63
|
+
end
|
64
|
+
|
65
|
+
def body
|
66
|
+
page.body
|
67
|
+
end
|
68
|
+
|
69
|
+
def links
|
22
70
|
root.xpath("//link[@rel='alternate' and (#{types_query})]")
|
23
71
|
end
|
24
72
|
|
@@ -27,7 +75,27 @@ class FeedSearcher
|
|
27
75
|
end
|
28
76
|
|
29
77
|
def root
|
30
|
-
|
78
|
+
xml || html
|
79
|
+
end
|
80
|
+
|
81
|
+
def xml
|
82
|
+
if @xml.nil?
|
83
|
+
@xml = parse_xml
|
84
|
+
else
|
85
|
+
@xml
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def html
|
90
|
+
Nokogiri.HTML(body)
|
91
|
+
end
|
92
|
+
|
93
|
+
def parse_xml
|
94
|
+
Nokogiri.XML(body) do |config|
|
95
|
+
config.options = Nokogiri::XML::ParseOptions::STRICT | Nokogiri::XML::ParseOptions::NOENT
|
96
|
+
end
|
97
|
+
rescue
|
98
|
+
false
|
31
99
|
end
|
32
100
|
end
|
33
101
|
end
|
data/spec/feed_searcher_spec.rb
CHANGED
@@ -1,50 +1,111 @@
|
|
1
1
|
require "spec_helper"
|
2
|
+
require "active_support/core_ext/string/strip"
|
2
3
|
|
3
4
|
describe FeedSearcher do
|
4
5
|
describe ".search" do
|
5
|
-
|
6
|
-
|
7
|
-
:
|
8
|
-
|
9
|
-
|
10
|
-
<
|
11
|
-
<
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
body
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
6
|
+
context "when the specified resource is HTML" do
|
7
|
+
before do
|
8
|
+
stub_request(:get, "http://example.com/").to_return(
|
9
|
+
:body => <<-EOS.strip_heredoc
|
10
|
+
<!DOCTYPE HTML>
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<meta charset="UTF-8">
|
14
|
+
<link href="http://example.com/1" rel="alternate" type="application/atom+xml" />
|
15
|
+
<link href="http://example.com/2" rel="alternate" type="application/rdf+xml" />
|
16
|
+
<link href="http://example.com/3" rel="alternate" type="application/rss+xml" />
|
17
|
+
<link href="http://example.com/4" rel="alternate" type="application/xml" />
|
18
|
+
<link href="http://example.com/5" rel="resource" type="application/rss+xml" />
|
19
|
+
<link href="http://www.example.com/6" rel="alternate" type="application/rss+xml" />
|
20
|
+
<link href="http://other-example.com/7" rel="alternate" type="application/rss+xml" />
|
21
|
+
<link href="/8" rel="alternate" type="application/rss+xml" />
|
22
|
+
</head>
|
23
|
+
<body>
|
24
|
+
body
|
25
|
+
</body>
|
26
|
+
</html>
|
27
|
+
EOS
|
28
|
+
)
|
29
|
+
end
|
30
|
+
|
31
|
+
# This example makes sure the following specifications.
|
32
|
+
#
|
33
|
+
# * it recognizes application/atom+xml
|
34
|
+
# * it recognizes application/rdf+xml
|
35
|
+
# * it recognizes application/rss+xml
|
36
|
+
# * it does not recognize application/xml
|
37
|
+
# * it keeps subdomain
|
38
|
+
# * it keeps other domain
|
39
|
+
# * it converts relative path to absolute url
|
40
|
+
#
|
41
|
+
it "returns feed URLs from link elements in the specified resource" do
|
42
|
+
FeedSearcher.search("http://example.com/").should == %w[
|
43
|
+
http://example.com/1
|
44
|
+
http://example.com/2
|
45
|
+
http://example.com/3
|
46
|
+
http://www.example.com/6
|
47
|
+
http://other-example.com/7
|
48
|
+
http://example.com/8
|
49
|
+
]
|
50
|
+
end
|
27
51
|
end
|
28
52
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
53
|
+
context "when the specified resource has feed MIME type and be parsable as XML" do
|
54
|
+
before do
|
55
|
+
stub_request(:get, "http://example.com/").to_return(
|
56
|
+
:headers => { "Content-Type" => "application/rss+xml; charset=UTF-8" },
|
57
|
+
:body => <<-EOS.strip_heredoc
|
58
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
59
|
+
<rss>
|
60
|
+
<channel>
|
61
|
+
<title>title</title>
|
62
|
+
<link>http://exmple.com/</link>
|
63
|
+
<item>
|
64
|
+
<title>item title</title>
|
65
|
+
<link>http://example.com/item</link>
|
66
|
+
<description>item description</description>
|
67
|
+
</item>
|
68
|
+
</channel>
|
69
|
+
</rss>
|
70
|
+
EOS
|
71
|
+
)
|
72
|
+
end
|
73
|
+
|
74
|
+
it "returns itself as a feed url" do
|
75
|
+
FeedSearcher.search("http://example.com/").should == %w[
|
76
|
+
http://example.com/
|
77
|
+
]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
context "when the specified resource has feed extension and be parsable as XML" do
|
82
|
+
before do
|
83
|
+
stub_request(:get, "http://example.com/feed.atom").to_return(
|
84
|
+
:body => <<-EOS.strip_heredoc
|
85
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
86
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
87
|
+
<title>title</title>
|
88
|
+
<link rel="self" href="http://example.com/1"/>
|
89
|
+
<link rel="alternate" href="http://example.com/"/>
|
90
|
+
<entry>
|
91
|
+
<title>item title</title>
|
92
|
+
<link rel="alternate" href="http://example.com/"/>
|
93
|
+
<content type="html">
|
94
|
+
<div xmlns="http://www.w3.org/1999/xhtml">
|
95
|
+
<p>item content</p>
|
96
|
+
</div>
|
97
|
+
</content>
|
98
|
+
</entry>
|
99
|
+
</feed>
|
100
|
+
EOS
|
101
|
+
)
|
102
|
+
end
|
103
|
+
|
104
|
+
it "returns itself as a feed url" do
|
105
|
+
FeedSearcher.search("http://example.com/feed.atom").should == %w[
|
106
|
+
http://example.com/feed.atom
|
107
|
+
]
|
108
|
+
end
|
48
109
|
end
|
49
110
|
end
|
50
111
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
3
|
+
<title>title</title>
|
4
|
+
<link rel="self" href="http://example.com/1"/>
|
5
|
+
<link rel="alternate" href="http://example.com/"/>
|
6
|
+
<entry>
|
7
|
+
<title>item title</title>
|
8
|
+
<link rel="alternate" href="http://example.com/"/>
|
9
|
+
<content type="html">
|
10
|
+
<div xmlns="http://www.w3.org/1999/xhtml">
|
11
|
+
<p>item content</p>
|
12
|
+
</div>
|
13
|
+
</content>
|
14
|
+
</entry>
|
15
|
+
</feed>
|
@@ -0,0 +1,18 @@
|
|
1
|
+
<!DOCTYPE HTML>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
<link href="http://example.com/1" rel="alternate" type="application/atom+xml" />
|
6
|
+
<link href="http://example.com/2" rel="alternate" type="application/rdf+xml" />
|
7
|
+
<link href="http://example.com/3" rel="alternate" type="application/rss+xml" />
|
8
|
+
<link href="http://example.com/4" rel="alternate" type="application/xml" />
|
9
|
+
<link href="http://example.com/5" rel="resource" type="application/rss+xml" />
|
10
|
+
<link href="http://www.example.com/6" rel="alternate" type="application/rss+xml" />
|
11
|
+
<link href="http://other-example.com/7" rel="alternate" type="application/rss+xml" />
|
12
|
+
<link href="/8" rel="alternate" type="application/rss+xml" />
|
13
|
+
</head>
|
14
|
+
<body>
|
15
|
+
body
|
16
|
+
</body>
|
17
|
+
</html>
|
18
|
+
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<rss>
|
3
|
+
<channel>
|
4
|
+
<title>title</title>
|
5
|
+
<link>http://exmple.com/</link>
|
6
|
+
<item>
|
7
|
+
<title>item title</title>
|
8
|
+
<link>http://example.com/item</link>
|
9
|
+
<description>item description</description>
|
10
|
+
</item>
|
11
|
+
</channel>
|
12
|
+
</rss>
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feed_searcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -43,6 +43,22 @@ dependencies:
|
|
43
43
|
- - ! '>='
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: activesupport
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
46
62
|
- !ruby/object:Gem::Dependency
|
47
63
|
name: bundler
|
48
64
|
requirement: !ruby/object:Gem::Requirement
|
@@ -59,6 +75,22 @@ dependencies:
|
|
59
75
|
- - ~>
|
60
76
|
- !ruby/object:Gem::Version
|
61
77
|
version: '1.3'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: pry
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
62
94
|
- !ruby/object:Gem::Dependency
|
63
95
|
name: rake
|
64
96
|
requirement: !ruby/object:Gem::Requirement
|
@@ -91,6 +123,22 @@ dependencies:
|
|
91
123
|
- - ! '>='
|
92
124
|
- !ruby/object:Gem::Version
|
93
125
|
version: 2.13.0
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: simplecov
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :development
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
94
142
|
- !ruby/object:Gem::Dependency
|
95
143
|
name: webmock
|
96
144
|
requirement: !ruby/object:Gem::Requirement
|
@@ -115,6 +163,7 @@ extensions: []
|
|
115
163
|
extra_rdoc_files: []
|
116
164
|
files:
|
117
165
|
- .gitignore
|
166
|
+
- .travis.yml
|
118
167
|
- Gemfile
|
119
168
|
- LICENSE.txt
|
120
169
|
- README.md
|
@@ -125,6 +174,9 @@ files:
|
|
125
174
|
- lib/feed_searcher/page.rb
|
126
175
|
- lib/feed_searcher/version.rb
|
127
176
|
- spec/feed_searcher_spec.rb
|
177
|
+
- spec/fixtures/example.atom
|
178
|
+
- spec/fixtures/example.html
|
179
|
+
- spec/fixtures/example.rss
|
128
180
|
- spec/spec_helper.rb
|
129
181
|
homepage: https://github.com/r7kamura/feed_searcher
|
130
182
|
licenses:
|
@@ -153,5 +205,8 @@ specification_version: 3
|
|
153
205
|
summary: Search RSS feed URLs from the given URL
|
154
206
|
test_files:
|
155
207
|
- spec/feed_searcher_spec.rb
|
208
|
+
- spec/fixtures/example.atom
|
209
|
+
- spec/fixtures/example.html
|
210
|
+
- spec/fixtures/example.rss
|
156
211
|
- spec/spec_helper.rb
|
157
212
|
has_rdoc:
|