feed_searcher 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ rvm:
2
+ - 1.9.3
3
+ - 2.0.0
4
+ branches:
5
+ only:
6
+ - master
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # FeedSearcher
2
2
  Search RSS feed URLs from the given URL.
3
3
 
4
+ [![Build Status](https://travis-ci.org/fastladder/feed_searcher.png?branch=master)](https://travis-ci.org/fastladder/feed_searcher)
5
+
4
6
 
5
7
  ## Installation
6
8
  ```
@@ -14,3 +16,19 @@ require "feed_searcher"
14
16
  FeedSearcher.search("https://github.com/r7kamura/feed_searcher")
15
17
  #=> ["https://github.com/r7kamura/feed_searcher/commits/master.atom"]
16
18
  ```
19
+
20
+
21
+ ## Internal
22
+ Let me explain how FeedSearcher works along its execution sequence.
23
+
24
+ 1. Fetches the HTML source of the given URL
25
+ 2. Finds link elements (represented as XPath format)
26
+ 3. Extracts URLs from the elements via its `href` attribute
27
+ 4. Includes the given URL if its resource itself is a feed
28
+ 5. Converts to relative path to absolute path
29
+
30
+ FeedSearcher finds link elements matcing following XPath patterns.
31
+
32
+ * //link[@rel='alternate'][@type='application/atom+xml']
33
+ * //link[@rel='alternate'][@type='application/rdf+xml']
34
+ * //link[@rel='alternate'][@type='application/rss+xml']
data/Rakefile CHANGED
@@ -1 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task :default => :spec
@@ -20,8 +20,11 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_dependency "mechanize", ">= 1.0.0"
22
22
  spec.add_dependency "nokogiri"
23
+ spec.add_development_dependency "activesupport"
23
24
  spec.add_development_dependency "bundler", "~> 1.3"
25
+ spec.add_development_dependency "pry"
24
26
  spec.add_development_dependency "rake"
25
27
  spec.add_development_dependency "rspec", ">= 2.13.0"
28
+ spec.add_development_dependency "simplecov"
26
29
  spec.add_development_dependency "webmock"
27
30
  end
@@ -1,5 +1,11 @@
1
1
  class FeedSearcher
2
2
  class Page
3
+ EXTENSIONS = %w[
4
+ atom
5
+ rdf
6
+ rss
7
+ ]
8
+
3
9
  MIME_TYPES = %w[
4
10
  application/atom+xml
5
11
  application/rdf+xml
@@ -13,12 +19,54 @@ class FeedSearcher
13
19
  end
14
20
 
15
21
  def feed_urls
16
- feed_attributes.map {|attribute| attribute["href"] }
22
+ urls = []
23
+ urls << url if (has_feed_mime_type? || has_feed_extension?) && xml?
24
+ urls += links.map {|link| link["href"] }
17
25
  end
18
26
 
19
27
  private
20
28
 
21
- def feed_attributes
29
+ def has_xml_declaration?
30
+ !!body.index("<?xml")
31
+ end
32
+
33
+ def has_feed_mime_type?
34
+ MIME_TYPES.include?(mime_type)
35
+ end
36
+
37
+ def has_feed_extension?
38
+ EXTENSIONS.include?(extension)
39
+ end
40
+
41
+ def parsable_as_xml?
42
+ !!xml
43
+ end
44
+
45
+ def xml?
46
+ has_xml_declaration? && parsable_as_xml?
47
+ end
48
+
49
+ def url
50
+ page.uri.to_s
51
+ end
52
+
53
+ def content_type
54
+ page.response["content-type"]
55
+ end
56
+
57
+ def mime_type
58
+ content_type.sub(/;.*\z/, "") if content_type
59
+ end
60
+
61
+ def extension
62
+ File.extname(page.uri.path).sub(/^\./, "")
63
+ end
64
+
65
+ def body
66
+ page.body
67
+ end
68
+
69
+ def links
22
70
  root.xpath("//link[@rel='alternate' and (#{types_query})]")
23
71
  end
24
72
 
@@ -27,7 +75,27 @@ class FeedSearcher
27
75
  end
28
76
 
29
77
  def root
30
- Nokogiri.HTML(page.body)
78
+ xml || html
79
+ end
80
+
81
+ def xml
82
+ if @xml.nil?
83
+ @xml = parse_xml
84
+ else
85
+ @xml
86
+ end
87
+ end
88
+
89
+ def html
90
+ Nokogiri.HTML(body)
91
+ end
92
+
93
+ def parse_xml
94
+ Nokogiri.XML(body) do |config|
95
+ config.options = Nokogiri::XML::ParseOptions::STRICT | Nokogiri::XML::ParseOptions::NOENT
96
+ end
97
+ rescue
98
+ false
31
99
  end
32
100
  end
33
101
  end
@@ -1,3 +1,3 @@
1
1
  class FeedSearcher
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
@@ -1,50 +1,111 @@
1
1
  require "spec_helper"
2
+ require "active_support/core_ext/string/strip"
2
3
 
3
4
  describe FeedSearcher do
4
5
  describe ".search" do
5
- before do
6
- stub_request(:get, "http://example.com/").to_return(
7
- :body => <<-EOF
8
- <!DOCTYPE HTML>
9
- <html>
10
- <head>
11
- <meta charset="UTF-8">
12
- <link href="http://example.com/1" rel="alternate" type="application/atom+xml" />
13
- <link href="http://example.com/2" rel="alternate" type="application/rdf+xml" />
14
- <link href="http://example.com/3" rel="alternate" type="application/rss+xml" />
15
- <link href="http://example.com/4" rel="alternate" type="application/xml" />
16
- <link href="http://example.com/5" rel="resource" type="application/rss+xml" />
17
- <link href="http://www.example.com/6" rel="alternate" type="application/rss+xml" />
18
- <link href="http://other-example.com/7" rel="alternate" type="application/rss+xml" />
19
- <link href="/8" rel="alternate" type="application/rss+xml" />
20
- </head>
21
- <body>
22
- body
23
- </body>
24
- </html>
25
- EOF
26
- )
6
+ context "when the specified resource is HTML" do
7
+ before do
8
+ stub_request(:get, "http://example.com/").to_return(
9
+ :body => <<-EOS.strip_heredoc
10
+ <!DOCTYPE HTML>
11
+ <html>
12
+ <head>
13
+ <meta charset="UTF-8">
14
+ <link href="http://example.com/1" rel="alternate" type="application/atom+xml" />
15
+ <link href="http://example.com/2" rel="alternate" type="application/rdf+xml" />
16
+ <link href="http://example.com/3" rel="alternate" type="application/rss+xml" />
17
+ <link href="http://example.com/4" rel="alternate" type="application/xml" />
18
+ <link href="http://example.com/5" rel="resource" type="application/rss+xml" />
19
+ <link href="http://www.example.com/6" rel="alternate" type="application/rss+xml" />
20
+ <link href="http://other-example.com/7" rel="alternate" type="application/rss+xml" />
21
+ <link href="/8" rel="alternate" type="application/rss+xml" />
22
+ </head>
23
+ <body>
24
+ body
25
+ </body>
26
+ </html>
27
+ EOS
28
+ )
29
+ end
30
+
31
+ # This example makes sure the following specifications.
32
+ #
33
+ # * it recognizes application/atom+xml
34
+ # * it recognizes application/rdf+xml
35
+ # * it recognizes application/rss+xml
36
+ # * it does not recognize application/xml
37
+ # * it keeps subdomain
38
+ # * it keeps other domain
39
+ # * it converts relative path to absolute url
40
+ #
41
+ it "returns feed URLs from link elements in the specified resource" do
42
+ FeedSearcher.search("http://example.com/").should == %w[
43
+ http://example.com/1
44
+ http://example.com/2
45
+ http://example.com/3
46
+ http://www.example.com/6
47
+ http://other-example.com/7
48
+ http://example.com/8
49
+ ]
50
+ end
27
51
  end
28
52
 
29
- # This example makes sure the following specifications.
30
- #
31
- # * it recognizes application/atom+xml
32
- # * it recognizes application/rdf+xml
33
- # * it recognizes application/rss+xml
34
- # * it does not recognizes application/xml
35
- # * it keeps subdomain
36
- # * it keeps other domain
37
- # * it converts absolute url
38
- #
39
- it "returns feed URLs from given URL" do
40
- FeedSearcher.search("http://example.com/").should == %w[
41
- http://example.com/1
42
- http://example.com/2
43
- http://example.com/3
44
- http://www.example.com/6
45
- http://other-example.com/7
46
- http://example.com/8
47
- ]
53
+ context "when the specified resource has feed MIME type and be parsable as XML" do
54
+ before do
55
+ stub_request(:get, "http://example.com/").to_return(
56
+ :headers => { "Content-Type" => "application/rss+xml; charset=UTF-8" },
57
+ :body => <<-EOS.strip_heredoc
58
+ <?xml version="1.0" encoding="UTF-8"?>
59
+ <rss>
60
+ <channel>
61
+ <title>title</title>
62
+ <link>http://exmple.com/</link>
63
+ <item>
64
+ <title>item title</title>
65
+ <link>http://example.com/item</link>
66
+ <description>item description</description>
67
+ </item>
68
+ </channel>
69
+ </rss>
70
+ EOS
71
+ )
72
+ end
73
+
74
+ it "returns itself as a feed url" do
75
+ FeedSearcher.search("http://example.com/").should == %w[
76
+ http://example.com/
77
+ ]
78
+ end
79
+ end
80
+
81
+ context "when the specified resource has feed extension and be parsable as XML" do
82
+ before do
83
+ stub_request(:get, "http://example.com/feed.atom").to_return(
84
+ :body => <<-EOS.strip_heredoc
85
+ <?xml version="1.0" encoding="UTF-8"?>
86
+ <feed xmlns="http://www.w3.org/2005/Atom">
87
+ <title>title</title>
88
+ <link rel="self" href="http://example.com/1"/>
89
+ <link rel="alternate" href="http://example.com/"/>
90
+ <entry>
91
+ <title>item title</title>
92
+ <link rel="alternate" href="http://example.com/"/>
93
+ <content type="html">
94
+ <div xmlns="http://www.w3.org/1999/xhtml">
95
+ <p>item content</p>
96
+ </div>
97
+ </content>
98
+ </entry>
99
+ </feed>
100
+ EOS
101
+ )
102
+ end
103
+
104
+ it "returns itself as a feed url" do
105
+ FeedSearcher.search("http://example.com/feed.atom").should == %w[
106
+ http://example.com/feed.atom
107
+ ]
108
+ end
48
109
  end
49
110
  end
50
111
  end
@@ -0,0 +1,15 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <feed xmlns="http://www.w3.org/2005/Atom">
3
+ <title>title</title>
4
+ <link rel="self" href="http://example.com/1"/>
5
+ <link rel="alternate" href="http://example.com/"/>
6
+ <entry>
7
+ <title>item title</title>
8
+ <link rel="alternate" href="http://example.com/"/>
9
+ <content type="html">
10
+ <div xmlns="http://www.w3.org/1999/xhtml">
11
+ <p>item content</p>
12
+ </div>
13
+ </content>
14
+ </entry>
15
+ </feed>
@@ -0,0 +1,18 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <link href="http://example.com/1" rel="alternate" type="application/atom+xml" />
6
+ <link href="http://example.com/2" rel="alternate" type="application/rdf+xml" />
7
+ <link href="http://example.com/3" rel="alternate" type="application/rss+xml" />
8
+ <link href="http://example.com/4" rel="alternate" type="application/xml" />
9
+ <link href="http://example.com/5" rel="resource" type="application/rss+xml" />
10
+ <link href="http://www.example.com/6" rel="alternate" type="application/rss+xml" />
11
+ <link href="http://other-example.com/7" rel="alternate" type="application/rss+xml" />
12
+ <link href="/8" rel="alternate" type="application/rss+xml" />
13
+ </head>
14
+ <body>
15
+ body
16
+ </body>
17
+ </html>
18
+
@@ -0,0 +1,12 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <rss>
3
+ <channel>
4
+ <title>title</title>
5
+ <link>http://exmple.com/</link>
6
+ <item>
7
+ <title>item title</title>
8
+ <link>http://example.com/item</link>
9
+ <description>item description</description>
10
+ </item>
11
+ </channel>
12
+ </rss>
data/spec/spec_helper.rb CHANGED
@@ -1,3 +1,6 @@
1
+ require "simplecov"
2
+ SimpleCov.start
3
+
1
4
  $LOAD_PATH.unshift File.expand_path("../../lib/feed_searcher", __FILE__)
2
5
  require "feed_searcher"
3
6
  require "webmock/rspec"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feed_searcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-17 00:00:00.000000000 Z
12
+ date: 2013-03-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -43,6 +43,22 @@ dependencies:
43
43
  - - ! '>='
44
44
  - !ruby/object:Gem::Version
45
45
  version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: activesupport
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
46
62
  - !ruby/object:Gem::Dependency
47
63
  name: bundler
48
64
  requirement: !ruby/object:Gem::Requirement
@@ -59,6 +75,22 @@ dependencies:
59
75
  - - ~>
60
76
  - !ruby/object:Gem::Version
61
77
  version: '1.3'
78
+ - !ruby/object:Gem::Dependency
79
+ name: pry
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
62
94
  - !ruby/object:Gem::Dependency
63
95
  name: rake
64
96
  requirement: !ruby/object:Gem::Requirement
@@ -91,6 +123,22 @@ dependencies:
91
123
  - - ! '>='
92
124
  - !ruby/object:Gem::Version
93
125
  version: 2.13.0
126
+ - !ruby/object:Gem::Dependency
127
+ name: simplecov
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :development
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
94
142
  - !ruby/object:Gem::Dependency
95
143
  name: webmock
96
144
  requirement: !ruby/object:Gem::Requirement
@@ -115,6 +163,7 @@ extensions: []
115
163
  extra_rdoc_files: []
116
164
  files:
117
165
  - .gitignore
166
+ - .travis.yml
118
167
  - Gemfile
119
168
  - LICENSE.txt
120
169
  - README.md
@@ -125,6 +174,9 @@ files:
125
174
  - lib/feed_searcher/page.rb
126
175
  - lib/feed_searcher/version.rb
127
176
  - spec/feed_searcher_spec.rb
177
+ - spec/fixtures/example.atom
178
+ - spec/fixtures/example.html
179
+ - spec/fixtures/example.rss
128
180
  - spec/spec_helper.rb
129
181
  homepage: https://github.com/r7kamura/feed_searcher
130
182
  licenses:
@@ -153,5 +205,8 @@ specification_version: 3
153
205
  summary: Search RSS feed URLs from the given URL
154
206
  test_files:
155
207
  - spec/feed_searcher_spec.rb
208
+ - spec/fixtures/example.atom
209
+ - spec/fixtures/example.html
210
+ - spec/fixtures/example.rss
156
211
  - spec/spec_helper.rb
157
212
  has_rdoc: