spix_parser 1.5.4 → 1.6.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,9 @@
1
+ $:.unshift File.expand_path(File.dirname(__FILE__))
2
+ module Spix
3
+ autoload :FeedDiscovery, 'feed_discovery'
4
+ module FeedDiscovery
5
+ autoload :Base, 'feed_discovery/base'
6
+ autoload :Feed, 'feed_discovery/feed'
7
+ autoload :Document, 'feed_discovery/document'
8
+ end
9
+ end
@@ -0,0 +1,74 @@
1
+ module Spix
2
+ module FeedDiscovery
3
+ class Base
4
+ # HTTP "User-Agent" header to send to servers when downloading feeds.
5
+ USER_AGENT = "SpixParser"
6
+
7
+ def initialize uri
8
+ self.uri = uri
9
+ self.document = FeedDiscovery::Document.new uri
10
+ end
11
+
12
+ def uri= value
13
+ @uri = URI.parse value
14
+ end
15
+
16
+ def uri
17
+ @uri
18
+ end
19
+
20
+ def document= value
21
+ @document = value
22
+ end
23
+ private :document=
24
+
25
+ def document
26
+ @document
27
+ end
28
+ private :document
29
+
30
+ def list
31
+ extract_feeds_from_anchors if html?
32
+ extract_feeds_from_links
33
+ include_it_self if feed?
34
+ items
35
+ end
36
+
37
+ def html?
38
+ document.html?
39
+ end
40
+ private :html?
41
+
42
+ def extract_feeds_from_anchors
43
+ document.feed_uris_from_anchors +
44
+ document.generic_uris_from_anchors.map { |uri|
45
+ FeedDiscovery::Document.new(uri).feed_uris_from_anchors
46
+ }.flatten.each { |uri|
47
+ items << Feed.new(uri, document.favicon)
48
+ }
49
+ end
50
+ private :extract_feeds_from_anchors
51
+
52
+ def extract_feeds_from_links
53
+ document.feed_uris_from_links.each { |uri|
54
+ items << Feed.new(uri, document.favicon)
55
+ }
56
+ end
57
+ private :extract_feeds_from_links
58
+
59
+ def include_it_self
60
+ items << Feed.new(uri.to_s, document.favicon)
61
+ end
62
+
63
+ def feed?
64
+ document.feed?
65
+ end
66
+ private :feed?
67
+
68
+ def items
69
+ @items ||= []
70
+ end
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,104 @@
1
+ module Spix
2
+ module FeedDiscovery
3
+ class Document
4
+
5
+ def initialize uri
6
+ @uri = URI.parse uri
7
+ @document = Nokogiri::XML(content)
8
+ end
9
+
10
+ def content
11
+ if @uri.respond_to?(:read)
12
+ @uri.read
13
+ else
14
+ req_headers = {}
15
+ req_headers["User-Agent"] = USER_AGENT
16
+ open(@uri.to_s, req_headers).read
17
+ end
18
+ end
19
+ private :content
20
+
21
+ def feed_uris_from_anchors
22
+ @document.search("a").select { |anchor|
23
+ rss_or_atom_content_type_in? anchor
24
+ }.map { |node|
25
+ uri = @uri.merge node.get_attribute 'href'
26
+ uri.to_s
27
+ }
28
+ end
29
+
30
+ def generic_uris_from_anchors
31
+ @document.search("a").select { |anchor|
32
+ not rss_or_atom_content_type_in? anchor
33
+ }.map { |node|
34
+ uri = @uri.merge node.get_attribute 'href'
35
+ uri.to_s
36
+ }
37
+ end
38
+
39
+ def feed_uris_from_links
40
+ @document.search(
41
+ "link[@type='application/atom+xml']",
42
+ "link[@type='application/rss+xml']"
43
+ ).map { |node|
44
+ uri = @uri.merge node.get_attribute 'href'
45
+ uri.to_s
46
+ }
47
+ end
48
+
49
+ def rss_or_atom_content_type_in? anchor
50
+ req, path = request_and_path_using address_from anchor
51
+ resp = req.request_head path
52
+ resp['content-type'] =~ /rss|atom/
53
+ end
54
+ private :rss_or_atom_content_type_in?
55
+
56
+ def request_and_path_using address
57
+ uri = @uri.merge URI.parse address
58
+ req = Net::HTTP.new uri.host, uri.port
59
+ path = uri - uri.select(:scheme, :host).join("://")
60
+ return req, path.to_s
61
+ end
62
+ private :request_and_path_using
63
+
64
+ def address_from node
65
+ node.get_attribute("href")
66
+ end
67
+ private :address_from
68
+
69
+ def html?
70
+ @document.root.name == "html"
71
+ end
72
+
73
+ def feed?
74
+ %w[rss feed].include? @document.root.name
75
+ end
76
+
77
+ def favicon
78
+ shortcuts_in_document or shortcuts_from(base_path) or base_path.merge('favicon.ico').to_s
79
+ end
80
+
81
+ def shortcuts_in_document
82
+ shortcuts = @document.search('link[@rel*=shortcut]')
83
+ shortcuts.any? ? shortcuts : nil
84
+ end
85
+ private :shortcuts_in_document
86
+
87
+ def shortcuts_from base_path
88
+ doc = Nokogiri::HTML Net::HTTP.get base_path
89
+ doc.search('link[@rel*=shortcut]')
90
+
91
+ rescue Net::HTTPError, Net::HTTPFatalError
92
+ logger.warn "error opening favicon: #{$!}"
93
+ nil
94
+ end
95
+ private :shortcuts_from
96
+
97
+ def base_path
98
+ URI.parse @uri.select(:scheme, :host).join("://")
99
+ end
100
+ private :base_path
101
+
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,30 @@
1
+ module Spix
2
+ module FeedDiscovery
3
+ class Feed < Struct.new(:url, :favicon, :title)
4
+
5
+ def initialize url, favicon
6
+ self.url = url
7
+ self.favicon = favicon
8
+ self.title = get_title
9
+ end
10
+
11
+ def get_title
12
+ content.search('title').first.content
13
+ end
14
+ private :get_title
15
+
16
+ def content
17
+ req = Net::HTTP.new uri.host, uri.port
18
+ path = uri - uri.select(:scheme, :host).join("://")
19
+ resp = req.request_get path.to_s
20
+ Nokogiri::XML(resp.body)
21
+ end
22
+ private :content
23
+
24
+ def uri
25
+ URI.parse url
26
+ end
27
+ private :uri
28
+ end
29
+ end
30
+ end
@@ -1,94 +1,20 @@
1
- gem "feedzirra", ">=0.0.24"
2
1
  require "feedzirra"
3
2
  require "nokogiri"
4
3
  require "uri"
5
4
  require "open-uri"
6
5
 
7
6
  module Spix
8
- class FeedDiscovery
7
+ module FeedDiscovery
8
+ extend self
9
9
 
10
- # HTTP "User-Agent" header to send to servers when downloading feeds.
11
- USER_AGENT = "SpixParser"
12
-
13
- def self.feed?(uri)
10
+ def feed? uri
14
11
  Spix::Parser.parse(uri, :mode => :fetch) ? true : false
15
12
  end
16
13
 
17
- def self.list(uri)
18
-
19
- content = self.read(uri)
20
-
21
- doc = Nokogiri::HTML(content)
22
-
23
- # get page title
24
- title = doc.search('title')[0].content
25
-
26
- items = doc.search("//link[@type='application/atom+xml']", "//link[@type='application/rss+xml']").collect do |link|
27
- url_object = URI::parse(uri).normalize
28
-
29
- href = link.get_attribute(:href).to_s
30
-
31
- feed_url_object = URI::parse(href)
32
-
33
- if feed_url_object.relative?
34
-
35
- # there's 2 types of relative URIs
36
- # the ones based on a path (base: http://sitewithfeed.com/foo/, relative: feed.xml, feed: http://sitewithfeed.com/foo/feed.xml)
37
- # and the ones based on the top domain (base: http://sitewithfeed.com/foo/, relative: /feed.xml, feed: http://sitewithfeed.com/feed.xml)
38
- if feed_url_object.path.match(/^\//)
39
- # when the feed_url_object is relative and starts with a "/" we should ignore the domain path
40
- path = nil
41
- else
42
- # when the feed_url_object is relative and do not starts with a "/" we should use the domain path
43
-
44
- if url_object.path.match(/\/$/)
45
- # when the url_object ends with a "/" we should use it
46
- path = url_object.path
47
- else
48
- # when the url_object do not ends with a "/" we should add it
49
- path = url_object.path + "/"
50
- end
51
- end
52
-
53
- href = "#{url_object.scheme}://" +
54
- "#{url_object.host}" +
55
- "#{path}" +
56
- "#{url_object.query}" +
57
- href
58
- end
59
-
60
- item = {
61
- :title => link.get_attribute(:title) || title,
62
- :url => href
63
- }
64
-
65
- end
66
-
67
- if items.size == 0
68
- # if there's no item found at the given URI, maybe it's a feed URI
69
- if self.feed?(uri)
70
- items = [
71
- {
72
- :title => title,
73
- :url => uri
74
- }
75
- ]
76
- end
77
- end
78
-
79
- items
80
- rescue
81
- nil
14
+ def list uri
15
+ page = Base.new uri
16
+ page.list
82
17
  end
83
18
 
84
- def self.read(uri)
85
- if uri.respond_to?(:read)
86
- content = uri.read
87
- else
88
- req_headers = {}
89
- req_headers["User-Agent"] = USER_AGENT
90
- content = open(uri, req_headers).read
91
- end
92
- end
93
19
  end
94
- end
20
+ end
@@ -3,8 +3,8 @@ module Spix
3
3
  module Parser
4
4
  module Version
5
5
  MAJOR = 1
6
- MINOR = 5
7
- TINY = 4
6
+ MINOR = 6
7
+ TINY = 1
8
8
 
9
9
  def self.current_version
10
10
  "#{MAJOR}.#{MINOR}.#{TINY}"
data/lib/spix_parser.rb CHANGED
@@ -29,7 +29,7 @@ require "spix_parser/custom_parsers/atom"
29
29
  require "spix_parser/custom_parsers/rss_entry"
30
30
  require "spix_parser/custom_parsers/rss"
31
31
 
32
- require "spix_parser/tools/feed_discovery"
32
+ require "spix_parser/tools/discovery"
33
33
 
34
34
  if RUBY_VERSION < '1.9'
35
35
  $KCODE='u'
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spix::FeedDiscovery::Document do
4
+
5
+ let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
6
+
7
+ describe '#feed_uris_from_anchors' do
8
+ it 'should return only uris from anchors' do
9
+ document.feed_uris_from_anchors.should eql expected_uris_inside('a.has_feed')
10
+ end
11
+ end
12
+
13
+ describe '#feed_uris_from_links' do
14
+ it 'should return only uris from links' do
15
+ document.feed_uris_from_links.should eql expected_uris_inside('link.has_feed')
16
+ end
17
+ end
18
+
19
+ describe '#generic_uris' do
20
+ it 'should return only ordinary uris from anchors' do
21
+ document.generic_uris_from_anchors.should eql expected_uris_inside('a.generic')
22
+ end
23
+ end
24
+
25
+ describe '#html?' do
26
+ it 'should return true if is a html document' do
27
+ content = load_fixture('rss_list.html')
28
+ Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
29
+ document.html?.should eql true
30
+ end
31
+
32
+ it 'should return false if is a rss/feed document' do
33
+ content = load_fixture('feed.rss')
34
+ Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
35
+ document.html?.should eql false
36
+ end
37
+ end
38
+
39
+ describe '#feed?' do
40
+ it 'should return true if a feed document' do
41
+ content = load_fixture('feed.rss')
42
+ Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
43
+ document.feed?.should eql true
44
+ end
45
+
46
+ it 'should return false if hot a html document' do
47
+ content = load_fixture('rss_list.html')
48
+ Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
49
+ document.feed?.should eql false
50
+ end
51
+ end
52
+
53
+ before :all do
54
+ @rss_uri = 'http://myfeed.com/rss_list.html'
55
+ @content = load_fixture("rss_list.html")
56
+ @document = Nokogiri::XML(@content)
57
+ end
58
+
59
+ before :each do
60
+ Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
61
+ FakeWeb.register_uri(:head, 'http://myfeed.com/has_feed.html', :content_type => 'application/atom+xml')
62
+ FakeWeb.register_uri(:head, 'http://myfeed.com/generic.html', :content_type => 'text/html' )
63
+ end
64
+ end
65
+
66
+ def expected_uris_inside to_search
67
+ @document.search(to_search).map { |node| URI.parse(@rss_uri).merge(node.get_attribute('href')).to_s }
68
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spix::FeedDiscovery::Feed do
4
+
5
+ context 'given an expecific uri' do
6
+
7
+ let(:feed) { Spix::FeedDiscovery::Feed.new @feed_uri, @favicon_uri }
8
+
9
+ it 'should set the favicon' do
10
+ feed.favicon.should == @favicon_uri
11
+ end
12
+
13
+ it 'should set the url' do
14
+ feed.url.should == @feed_uri
15
+ end
16
+
17
+ it 'should set title' do
18
+ feed.title.should == @document.search('title').first.content
19
+ end
20
+
21
+ before :all do
22
+ @feed_uri = "http://myfeed.com/feed.rss"
23
+ @favicon_uri = "http://myfeed.com/images/favicon.ico"
24
+ @document = Nokogiri::XML load_fixture 'feed.rss'
25
+ FakeWeb.register_uri(:get, @feed_uri, :body => @document.to_s)
26
+ end
27
+
28
+ end
29
+ end
@@ -2,13 +2,11 @@ require 'spec_helper'
2
2
 
3
3
  describe Spix::FeedDiscovery, "#list" do
4
4
 
5
- before(:all) do
6
- @domain_url = "http://sitewithfeed.com"
7
- end
8
-
9
5
  describe "when the feed have an absolute URI" do
10
6
  it "should return the feed url" do
11
- FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("absolute_uri.html"))
7
+ fake_requests_for :path_inside_content => '/html4-002.html',
8
+ :resource_path => @domain_url,
9
+ :content => load_fixture("absolute_uri.html")
12
10
  Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
13
11
  end
14
12
  end
@@ -16,30 +14,37 @@ describe Spix::FeedDiscovery, "#list" do
16
14
  describe "when the feed have a relative URI" do
17
15
  describe "which is relative to a path" do
18
16
  it "should return the feed url when the URI is at the top domain" do
19
- FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri.html"))
17
+ fake_requests_for :path_inside_content => '/html4-003.html',
18
+ :resource_path => @domain_url,
19
+ :content => load_fixture("relative_uri.html")
20
20
  Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
21
21
  end
22
22
 
23
23
  it "should return the feed url when the URI is inside a path" do
24
- @path_url = "/foo/bar"
24
+ @path_url = "/foo/bar/"
25
25
  @feed_url = @domain_url + @path_url
26
-
27
- FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri.html"))
28
- Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "/" + "html4-002.xml"
26
+ fake_requests_for :path_inside_content => 'html4-003.html',
27
+ :resource_path => @feed_url,
28
+ :content => load_fixture('relative_uri.html')
29
+ Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "html4-002.xml"
29
30
  end
30
31
  end
31
32
 
32
33
  describe "which is relative to the top domain" do
33
34
  it "should return the feed url when the URI is at the top domain" do
34
- FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri_top_domain.html"))
35
+ fake_requests_for :path_inside_content => '/html4-004.html',
36
+ :resource_path => @domain_url,
37
+ :content => load_fixture("relative_uri_top_domain.html")
35
38
  Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
36
39
  end
37
40
 
38
41
  it "should return the feed url when the URI is inside a path" do
39
- @path_url = "/foo/bar"
42
+ @path_url = "/foo/bar/"
40
43
  @feed_url = @domain_url + @path_url
41
44
 
42
- FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri_top_domain.html"))
45
+ fake_requests_for :path_inside_content => 'html4-004.html',
46
+ :resource_path => @feed_url,
47
+ :content => load_fixture("relative_uri_top_domain.html")
43
48
  Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
44
49
  end
45
50
  end
@@ -68,5 +73,43 @@ describe Spix::FeedDiscovery, "#list" do
68
73
 
69
74
  Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url
70
75
  end
76
+
77
+ it "should return the title of feed" do
78
+ feed_xml = load_fixture("feed_without_self_link.xml")
79
+
80
+ FakeWeb.register_uri(:get, @feed_url, :body => feed_xml)
81
+
82
+ # feedzirra doesn't work with fakeweb
83
+ feed = Feedzirra::Feed.parse(feed_xml)
84
+ Feedzirra::Feed.stub!(:fetch_and_parse).and_return(feed)
85
+
86
+ Spix::FeedDiscovery.list(@feed_url).first[:title].should == feed.feed_title
87
+ end
88
+
89
+ it "should return the title of feed when enclosed in CDATA" do
90
+ feed_xml = load_fixture("feed_with_content_in_cdata.xml")
91
+
92
+ FakeWeb.register_uri(:get, @feed_url, :body => feed_xml)
93
+
94
+ # feedzirra doesn't work with fakeweb
95
+ feed = Feedzirra::Feed.parse(feed_xml)
96
+ Feedzirra::Feed.stub!(:fetch_and_parse).and_return(feed)
97
+
98
+ Spix::FeedDiscovery.list(@feed_url).first[:title].should_not be_empty
99
+ end
71
100
  end
101
+
102
+ before(:all) do
103
+ @domain_url = "http://sitewithfeed.com"
104
+ end
105
+
106
+ end
107
+
108
+ def fake_requests_for options = {}
109
+ content = options.delete(:content)
110
+ path_inside_content = options.delete(:path_inside_content)
111
+ resource_path = options.delete(:resource_path)
112
+ FakeWeb.register_uri(:get, resource_path, :body => content)
113
+ FakeWeb.register_uri(:head, resource_path + path_inside_content, :content => 'text/html')
114
+ FakeWeb.register_uri(:get, resource_path + path_inside_content, :content => content)
72
115
  end
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spix::FeedDiscoveryList do
4
+
5
+ let(:feed_discovery_list) { Spix::FeedDiscoveryList.new }
6
+
7
+ it "should inherit from array" do
8
+ feed_discovery_list.class.superclass.should == Array
9
+ end
10
+
11
+ describe "#invalids" do
12
+ it "should return an empty array from invalids accessor method" do
13
+ feed_discovery_list.invalids.should == []
14
+ end
15
+ end
16
+
17
+ end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: spix_parser
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.5.4
5
+ version: 1.6.1
6
6
  platform: ruby
7
7
  authors:
8
8
  - Marcelo Eden
@@ -13,7 +13,7 @@ autorequire:
13
13
  bindir: bin
14
14
  cert_chain: []
15
15
 
16
- date: 2011-05-12 00:00:00 -03:00
16
+ date: 2011-05-31 00:00:00 -03:00
17
17
  default_executable:
18
18
  dependencies:
19
19
  - !ruby/object:Gem::Dependency
@@ -100,6 +100,10 @@ files:
100
100
  - lib/spix_parser/custom_parsers/rss_entry.rb
101
101
  - lib/spix_parser/datetime.rb
102
102
  - lib/spix_parser/parser.rb
103
+ - lib/spix_parser/tools/discovery.rb
104
+ - lib/spix_parser/tools/feed_discovery/base.rb
105
+ - lib/spix_parser/tools/feed_discovery/document.rb
106
+ - lib/spix_parser/tools/feed_discovery/feed.rb
103
107
  - lib/spix_parser/tools/feed_discovery.rb
104
108
  - lib/spix_parser/tools/redirect_follower.rb
105
109
  - lib/spix_parser/version.rb
@@ -110,7 +114,10 @@ files:
110
114
  - lib/spix_parser.rb
111
115
  - spec/parser_spec.rb
112
116
  - spec/spix_parser/parser_spec.rb
117
+ - spec/spix_parser/tools/feed_discovery/document_spec.rb
118
+ - spec/spix_parser/tools/feed_discovery/feed_spec.rb
113
119
  - spec/spix_parser/tools/feed_discovery_spec.rb
120
+ - spec/spix_parser/tools/feed_list_spec.rb
114
121
  - spec/spix_parser/utils_spec.rb
115
122
  has_rdoc: true
116
123
  homepage: http://github.com/busk/spix_parser
@@ -143,5 +150,8 @@ summary: FeedParser for Spix
143
150
  test_files:
144
151
  - spec/parser_spec.rb
145
152
  - spec/spix_parser/parser_spec.rb
153
+ - spec/spix_parser/tools/feed_discovery/document_spec.rb
154
+ - spec/spix_parser/tools/feed_discovery/feed_spec.rb
146
155
  - spec/spix_parser/tools/feed_discovery_spec.rb
156
+ - spec/spix_parser/tools/feed_list_spec.rb
147
157
  - spec/spix_parser/utils_spec.rb