spix_parser 1.6.1 → 1.6.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -27,11 +27,8 @@ module Spix
27
27
  end
28
28
  private :document
29
29
 
30
- def list
31
- extract_feeds_from_anchors if html?
32
- extract_feeds_from_links
33
- include_it_self if feed?
34
- items
30
+ def list &block
31
+ @document.feed_uris &block
35
32
  end
36
33
 
37
34
  def html?
@@ -39,24 +36,7 @@ module Spix
39
36
  end
40
37
  private :html?
41
38
 
42
- def extract_feeds_from_anchors
43
- document.feed_uris_from_anchors +
44
- document.generic_uris_from_anchors.map { |uri|
45
- FeedDiscovery::Document.new(uri).feed_uris_from_anchors
46
- }.flatten.each { |uri|
47
- items << Feed.new(uri, document.favicon)
48
- }
49
- end
50
- private :extract_feeds_from_anchors
51
-
52
- def extract_feeds_from_links
53
- document.feed_uris_from_links.each { |uri|
54
- items << Feed.new(uri, document.favicon)
55
- }
56
- end
57
- private :extract_feeds_from_links
58
-
59
- def include_it_self
39
+ def include_it_self &block
60
40
  items << Feed.new(uri.to_s, document.favicon)
61
41
  end
62
42
 
@@ -16,53 +16,81 @@ module Spix
16
16
  open(@uri.to_s, req_headers).read
17
17
  end
18
18
  end
19
- private :content
19
+ protected :content
20
20
 
21
- def feed_uris_from_anchors
22
- @document.search("a").select { |anchor|
23
- rss_or_atom_content_type_in? anchor
24
- }.map { |node|
25
- uri = @uri.merge node.get_attribute 'href'
26
- uri.to_s
27
- }
28
- end
29
-
30
- def generic_uris_from_anchors
31
- @document.search("a").select { |anchor|
32
- not rss_or_atom_content_type_in? anchor
33
- }.map { |node|
34
- uri = @uri.merge node.get_attribute 'href'
35
- uri.to_s
36
- }
21
+ def feed_uris &block
22
+ items = feed_uris_from_links(&block) + feed_uris_from_anchors(&block)
23
+ items << feed_unsing_address(@uri, &block) if feed?
24
+ items
37
25
  end
38
26
 
39
- def feed_uris_from_links
27
+ def feed_uris_from_links &block
40
28
  @document.search(
41
29
  "link[@type='application/atom+xml']",
42
30
  "link[@type='application/rss+xml']"
43
- ).map { |node|
44
- uri = @uri.merge node.get_attribute 'href'
45
- uri.to_s
31
+ ).map { |node| feed_from node, &block }
32
+ end
33
+ private :feed_uris_from_links
34
+
35
+ def feed_uris_from_anchors &block
36
+ @document.search('a').select { |node|
37
+ valid_url_in? node
38
+ }.select { |node|
39
+ rss_or_atom_content_type_in? node
40
+ }.map { |node|
41
+ feed_from node, &block
46
42
  }
47
43
  end
44
+ private :feed_uris_from_anchors
45
+
46
+ def feed_from node, &block
47
+ uri = @uri.merge node.get_attribute 'href'
48
+ feed_unsing_address uri, &block
49
+ end
50
+ private :feed_from
51
+
52
+ def feed_unsing_address uri, &block
53
+ begin
54
+ Feed.new(uri, favicon).tap do |feed|
55
+ block.call feed if block_given?
56
+ end
57
+ rescue => error
58
+ error.tap do |e|
59
+ block.call e if block_given?
60
+ end
61
+ end
62
+ end
63
+ private :feed_unsing_address
48
64
 
65
+ def valid_url_in? anchor
66
+ uri = address_from anchor
67
+ uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
68
+ end
69
+ private :valid_url_in?
70
+
49
71
  def rss_or_atom_content_type_in? anchor
50
72
  req, path = request_and_path_using address_from anchor
51
73
  resp = req.request_head path
52
74
  resp['content-type'] =~ /rss|atom/
75
+ rescue
76
+ true
53
77
  end
54
78
  private :rss_or_atom_content_type_in?
55
79
 
56
- def request_and_path_using address
57
- uri = @uri.merge URI.parse address
80
+ def request_and_path_using uri
58
81
  req = Net::HTTP.new uri.host, uri.port
59
- path = uri - uri.select(:scheme, :host).join("://")
60
- return req, path.to_s
82
+ return req, path_from(uri) || uri.to_s
61
83
  end
62
84
  private :request_and_path_using
63
85
 
86
+ def path_from uri
87
+ path = uri - uri.select(:scheme, :host).join("://")
88
+ path.to_s unless path.to_s.blank?
89
+ end
90
+ private :path_from
91
+
64
92
  def address_from node
65
- node.get_attribute("href")
93
+ @uri.merge URI.parse node.get_attribute("href")
66
94
  end
67
95
  private :address_from
68
96
 
@@ -75,30 +103,70 @@ module Spix
75
103
  end
76
104
 
77
105
  def favicon
78
- shortcuts_in_document or shortcuts_from(base_path) or base_path.merge('favicon.ico').to_s
106
+ shortcut_in_document or shortcut_from_original_page or shortcut_from(base_uri) or default_favico_if_exist
79
107
  end
80
108
 
81
- def shortcuts_in_document
82
- shortcuts = @document.search('link[@rel*=shortcut]')
83
- shortcuts.any? ? shortcuts : nil
109
+ def shortcut_in_document
110
+ shortcuts = find_shortcut_in @document
111
+ shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
84
112
  end
85
- private :shortcuts_in_document
113
+ private :shortcut_in_document
114
+
115
+ def shortcut_from_original_page
116
+ if feed?
117
+ if node = @document.search('link').first
118
+ path = URI.parse node.content.strip
119
+ shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
120
+ end
121
+ end
122
+ end
123
+ private :shortcut_from_original_page
86
124
 
87
- def shortcuts_from base_path
88
- doc = Nokogiri::HTML Net::HTTP.get base_path
89
- doc.search('link[@rel*=shortcut]')
125
+ def shortcut_from base_uri
126
+ doc = get base_uri
127
+ shortcuts = find_shortcut_in doc
128
+ shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
90
129
 
91
130
  rescue Net::HTTPError, Net::HTTPFatalError
92
131
  logger.warn "error opening favicon: #{$!}"
93
132
  nil
94
133
  end
95
- private :shortcuts_from
134
+ private :shortcut_from
135
+
136
+ def find_shortcut_in doc
137
+ doc.xpath(
138
+ '//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "shortcut")]',
139
+ '//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "ico")]'
140
+ ).map { |node| node.get_attribute "href" }
141
+ end
96
142
 
97
- def base_path
98
- URI.parse @uri.select(:scheme, :host).join("://")
143
+ def default_favico_if_exist
144
+ http = Net::HTTP.new base_uri.host, base_uri.port
145
+ resp = http.request_head 'favicon.ico'
146
+ base_uri.merge('favicon.ico').to_s unless resp.kind_of? Net::HTTPError
147
+ rescue
148
+ nil
149
+ end
150
+ private :default_favico_if_exist
151
+
152
+ def get uri
153
+ resp = Net::HTTP.get_response uri
154
+ doc = Nokogiri::HTML(resp.body)
155
+ if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
156
+ path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
157
+ get URI.parse path
158
+ else
159
+ doc
160
+ end
161
+ rescue
162
+ Nokogiri::HTML('')
99
163
  end
100
- private :base_path
101
164
 
165
+ def base_uri
166
+ @base_uri ||= URI.parse @uri.select(:scheme, :host).join("://")
167
+ end
168
+ private :base_uri
169
+
102
170
  end
103
171
  end
104
172
  end
@@ -1,15 +1,26 @@
1
1
  module Spix
2
2
  module FeedDiscovery
3
- class Feed < Struct.new(:url, :favicon, :title)
3
+ class Feed < Hash
4
4
 
5
5
  def initialize url, favicon
6
- self.url = url
6
+ self.url = url.to_s
7
7
  self.favicon = favicon
8
8
  self.title = get_title
9
9
  end
10
10
 
11
+ %w[url favicon title].each do |attr|
12
+ define_method attr do
13
+ self[attr.to_sym]
14
+ end
15
+
16
+ define_method "#{attr}=" do |value|
17
+ self[attr.to_sym] = value
18
+ end
19
+ end
20
+
11
21
  def get_title
12
- content.search('title').first.content
22
+ node = content.search('title').first
23
+ node.content if node
13
24
  end
14
25
  private :get_title
15
26
 
@@ -22,9 +33,12 @@ module Spix
22
33
  private :content
23
34
 
24
35
  def uri
25
- URI.parse url
36
+ @uri ||= URI.parse url
26
37
  end
27
38
  private :uri
39
+
40
+ def to_hash
41
+ end
28
42
  end
29
43
  end
30
44
  end
@@ -11,9 +11,11 @@ module Spix
11
11
  Spix::Parser.parse(uri, :mode => :fetch) ? true : false
12
12
  end
13
13
 
14
- def list uri
14
+ def list uri, &block
15
15
  page = Base.new uri
16
- page.list
16
+ page.list &block
17
+ rescue => error
18
+ [error]
17
19
  end
18
20
 
19
21
  end
@@ -4,7 +4,7 @@ module Spix
4
4
  module Version
5
5
  MAJOR = 1
6
6
  MINOR = 6
7
- TINY = 1
7
+ TINY = 4
8
8
 
9
9
  def self.current_version
10
10
  "#{MAJOR}.#{MINOR}.#{TINY}"
@@ -4,22 +4,27 @@ describe Spix::FeedDiscovery::Document do
4
4
 
5
5
  let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
6
6
 
7
- describe '#feed_uris_from_anchors' do
8
- it 'should return only uris from anchors' do
9
- document.feed_uris_from_anchors.should eql expected_uris_inside('a.has_feed')
10
- end
11
- end
7
+ describe '#feed_uris' do
12
8
 
13
- describe '#feed_uris_from_links' do
14
- it 'should return only uris from links' do
15
- document.feed_uris_from_links.should eql expected_uris_inside('link.has_feed')
16
- end
17
- end
9
+ context 'when the uri exists' do
10
+
11
+ it 'should return only feed uris' do
12
+ document.should_receive(:feed_uris).and_return expected_feeds
13
+ document.feed_uris
14
+ end
15
+
16
+ it 'should yield feed with correct uri content' do
17
+ document.feed_uris do |feed|
18
+ expected_feeds.should include(feed)
19
+ end
20
+ end
21
+
22
+ before :each do
23
+ stub_requests
24
+ end
18
25
 
19
- describe '#generic_uris' do
20
- it 'should return only ordinary uris from anchors' do
21
- document.generic_uris_from_anchors.should eql expected_uris_inside('a.generic')
22
26
  end
27
+
23
28
  end
24
29
 
25
30
  describe '#html?' do
@@ -34,6 +39,11 @@ describe Spix::FeedDiscovery::Document do
34
39
  Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
35
40
  document.html?.should eql false
36
41
  end
42
+
43
+ before :each do
44
+ stub_requests
45
+ end
46
+
37
47
  end
38
48
 
39
49
  describe '#feed?' do
@@ -48,21 +58,32 @@ describe Spix::FeedDiscovery::Document do
48
58
  Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
49
59
  document.feed?.should eql false
50
60
  end
61
+
62
+ before :each do
63
+ stub_requests
64
+ end
51
65
  end
52
66
 
53
67
  before :all do
54
- @rss_uri = 'http://myfeed.com/rss_list.html'
68
+ @domain = 'http://diveintomark.org'
69
+ @rss_uri = @domain + '/rss_list.html'
55
70
  @content = load_fixture("rss_list.html")
56
71
  @document = Nokogiri::XML(@content)
57
72
  end
58
73
 
59
- before :each do
60
- Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
61
- FakeWeb.register_uri(:head, 'http://myfeed.com/has_feed.html', :content_type => 'application/atom+xml')
62
- FakeWeb.register_uri(:head, 'http://myfeed.com/generic.html', :content_type => 'text/html' )
63
- end
64
74
  end
65
75
 
66
- def expected_uris_inside to_search
67
- @document.search(to_search).map { |node| URI.parse(@rss_uri).merge(node.get_attribute('href')).to_s }
76
+ def stub_requests
77
+ Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
78
+ %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].each { |path|
79
+ FakeWeb.register_uri(:head, @domain + '/' + path, :content_type => 'application/atom+xml')
80
+ FakeWeb.register_uri(:get, @domain + '/' + path, :body => load_fixture(path))
81
+ }
82
+ FakeWeb.register_uri(:head, @domain + '/generic.html', :content_type => 'text/html' )
83
+ end
84
+
85
+ def expected_feeds
86
+ @feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
87
+ Spix::FeedDiscovery::Feed.new(@domain + '/' + path, @domain + '/images/favicon.ico')
88
+ }
68
89
  end
@@ -4,7 +4,8 @@ describe Spix::FeedDiscovery, "#list" do
4
4
 
5
5
  describe "when the feed have an absolute URI" do
6
6
  it "should return the feed url" do
7
- fake_requests_for :path_inside_content => '/html4-002.html',
7
+ fake_requests_for :ignore => ['/html4-002.html'],
8
+ :accept => ['/tests/client/autodiscovery/html4-001.xml'],
8
9
  :resource_path => @domain_url,
9
10
  :content => load_fixture("absolute_uri.html")
10
11
  Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
@@ -14,7 +15,8 @@ describe Spix::FeedDiscovery, "#list" do
14
15
  describe "when the feed have a relative URI" do
15
16
  describe "which is relative to a path" do
16
17
  it "should return the feed url when the URI is at the top domain" do
17
- fake_requests_for :path_inside_content => '/html4-003.html',
18
+ fake_requests_for :ignore => ['/html4-003.html'],
19
+ :accept => ['/html4-002.xml'],
18
20
  :resource_path => @domain_url,
19
21
  :content => load_fixture("relative_uri.html")
20
22
  Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
@@ -23,7 +25,8 @@ describe Spix::FeedDiscovery, "#list" do
23
25
  it "should return the feed url when the URI is inside a path" do
24
26
  @path_url = "/foo/bar/"
25
27
  @feed_url = @domain_url + @path_url
26
- fake_requests_for :path_inside_content => 'html4-003.html',
28
+ fake_requests_for :ignore => ['html4-003.html'],
29
+ :accept => ['html4-002.xml'],
27
30
  :resource_path => @feed_url,
28
31
  :content => load_fixture('relative_uri.html')
29
32
  Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "html4-002.xml"
@@ -32,7 +35,8 @@ describe Spix::FeedDiscovery, "#list" do
32
35
 
33
36
  describe "which is relative to the top domain" do
34
37
  it "should return the feed url when the URI is at the top domain" do
35
- fake_requests_for :path_inside_content => '/html4-004.html',
38
+ fake_requests_for :ignore => ['/html4-004.html'],
39
+ :accept => ['/html4-003.xml'],
36
40
  :resource_path => @domain_url,
37
41
  :content => load_fixture("relative_uri_top_domain.html")
38
42
  Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
@@ -42,7 +46,8 @@ describe Spix::FeedDiscovery, "#list" do
42
46
  @path_url = "/foo/bar/"
43
47
  @feed_url = @domain_url + @path_url
44
48
 
45
- fake_requests_for :path_inside_content => 'html4-004.html',
49
+ fake_requests_for :ignore => ['/html4-004.html'],
50
+ :accept => ['/html4-003.xml'],
46
51
  :resource_path => @feed_url,
47
52
  :content => load_fixture("relative_uri_top_domain.html")
48
53
  Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
@@ -100,16 +105,27 @@ describe Spix::FeedDiscovery, "#list" do
100
105
  end
101
106
 
102
107
  before(:all) do
103
- @domain_url = "http://sitewithfeed.com"
108
+ @domain_url = "http://diveintomark.org"
104
109
  end
105
110
 
106
111
  end
107
112
 
108
113
  def fake_requests_for options = {}
114
+
109
115
  content = options.delete(:content)
110
- path_inside_content = options.delete(:path_inside_content)
116
+ ignore = options.delete(:ignore)
117
+ accept = options.delete(:accept)
111
118
  resource_path = options.delete(:resource_path)
119
+
112
120
  FakeWeb.register_uri(:get, resource_path, :body => content)
113
- FakeWeb.register_uri(:head, resource_path + path_inside_content, :content => 'text/html')
114
- FakeWeb.register_uri(:get, resource_path + path_inside_content, :content => content)
121
+
122
+ ignore.each do |path|
123
+ FakeWeb.register_uri(:head, resource_path + path, :content => 'text/html')
124
+ end
125
+
126
+ accept.each do |path|
127
+ FakeWeb.register_uri(:head, resource_path + path, :content => 'application/atom+xml')
128
+ FakeWeb.register_uri(:get, resource_path + path, :body => load_fixture('feed.rss'))
129
+ end
130
+
115
131
  end
metadata CHANGED
@@ -2,9 +2,10 @@
2
2
  name: spix_parser
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.6.1
5
+ version: 1.6.4
6
6
  platform: ruby
7
7
  authors:
8
+ - Marcio Lopes de Faria
8
9
  - Marcelo Eden
9
10
  - Fabio Mont'Alegre
10
11
  - "Lucas H\xC3\xBAngaro"
@@ -13,7 +14,7 @@ autorequire:
13
14
  bindir: bin
14
15
  cert_chain: []
15
16
 
16
- date: 2011-05-31 00:00:00 -03:00
17
+ date: 2011-06-03 00:00:00 -03:00
17
18
  default_executable:
18
19
  dependencies:
19
20
  - !ruby/object:Gem::Dependency
@@ -117,7 +118,6 @@ files:
117
118
  - spec/spix_parser/tools/feed_discovery/document_spec.rb
118
119
  - spec/spix_parser/tools/feed_discovery/feed_spec.rb
119
120
  - spec/spix_parser/tools/feed_discovery_spec.rb
120
- - spec/spix_parser/tools/feed_list_spec.rb
121
121
  - spec/spix_parser/utils_spec.rb
122
122
  has_rdoc: true
123
123
  homepage: http://github.com/busk/spix_parser
@@ -153,5 +153,4 @@ test_files:
153
153
  - spec/spix_parser/tools/feed_discovery/document_spec.rb
154
154
  - spec/spix_parser/tools/feed_discovery/feed_spec.rb
155
155
  - spec/spix_parser/tools/feed_discovery_spec.rb
156
- - spec/spix_parser/tools/feed_list_spec.rb
157
156
  - spec/spix_parser/utils_spec.rb
@@ -1,17 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Spix::FeedDiscoveryList do
4
-
5
- let(:feed_discovery_list) { Spix::FeedDiscoveryList.new }
6
-
7
- it "should inherit from array" do
8
- feed_discovery_list.class.superclass.should == Array
9
- end
10
-
11
- describe "#invalids" do
12
- it "should return an empty array from invalids accessor method" do
13
- feed_discovery_list.invalids.should == []
14
- end
15
- end
16
-
17
- end