spix_parser 1.6.1 → 1.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,11 +27,8 @@ module Spix
27
27
  end
28
28
  private :document
29
29
 
30
- def list
31
- extract_feeds_from_anchors if html?
32
- extract_feeds_from_links
33
- include_it_self if feed?
34
- items
30
+ def list &block
31
+ @document.feed_uris &block
35
32
  end
36
33
 
37
34
  def html?
@@ -39,24 +36,7 @@ module Spix
39
36
  end
40
37
  private :html?
41
38
 
42
- def extract_feeds_from_anchors
43
- document.feed_uris_from_anchors +
44
- document.generic_uris_from_anchors.map { |uri|
45
- FeedDiscovery::Document.new(uri).feed_uris_from_anchors
46
- }.flatten.each { |uri|
47
- items << Feed.new(uri, document.favicon)
48
- }
49
- end
50
- private :extract_feeds_from_anchors
51
-
52
- def extract_feeds_from_links
53
- document.feed_uris_from_links.each { |uri|
54
- items << Feed.new(uri, document.favicon)
55
- }
56
- end
57
- private :extract_feeds_from_links
58
-
59
- def include_it_self
39
+ def include_it_self &block
60
40
  items << Feed.new(uri.to_s, document.favicon)
61
41
  end
62
42
 
@@ -16,53 +16,81 @@ module Spix
16
16
  open(@uri.to_s, req_headers).read
17
17
  end
18
18
  end
19
- private :content
19
+ protected :content
20
20
 
21
- def feed_uris_from_anchors
22
- @document.search("a").select { |anchor|
23
- rss_or_atom_content_type_in? anchor
24
- }.map { |node|
25
- uri = @uri.merge node.get_attribute 'href'
26
- uri.to_s
27
- }
28
- end
29
-
30
- def generic_uris_from_anchors
31
- @document.search("a").select { |anchor|
32
- not rss_or_atom_content_type_in? anchor
33
- }.map { |node|
34
- uri = @uri.merge node.get_attribute 'href'
35
- uri.to_s
36
- }
21
+ def feed_uris &block
22
+ items = feed_uris_from_links(&block) + feed_uris_from_anchors(&block)
23
+ items << feed_unsing_address(@uri, &block) if feed?
24
+ items
37
25
  end
38
26
 
39
- def feed_uris_from_links
27
+ def feed_uris_from_links &block
40
28
  @document.search(
41
29
  "link[@type='application/atom+xml']",
42
30
  "link[@type='application/rss+xml']"
43
- ).map { |node|
44
- uri = @uri.merge node.get_attribute 'href'
45
- uri.to_s
31
+ ).map { |node| feed_from node, &block }
32
+ end
33
+ private :feed_uris_from_links
34
+
35
+ def feed_uris_from_anchors &block
36
+ @document.search('a').select { |node|
37
+ valid_url_in? node
38
+ }.select { |node|
39
+ rss_or_atom_content_type_in? node
40
+ }.map { |node|
41
+ feed_from node, &block
46
42
  }
47
43
  end
44
+ private :feed_uris_from_anchors
45
+
46
+ def feed_from node, &block
47
+ uri = @uri.merge node.get_attribute 'href'
48
+ feed_unsing_address uri, &block
49
+ end
50
+ private :feed_from
51
+
52
+ def feed_unsing_address uri, &block
53
+ begin
54
+ Feed.new(uri, favicon).tap do |feed|
55
+ block.call feed if block_given?
56
+ end
57
+ rescue => error
58
+ error.tap do |e|
59
+ block.call e if block_given?
60
+ end
61
+ end
62
+ end
63
+ private :feed_unsing_address
48
64
 
65
+ def valid_url_in? anchor
66
+ uri = address_from anchor
67
+ uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
68
+ end
69
+ private :valid_url_in?
70
+
49
71
  def rss_or_atom_content_type_in? anchor
50
72
  req, path = request_and_path_using address_from anchor
51
73
  resp = req.request_head path
52
74
  resp['content-type'] =~ /rss|atom/
75
+ rescue
76
+ true
53
77
  end
54
78
  private :rss_or_atom_content_type_in?
55
79
 
56
- def request_and_path_using address
57
- uri = @uri.merge URI.parse address
80
+ def request_and_path_using uri
58
81
  req = Net::HTTP.new uri.host, uri.port
59
- path = uri - uri.select(:scheme, :host).join("://")
60
- return req, path.to_s
82
+ return req, path_from(uri) || uri.to_s
61
83
  end
62
84
  private :request_and_path_using
63
85
 
86
+ def path_from uri
87
+ path = uri - uri.select(:scheme, :host).join("://")
88
+ path.to_s unless path.to_s.blank?
89
+ end
90
+ private :path_from
91
+
64
92
  def address_from node
65
- node.get_attribute("href")
93
+ @uri.merge URI.parse node.get_attribute("href")
66
94
  end
67
95
  private :address_from
68
96
 
@@ -75,30 +103,70 @@ module Spix
75
103
  end
76
104
 
77
105
  def favicon
78
- shortcuts_in_document or shortcuts_from(base_path) or base_path.merge('favicon.ico').to_s
106
+ shortcut_in_document or shortcut_from_original_page or shortcut_from(base_uri) or default_favico_if_exist
79
107
  end
80
108
 
81
- def shortcuts_in_document
82
- shortcuts = @document.search('link[@rel*=shortcut]')
83
- shortcuts.any? ? shortcuts : nil
109
+ def shortcut_in_document
110
+ shortcuts = find_shortcut_in @document
111
+ shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
84
112
  end
85
- private :shortcuts_in_document
113
+ private :shortcut_in_document
114
+
115
+ def shortcut_from_original_page
116
+ if feed?
117
+ if node = @document.search('link').first
118
+ path = URI.parse node.content.strip
119
+ shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
120
+ end
121
+ end
122
+ end
123
+ private :shortcut_from_original_page
86
124
 
87
- def shortcuts_from base_path
88
- doc = Nokogiri::HTML Net::HTTP.get base_path
89
- doc.search('link[@rel*=shortcut]')
125
+ def shortcut_from base_uri
126
+ doc = get base_uri
127
+ shortcuts = find_shortcut_in doc
128
+ shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
90
129
 
91
130
  rescue Net::HTTPError, Net::HTTPFatalError
92
131
  logger.warn "error opening favicon: #{$!}"
93
132
  nil
94
133
  end
95
- private :shortcuts_from
134
+ private :shortcut_from
135
+
136
+ def find_shortcut_in doc
137
+ doc.xpath(
138
+ '//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "shortcut")]',
139
+ '//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "ico")]'
140
+ ).map { |node| node.get_attribute "href" }
141
+ end
96
142
 
97
- def base_path
98
- URI.parse @uri.select(:scheme, :host).join("://")
143
+ def default_favico_if_exist
144
+ http = Net::HTTP.new base_uri.host, base_uri.port
145
+ resp = http.request_head 'favicon.ico'
146
+ base_uri.merge('favicon.ico').to_s unless resp.kind_of? Net::HTTPError
147
+ rescue
148
+ nil
149
+ end
150
+ private :default_favico_if_exist
151
+
152
+ def get uri
153
+ resp = Net::HTTP.get_response uri
154
+ doc = Nokogiri::HTML(resp.body)
155
+ if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
156
+ path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
157
+ get URI.parse path
158
+ else
159
+ doc
160
+ end
161
+ rescue
162
+ Nokogiri::HTML('')
99
163
  end
100
- private :base_path
101
164
 
165
+ def base_uri
166
+ @base_uri ||= URI.parse @uri.select(:scheme, :host).join("://")
167
+ end
168
+ private :base_uri
169
+
102
170
  end
103
171
  end
104
172
  end
@@ -1,15 +1,26 @@
1
1
  module Spix
2
2
  module FeedDiscovery
3
- class Feed < Struct.new(:url, :favicon, :title)
3
+ class Feed < Hash
4
4
 
5
5
  def initialize url, favicon
6
- self.url = url
6
+ self.url = url.to_s
7
7
  self.favicon = favicon
8
8
  self.title = get_title
9
9
  end
10
10
 
11
+ %w[url favicon title].each do |attr|
12
+ define_method attr do
13
+ self[attr.to_sym]
14
+ end
15
+
16
+ define_method "#{attr}=" do |value|
17
+ self[attr.to_sym] = value
18
+ end
19
+ end
20
+
11
21
  def get_title
12
- content.search('title').first.content
22
+ node = content.search('title').first
23
+ node.content if node
13
24
  end
14
25
  private :get_title
15
26
 
@@ -22,9 +33,12 @@ module Spix
22
33
  private :content
23
34
 
24
35
  def uri
25
- URI.parse url
36
+ @uri ||= URI.parse url
26
37
  end
27
38
  private :uri
39
+
40
+ def to_hash
41
+ end
28
42
  end
29
43
  end
30
44
  end
@@ -11,9 +11,11 @@ module Spix
11
11
  Spix::Parser.parse(uri, :mode => :fetch) ? true : false
12
12
  end
13
13
 
14
- def list uri
14
+ def list uri, &block
15
15
  page = Base.new uri
16
- page.list
16
+ page.list &block
17
+ rescue => error
18
+ [error]
17
19
  end
18
20
 
19
21
  end
@@ -4,7 +4,7 @@ module Spix
4
4
  module Version
5
5
  MAJOR = 1
6
6
  MINOR = 6
7
- TINY = 1
7
+ TINY = 4
8
8
 
9
9
  def self.current_version
10
10
  "#{MAJOR}.#{MINOR}.#{TINY}"
@@ -4,22 +4,27 @@ describe Spix::FeedDiscovery::Document do
4
4
 
5
5
  let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
6
6
 
7
- describe '#feed_uris_from_anchors' do
8
- it 'should return only uris from anchors' do
9
- document.feed_uris_from_anchors.should eql expected_uris_inside('a.has_feed')
10
- end
11
- end
7
+ describe '#feed_uris' do
12
8
 
13
- describe '#feed_uris_from_links' do
14
- it 'should return only uris from links' do
15
- document.feed_uris_from_links.should eql expected_uris_inside('link.has_feed')
16
- end
17
- end
9
+ context 'when the uri exists' do
10
+
11
+ it 'should return only feed uris' do
12
+ document.should_receive(:feed_uris).and_return expected_feeds
13
+ document.feed_uris
14
+ end
15
+
16
+ it 'should yield feed with correct uri content' do
17
+ document.feed_uris do |feed|
18
+ expected_feeds.should include(feed)
19
+ end
20
+ end
21
+
22
+ before :each do
23
+ stub_requests
24
+ end
18
25
 
19
- describe '#generic_uris' do
20
- it 'should return only ordinary uris from anchors' do
21
- document.generic_uris_from_anchors.should eql expected_uris_inside('a.generic')
22
26
  end
27
+
23
28
  end
24
29
 
25
30
  describe '#html?' do
@@ -34,6 +39,11 @@ describe Spix::FeedDiscovery::Document do
34
39
  Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
35
40
  document.html?.should eql false
36
41
  end
42
+
43
+ before :each do
44
+ stub_requests
45
+ end
46
+
37
47
  end
38
48
 
39
49
  describe '#feed?' do
@@ -48,21 +58,32 @@ describe Spix::FeedDiscovery::Document do
48
58
  Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
49
59
  document.feed?.should eql false
50
60
  end
61
+
62
+ before :each do
63
+ stub_requests
64
+ end
51
65
  end
52
66
 
53
67
  before :all do
54
- @rss_uri = 'http://myfeed.com/rss_list.html'
68
+ @domain = 'http://diveintomark.org'
69
+ @rss_uri = @domain + '/rss_list.html'
55
70
  @content = load_fixture("rss_list.html")
56
71
  @document = Nokogiri::XML(@content)
57
72
  end
58
73
 
59
- before :each do
60
- Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
61
- FakeWeb.register_uri(:head, 'http://myfeed.com/has_feed.html', :content_type => 'application/atom+xml')
62
- FakeWeb.register_uri(:head, 'http://myfeed.com/generic.html', :content_type => 'text/html' )
63
- end
64
74
  end
65
75
 
66
- def expected_uris_inside to_search
67
- @document.search(to_search).map { |node| URI.parse(@rss_uri).merge(node.get_attribute('href')).to_s }
76
+ def stub_requests
77
+ Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
78
+ %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].each { |path|
79
+ FakeWeb.register_uri(:head, @domain + '/' + path, :content_type => 'application/atom+xml')
80
+ FakeWeb.register_uri(:get, @domain + '/' + path, :body => load_fixture(path))
81
+ }
82
+ FakeWeb.register_uri(:head, @domain + '/generic.html', :content_type => 'text/html' )
83
+ end
84
+
85
+ def expected_feeds
86
+ @feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
87
+ Spix::FeedDiscovery::Feed.new(@domain + '/' + path, @domain + '/images/favicon.ico')
88
+ }
68
89
  end
@@ -4,7 +4,8 @@ describe Spix::FeedDiscovery, "#list" do
4
4
 
5
5
  describe "when the feed have an absolute URI" do
6
6
  it "should return the feed url" do
7
- fake_requests_for :path_inside_content => '/html4-002.html',
7
+ fake_requests_for :ignore => ['/html4-002.html'],
8
+ :accept => ['/tests/client/autodiscovery/html4-001.xml'],
8
9
  :resource_path => @domain_url,
9
10
  :content => load_fixture("absolute_uri.html")
10
11
  Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
@@ -14,7 +15,8 @@ describe Spix::FeedDiscovery, "#list" do
14
15
  describe "when the feed have a relative URI" do
15
16
  describe "which is relative to a path" do
16
17
  it "should return the feed url when the URI is at the top domain" do
17
- fake_requests_for :path_inside_content => '/html4-003.html',
18
+ fake_requests_for :ignore => ['/html4-003.html'],
19
+ :accept => ['/html4-002.xml'],
18
20
  :resource_path => @domain_url,
19
21
  :content => load_fixture("relative_uri.html")
20
22
  Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
@@ -23,7 +25,8 @@ describe Spix::FeedDiscovery, "#list" do
23
25
  it "should return the feed url when the URI is inside a path" do
24
26
  @path_url = "/foo/bar/"
25
27
  @feed_url = @domain_url + @path_url
26
- fake_requests_for :path_inside_content => 'html4-003.html',
28
+ fake_requests_for :ignore => ['html4-003.html'],
29
+ :accept => ['html4-002.xml'],
27
30
  :resource_path => @feed_url,
28
31
  :content => load_fixture('relative_uri.html')
29
32
  Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "html4-002.xml"
@@ -32,7 +35,8 @@ describe Spix::FeedDiscovery, "#list" do
32
35
 
33
36
  describe "which is relative to the top domain" do
34
37
  it "should return the feed url when the URI is at the top domain" do
35
- fake_requests_for :path_inside_content => '/html4-004.html',
38
+ fake_requests_for :ignore => ['/html4-004.html'],
39
+ :accept => ['/html4-003.xml'],
36
40
  :resource_path => @domain_url,
37
41
  :content => load_fixture("relative_uri_top_domain.html")
38
42
  Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
@@ -42,7 +46,8 @@ describe Spix::FeedDiscovery, "#list" do
42
46
  @path_url = "/foo/bar/"
43
47
  @feed_url = @domain_url + @path_url
44
48
 
45
- fake_requests_for :path_inside_content => 'html4-004.html',
49
+ fake_requests_for :ignore => ['/html4-004.html'],
50
+ :accept => ['/html4-003.xml'],
46
51
  :resource_path => @feed_url,
47
52
  :content => load_fixture("relative_uri_top_domain.html")
48
53
  Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
@@ -100,16 +105,27 @@ describe Spix::FeedDiscovery, "#list" do
100
105
  end
101
106
 
102
107
  before(:all) do
103
- @domain_url = "http://sitewithfeed.com"
108
+ @domain_url = "http://diveintomark.org"
104
109
  end
105
110
 
106
111
  end
107
112
 
108
113
  def fake_requests_for options = {}
114
+
109
115
  content = options.delete(:content)
110
- path_inside_content = options.delete(:path_inside_content)
116
+ ignore = options.delete(:ignore)
117
+ accept = options.delete(:accept)
111
118
  resource_path = options.delete(:resource_path)
119
+
112
120
  FakeWeb.register_uri(:get, resource_path, :body => content)
113
- FakeWeb.register_uri(:head, resource_path + path_inside_content, :content => 'text/html')
114
- FakeWeb.register_uri(:get, resource_path + path_inside_content, :content => content)
121
+
122
+ ignore.each do |path|
123
+ FakeWeb.register_uri(:head, resource_path + path, :content => 'text/html')
124
+ end
125
+
126
+ accept.each do |path|
127
+ FakeWeb.register_uri(:head, resource_path + path, :content => 'application/atom+xml')
128
+ FakeWeb.register_uri(:get, resource_path + path, :body => load_fixture('feed.rss'))
129
+ end
130
+
115
131
  end
metadata CHANGED
@@ -2,9 +2,10 @@
2
2
  name: spix_parser
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.6.1
5
+ version: 1.6.4
6
6
  platform: ruby
7
7
  authors:
8
+ - Marcio Lopes de Faria
8
9
  - Marcelo Eden
9
10
  - Fabio Mont'Alegre
10
11
  - "Lucas H\xC3\xBAngaro"
@@ -13,7 +14,7 @@ autorequire:
13
14
  bindir: bin
14
15
  cert_chain: []
15
16
 
16
- date: 2011-05-31 00:00:00 -03:00
17
+ date: 2011-06-03 00:00:00 -03:00
17
18
  default_executable:
18
19
  dependencies:
19
20
  - !ruby/object:Gem::Dependency
@@ -117,7 +118,6 @@ files:
117
118
  - spec/spix_parser/tools/feed_discovery/document_spec.rb
118
119
  - spec/spix_parser/tools/feed_discovery/feed_spec.rb
119
120
  - spec/spix_parser/tools/feed_discovery_spec.rb
120
- - spec/spix_parser/tools/feed_list_spec.rb
121
121
  - spec/spix_parser/utils_spec.rb
122
122
  has_rdoc: true
123
123
  homepage: http://github.com/busk/spix_parser
@@ -153,5 +153,4 @@ test_files:
153
153
  - spec/spix_parser/tools/feed_discovery/document_spec.rb
154
154
  - spec/spix_parser/tools/feed_discovery/feed_spec.rb
155
155
  - spec/spix_parser/tools/feed_discovery_spec.rb
156
- - spec/spix_parser/tools/feed_list_spec.rb
157
156
  - spec/spix_parser/utils_spec.rb
@@ -1,17 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Spix::FeedDiscoveryList do
4
-
5
- let(:feed_discovery_list) { Spix::FeedDiscoveryList.new }
6
-
7
- it "should inherit from array" do
8
- feed_discovery_list.class.superclass.should == Array
9
- end
10
-
11
- describe "#invalids" do
12
- it "should return an empty array from invalids accessor method" do
13
- feed_discovery_list.invalids.should == []
14
- end
15
- end
16
-
17
- end