spix_parser 1.6.6 → 1.6.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,8 +14,11 @@ module Spix
14
14
  def list uri, &block
15
15
  page = Base.new uri
16
16
  page.list &block
17
- rescue => error
18
- [error]
17
+ end
18
+
19
+ def list_uris_finded_in uri
20
+ page = Base.new uri
21
+ page.list_uris
19
22
  end
20
23
 
21
24
  end
@@ -28,6 +28,10 @@ module Spix
28
28
  private :document
29
29
 
30
30
  def list &block
31
+ @document.feeds &block
32
+ end
33
+
34
+ def list_uris &block
31
35
  @document.feed_uris &block
32
36
  end
33
37
 
@@ -2,86 +2,146 @@ module Spix
2
2
  module FeedDiscovery
3
3
  class Document
4
4
 
5
- def initialize uri
6
- @uri = URI.parse uri
7
- @document = Nokogiri::XML(content)
5
+ def initialize uri_name
6
+ @uri = parse_uri uri_name
8
7
  end
9
8
 
10
- def content
11
- if @uri.respond_to?(:read)
12
- @uri.read
9
+ def load_content
10
+ @document ||= content_from @uri.to_s
11
+ end
12
+
13
+ def content_from uri_name, limit=10
14
+ raise ArgumentError, 'HTTP redirect too deep' if limit == 0
15
+ uri = parse_uri uri_name
16
+ connection, path = connection_and_path_using uri
17
+ response = connection.request_get path
18
+ content = Nokogiri::XML response.body
19
+ if response.kind_of? Net::HTTPRedirection
20
+ content_from response['location'], limit - 1
21
+ elsif meta_refresh = content.search('//meta[contains(translate(@http-equiv, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "refresh")]').first
22
+ content_from meta_refresh.get_attribute('content')[/http:\/\/.*/], limit - 1;
13
23
  else
14
- req_headers = {}
15
- req_headers["User-Agent"] = USER_AGENT
16
- open(@uri.to_s, req_headers).read
24
+ content
17
25
  end
18
26
  end
19
- protected :content
27
+ protected :content_from
20
28
 
21
29
  def feed_uris &block
22
- items = feed_uris_from_links(&block) + feed_uris_from_anchors(&block)
30
+ items = []
31
+ load_content
32
+ items = uris_from_links(&block) + uris_from_anchors(&block)
33
+ items << Feed.new(@uri).tap { |uri|
34
+ block.call uri if block_given?
35
+ } if feed?
36
+ rescue => error
37
+ items << feed_exception_from(error)
38
+ ensure
39
+ return items
40
+ end
41
+
42
+ def feeds &block
43
+ items = []
44
+ load_content
45
+ items = feeds_from_links(&block) + feeds_from_anchors(&block)
23
46
  items << feed_unsing_address(@uri, &block) if feed?
24
- items
47
+ rescue => error
48
+ items << feed_exception_from(error)
49
+ ensure
50
+ return items
51
+ end
52
+
53
+ def feed_exception_from error, &block
54
+ Feed.new(@uri).tap { |item|
55
+ item.exceptions = [error.to_s]
56
+ block.call item if block_given?
57
+ }
25
58
  end
26
59
 
27
- def feed_uris_from_links &block
60
+ def feeds_from_links &block
61
+ from_links.map { |node| feed_from node, &block }
62
+ end
63
+ private :feeds_from_links
64
+
65
+ def uris_from_links &block
66
+ from_links.map { |node|
67
+ Feed.new(node.get_attribute('href')).tap do |item|
68
+ block.call item if block_given?
69
+ end
70
+ }
71
+ end
72
+ private :uris_from_links
73
+
74
+ def from_links
28
75
  @document.search(
29
76
  "link[@type='application/atom+xml']",
30
77
  "link[@type='application/rss+xml']"
31
- ).map { |node| feed_from node, &block }
78
+ )
32
79
  end
33
- private :feed_uris_from_links
80
+ private :from_links
34
81
 
35
- def feed_uris_from_anchors &block
82
+ def feeds_from_anchors &block
83
+ from_anchors.map { |node|
84
+ feed_from node, &block
85
+ }
86
+ end
87
+ private :feeds_from_anchors
88
+
89
+ def uris_from_anchors &block
90
+ from_anchors.map { |node|
91
+ Feed.new(node.get_attribute('href')).tap do |item|
92
+ block.call item if block_given?
93
+ end
94
+ }
95
+ end
96
+ private :uris_from_anchors
97
+
98
+ def from_anchors
36
99
  @document.search('a').select { |node|
37
100
  valid_url_in? node
38
101
  }.select { |node|
39
102
  rss_or_atom_content_type_in? node
40
- }.map { |node|
41
- feed_from node, &block
42
- }
103
+ }
43
104
  end
44
- private :feed_uris_from_anchors
105
+ private :from_anchors
45
106
 
46
107
  def feed_from node, &block
47
- uri = @uri.merge node.get_attribute 'href'
108
+ uri = @uri.merge node.get_attribute("href").to_s
48
109
  feed_unsing_address uri, &block
49
110
  end
50
111
  private :feed_from
51
112
 
52
113
  def feed_unsing_address uri, &block
53
114
  begin
54
- Feed.new(uri).tap do |feed|
115
+ Feed.new(uri) { |feed|
116
+ feed.set_title
117
+ feed.set_favicon
118
+ }.tap { |feed|
55
119
  block.call feed if block_given?
56
- end
57
- rescue => error
58
- error.tap do |e|
59
- block.call e if block_given?
60
- end
120
+ }
61
121
  end
62
122
  end
63
123
  private :feed_unsing_address
64
124
 
65
125
  def valid_url_in? anchor
66
126
  uri = address_from anchor
67
- uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
127
+ uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host) and uri.to_s.present?
68
128
  end
69
129
  private :valid_url_in?
70
130
 
71
131
  def rss_or_atom_content_type_in? anchor
72
- req, path = request_and_path_using address_from anchor
73
- resp = req.request_head path
74
- resp['content-type'] =~ /rss|atom/
132
+ connection, path = connection_and_path_using address_from anchor
133
+ response = connection.request_head path
134
+ response['content-type'] =~ /rss|atom/
75
135
  rescue
76
136
  true
77
137
  end
78
138
  private :rss_or_atom_content_type_in?
79
139
 
80
- def request_and_path_using uri
81
- req = Net::HTTP.new uri.host, uri.port
82
- return req, path_from(uri) || uri.to_s
140
+ def connection_and_path_using uri
141
+ connection = Net::HTTP.new uri.host, uri.port
142
+ return connection, path_from(uri) || uri.to_s
83
143
  end
84
- private :request_and_path_using
144
+ private :connection_and_path_using
85
145
 
86
146
  def path_from uri
87
147
  path = uri - uri.select(:scheme, :host).join("://")
@@ -90,7 +150,7 @@ module Spix
90
150
  private :path_from
91
151
 
92
152
  def address_from node
93
- @uri.merge URI.parse node.get_attribute("href")
153
+ @uri.merge parse_uri node.get_attribute("href").to_s
94
154
  end
95
155
  private :address_from
96
156
 
@@ -101,6 +161,10 @@ module Spix
101
161
  def feed?
102
162
  %w[rss feed].include? @document.root.name
103
163
  end
164
+
165
+ def parse_uri path
166
+ URI.parse URI.encode path.to_s
167
+ end
104
168
 
105
169
  end
106
170
  end
@@ -4,11 +4,14 @@ module Spix
4
4
 
5
5
  def initialize url
6
6
  self.url = url.to_s
7
- self.favicon = get_favicon
8
- self.title = get_title
7
+ self.similars = []
8
+ self.exceptions = []
9
+ yield self if block_given?
10
+ rescue => error
11
+ self.errors = [error]
9
12
  end
10
13
 
11
- %w[url favicon title].each do |attr|
14
+ %w[url favicon title exceptions similars].each do |attr|
12
15
  define_method attr do
13
16
  self[attr.to_sym]
14
17
  end
@@ -18,21 +21,20 @@ module Spix
18
21
  end
19
22
  end
20
23
 
21
- def get_title
24
+ def set_title
22
25
  node = content.search('title').first
23
- node.content if node
26
+ self.title = node.content if node
24
27
  end
25
- private :get_title
26
28
 
27
- def get_favicon
29
+ def set_favicon
28
30
  if node = content.search('link').first
29
- path = URI.parse node.content.strip
30
- shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
31
+ path = parse_uri node.content.strip
32
+ self.favicon = shortcut_from parse_uri path.select(:scheme, :host).join("://") rescue nil
31
33
  end
32
34
  end
33
35
 
34
36
  def shortcut_from base_uri
35
- doc = get base_uri
37
+ doc = fetch_html base_uri
36
38
  shortcuts = find_shortcut_in doc
37
39
  shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
38
40
 
@@ -49,21 +51,31 @@ module Spix
49
51
  ).map { |node| node.get_attribute "href" }
50
52
  end
51
53
 
52
- def get uri
54
+ def fetch uri, limit = 10
55
+ raise ArgumentError, 'HTTP redirect too deep' if limit == 0
53
56
  resp = Net::HTTP.get_response uri
54
- doc = Nokogiri::HTML(resp.body)
55
57
  if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
56
58
  path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
57
- get URI.parse path
59
+ from_redirect = parse_uri path
60
+ self.url = from_redirect.to_s
61
+ fetch from_redirect, limit - 1
58
62
  else
59
- doc
63
+ resp.body
60
64
  end
61
65
  rescue
62
- Nokogiri::HTML('')
66
+ String.new
67
+ end
68
+
69
+ def fetch_xml uri
70
+ Nokogiri::XML fetch uri
71
+ end
72
+
73
+ def fetch_html uri
74
+ Nokogiri::HTML fetch uri
63
75
  end
64
76
 
65
77
  def base_uri
66
- @base_uri ||= URI.parse uri.select(:scheme, :host).join("://")
78
+ @base_uri ||= parse_uri uri.select(:scheme, :host).join("://")
67
79
  end
68
80
  private :base_uri
69
81
 
@@ -73,20 +85,19 @@ module Spix
73
85
  private :content
74
86
 
75
87
  def load_content
76
- req = Net::HTTP.new uri.host, uri.port
77
- path = uri - uri.select(:scheme, :host).join("://")
78
- resp = req.request_get path.to_s
79
- Nokogiri::XML(resp.body)
88
+ fetch_xml uri
80
89
  end
81
90
  private :load_content
82
91
 
83
92
  def uri
84
- @uri ||= URI.parse url
93
+ @uri ||= parse_uri url
85
94
  end
86
95
  private :uri
87
96
 
88
- def to_hash
97
+ def parse_uri path
98
+ URI.parse URI.encode path
89
99
  end
100
+
90
101
  end
91
102
  end
92
103
  end
@@ -4,7 +4,7 @@ module Spix
4
4
  module Version
5
5
  MAJOR = 1
6
6
  MINOR = 6
7
- TINY = 6
7
+ TINY = 7
8
8
 
9
9
  def self.current_version
10
10
  "#{MAJOR}.#{MINOR}.#{TINY}"
@@ -4,17 +4,17 @@ describe Spix::FeedDiscovery::Document do
4
4
 
5
5
  let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
6
6
 
7
- describe '#feed_uris' do
7
+ describe '#feed' do
8
8
 
9
9
  context 'when the uri exists' do
10
10
 
11
11
  it 'should return only feed uris' do
12
- document.should_receive(:feed_uris).and_return expected_feeds
13
- document.feed_uris
12
+ document.should_receive(:feeds).and_return expected_feeds
13
+ document.feeds
14
14
  end
15
15
 
16
16
  it 'should yield feed with correct uri content' do
17
- document.feed_uris do |feed|
17
+ document.feeds do |feed|
18
18
  expected_feeds.should include(feed)
19
19
  end
20
20
  end
@@ -29,14 +29,16 @@ describe Spix::FeedDiscovery::Document do
29
29
 
30
30
  describe '#html?' do
31
31
  it 'should return true if is a html document' do
32
- content = load_fixture('rss_list.html')
33
- Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
32
+ content = Nokogiri::XML load_fixture('rss_list.html')
33
+ Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
34
+ document.load_content
34
35
  document.html?.should eql true
35
36
  end
36
37
 
37
38
  it 'should return false if is a rss/feed document' do
38
- content = load_fixture('feed.rss')
39
- Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
39
+ content = Nokogiri::XML load_fixture('feed.rss')
40
+ Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
41
+ document.load_content
40
42
  document.html?.should eql false
41
43
  end
42
44
 
@@ -48,14 +50,16 @@ describe Spix::FeedDiscovery::Document do
48
50
 
49
51
  describe '#feed?' do
50
52
  it 'should return true if a feed document' do
51
- content = load_fixture('feed.rss')
52
- Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
53
+ content = Nokogiri::XML load_fixture('feed.rss')
54
+ Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
55
+ document.load_content
53
56
  document.feed?.should eql true
54
57
  end
55
58
 
56
59
  it 'should return false if hot a html document' do
57
- content = load_fixture('rss_list.html')
58
- Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
60
+ content = Nokogiri::XML load_fixture('rss_list.html')
61
+ document.load_content
62
+ Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
59
63
  document.feed?.should eql false
60
64
  end
61
65
 
@@ -84,6 +88,6 @@ end
84
88
 
85
89
  def expected_feeds
86
90
  @feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
87
- Spix::FeedDiscovery::Feed.new(@domain + '/' + path, @domain + '/images/favicon.ico')
91
+ Spix::FeedDiscovery::Feed.new(@domain + '/' + path)
88
92
  }
89
93
  end
@@ -4,23 +4,20 @@ describe Spix::FeedDiscovery::Feed do
4
4
 
5
5
  context 'given an expecific uri' do
6
6
 
7
- let(:feed) { Spix::FeedDiscovery::Feed.new @feed_uri, @favicon_uri }
8
-
9
- it 'should set the favicon' do
10
- feed.favicon.should == @favicon_uri
11
- end
12
-
13
7
  it 'should set the url' do
8
+ feed = described_class.new @feed_uri
14
9
  feed.url.should == @feed_uri
15
10
  end
16
11
 
17
12
  it 'should set title' do
13
+ feed = described_class.new @feed_uri do |f|
14
+ f.set_title
15
+ end
18
16
  feed.title.should == @document.search('title').first.content
19
17
  end
20
18
 
21
19
  before :all do
22
20
  @feed_uri = "http://myfeed.com/feed.rss"
23
- @favicon_uri = "http://myfeed.com/images/favicon.ico"
24
21
  @document = Nokogiri::XML load_fixture 'feed.rss'
25
22
  FakeWeb.register_uri(:get, @feed_uri, :body => @document.to_s)
26
23
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: spix_parser
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.6.6
5
+ version: 1.6.7
6
6
  platform: ruby
7
7
  authors:
8
8
  - Marcio Lopes de Faria
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-06-03 00:00:00 -03:00
17
+ date: 2011-06-06 00:00:00 -03:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency