spix_parser 1.6.6 → 1.6.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,8 +14,11 @@ module Spix
14
14
  def list uri, &block
15
15
  page = Base.new uri
16
16
  page.list &block
17
- rescue => error
18
- [error]
17
+ end
18
+
19
+ def list_uris_finded_in uri
20
+ page = Base.new uri
21
+ page.list_uris
19
22
  end
20
23
 
21
24
  end
@@ -28,6 +28,10 @@ module Spix
28
28
  private :document
29
29
 
30
30
  def list &block
31
+ @document.feeds &block
32
+ end
33
+
34
+ def list_uris &block
31
35
  @document.feed_uris &block
32
36
  end
33
37
 
@@ -2,86 +2,146 @@ module Spix
2
2
  module FeedDiscovery
3
3
  class Document
4
4
 
5
- def initialize uri
6
- @uri = URI.parse uri
7
- @document = Nokogiri::XML(content)
5
+ def initialize uri_name
6
+ @uri = parse_uri uri_name
8
7
  end
9
8
 
10
- def content
11
- if @uri.respond_to?(:read)
12
- @uri.read
9
+ def load_content
10
+ @document ||= content_from @uri.to_s
11
+ end
12
+
13
+ def content_from uri_name, limit=10
14
+ raise ArgumentError, 'HTTP redirect too deep' if limit == 0
15
+ uri = parse_uri uri_name
16
+ connection, path = connection_and_path_using uri
17
+ response = connection.request_get path
18
+ content = Nokogiri::XML response.body
19
+ if response.kind_of? Net::HTTPRedirection
20
+ content_from response['location'], limit - 1
21
+ elsif meta_refresh = content.search('//meta[contains(translate(@http-equiv, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "refresh")]').first
22
+ content_from meta_refresh.get_attribute('content')[/http:\/\/.*/], limit - 1;
13
23
  else
14
- req_headers = {}
15
- req_headers["User-Agent"] = USER_AGENT
16
- open(@uri.to_s, req_headers).read
24
+ content
17
25
  end
18
26
  end
19
- protected :content
27
+ protected :content_from
20
28
 
21
29
  def feed_uris &block
22
- items = feed_uris_from_links(&block) + feed_uris_from_anchors(&block)
30
+ items = []
31
+ load_content
32
+ items = uris_from_links(&block) + uris_from_anchors(&block)
33
+ items << Feed.new(@uri).tap { |uri|
34
+ block.call uri if block_given?
35
+ } if feed?
36
+ rescue => error
37
+ items << feed_exception_from(error)
38
+ ensure
39
+ return items
40
+ end
41
+
42
+ def feeds &block
43
+ items = []
44
+ load_content
45
+ items = feeds_from_links(&block) + feeds_from_anchors(&block)
23
46
  items << feed_unsing_address(@uri, &block) if feed?
24
- items
47
+ rescue => error
48
+ items << feed_exception_from(error)
49
+ ensure
50
+ return items
51
+ end
52
+
53
+ def feed_exception_from error, &block
54
+ Feed.new(@uri).tap { |item|
55
+ item.exceptions = [error.to_s]
56
+ block.call item if block_given?
57
+ }
25
58
  end
26
59
 
27
- def feed_uris_from_links &block
60
+ def feeds_from_links &block
61
+ from_links.map { |node| feed_from node, &block }
62
+ end
63
+ private :feeds_from_links
64
+
65
+ def uris_from_links &block
66
+ from_links.map { |node|
67
+ Feed.new(node.get_attribute('href')).tap do |item|
68
+ block.call item if block_given?
69
+ end
70
+ }
71
+ end
72
+ private :uris_from_links
73
+
74
+ def from_links
28
75
  @document.search(
29
76
  "link[@type='application/atom+xml']",
30
77
  "link[@type='application/rss+xml']"
31
- ).map { |node| feed_from node, &block }
78
+ )
32
79
  end
33
- private :feed_uris_from_links
80
+ private :from_links
34
81
 
35
- def feed_uris_from_anchors &block
82
+ def feeds_from_anchors &block
83
+ from_anchors.map { |node|
84
+ feed_from node, &block
85
+ }
86
+ end
87
+ private :feeds_from_anchors
88
+
89
+ def uris_from_anchors &block
90
+ from_anchors.map { |node|
91
+ Feed.new(node.get_attribute('href')).tap do |item|
92
+ block.call item if block_given?
93
+ end
94
+ }
95
+ end
96
+ private :uris_from_anchors
97
+
98
+ def from_anchors
36
99
  @document.search('a').select { |node|
37
100
  valid_url_in? node
38
101
  }.select { |node|
39
102
  rss_or_atom_content_type_in? node
40
- }.map { |node|
41
- feed_from node, &block
42
- }
103
+ }
43
104
  end
44
- private :feed_uris_from_anchors
105
+ private :from_anchors
45
106
 
46
107
  def feed_from node, &block
47
- uri = @uri.merge node.get_attribute 'href'
108
+ uri = @uri.merge node.get_attribute("href").to_s
48
109
  feed_unsing_address uri, &block
49
110
  end
50
111
  private :feed_from
51
112
 
52
113
  def feed_unsing_address uri, &block
53
114
  begin
54
- Feed.new(uri).tap do |feed|
115
+ Feed.new(uri) { |feed|
116
+ feed.set_title
117
+ feed.set_favicon
118
+ }.tap { |feed|
55
119
  block.call feed if block_given?
56
- end
57
- rescue => error
58
- error.tap do |e|
59
- block.call e if block_given?
60
- end
120
+ }
61
121
  end
62
122
  end
63
123
  private :feed_unsing_address
64
124
 
65
125
  def valid_url_in? anchor
66
126
  uri = address_from anchor
67
- uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
127
+ uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host) and uri.to_s.present?
68
128
  end
69
129
  private :valid_url_in?
70
130
 
71
131
  def rss_or_atom_content_type_in? anchor
72
- req, path = request_and_path_using address_from anchor
73
- resp = req.request_head path
74
- resp['content-type'] =~ /rss|atom/
132
+ connection, path = connection_and_path_using address_from anchor
133
+ response = connection.request_head path
134
+ response['content-type'] =~ /rss|atom/
75
135
  rescue
76
136
  true
77
137
  end
78
138
  private :rss_or_atom_content_type_in?
79
139
 
80
- def request_and_path_using uri
81
- req = Net::HTTP.new uri.host, uri.port
82
- return req, path_from(uri) || uri.to_s
140
+ def connection_and_path_using uri
141
+ connection = Net::HTTP.new uri.host, uri.port
142
+ return connection, path_from(uri) || uri.to_s
83
143
  end
84
- private :request_and_path_using
144
+ private :connection_and_path_using
85
145
 
86
146
  def path_from uri
87
147
  path = uri - uri.select(:scheme, :host).join("://")
@@ -90,7 +150,7 @@ module Spix
90
150
  private :path_from
91
151
 
92
152
  def address_from node
93
- @uri.merge URI.parse node.get_attribute("href")
153
+ @uri.merge parse_uri node.get_attribute("href").to_s
94
154
  end
95
155
  private :address_from
96
156
 
@@ -101,6 +161,10 @@ module Spix
101
161
  def feed?
102
162
  %w[rss feed].include? @document.root.name
103
163
  end
164
+
165
+ def parse_uri path
166
+ URI.parse URI.encode path.to_s
167
+ end
104
168
 
105
169
  end
106
170
  end
@@ -4,11 +4,14 @@ module Spix
4
4
 
5
5
  def initialize url
6
6
  self.url = url.to_s
7
- self.favicon = get_favicon
8
- self.title = get_title
7
+ self.similars = []
8
+ self.exceptions = []
9
+ yield self if block_given?
10
+ rescue => error
11
+ self.errors = [error]
9
12
  end
10
13
 
11
- %w[url favicon title].each do |attr|
14
+ %w[url favicon title exceptions similars].each do |attr|
12
15
  define_method attr do
13
16
  self[attr.to_sym]
14
17
  end
@@ -18,21 +21,20 @@ module Spix
18
21
  end
19
22
  end
20
23
 
21
- def get_title
24
+ def set_title
22
25
  node = content.search('title').first
23
- node.content if node
26
+ self.title = node.content if node
24
27
  end
25
- private :get_title
26
28
 
27
- def get_favicon
29
+ def set_favicon
28
30
  if node = content.search('link').first
29
- path = URI.parse node.content.strip
30
- shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
31
+ path = parse_uri node.content.strip
32
+ self.favicon = shortcut_from parse_uri path.select(:scheme, :host).join("://") rescue nil
31
33
  end
32
34
  end
33
35
 
34
36
  def shortcut_from base_uri
35
- doc = get base_uri
37
+ doc = fetch_html base_uri
36
38
  shortcuts = find_shortcut_in doc
37
39
  shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
38
40
 
@@ -49,21 +51,31 @@ module Spix
49
51
  ).map { |node| node.get_attribute "href" }
50
52
  end
51
53
 
52
- def get uri
54
+ def fetch uri, limit = 10
55
+ raise ArgumentError, 'HTTP redirect too deep' if limit == 0
53
56
  resp = Net::HTTP.get_response uri
54
- doc = Nokogiri::HTML(resp.body)
55
57
  if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
56
58
  path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
57
- get URI.parse path
59
+ from_redirect = parse_uri path
60
+ self.url = from_redirect.to_s
61
+ fetch from_redirect, limit - 1
58
62
  else
59
- doc
63
+ resp.body
60
64
  end
61
65
  rescue
62
- Nokogiri::HTML('')
66
+ String.new
67
+ end
68
+
69
+ def fetch_xml uri
70
+ Nokogiri::XML fetch uri
71
+ end
72
+
73
+ def fetch_html uri
74
+ Nokogiri::HTML fetch uri
63
75
  end
64
76
 
65
77
  def base_uri
66
- @base_uri ||= URI.parse uri.select(:scheme, :host).join("://")
78
+ @base_uri ||= parse_uri uri.select(:scheme, :host).join("://")
67
79
  end
68
80
  private :base_uri
69
81
 
@@ -73,20 +85,19 @@ module Spix
73
85
  private :content
74
86
 
75
87
  def load_content
76
- req = Net::HTTP.new uri.host, uri.port
77
- path = uri - uri.select(:scheme, :host).join("://")
78
- resp = req.request_get path.to_s
79
- Nokogiri::XML(resp.body)
88
+ fetch_xml uri
80
89
  end
81
90
  private :load_content
82
91
 
83
92
  def uri
84
- @uri ||= URI.parse url
93
+ @uri ||= parse_uri url
85
94
  end
86
95
  private :uri
87
96
 
88
- def to_hash
97
+ def parse_uri path
98
+ URI.parse URI.encode path
89
99
  end
100
+
90
101
  end
91
102
  end
92
103
  end
@@ -4,7 +4,7 @@ module Spix
4
4
  module Version
5
5
  MAJOR = 1
6
6
  MINOR = 6
7
- TINY = 6
7
+ TINY = 7
8
8
 
9
9
  def self.current_version
10
10
  "#{MAJOR}.#{MINOR}.#{TINY}"
@@ -4,17 +4,17 @@ describe Spix::FeedDiscovery::Document do
4
4
 
5
5
  let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
6
6
 
7
- describe '#feed_uris' do
7
+ describe '#feed' do
8
8
 
9
9
  context 'when the uri exists' do
10
10
 
11
11
  it 'should return only feed uris' do
12
- document.should_receive(:feed_uris).and_return expected_feeds
13
- document.feed_uris
12
+ document.should_receive(:feeds).and_return expected_feeds
13
+ document.feeds
14
14
  end
15
15
 
16
16
  it 'should yield feed with correct uri content' do
17
- document.feed_uris do |feed|
17
+ document.feeds do |feed|
18
18
  expected_feeds.should include(feed)
19
19
  end
20
20
  end
@@ -29,14 +29,16 @@ describe Spix::FeedDiscovery::Document do
29
29
 
30
30
  describe '#html?' do
31
31
  it 'should return true if is a html document' do
32
- content = load_fixture('rss_list.html')
33
- Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
32
+ content = Nokogiri::XML load_fixture('rss_list.html')
33
+ Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
34
+ document.load_content
34
35
  document.html?.should eql true
35
36
  end
36
37
 
37
38
  it 'should return false if is a rss/feed document' do
38
- content = load_fixture('feed.rss')
39
- Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
39
+ content = Nokogiri::XML load_fixture('feed.rss')
40
+ Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
41
+ document.load_content
40
42
  document.html?.should eql false
41
43
  end
42
44
 
@@ -48,14 +50,16 @@ describe Spix::FeedDiscovery::Document do
48
50
 
49
51
  describe '#feed?' do
50
52
  it 'should return true if a feed document' do
51
- content = load_fixture('feed.rss')
52
- Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
53
+ content = Nokogiri::XML load_fixture('feed.rss')
54
+ Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
55
+ document.load_content
53
56
  document.feed?.should eql true
54
57
  end
55
58
 
56
59
  it 'should return false if hot a html document' do
57
- content = load_fixture('rss_list.html')
58
- Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
60
+ content = Nokogiri::XML load_fixture('rss_list.html')
61
+ document.load_content
62
+ Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
59
63
  document.feed?.should eql false
60
64
  end
61
65
 
@@ -84,6 +88,6 @@ end
84
88
 
85
89
  def expected_feeds
86
90
  @feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
87
- Spix::FeedDiscovery::Feed.new(@domain + '/' + path, @domain + '/images/favicon.ico')
91
+ Spix::FeedDiscovery::Feed.new(@domain + '/' + path)
88
92
  }
89
93
  end
@@ -4,23 +4,20 @@ describe Spix::FeedDiscovery::Feed do
4
4
 
5
5
  context 'given an expecific uri' do
6
6
 
7
- let(:feed) { Spix::FeedDiscovery::Feed.new @feed_uri, @favicon_uri }
8
-
9
- it 'should set the favicon' do
10
- feed.favicon.should == @favicon_uri
11
- end
12
-
13
7
  it 'should set the url' do
8
+ feed = described_class.new @feed_uri
14
9
  feed.url.should == @feed_uri
15
10
  end
16
11
 
17
12
  it 'should set title' do
13
+ feed = described_class.new @feed_uri do |f|
14
+ f.set_title
15
+ end
18
16
  feed.title.should == @document.search('title').first.content
19
17
  end
20
18
 
21
19
  before :all do
22
20
  @feed_uri = "http://myfeed.com/feed.rss"
23
- @favicon_uri = "http://myfeed.com/images/favicon.ico"
24
21
  @document = Nokogiri::XML load_fixture 'feed.rss'
25
22
  FakeWeb.register_uri(:get, @feed_uri, :body => @document.to_s)
26
23
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: spix_parser
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.6.6
5
+ version: 1.6.7
6
6
  platform: ruby
7
7
  authors:
8
8
  - Marcio Lopes de Faria
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-06-03 00:00:00 -03:00
17
+ date: 2011-06-06 00:00:00 -03:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency