rawler 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -1
- data/lib/rawler.rb +1 -1
- data/lib/rawler/crawler.rb +7 -1
- data/spec/unit/crawler_spec.rb +29 -1
- metadata +3 -3
data/History.txt
CHANGED
data/lib/rawler.rb
CHANGED
@@ -6,7 +6,7 @@ $:.unshift(File.dirname(__FILE__)) unless
|
|
6
6
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
7
7
|
|
8
8
|
module Rawler
|
9
|
-
VERSION = '0.0.
|
9
|
+
VERSION = '0.0.2'
|
10
10
|
|
11
11
|
autoload :Base, "rawler/base"
|
12
12
|
autoload :Crawler, "rawler/crawler"
|
data/lib/rawler/crawler.rb
CHANGED
@@ -12,11 +12,17 @@ module Rawler
|
|
12
12
|
content = Net::HTTP.get(URI.parse(url))
|
13
13
|
|
14
14
|
doc = Nokogiri::HTML(content)
|
15
|
-
doc.css('a').map { |a| a['href'] }
|
15
|
+
doc.css('a').map { |a| absolute_url(a['href']) }
|
16
16
|
rescue Errno::ECONNREFUSED
|
17
17
|
$output.puts "Couldn't connect to #{url}"
|
18
18
|
[]
|
19
19
|
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def absolute_url(path)
|
24
|
+
URI.parse(url).merge(path.to_s).to_s
|
25
|
+
end
|
20
26
|
|
21
27
|
end
|
22
28
|
|
data/spec/unit/crawler_spec.rb
CHANGED
@@ -3,12 +3,40 @@ require File.dirname(__FILE__) + '/../spec_helper.rb'
|
|
3
3
|
describe Rawler::Crawler do
|
4
4
|
|
5
5
|
it "should parse all links" do
|
6
|
-
url = 'http://example.com'
|
6
|
+
url = 'http://example.com/'
|
7
7
|
register(url, site)
|
8
8
|
|
9
9
|
Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
|
10
10
|
end
|
11
11
|
|
12
|
+
it "should return an empty array when raising Errno::ECONNREFUSED" do
|
13
|
+
url = 'http://example.com'
|
14
|
+
register(url, site)
|
15
|
+
|
16
|
+
Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
|
17
|
+
|
18
|
+
crawler = Rawler::Crawler.new(url).links.should == []
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should parse relative links" do
|
22
|
+
url = 'http://example.com/path'
|
23
|
+
register(url, '<a href="/foo">foo</a>')
|
24
|
+
|
25
|
+
Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
|
26
|
+
end
|
27
|
+
|
28
|
+
# it "should print a message when raising Errno::ECONNREFUSED" do
|
29
|
+
# pending "refactor output. Don't use a global variable"
|
30
|
+
# url = 'http://example.com'
|
31
|
+
# register(url, site)
|
32
|
+
#
|
33
|
+
# Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
|
34
|
+
#
|
35
|
+
# $stdout.should_receive(:puts).with("Couldn't connect to #{url}")
|
36
|
+
#
|
37
|
+
# Rawler::Crawler.new(url).links
|
38
|
+
# end
|
39
|
+
|
12
40
|
private
|
13
41
|
|
14
42
|
def site
|
metadata
CHANGED