rawler 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -1
- data/lib/rawler.rb +1 -1
- data/lib/rawler/crawler.rb +7 -1
- data/spec/unit/crawler_spec.rb +29 -1
- metadata +3 -3
data/History.txt
CHANGED
data/lib/rawler.rb
CHANGED
@@ -6,7 +6,7 @@ $:.unshift(File.dirname(__FILE__)) unless
|
|
6
6
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
7
7
|
|
8
8
|
module Rawler
|
9
|
-
VERSION = '0.0.
|
9
|
+
VERSION = '0.0.2'
|
10
10
|
|
11
11
|
autoload :Base, "rawler/base"
|
12
12
|
autoload :Crawler, "rawler/crawler"
|
data/lib/rawler/crawler.rb
CHANGED
@@ -12,11 +12,17 @@ module Rawler
|
|
12
12
|
content = Net::HTTP.get(URI.parse(url))
|
13
13
|
|
14
14
|
doc = Nokogiri::HTML(content)
|
15
|
-
doc.css('a').map { |a| a['href'] }
|
15
|
+
doc.css('a').map { |a| absolute_url(a['href']) }
|
16
16
|
rescue Errno::ECONNREFUSED
|
17
17
|
$output.puts "Couldn't connect to #{url}"
|
18
18
|
[]
|
19
19
|
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def absolute_url(path)
|
24
|
+
URI.parse(url).merge(path.to_s).to_s
|
25
|
+
end
|
20
26
|
|
21
27
|
end
|
22
28
|
|
data/spec/unit/crawler_spec.rb
CHANGED
@@ -3,12 +3,40 @@ require File.dirname(__FILE__) + '/../spec_helper.rb'
|
|
3
3
|
describe Rawler::Crawler do
|
4
4
|
|
5
5
|
it "should parse all links" do
|
6
|
-
url = 'http://example.com'
|
6
|
+
url = 'http://example.com/'
|
7
7
|
register(url, site)
|
8
8
|
|
9
9
|
Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
|
10
10
|
end
|
11
11
|
|
12
|
+
it "should return an empty array when raising Errno::ECONNREFUSED" do
|
13
|
+
url = 'http://example.com'
|
14
|
+
register(url, site)
|
15
|
+
|
16
|
+
Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
|
17
|
+
|
18
|
+
crawler = Rawler::Crawler.new(url).links.should == []
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should parse relative links" do
|
22
|
+
url = 'http://example.com/path'
|
23
|
+
register(url, '<a href="/foo">foo</a>')
|
24
|
+
|
25
|
+
Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
|
26
|
+
end
|
27
|
+
|
28
|
+
# it "should print a message when raising Errno::ECONNREFUSED" do
|
29
|
+
# pending "refactor output. Don't use a global variable"
|
30
|
+
# url = 'http://example.com'
|
31
|
+
# register(url, site)
|
32
|
+
#
|
33
|
+
# Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
|
34
|
+
#
|
35
|
+
# $stdout.should_receive(:puts).with("Couldn't connect to #{url}")
|
36
|
+
#
|
37
|
+
# Rawler::Crawler.new(url).links
|
38
|
+
# end
|
39
|
+
|
12
40
|
private
|
13
41
|
|
14
42
|
def site
|
metadata
CHANGED