rawler 0.0.9 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest.txt +1 -2
- data/README.txt +2 -0
- data/bin/rawler +3 -4
- data/lib/rawler.rb +2 -1
- data/lib/rawler/base.rb +5 -5
- data/lib/rawler/crawler.rb +4 -6
- data/spec/lib/rawler/crawler_spec.rb +30 -10
- data/spec/lib/rawler_spec.rb +7 -1
- metadata +3 -5
- data/History.txt +0 -12
data/Manifest.txt
CHANGED
data/README.txt
CHANGED
@@ -15,6 +15,7 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
15
15
|
where [options] are:
|
16
16
|
--username, -u <s>: HTTP Basic Username
|
17
17
|
--password, -p <s>: HTTP Basic Password
|
18
|
+
--wait: Number of seconds to wait betweet requests (default 3)
|
18
19
|
--version, -v: Print version and exit
|
19
20
|
--help, -h: Show this message
|
20
21
|
|
@@ -46,6 +47,7 @@ And add them to the Manifest file.
|
|
46
47
|
* Hugh Sasse
|
47
48
|
* Ken Egozi
|
48
49
|
* Robert Glaser
|
50
|
+
* Stefan Schüßler
|
49
51
|
* Vesa Vänskä
|
50
52
|
|
51
53
|
See also https://github.com/oscardelben/rawler/contributors
|
data/bin/rawler
CHANGED
@@ -14,13 +14,12 @@ Usage:
|
|
14
14
|
where [options] are:
|
15
15
|
EOS
|
16
16
|
|
17
|
-
# opt :domain, "domain that you want to test", :type => :string
|
18
17
|
opt :username, "HTT Basic Username", :type => :string
|
19
18
|
opt :password, "HTT Basic Password", :type => :string
|
19
|
+
opt :wait, "Number of seconds to wait betweet requests", :default => 3
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
domain = ARGV.shift.dup
|
22
|
+
domain = ARGV.shift
|
24
23
|
|
25
24
|
if domain.nil?
|
26
25
|
Trollop::die "Domain name is mandatory. Type --help for help"
|
@@ -30,5 +29,5 @@ else
|
|
30
29
|
end
|
31
30
|
end
|
32
31
|
|
33
|
-
Rawler::Base.new(domain, $stdout, opts
|
32
|
+
Rawler::Base.new(domain, $stdout, opts).validate
|
34
33
|
|
data/lib/rawler.rb
CHANGED
data/lib/rawler/base.rb
CHANGED
@@ -4,13 +4,14 @@ module Rawler
|
|
4
4
|
|
5
5
|
attr_accessor :responses
|
6
6
|
|
7
|
-
def initialize(url, output,
|
7
|
+
def initialize(url, output, options={})
|
8
8
|
@responses = {}
|
9
9
|
|
10
10
|
Rawler.url = URI.escape(url)
|
11
11
|
Rawler.output = Logger.new(output)
|
12
|
-
Rawler.username = username
|
13
|
-
Rawler.password = password
|
12
|
+
Rawler.username = options[:username]
|
13
|
+
Rawler.password = options[:password]
|
14
|
+
Rawler.wait = options[:wait].to_i
|
14
15
|
end
|
15
16
|
|
16
17
|
def validate
|
@@ -22,8 +23,7 @@ module Rawler
|
|
22
23
|
def validate_links_in_page(current_url)
|
23
24
|
Rawler::Crawler.new(current_url).links.each do |page_url|
|
24
25
|
validate_page(page_url, current_url)
|
25
|
-
|
26
|
-
sleep(3)
|
26
|
+
sleep(Rawler.wait)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
data/lib/rawler/crawler.rb
CHANGED
@@ -30,13 +30,11 @@ module Rawler
|
|
30
30
|
private
|
31
31
|
|
32
32
|
def absolute_url(path)
|
33
|
-
path = URI.encode(path.strip)
|
34
|
-
if path
|
35
|
-
URI.parse(url).merge(path.to_s).to_s
|
36
|
-
elsif URI.parse(path).scheme.nil?
|
37
|
-
URI.parse(url).merge("/#{path.to_s}").to_s
|
38
|
-
else
|
33
|
+
path = URI.encode(path.strip, Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]"))
|
34
|
+
if URI.parse(path).scheme
|
39
35
|
path
|
36
|
+
else
|
37
|
+
URI.parse(url).merge(path).to_s
|
40
38
|
end
|
41
39
|
rescue URI::InvalidURIError
|
42
40
|
write("Invalid url: #{path} - Called from: #{url}")
|
@@ -36,16 +36,36 @@ describe Rawler::Crawler do
|
|
36
36
|
|
37
37
|
context "relative paths" do
|
38
38
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
39
|
+
context "base URL ends with a slash" do
|
40
|
+
|
41
|
+
let(:url) { 'http://example.com/dir1/dir2/' }
|
42
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
43
|
+
let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
|
44
|
+
|
45
|
+
before(:each) do
|
46
|
+
register(url, content)
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should parse relative links" do
|
50
|
+
crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/dir2/bar', 'http://example.com/dir1/baz']
|
51
|
+
end
|
52
|
+
|
45
53
|
end
|
46
54
|
|
47
|
-
|
48
|
-
|
55
|
+
context "base URL doesn't end with a slash" do
|
56
|
+
|
57
|
+
let(:url) { 'http://example.com/dir1/dir2' }
|
58
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
59
|
+
let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
|
60
|
+
|
61
|
+
before(:each) do
|
62
|
+
register(url, content)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should parse relative links" do
|
66
|
+
crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/bar', 'http://example.com/baz']
|
67
|
+
end
|
68
|
+
|
49
69
|
end
|
50
70
|
|
51
71
|
end
|
@@ -77,8 +97,8 @@ describe Rawler::Crawler do
|
|
77
97
|
register(url, content)
|
78
98
|
end
|
79
99
|
|
80
|
-
it "should
|
81
|
-
crawler.links.should == ['http://example.com/foo
|
100
|
+
it "should not encode hashtags" do
|
101
|
+
crawler.links.should == ['http://example.com/foo#bar']
|
82
102
|
end
|
83
103
|
|
84
104
|
end
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -99,12 +99,18 @@ describe Rawler::Base do
|
|
99
99
|
end
|
100
100
|
|
101
101
|
it "should save username and password" do
|
102
|
-
rawler = Rawler::Base.new('http://example.com', output, 'my_user', 'secret')
|
102
|
+
rawler = Rawler::Base.new('http://example.com', output, {:username => 'my_user', :password => 'secret'})
|
103
103
|
|
104
104
|
Rawler.username.should == 'my_user'
|
105
105
|
Rawler.password.should == 'secret'
|
106
106
|
end
|
107
107
|
|
108
|
+
it "should save wait" do
|
109
|
+
rawler = Rawler::Base.new('http://example.com', output, {:wait => 5})
|
110
|
+
|
111
|
+
Rawler.wait.should == 5
|
112
|
+
end
|
113
|
+
|
108
114
|
it "should rescue from Errno::ECONNREFUSED" do
|
109
115
|
url = 'http://example.com'
|
110
116
|
from = 'http://other.com'
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
+
- 1
|
7
8
|
- 0
|
8
|
-
|
9
|
-
version: 0.0.9
|
9
|
+
version: 0.1.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Oscar Del Ben
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-29 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -71,13 +71,11 @@ executables:
|
|
71
71
|
extensions: []
|
72
72
|
|
73
73
|
extra_rdoc_files:
|
74
|
-
- History.txt
|
75
74
|
- Manifest.txt
|
76
75
|
- README.txt
|
77
76
|
files:
|
78
77
|
- Gemfile
|
79
78
|
- Gemfile.lock
|
80
|
-
- History.txt
|
81
79
|
- Manifest.txt
|
82
80
|
- README.txt
|
83
81
|
- Rakefile
|