rawler 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +1 -2
- data/README.txt +2 -0
- data/bin/rawler +3 -4
- data/lib/rawler.rb +2 -1
- data/lib/rawler/base.rb +5 -5
- data/lib/rawler/crawler.rb +4 -6
- data/spec/lib/rawler/crawler_spec.rb +30 -10
- data/spec/lib/rawler_spec.rb +7 -1
- metadata +3 -5
- data/History.txt +0 -12
data/Manifest.txt
CHANGED
data/README.txt
CHANGED
@@ -15,6 +15,7 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
15
15
|
where [options] are:
|
16
16
|
--username, -u <s>: HTTP Basic Username
|
17
17
|
--password, -p <s>: HTTP Basic Password
|
18
|
+
--wait: Number of seconds to wait betweet requests (default 3)
|
18
19
|
--version, -v: Print version and exit
|
19
20
|
--help, -h: Show this message
|
20
21
|
|
@@ -46,6 +47,7 @@ And add them to the Manifest file.
|
|
46
47
|
* Hugh Sasse
|
47
48
|
* Ken Egozi
|
48
49
|
* Robert Glaser
|
50
|
+
* Stefan Schüßler
|
49
51
|
* Vesa Vänskä
|
50
52
|
|
51
53
|
See also https://github.com/oscardelben/rawler/contributors
|
data/bin/rawler
CHANGED
@@ -14,13 +14,12 @@ Usage:
|
|
14
14
|
where [options] are:
|
15
15
|
EOS
|
16
16
|
|
17
|
-
# opt :domain, "domain that you want to test", :type => :string
|
18
17
|
opt :username, "HTT Basic Username", :type => :string
|
19
18
|
opt :password, "HTT Basic Password", :type => :string
|
19
|
+
opt :wait, "Number of seconds to wait betweet requests", :default => 3
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
domain = ARGV.shift.dup
|
22
|
+
domain = ARGV.shift
|
24
23
|
|
25
24
|
if domain.nil?
|
26
25
|
Trollop::die "Domain name is mandatory. Type --help for help"
|
@@ -30,5 +29,5 @@ else
|
|
30
29
|
end
|
31
30
|
end
|
32
31
|
|
33
|
-
Rawler::Base.new(domain, $stdout, opts
|
32
|
+
Rawler::Base.new(domain, $stdout, opts).validate
|
34
33
|
|
data/lib/rawler.rb
CHANGED
data/lib/rawler/base.rb
CHANGED
@@ -4,13 +4,14 @@ module Rawler
|
|
4
4
|
|
5
5
|
attr_accessor :responses
|
6
6
|
|
7
|
-
def initialize(url, output,
|
7
|
+
def initialize(url, output, options={})
|
8
8
|
@responses = {}
|
9
9
|
|
10
10
|
Rawler.url = URI.escape(url)
|
11
11
|
Rawler.output = Logger.new(output)
|
12
|
-
Rawler.username = username
|
13
|
-
Rawler.password = password
|
12
|
+
Rawler.username = options[:username]
|
13
|
+
Rawler.password = options[:password]
|
14
|
+
Rawler.wait = options[:wait].to_i
|
14
15
|
end
|
15
16
|
|
16
17
|
def validate
|
@@ -22,8 +23,7 @@ module Rawler
|
|
22
23
|
def validate_links_in_page(current_url)
|
23
24
|
Rawler::Crawler.new(current_url).links.each do |page_url|
|
24
25
|
validate_page(page_url, current_url)
|
25
|
-
|
26
|
-
sleep(3)
|
26
|
+
sleep(Rawler.wait)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
data/lib/rawler/crawler.rb
CHANGED
@@ -30,13 +30,11 @@ module Rawler
|
|
30
30
|
private
|
31
31
|
|
32
32
|
def absolute_url(path)
|
33
|
-
path = URI.encode(path.strip)
|
34
|
-
if path
|
35
|
-
URI.parse(url).merge(path.to_s).to_s
|
36
|
-
elsif URI.parse(path).scheme.nil?
|
37
|
-
URI.parse(url).merge("/#{path.to_s}").to_s
|
38
|
-
else
|
33
|
+
path = URI.encode(path.strip, Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]"))
|
34
|
+
if URI.parse(path).scheme
|
39
35
|
path
|
36
|
+
else
|
37
|
+
URI.parse(url).merge(path).to_s
|
40
38
|
end
|
41
39
|
rescue URI::InvalidURIError
|
42
40
|
write("Invalid url: #{path} - Called from: #{url}")
|
@@ -36,16 +36,36 @@ describe Rawler::Crawler do
|
|
36
36
|
|
37
37
|
context "relative paths" do
|
38
38
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
39
|
+
context "base URL ends with a slash" do
|
40
|
+
|
41
|
+
let(:url) { 'http://example.com/dir1/dir2/' }
|
42
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
43
|
+
let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
|
44
|
+
|
45
|
+
before(:each) do
|
46
|
+
register(url, content)
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should parse relative links" do
|
50
|
+
crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/dir2/bar', 'http://example.com/dir1/baz']
|
51
|
+
end
|
52
|
+
|
45
53
|
end
|
46
54
|
|
47
|
-
|
48
|
-
|
55
|
+
context "base URL doesn't end with a slash" do
|
56
|
+
|
57
|
+
let(:url) { 'http://example.com/dir1/dir2' }
|
58
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
59
|
+
let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
|
60
|
+
|
61
|
+
before(:each) do
|
62
|
+
register(url, content)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should parse relative links" do
|
66
|
+
crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/bar', 'http://example.com/baz']
|
67
|
+
end
|
68
|
+
|
49
69
|
end
|
50
70
|
|
51
71
|
end
|
@@ -77,8 +97,8 @@ describe Rawler::Crawler do
|
|
77
97
|
register(url, content)
|
78
98
|
end
|
79
99
|
|
80
|
-
it "should
|
81
|
-
crawler.links.should == ['http://example.com/foo
|
100
|
+
it "should not encode hashtags" do
|
101
|
+
crawler.links.should == ['http://example.com/foo#bar']
|
82
102
|
end
|
83
103
|
|
84
104
|
end
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -99,12 +99,18 @@ describe Rawler::Base do
|
|
99
99
|
end
|
100
100
|
|
101
101
|
it "should save username and password" do
|
102
|
-
rawler = Rawler::Base.new('http://example.com', output, 'my_user', 'secret')
|
102
|
+
rawler = Rawler::Base.new('http://example.com', output, {:username => 'my_user', :password => 'secret'})
|
103
103
|
|
104
104
|
Rawler.username.should == 'my_user'
|
105
105
|
Rawler.password.should == 'secret'
|
106
106
|
end
|
107
107
|
|
108
|
+
it "should save wait" do
|
109
|
+
rawler = Rawler::Base.new('http://example.com', output, {:wait => 5})
|
110
|
+
|
111
|
+
Rawler.wait.should == 5
|
112
|
+
end
|
113
|
+
|
108
114
|
it "should rescue from Errno::ECONNREFUSED" do
|
109
115
|
url = 'http://example.com'
|
110
116
|
from = 'http://other.com'
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
+
- 1
|
7
8
|
- 0
|
8
|
-
|
9
|
-
version: 0.0.9
|
9
|
+
version: 0.1.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Oscar Del Ben
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-29 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -71,13 +71,11 @@ executables:
|
|
71
71
|
extensions: []
|
72
72
|
|
73
73
|
extra_rdoc_files:
|
74
|
-
- History.txt
|
75
74
|
- Manifest.txt
|
76
75
|
- README.txt
|
77
76
|
files:
|
78
77
|
- Gemfile
|
79
78
|
- Gemfile.lock
|
80
|
-
- History.txt
|
81
79
|
- Manifest.txt
|
82
80
|
- README.txt
|
83
81
|
- Rakefile
|