rawler 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -0
- data/VERSION +1 -1
- data/bin/rawler +2 -1
- data/lib/rawler.rb +1 -1
- data/lib/rawler/base.rb +11 -3
- data/lib/rawler/crawler.rb +1 -1
- data/rawler.gemspec +3 -3
- data/spec/lib/rawler/crawler_spec.rb +16 -1
- data/spec/lib/rawler_spec.rb +43 -0
- metadata +4 -4
data/README.md
CHANGED
@@ -13,6 +13,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
13
13
|
--password, -p <s>: HTT Basic Password
|
14
14
|
--wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
|
15
15
|
--log, -l: Log results to file rawler_log.txt
|
16
|
+
--logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
|
17
|
+
--css, -c: Check CSS links
|
16
18
|
--version, -v: Print version and exit
|
17
19
|
--help, -h: Show this message
|
18
20
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.6
|
data/bin/rawler
CHANGED
@@ -17,7 +17,8 @@ EOS
|
|
17
17
|
opt :username, "HTT Basic Username", :type => :string
|
18
18
|
opt :password, "HTT Basic Password", :type => :string
|
19
19
|
opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
|
20
|
-
opt :log, "Log results to file
|
20
|
+
opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
|
21
|
+
opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
|
21
22
|
opt :css, "Check CSS links", :type => :boolean, :default => false
|
22
23
|
end
|
23
24
|
|
data/lib/rawler.rb
CHANGED
data/lib/rawler/base.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
module Rawler
|
2
2
|
class Base
|
3
3
|
|
4
|
+
DEFAULT_LOGFILE = "rawler_log.txt"
|
5
|
+
|
4
6
|
attr_accessor :responses
|
5
7
|
|
6
8
|
def initialize(url, output, options={})
|
@@ -12,9 +14,13 @@ module Rawler
|
|
12
14
|
Rawler.username = options[:username]
|
13
15
|
Rawler.password = options[:password]
|
14
16
|
Rawler.wait = options[:wait]
|
15
|
-
Rawler.log = options[:log]
|
16
17
|
Rawler.css = options[:css]
|
17
|
-
|
18
|
+
|
19
|
+
# Using a custom logfile implies logging.
|
20
|
+
Rawler.logfile = options[:logfile] || DEFAULT_LOGFILE
|
21
|
+
Rawler.log = options[:log] || Rawler.logfile != DEFAULT_LOGFILE
|
22
|
+
|
23
|
+
@logfile = File.new(Rawler.logfile, "w") if Rawler.log
|
18
24
|
end
|
19
25
|
|
20
26
|
def validate
|
@@ -55,9 +61,11 @@ module Rawler
|
|
55
61
|
def add_status_code(link, from_url)
|
56
62
|
response = Rawler::Request.get(link)
|
57
63
|
|
58
|
-
validate_page(response['Location'], from_url) if response['Location']
|
59
64
|
record_response(response.code, link, from_url, response['Location'])
|
60
65
|
responses[link] = { :status => response.code.to_i }
|
66
|
+
|
67
|
+
validate_page(response['Location'], from_url) if response['Location']
|
68
|
+
|
61
69
|
rescue Errno::ECONNREFUSED
|
62
70
|
error("Connection refused - #{link} - Called from: #{from_url}")
|
63
71
|
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
|
data/lib/rawler/crawler.rb
CHANGED
data/rawler.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "rawler"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Oscar Del Ben"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-07-17"
|
13
13
|
s.description = "Rawler is a tool that crawls the links of your website"
|
14
14
|
s.email = "info@oscardelben.com"
|
15
15
|
s.executables = ["rawler"]
|
@@ -45,7 +45,7 @@ Gem::Specification.new do |s|
|
|
45
45
|
s.homepage = "http://github.com/oscardelben/rawler"
|
46
46
|
s.licenses = ["MIT"]
|
47
47
|
s.require_paths = ["lib"]
|
48
|
-
s.rubygems_version = "1.8.
|
48
|
+
s.rubygems_version = "1.8.23"
|
49
49
|
s.summary = "Rawler is a tool that crawls the links of your website"
|
50
50
|
|
51
51
|
if s.respond_to? :specification_version then
|
@@ -306,5 +306,20 @@ describe Rawler::Crawler do
|
|
306
306
|
crawler.links.should == []
|
307
307
|
end
|
308
308
|
end
|
309
|
-
|
309
|
+
|
310
|
+
context "invalid mailto" do
|
311
|
+
let(:content) { '<a href="mailto:obfuscated(at)example(dot)com">foo</a>' }
|
312
|
+
let(:url) { 'http://example.com' }
|
313
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
314
|
+
|
315
|
+
before(:each) do
|
316
|
+
register(url, content)
|
317
|
+
end
|
318
|
+
|
319
|
+
it "should notify about the invalid url" do
|
320
|
+
output.should_receive(:error).with('Invalid url: mailto:obfuscated(at)example(dot)com - Called from: http://example.com')
|
321
|
+
crawler.links.should == []
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
310
325
|
end
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -12,6 +12,40 @@ describe Rawler::Base do
|
|
12
12
|
register('http://example.com', site)
|
13
13
|
end
|
14
14
|
|
15
|
+
describe "logfile" do
|
16
|
+
it "should have default value" do
|
17
|
+
url = 'http://example.com'
|
18
|
+
|
19
|
+
Rawler::Base.new(url, output)
|
20
|
+
Rawler.logfile.should == Rawler::Base::DEFAULT_LOGFILE
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should honor logfile option" do
|
24
|
+
url = 'http://example.com'
|
25
|
+
logfile = 'custom_logfile'.freeze
|
26
|
+
|
27
|
+
Rawler::Base.new(url, output, :logfile => logfile)
|
28
|
+
Rawler.logfile.should == logfile
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "log" do
|
33
|
+
it "should be turned off by default" do
|
34
|
+
url = 'http://example.com'
|
35
|
+
|
36
|
+
Rawler::Base.new(url, output)
|
37
|
+
Rawler.log.should == false
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should be turned on when assigning custom logfile" do
|
41
|
+
url = 'http://example.com'
|
42
|
+
logfile = 'custom_logfile'
|
43
|
+
|
44
|
+
Rawler::Base.new(url, output, :logfile => logfile)
|
45
|
+
Rawler.log.should == true
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
15
49
|
describe "url encoding" do
|
16
50
|
it "should encode url" do
|
17
51
|
original = 'http://example.com/写程序容易出现的几个不好的地方'
|
@@ -95,6 +129,15 @@ describe Rawler::Base do
|
|
95
129
|
rawler.validate
|
96
130
|
end
|
97
131
|
|
132
|
+
it "should handle circular redirections" do
|
133
|
+
register('http://example.com', '<a href="/foo">foo</a>')
|
134
|
+
register('http://example.com/foo', '', 301, :location => 'http://example.com/foo')
|
135
|
+
|
136
|
+
output.should_receive(:warn).with('301 - http://example.com/foo - Called from: http://example.com - Following redirection to: http://example.com/foo')
|
137
|
+
|
138
|
+
rawler.validate
|
139
|
+
end
|
140
|
+
|
98
141
|
end
|
99
142
|
|
100
143
|
describe "get_status_code" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-07-17 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -170,7 +170,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
170
170
|
version: '0'
|
171
171
|
segments:
|
172
172
|
- 0
|
173
|
-
hash:
|
173
|
+
hash: -2845352824216855200
|
174
174
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
175
175
|
none: false
|
176
176
|
requirements:
|
@@ -179,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
179
179
|
version: '0'
|
180
180
|
requirements: []
|
181
181
|
rubyforge_project:
|
182
|
-
rubygems_version: 1.8.
|
182
|
+
rubygems_version: 1.8.23
|
183
183
|
signing_key:
|
184
184
|
specification_version: 3
|
185
185
|
summary: Rawler is a tool that crawls the links of your website
|