rawler 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -13,6 +13,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
13
13
  --password, -p <s>: HTT Basic Password
14
14
  --wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
15
15
  --log, -l: Log results to file rawler_log.txt
16
+ --logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
17
+ --css, -c: Check CSS links
16
18
  --version, -v: Print version and exit
17
19
  --help, -h: Show this message
18
20
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.5
1
+ 0.1.6
data/bin/rawler CHANGED
@@ -17,7 +17,8 @@ EOS
17
17
  opt :username, "HTT Basic Username", :type => :string
18
18
  opt :password, "HTT Basic Password", :type => :string
19
19
  opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
20
- opt :log, "Log results to file rawler_log.txt", :type => :boolean, :default => false
20
+ opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
21
+ opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
21
22
  opt :css, "Check CSS links", :type => :boolean, :default => false
22
23
  end
23
24
 
@@ -10,7 +10,7 @@ module Rawler
10
10
  mattr_accessor :url
11
11
  mattr_accessor :wait
12
12
  mattr_accessor :username, :password
13
- mattr_accessor :log
13
+ mattr_accessor :log, :logfile
14
14
  mattr_accessor :css
15
15
 
16
16
  autoload :Base, "rawler/base"
@@ -1,6 +1,8 @@
1
1
  module Rawler
2
2
  class Base
3
3
 
4
+ DEFAULT_LOGFILE = "rawler_log.txt"
5
+
4
6
  attr_accessor :responses
5
7
 
6
8
  def initialize(url, output, options={})
@@ -12,9 +14,13 @@ module Rawler
12
14
  Rawler.username = options[:username]
13
15
  Rawler.password = options[:password]
14
16
  Rawler.wait = options[:wait]
15
- Rawler.log = options[:log]
16
17
  Rawler.css = options[:css]
17
- @logfile = File.new("rawler_log.txt", "w") if Rawler.log
18
+
19
+ # Using a custom logfile implies logging.
20
+ Rawler.logfile = options[:logfile] || DEFAULT_LOGFILE
21
+ Rawler.log = options[:log] || Rawler.logfile != DEFAULT_LOGFILE
22
+
23
+ @logfile = File.new(Rawler.logfile, "w") if Rawler.log
18
24
  end
19
25
 
20
26
  def validate
@@ -55,9 +61,11 @@ module Rawler
55
61
  def add_status_code(link, from_url)
56
62
  response = Rawler::Request.get(link)
57
63
 
58
- validate_page(response['Location'], from_url) if response['Location']
59
64
  record_response(response.code, link, from_url, response['Location'])
60
65
  responses[link] = { :status => response.code.to_i }
66
+
67
+ validate_page(response['Location'], from_url) if response['Location']
68
+
61
69
  rescue Errno::ECONNREFUSED
62
70
  error("Connection refused - #{link} - Called from: #{from_url}")
63
71
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
@@ -55,7 +55,7 @@ module Rawler
55
55
  else
56
56
  URI.parse(url).merge(path).to_s
57
57
  end
58
- rescue URI::InvalidURIError
58
+ rescue URI::InvalidURIError, URI::InvalidComponentError
59
59
  write("Invalid url: #{path} - Called from: #{url}")
60
60
  nil
61
61
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "rawler"
8
- s.version = "0.1.5"
8
+ s.version = "0.1.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Oscar Del Ben"]
12
- s.date = "2012-04-14"
12
+ s.date = "2012-07-17"
13
13
  s.description = "Rawler is a tool that crawls the links of your website"
14
14
  s.email = "info@oscardelben.com"
15
15
  s.executables = ["rawler"]
@@ -45,7 +45,7 @@ Gem::Specification.new do |s|
45
45
  s.homepage = "http://github.com/oscardelben/rawler"
46
46
  s.licenses = ["MIT"]
47
47
  s.require_paths = ["lib"]
48
- s.rubygems_version = "1.8.21"
48
+ s.rubygems_version = "1.8.23"
49
49
  s.summary = "Rawler is a tool that crawls the links of your website"
50
50
 
51
51
  if s.respond_to? :specification_version then
@@ -306,5 +306,20 @@ describe Rawler::Crawler do
306
306
  crawler.links.should == []
307
307
  end
308
308
  end
309
-
309
+
310
+ context "invalid mailto" do
311
+ let(:content) { '<a href="mailto:obfuscated(at)example(dot)com">foo</a>' }
312
+ let(:url) { 'http://example.com' }
313
+ let(:crawler) { Rawler::Crawler.new(url) }
314
+
315
+ before(:each) do
316
+ register(url, content)
317
+ end
318
+
319
+ it "should notify about the invalid url" do
320
+ output.should_receive(:error).with('Invalid url: mailto:obfuscated(at)example(dot)com - Called from: http://example.com')
321
+ crawler.links.should == []
322
+ end
323
+ end
324
+
310
325
  end
@@ -12,6 +12,40 @@ describe Rawler::Base do
12
12
  register('http://example.com', site)
13
13
  end
14
14
 
15
+ describe "logfile" do
16
+ it "should have default value" do
17
+ url = 'http://example.com'
18
+
19
+ Rawler::Base.new(url, output)
20
+ Rawler.logfile.should == Rawler::Base::DEFAULT_LOGFILE
21
+ end
22
+
23
+ it "should honor logfile option" do
24
+ url = 'http://example.com'
25
+ logfile = 'custom_logfile'.freeze
26
+
27
+ Rawler::Base.new(url, output, :logfile => logfile)
28
+ Rawler.logfile.should == logfile
29
+ end
30
+ end
31
+
32
+ describe "log" do
33
+ it "should be turned off by default" do
34
+ url = 'http://example.com'
35
+
36
+ Rawler::Base.new(url, output)
37
+ Rawler.log.should == false
38
+ end
39
+
40
+ it "should be turned on when assigning custom logfile" do
41
+ url = 'http://example.com'
42
+ logfile = 'custom_logfile'
43
+
44
+ Rawler::Base.new(url, output, :logfile => logfile)
45
+ Rawler.log.should == true
46
+ end
47
+ end
48
+
15
49
  describe "url encoding" do
16
50
  it "should encode url" do
17
51
  original = 'http://example.com/写程序容易出现的几个不好的地方'
@@ -95,6 +129,15 @@ describe Rawler::Base do
95
129
  rawler.validate
96
130
  end
97
131
 
132
+ it "should handle circular redirections" do
133
+ register('http://example.com', '<a href="/foo">foo</a>')
134
+ register('http://example.com/foo', '', 301, :location => 'http://example.com/foo')
135
+
136
+ output.should_receive(:warn).with('301 - http://example.com/foo - Called from: http://example.com - Following redirection to: http://example.com/foo')
137
+
138
+ rawler.validate
139
+ end
140
+
98
141
  end
99
142
 
100
143
  describe "get_status_code" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-14 00:00:00.000000000 Z
12
+ date: 2012-07-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -170,7 +170,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
170
170
  version: '0'
171
171
  segments:
172
172
  - 0
173
- hash: 2522129833142198431
173
+ hash: -2845352824216855200
174
174
  required_rubygems_version: !ruby/object:Gem::Requirement
175
175
  none: false
176
176
  requirements:
@@ -179,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
179
179
  version: '0'
180
180
  requirements: []
181
181
  rubyforge_project:
182
- rubygems_version: 1.8.21
182
+ rubygems_version: 1.8.23
183
183
  signing_key:
184
184
  specification_version: 3
185
185
  summary: Rawler is a tool that crawls the links of your website