rawler 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -13,6 +13,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
13
13
  --password, -p <s>: HTT Basic Password
14
14
  --wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
15
15
  --log, -l: Log results to file rawler_log.txt
16
+ --logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
17
+ --css, -c: Check CSS links
16
18
  --version, -v: Print version and exit
17
19
  --help, -h: Show this message
18
20
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.5
1
+ 0.1.6
data/bin/rawler CHANGED
@@ -17,7 +17,8 @@ EOS
17
17
  opt :username, "HTT Basic Username", :type => :string
18
18
  opt :password, "HTT Basic Password", :type => :string
19
19
  opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
20
- opt :log, "Log results to file rawler_log.txt", :type => :boolean, :default => false
20
+ opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
21
+ opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
21
22
  opt :css, "Check CSS links", :type => :boolean, :default => false
22
23
  end
23
24
 
@@ -10,7 +10,7 @@ module Rawler
10
10
  mattr_accessor :url
11
11
  mattr_accessor :wait
12
12
  mattr_accessor :username, :password
13
- mattr_accessor :log
13
+ mattr_accessor :log, :logfile
14
14
  mattr_accessor :css
15
15
 
16
16
  autoload :Base, "rawler/base"
@@ -1,6 +1,8 @@
1
1
  module Rawler
2
2
  class Base
3
3
 
4
+ DEFAULT_LOGFILE = "rawler_log.txt"
5
+
4
6
  attr_accessor :responses
5
7
 
6
8
  def initialize(url, output, options={})
@@ -12,9 +14,13 @@ module Rawler
12
14
  Rawler.username = options[:username]
13
15
  Rawler.password = options[:password]
14
16
  Rawler.wait = options[:wait]
15
- Rawler.log = options[:log]
16
17
  Rawler.css = options[:css]
17
- @logfile = File.new("rawler_log.txt", "w") if Rawler.log
18
+
19
+ # Using a custom logfile implies logging.
20
+ Rawler.logfile = options[:logfile] || DEFAULT_LOGFILE
21
+ Rawler.log = options[:log] || Rawler.logfile != DEFAULT_LOGFILE
22
+
23
+ @logfile = File.new(Rawler.logfile, "w") if Rawler.log
18
24
  end
19
25
 
20
26
  def validate
@@ -55,9 +61,11 @@ module Rawler
55
61
  def add_status_code(link, from_url)
56
62
  response = Rawler::Request.get(link)
57
63
 
58
- validate_page(response['Location'], from_url) if response['Location']
59
64
  record_response(response.code, link, from_url, response['Location'])
60
65
  responses[link] = { :status => response.code.to_i }
66
+
67
+ validate_page(response['Location'], from_url) if response['Location']
68
+
61
69
  rescue Errno::ECONNREFUSED
62
70
  error("Connection refused - #{link} - Called from: #{from_url}")
63
71
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
@@ -55,7 +55,7 @@ module Rawler
55
55
  else
56
56
  URI.parse(url).merge(path).to_s
57
57
  end
58
- rescue URI::InvalidURIError
58
+ rescue URI::InvalidURIError, URI::InvalidComponentError
59
59
  write("Invalid url: #{path} - Called from: #{url}")
60
60
  nil
61
61
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "rawler"
8
- s.version = "0.1.5"
8
+ s.version = "0.1.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Oscar Del Ben"]
12
- s.date = "2012-04-14"
12
+ s.date = "2012-07-17"
13
13
  s.description = "Rawler is a tool that crawls the links of your website"
14
14
  s.email = "info@oscardelben.com"
15
15
  s.executables = ["rawler"]
@@ -45,7 +45,7 @@ Gem::Specification.new do |s|
45
45
  s.homepage = "http://github.com/oscardelben/rawler"
46
46
  s.licenses = ["MIT"]
47
47
  s.require_paths = ["lib"]
48
- s.rubygems_version = "1.8.21"
48
+ s.rubygems_version = "1.8.23"
49
49
  s.summary = "Rawler is a tool that crawls the links of your website"
50
50
 
51
51
  if s.respond_to? :specification_version then
@@ -306,5 +306,20 @@ describe Rawler::Crawler do
306
306
  crawler.links.should == []
307
307
  end
308
308
  end
309
-
309
+
310
+ context "invalid mailto" do
311
+ let(:content) { '<a href="mailto:obfuscated(at)example(dot)com">foo</a>' }
312
+ let(:url) { 'http://example.com' }
313
+ let(:crawler) { Rawler::Crawler.new(url) }
314
+
315
+ before(:each) do
316
+ register(url, content)
317
+ end
318
+
319
+ it "should notify about the invalid url" do
320
+ output.should_receive(:error).with('Invalid url: mailto:obfuscated(at)example(dot)com - Called from: http://example.com')
321
+ crawler.links.should == []
322
+ end
323
+ end
324
+
310
325
  end
@@ -12,6 +12,40 @@ describe Rawler::Base do
12
12
  register('http://example.com', site)
13
13
  end
14
14
 
15
+ describe "logfile" do
16
+ it "should have default value" do
17
+ url = 'http://example.com'
18
+
19
+ Rawler::Base.new(url, output)
20
+ Rawler.logfile.should == Rawler::Base::DEFAULT_LOGFILE
21
+ end
22
+
23
+ it "should honor logfile option" do
24
+ url = 'http://example.com'
25
+ logfile = 'custom_logfile'.freeze
26
+
27
+ Rawler::Base.new(url, output, :logfile => logfile)
28
+ Rawler.logfile.should == logfile
29
+ end
30
+ end
31
+
32
+ describe "log" do
33
+ it "should be turned off by default" do
34
+ url = 'http://example.com'
35
+
36
+ Rawler::Base.new(url, output)
37
+ Rawler.log.should == false
38
+ end
39
+
40
+ it "should be turned on when assigning custom logfile" do
41
+ url = 'http://example.com'
42
+ logfile = 'custom_logfile'
43
+
44
+ Rawler::Base.new(url, output, :logfile => logfile)
45
+ Rawler.log.should == true
46
+ end
47
+ end
48
+
15
49
  describe "url encoding" do
16
50
  it "should encode url" do
17
51
  original = 'http://example.com/写程序容易出现的几个不好的地方'
@@ -95,6 +129,15 @@ describe Rawler::Base do
95
129
  rawler.validate
96
130
  end
97
131
 
132
+ it "should handle circular redirections" do
133
+ register('http://example.com', '<a href="/foo">foo</a>')
134
+ register('http://example.com/foo', '', 301, :location => 'http://example.com/foo')
135
+
136
+ output.should_receive(:warn).with('301 - http://example.com/foo - Called from: http://example.com - Following redirection to: http://example.com/foo')
137
+
138
+ rawler.validate
139
+ end
140
+
98
141
  end
99
142
 
100
143
  describe "get_status_code" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-14 00:00:00.000000000 Z
12
+ date: 2012-07-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -170,7 +170,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
170
170
  version: '0'
171
171
  segments:
172
172
  - 0
173
- hash: 2522129833142198431
173
+ hash: -2845352824216855200
174
174
  required_rubygems_version: !ruby/object:Gem::Requirement
175
175
  none: false
176
176
  requirements:
@@ -179,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
179
179
  version: '0'
180
180
  requirements: []
181
181
  rubyforge_project:
182
- rubygems_version: 1.8.21
182
+ rubygems_version: 1.8.23
183
183
  signing_key:
184
184
  specification_version: 3
185
185
  summary: Rawler is a tool that crawls the links of your website