rawler 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "nokogiri", "1.4.4"
4
+
5
+ group :development, :test do
6
+ gem "hoe", "2.6.2"
7
+ gem "rspec", "2.4.0"
8
+ gem "fakeweb", "1.3.0"
9
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,22 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ diff-lcs (1.1.2)
5
+ fakeweb (1.3.0)
6
+ nokogiri (1.4.4)
7
+ rspec (2.4.0)
8
+ rspec-core (~> 2.4.0)
9
+ rspec-expectations (~> 2.4.0)
10
+ rspec-mocks (~> 2.4.0)
11
+ rspec-core (2.4.0)
12
+ rspec-expectations (2.4.0)
13
+ diff-lcs (~> 1.1.2)
14
+ rspec-mocks (2.4.0)
15
+
16
+ PLATFORMS
17
+ ruby
18
+
19
+ DEPENDENCIES
20
+ fakeweb (= 1.3.0)
21
+ nokogiri (= 1.4.4)
22
+ rspec (= 2.4.0)
data/Manifest.txt CHANGED
@@ -1,4 +1,5 @@
1
- .autotest
1
+ Gemfile
2
+ Gemfile.lock
2
3
  History.txt
3
4
  Manifest.txt
4
5
  README.txt
@@ -10,13 +11,12 @@ lib/rawler/core_extensions.rb
10
11
  lib/rawler/core_extensions/module.rb
11
12
  lib/rawler/crawler.rb
12
13
  lib/rawler/request.rb
14
+ spec/lib/base_spec.rb
15
+ spec/lib/rawler/base_spec.rb
16
+ spec/lib/rawler/crawler_spec.rb
17
+ spec/lib/rawler_spec.rb
13
18
  spec/spec.opts
14
19
  spec/spec_helper.rb
15
- spec/unit/base_spec.rb
16
- spec/unit/crawler/base_spec.rb
17
- spec/unit/crawler/content_type_spec.rb
18
- spec/unit/crawler/exceptions_spec.rb
19
- spec/unit/crawler/http_basic_spec.rb
20
- spec/unit/crawler/url_domain_spec.rb
20
+ specs.watchr
21
21
  tasks/rspec.rake
22
22
  vendor/lib-trollop.rb
data/README.txt CHANGED
@@ -8,8 +8,6 @@ Rawler is a Ruby library that crawls your website and checks the status code for
8
8
 
9
9
  Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
10
10
 
11
- Please note: I had to temporarily remove url encoding in order to resolve some issues, so if you find any issue, please let me know. I'm also going to use Mechanizer for parsing pages with the next release.
12
-
13
11
  == SYNOPSIS:
14
12
 
15
13
  rawler http://example.com [options]
@@ -24,16 +22,40 @@ Please note: I had to temporarily remove url encoding in order to resolve some i
24
22
 
25
23
  gem install rawler
26
24
 
25
+ == DEVELOPMENT:
26
+
27
+ Run bundle install to install everything you need
28
+
29
+ rake test
30
+
31
+ To package and run the gem locally:
32
+
33
+ rake package
34
+ cd pkg
35
+ gem install rawler-#{version}.gem
36
+
37
+ If you add files, run:
38
+
39
+ rake check_manifest
40
+
41
+ And add them to the Manifest file.
42
+
27
43
  == TODO
28
44
 
45
+ * Add logger levels
29
46
  * Follow redirects, but still inform about them
30
47
  * Respect robots.txt
31
48
  * Export to html
32
49
 
33
50
  == CONTRIBUTORS:
34
51
 
35
- * Vesa Vänskä https://github.com/vesan
52
+ * bcoob
36
53
  * Hugh Sasse
54
+ * Ken Egozi
55
+ * Robert Glaser
56
+ * Vesa Vänskä
57
+
58
+ See also https://github.com/oscardelben/rawler/contributors
37
59
 
38
60
  == LICENSE:
39
61
 
@@ -58,4 +80,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
58
80
  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
59
81
  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
60
82
  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
61
- SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
83
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile CHANGED
@@ -3,6 +3,12 @@
3
3
  require 'rubygems'
4
4
  require 'hoe'
5
5
 
6
+ # require 'bundler'
7
+ # Bundler::GemHelper.install_tasks
8
+
9
+ require 'rspec/core/rake_task'
10
+ RSpec::Core::RakeTask.new(:test)
11
+
6
12
  # Hoe.plugin :compiler
7
13
  # Hoe.plugin :gem_prelude_sucks
8
14
  # Hoe.plugin :inline
@@ -21,4 +27,9 @@ Hoe.spec 'rawler' do
21
27
  extra_deps << ['nokogiri']
22
28
  end
23
29
 
30
+ desc 'Console'
31
+ task :console do
32
+ exec 'irb -rubygems -I lib -r rawler.rb'
33
+ end
34
+
24
35
  # vim: syntax=ruby
data/bin/rawler CHANGED
@@ -19,7 +19,8 @@ EOS
19
19
  opt :password, "HTT Basic Password", :type => :string
20
20
  end
21
21
 
22
- domain = ARGV.shift
22
+ # Use dup to unfrozen string
23
+ domain = ARGV.shift.dup
23
24
 
24
25
  if domain.nil?
25
26
  Trollop::die "Domain name is mandatory. Type --help for help"
data/lib/rawler.rb CHANGED
@@ -2,11 +2,12 @@ require 'rubygems'
2
2
  require 'net/http'
3
3
  require 'net/https'
4
4
  require 'nokogiri'
5
+ require 'logger'
5
6
 
6
7
  require 'rawler/core_extensions'
7
8
 
8
9
  module Rawler
9
- VERSION = '0.0.5'
10
+ VERSION = '0.0.6'
10
11
 
11
12
  mattr_accessor :output
12
13
  mattr_accessor :url
@@ -16,4 +17,14 @@ module Rawler
16
17
  autoload :Base, "rawler/base"
17
18
  autoload :Crawler, "rawler/crawler"
18
19
  autoload :Request, "rawler/request"
19
- end
20
+
21
+ def self.url=(url)
22
+ url.strip!
23
+
24
+ if (url =~ /http:\/\//) != 0
25
+ url = 'http://' + url
26
+ end
27
+
28
+ @@url = url
29
+ end
30
+ end
data/lib/rawler/base.rb CHANGED
@@ -8,7 +8,7 @@ module Rawler
8
8
  @responses = {}
9
9
 
10
10
  Rawler.url = url
11
- Rawler.output = output
11
+ Rawler.output = Logger.new(output)
12
12
  Rawler.username = username
13
13
  Rawler.password = password
14
14
  end
@@ -37,13 +37,13 @@ module Rawler
37
37
  def add_status_code(link)
38
38
  response = Rawler::Request.get(link)
39
39
 
40
- write("#{response.code} - #{link}")
40
+ record_response(response.code, link)
41
41
  responses[link] = { :status => response.code.to_i }
42
42
  rescue Errno::ECONNREFUSED
43
- write("Connection refused - '#{link}'")
43
+ Rawler.output.error("Connection refused - '#{link}'")
44
44
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
45
45
  EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
46
- write("Connection problems - '#{link}'")
46
+ Rawler.output.error("Connection problems - '#{link}'")
47
47
  end
48
48
 
49
49
  def same_domain?(link)
@@ -55,9 +55,30 @@ module Rawler
55
55
  end
56
56
 
57
57
  def write(message)
58
- Rawler.output.puts(message)
58
+ # TODO: This may not always be an error message,
59
+ # but that will make it show up most of the time
60
+ Rawler.output.error(message)
61
+ end
62
+
63
+ def record_response(code, link)
64
+ message = "#{code} - #{link}"
65
+ code = code.to_i
66
+ case code / 100
67
+ when 1
68
+ # TODO: check that if a 100 is received
69
+ # then there is another status code as well
70
+ Rawler.output.info(message)
71
+ when 2 then
72
+ Rawler.output.info(message)
73
+ when 3 then
74
+ Rawler.output.warn(message)
75
+ when 4,5 then
76
+ Rawler.output.error(message)
77
+ else
78
+ Rawler.output.error("Unknown code #{message}")
79
+ end
59
80
  end
60
81
 
61
82
  end
62
83
 
63
- end
84
+ end
@@ -5,7 +5,7 @@ module Rawler
5
5
  attr_accessor :url, :links
6
6
 
7
7
  def initialize(url)
8
- @url = url
8
+ @url = url.strip
9
9
  end
10
10
 
11
11
  def links
@@ -16,7 +16,7 @@ module Rawler
16
16
  response = Rawler::Request.get(url)
17
17
 
18
18
  doc = Nokogiri::HTML(response.body)
19
- doc.css('a').map { |a| absolute_url(a['href']) }.select { |url| valid_url?(url) }
19
+ doc.css('a').map { |a| a['href'] }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
20
20
  rescue Errno::ECONNREFUSED
21
21
  write("Couldn't connect to #{url}")
22
22
  []
@@ -28,11 +28,16 @@ module Rawler
28
28
  private
29
29
 
30
30
  def absolute_url(path)
31
- URI.parse(url).merge(path.to_s).to_s
31
+ path.strip!
32
+ if path[0].chr == '/'
33
+ URI.parse(url).merge(path.to_s).to_s
34
+ else
35
+ path
36
+ end
32
37
  end
33
38
 
34
39
  def write(message)
35
- Rawler.output.puts(message)
40
+ Rawler.output.error(message)
36
41
  end
37
42
 
38
43
  def different_domain?(url_1, url_2)
@@ -44,11 +49,23 @@ module Rawler
44
49
  end
45
50
 
46
51
  def valid_url?(url)
52
+ return false unless url
53
+
54
+ url.strip!
47
55
  scheme = URI.parse(url).scheme
56
+
57
+ if ['http', 'https'].include?(scheme)
58
+ true
59
+ else
60
+ write("Invalid url - #{url}")
61
+ false
62
+ end
48
63
 
49
- ['http', 'https'].include?(scheme)
64
+ rescue URI::InvalidURIError
65
+ false
66
+ write("Invalid url - #{url}")
50
67
  end
51
-
68
+
52
69
  end
53
70
 
54
- end
71
+ end
@@ -0,0 +1,5 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper.rb'
2
+
3
+ describe Rawler do
4
+
5
+ end
File without changes
@@ -0,0 +1,214 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
+
3
+ describe Rawler::Crawler do
4
+
5
+ let(:url) { 'http://example.com' }
6
+ let(:output) { double("output", :error => nil) }
7
+
8
+ before(:each) do
9
+ Rawler.stub!(:url).and_return(url)
10
+ Rawler.stub!(:output).and_return(output)
11
+ end
12
+
13
+ context "basic functionality" do
14
+
15
+ let(:url) { 'http://example.com' }
16
+ let(:crawler) { Rawler::Crawler.new(url) }
17
+ let(:content) {
18
+ content = <<-content
19
+ <p><a href="http://example.com/foo">foo</a></p>
20
+
21
+ <p><a href="http://external.com/bar">bar</a></p>
22
+ content
23
+ }
24
+
25
+ before(:each) do
26
+ register(url, content)
27
+ end
28
+
29
+ it "should parse all links" do
30
+ crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
31
+ end
32
+
33
+ end
34
+
35
+ context "relative paths" do
36
+
37
+ let(:url) { 'http://example.com/path' }
38
+ let(:crawler) { Rawler::Crawler.new(url) }
39
+ let(:content) { '<a href="/foo">foo</a>' }
40
+
41
+ before(:each) do
42
+ register(url, content)
43
+ end
44
+
45
+ it "should parse relative links" do
46
+ crawler.links.should == ['http://example.com/foo']
47
+ end
48
+
49
+ end
50
+
51
+ context "different domains" do
52
+
53
+ let(:url) { 'http://external.com/path' }
54
+ let(:crawler) { Rawler::Crawler.new(url) }
55
+ let(:content) { '<a href="/foo">foo</a>' }
56
+
57
+ before(:each) do
58
+ Rawler.stub!(:url).and_return('http://example.com')
59
+ register(url, content)
60
+ end
61
+
62
+ it "should parse relative links" do
63
+ crawler.links.should == []
64
+ end
65
+
66
+ end
67
+
68
+ context "urls with hash tags" do
69
+
70
+ let(:url) { 'http://example.com/path' }
71
+ let(:crawler) { Rawler::Crawler.new(url) }
72
+ let(:content) { '<a href="/foo#bar">foo</a>' }
73
+
74
+ before(:each) do
75
+ register(url, content)
76
+ end
77
+
78
+ it "should parse relative links" do
79
+ crawler.links.should == ['http://example.com/foo#bar']
80
+ end
81
+
82
+ end
83
+
84
+ context "invalid urls" do
85
+ let(:url) { 'http://example.com/path' }
86
+ let(:crawler) { Rawler::Crawler.new(url) }
87
+ let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
88
+ let(:content) { "<a href=\"#{js_url}\">foo</a>" }
89
+
90
+ before(:each) do
91
+ register(url, content)
92
+ end
93
+
94
+ it "should parse relative links" do
95
+ crawler.links.should == []
96
+ end
97
+
98
+ it "should report the error" do
99
+ crawler.should_receive(:write).with("Invalid url - #{js_url}")
100
+ crawler.links
101
+ end
102
+ end
103
+
104
+
105
+ context "content type" do
106
+
107
+ ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
108
+
109
+ let(:url) { 'http://example.com' }
110
+ let(:crawler) { Rawler::Crawler.new(url) }
111
+
112
+ before(:each) do
113
+ register(url, '', 200, :content_type => content_type)
114
+ end
115
+
116
+ it "should ignore '#{content_type}'" do
117
+ crawler.links.should == []
118
+ end
119
+
120
+ end
121
+ end
122
+
123
+ context "Exceptions" do
124
+
125
+ let(:url) { 'http://example.com' }
126
+ let(:crawler) { Rawler::Crawler.new(url) }
127
+ let(:output) { double('output', :error => nil) }
128
+
129
+ before(:each) do
130
+ register(url, '')
131
+ Rawler.stub!(:output).and_return(output)
132
+ end
133
+
134
+ context "Errno::ECONNREFUSED" do
135
+
136
+ before(:each) do
137
+ Rawler::Request.stub!(:get).and_raise Errno::ECONNREFUSED
138
+ end
139
+
140
+ it "should return an empty array" do
141
+ crawler.links.should == []
142
+ end
143
+
144
+ it "should print a message when raising Errno::ECONNREFUSED" do
145
+ output.should_receive(:error).with("Couldn't connect to #{url}")
146
+
147
+ crawler.links
148
+ end
149
+
150
+ end
151
+
152
+ context "Errno::ETIMEDOUT" do
153
+
154
+ before(:each) do
155
+ Rawler::Request.stub!(:get).and_raise Errno::ETIMEDOUT
156
+ end
157
+
158
+ it "should return an empty array when raising Errno::ETIMEDOUT" do
159
+ crawler.links.should == []
160
+ end
161
+
162
+ it "should print a message when raising Errno::ETIMEDOUT" do
163
+ output.should_receive(:error).with("Connection to #{url} timed out")
164
+
165
+ crawler.links
166
+ end
167
+
168
+ end
169
+
170
+ end
171
+
172
+ context "http basic" do
173
+
174
+ let(:url) { 'http://example.com' }
175
+ let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
176
+ let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
177
+
178
+ before(:each) do
179
+ register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
180
+ register('http://foo:bar@example.com/secret', content)
181
+
182
+ Rawler.stub!(:username).and_return('foo')
183
+ Rawler.stub!(:password).and_return('bar')
184
+ end
185
+
186
+ it "should crawl http basic pages" do
187
+ crawler.links.should == ['http://example.com/secret-path']
188
+ end
189
+
190
+ end
191
+
192
+ context "url domain" do
193
+
194
+ let(:content) {
195
+ content = <<-content
196
+ <a href="http://example.com/valid">foo</a>
197
+ <a href="mailto:info@example.com">invalid</a>
198
+ <a href="https://foo.com">valid</a>
199
+ <a href=" http://fooo.com ">valid with illegal whitespaces</a>
200
+ content
201
+ }
202
+ let(:url) { 'http://example.com' }
203
+ let(:crawler) { Rawler::Crawler.new(url) }
204
+
205
+ before(:each) do
206
+ register(url, content)
207
+ end
208
+
209
+ it "should ignore links other than http or https" do
210
+ crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com']
211
+ end
212
+ end
213
+
214
+ end
@@ -6,6 +6,7 @@ describe Rawler::Base do
6
6
  let(:rawler) { Rawler::Base.new('http://example.com', output) }
7
7
 
8
8
  before(:each) do
9
+ Rawler.stub!(:output).and_return(output)
9
10
  register('http://example.com', site)
10
11
  end
11
12
 
@@ -42,10 +43,10 @@ describe Rawler::Base do
42
43
  register('http://external.com', '')
43
44
  register('http://external.com/foo', '', 302)
44
45
 
45
- output.should_receive(:puts).with('200 - http://example.com/foo1')
46
- output.should_receive(:puts).with('200 - http://example.com/foo2')
47
- output.should_receive(:puts).with('200 - http://external.com')
48
- output.should_receive(:puts).with('302 - http://external.com/foo')
46
+ output.should_receive(:info).with('200 - http://example.com/foo1')
47
+ output.should_receive(:info).with('200 - http://example.com/foo2')
48
+ output.should_receive(:info).with('200 - http://external.com')
49
+ output.should_receive(:warn).with('302 - http://external.com/foo')
49
50
 
50
51
  rawler.validate
51
52
  end
@@ -54,7 +55,7 @@ describe Rawler::Base do
54
55
  register('http://example.com/foo1', '<a href="http://example.com/page-with#hashtag">x</a>')
55
56
  register('http://example.com/page-with', '')
56
57
 
57
- output.should_receive(:puts).with('200 - http://example.com/page-with#hashtag')
58
+ output.should_receive(:info).with('200 - http://example.com/page-with#hashtag')
58
59
 
59
60
  rawler.validate
60
61
  end
@@ -93,7 +94,7 @@ describe Rawler::Base do
93
94
 
94
95
  Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
95
96
 
96
- output.should_receive(:puts).with("Connection refused - '#{url}'")
97
+ output.should_receive(:error).with("Connection refused - '#{url}'")
97
98
 
98
99
  rawler.send(:add_status_code, url)
99
100
  end
@@ -105,7 +106,7 @@ describe Rawler::Base do
105
106
 
106
107
  Rawler::Request.should_receive(:get).and_raise error
107
108
 
108
- output.should_receive(:puts).with("Connection problems - '#{url}'")
109
+ output.should_receive(:error).with("Connection problems - '#{url}'")
109
110
 
110
111
  rawler.send(:add_status_code, url)
111
112
  end
@@ -113,6 +114,76 @@ describe Rawler::Base do
113
114
 
114
115
  end
115
116
 
117
+ describe "record_response" do
118
+
119
+ let(:message) { 'foo' }
120
+
121
+ context "response code 100" do
122
+ %w!100, 150, 199!.each do |code|
123
+
124
+ it "logger should receive info" do
125
+ output.should_receive(:info).with("#{code} - #{message}")
126
+ rawler.send(:record_response, code, message)
127
+ end
128
+
129
+ end
130
+ end
131
+
132
+ context "response code 200" do
133
+ %w!200, 250, 299!.each do |code|
134
+
135
+ it "logger should receive info" do
136
+ output.should_receive(:info).with("#{code} - #{message}")
137
+ rawler.send(:record_response, code, message)
138
+ end
139
+
140
+ end
141
+ end
142
+
143
+ context "response code 300" do
144
+ %w!300, 350, 399!.each do |code|
145
+
146
+ it "logger should receive warn" do
147
+ output.should_receive(:warn).with("#{code} - #{message}")
148
+ rawler.send(:record_response, code, message)
149
+ end
150
+
151
+ end
152
+ end
153
+
154
+ context "response code 400" do
155
+ %w!400, 450, 499!.each do |code|
156
+
157
+ it "logger should receive info" do
158
+ output.should_receive(:error).with("#{code} - #{message}")
159
+ rawler.send(:record_response, code, message)
160
+ end
161
+
162
+ end
163
+ end
164
+
165
+ context "response code 500" do
166
+ %w!400, 550, 599!.each do |code|
167
+
168
+ it "logger should receive info" do
169
+ output.should_receive(:error).with("#{code} - #{message}")
170
+ rawler.send(:record_response, code, message)
171
+ end
172
+
173
+ end
174
+ end
175
+
176
+ context "response code invalid" do
177
+ let(:code) { 600 }
178
+
179
+ it "logger should receive eror" do
180
+ output.should_receive(:error).with("Unknown code #{code} - #{message}")
181
+ rawler.send(:record_response, code, message)
182
+ end
183
+ end
184
+
185
+ end
186
+
116
187
 
117
188
  private
118
189
 
@@ -129,4 +200,4 @@ describe Rawler::Base do
129
200
  site
130
201
  end
131
202
 
132
- end
203
+ end
data/spec/spec_helper.rb CHANGED
@@ -15,4 +15,4 @@ FakeWeb.allow_net_connect = false
15
15
 
16
16
  def register(uri, content, status=200, options={})
17
17
  FakeWeb.register_uri(:any, uri, { :body => content, :status => status, :content_type => 'text/html' }.merge(options))
18
- end
18
+ end
data/specs.watchr ADDED
@@ -0,0 +1,59 @@
1
+ # Run me with:
2
+ #
3
+ # $ watchr specs.watchr
4
+
5
+ # --------------------------------------------------
6
+ # Convenience Methods
7
+ # --------------------------------------------------
8
+ def all_test_files
9
+ Dir['spec/**/*_spec.rb']
10
+ end
11
+
12
+ def run_test_matching(thing_to_match)
13
+ matches = all_test_files.grep(/#{thing_to_match}/i)
14
+ if matches.empty?
15
+ puts "Sorry, thanks for playing, but there were no matches for #{thing_to_match}"
16
+ else
17
+ run matches.join(' ')
18
+ end
19
+ end
20
+
21
+ def run(files_to_run)
22
+ puts("Running: #{files_to_run}")
23
+ system("clear;rspec -cfs #{files_to_run}")
24
+ no_int_for_you
25
+ end
26
+
27
+ def run_all_tests
28
+ puts "foo"
29
+ run(all_test_files.join(' '))
30
+ end
31
+
32
+ # --------------------------------------------------
33
+ # Watchr Rules
34
+ # --------------------------------------------------
35
+ watch('^spec/(.*)_spec\.rb' ) { |m| run_test_matching(m[1]) }
36
+ watch('^lib/(.*)\.rb' ) { |m| run_test_matching(m[1]) }
37
+ watch('^spec/spec_helper\.rb') { run_all_tests }
38
+ # --------------------------------------------------
39
+ # Signal Handling
40
+ # --------------------------------------------------
41
+
42
+ def no_int_for_you
43
+ @sent_an_int = nil
44
+ end
45
+
46
+ Signal.trap 'INT' do
47
+ if @sent_an_int then
48
+ puts " A second INT? Ok, I get the message. Shutting down now."
49
+ exit
50
+ else
51
+ puts " Did you just send me an INT? Ugh. I'll quit for real if you do it again."
52
+ @sent_an_int = true
53
+ Kernel.sleep 1.5
54
+ run_all_tests
55
+ end
56
+ end
57
+
58
+ # vim:ft=ruby
59
+
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 5
10
- version: 0.0.5
9
+ - 6
10
+ version: 0.0.6
11
11
  platform: ruby
12
12
  authors:
13
13
  - Oscar Del Ben
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-21 00:00:00 +01:00
18
+ date: 2011-02-04 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -52,8 +52,6 @@ description: |-
52
52
  Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
53
53
 
54
54
  Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
55
-
56
- Please note: I had to temporarily remove url encoding in order to resolve some issues, so if you find any issue, please let me know. I'm also going to use Mechanizer for parsing pages with the next release.
57
55
  email:
58
56
  - info@oscardelben.com
59
57
  executables:
@@ -65,7 +63,8 @@ extra_rdoc_files:
65
63
  - Manifest.txt
66
64
  - README.txt
67
65
  files:
68
- - .autotest
66
+ - Gemfile
67
+ - Gemfile.lock
69
68
  - History.txt
70
69
  - Manifest.txt
71
70
  - README.txt
@@ -77,14 +76,13 @@ files:
77
76
  - lib/rawler/core_extensions/module.rb
78
77
  - lib/rawler/crawler.rb
79
78
  - lib/rawler/request.rb
79
+ - spec/lib/base_spec.rb
80
+ - spec/lib/rawler/base_spec.rb
81
+ - spec/lib/rawler/crawler_spec.rb
82
+ - spec/lib/rawler_spec.rb
80
83
  - spec/spec.opts
81
84
  - spec/spec_helper.rb
82
- - spec/unit/base_spec.rb
83
- - spec/unit/crawler/base_spec.rb
84
- - spec/unit/crawler/content_type_spec.rb
85
- - spec/unit/crawler/exceptions_spec.rb
86
- - spec/unit/crawler/http_basic_spec.rb
87
- - spec/unit/crawler/url_domain_spec.rb
85
+ - specs.watchr
88
86
  - tasks/rspec.rake
89
87
  - vendor/lib-trollop.rb
90
88
  has_rdoc: true
data/.autotest DELETED
@@ -1,23 +0,0 @@
1
- # -*- ruby -*-
2
-
3
- require 'autotest/restart'
4
-
5
- # Autotest.add_hook :initialize do |at|
6
- # at.extra_files << "../some/external/dependency.rb"
7
- #
8
- # at.libs << ":../some/external"
9
- #
10
- # at.add_exception 'vendor'
11
- #
12
- # at.add_mapping(/dependency.rb/) do |f, _|
13
- # at.files_matching(/test_.*rb$/)
14
- # end
15
- #
16
- # %w(TestA TestB).each do |klass|
17
- # at.extra_class_map[klass] = "test/test_misc.rb"
18
- # end
19
- # end
20
-
21
- # Autotest.add_hook :run_command do |at|
22
- # system "rake build"
23
- # end
@@ -1,75 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
-
3
- describe Rawler::Crawler do
4
-
5
- context "basic functionality" do
6
-
7
- let(:url) { 'http://example.com' }
8
- let(:crawler) { Rawler::Crawler.new(url) }
9
- let(:content) {
10
- content = <<-content
11
- <p><a href="http://example.com/foo">foo</a></p>
12
-
13
- <p><a href="http://external.com/bar">bar</a></p>
14
- content
15
- }
16
-
17
- before(:each) do
18
- register(url, content)
19
- end
20
-
21
- it "should parse all links" do
22
- crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
23
- end
24
-
25
- end
26
-
27
- context "relative paths" do
28
-
29
- let(:url) { 'http://example.com/path' }
30
- let(:crawler) { Rawler::Crawler.new(url) }
31
- let(:content) { '<a href="/foo">foo</a>' }
32
-
33
- before(:each) do
34
- register(url, content)
35
- end
36
-
37
- it "should parse relative links" do
38
- crawler.links.should == ['http://example.com/foo']
39
- end
40
-
41
- end
42
-
43
- context "different domains" do
44
-
45
- let(:url) { 'http://external.com/path' }
46
- let(:crawler) { Rawler::Crawler.new(url) }
47
- let(:content) { '<a href="/foo">foo</a>' }
48
-
49
- before(:each) do
50
- register(url, content)
51
- end
52
-
53
- it "should parse relative links" do
54
- crawler.links.should == []
55
- end
56
-
57
- end
58
-
59
- context "urls with hash tags" do
60
-
61
- let(:url) { 'http://example.com/path' }
62
- let(:crawler) { Rawler::Crawler.new(url) }
63
- let(:content) { '<a href="/foo#bar">foo</a>' }
64
-
65
- before(:each) do
66
- register(url, content)
67
- end
68
-
69
- it "should parse relative links" do
70
- crawler.links.should == ['http://example.com/foo#bar']
71
- end
72
-
73
- end
74
-
75
- end
@@ -1,23 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
-
3
- describe Rawler::Crawler do
4
-
5
- context "content type" do
6
-
7
- ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
8
-
9
- let(:url) { 'http://example.com' }
10
- let(:crawler) { Rawler::Crawler.new(url) }
11
-
12
- before(:each) do
13
- register(url, '', 200, :content_type => content_type)
14
- end
15
-
16
- it "should ignore '#{content_type}'" do
17
- crawler.links.should == []
18
- end
19
-
20
- end
21
- end
22
-
23
- end
@@ -1,54 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
-
3
- describe Rawler::Crawler do
4
-
5
- context "Exceptions" do
6
-
7
- let(:url) { 'http://example.com' }
8
- let(:crawler) { Rawler::Crawler.new(url) }
9
- let(:output) { double('output', :puts => nil) }
10
-
11
- before(:each) do
12
- register(url, '')
13
- Rawler.stub!(:output).and_return(output)
14
- end
15
-
16
- context "Errno::ECONNREFUSED" do
17
-
18
- before(:each) do
19
- Rawler::Request.stub!(:get).and_raise Errno::ECONNREFUSED
20
- end
21
-
22
- it "should return an empty array" do
23
- crawler.links.should == []
24
- end
25
-
26
- it "should print a message when raising Errno::ECONNREFUSED" do
27
- output.should_receive(:puts).with("Couldn't connect to #{url}")
28
-
29
- crawler.links
30
- end
31
-
32
- end
33
-
34
- context "Errno::ETIMEDOUT" do
35
-
36
- before(:each) do
37
- Rawler::Request.stub!(:get).and_raise Errno::ETIMEDOUT
38
- end
39
-
40
- it "should return an empty array when raising Errno::ETIMEDOUT" do
41
- crawler.links.should == []
42
- end
43
-
44
- it "should print a message when raising Errno::ETIMEDOUT" do
45
- output.should_receive(:puts).with("Connection to #{url} timed out")
46
-
47
- crawler.links
48
- end
49
-
50
- end
51
-
52
- end
53
-
54
- end
@@ -1,25 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
-
3
- describe Rawler::Crawler do
4
-
5
- context "http basic" do
6
-
7
- let(:url) { 'http://example.com' }
8
- let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
9
- let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
10
-
11
- before(:each) do
12
- register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
13
- register('http://foo:bar@example.com/secret', content)
14
-
15
- Rawler.stub!(:username).and_return('foo')
16
- Rawler.stub!(:password).and_return('bar')
17
- end
18
-
19
- it "should crawl http basic pages" do
20
- crawler.links.should == ['http://example.com/secret-path']
21
- end
22
-
23
- end
24
-
25
- end
@@ -1,26 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
-
3
- describe Rawler::Crawler do
4
-
5
- context "url domain" do
6
-
7
- let(:content) {
8
- content = <<-content
9
- <a href="http://example.com/valid">foo</a>
10
- <a href="mailto:info@example.com">invalid</a>
11
- <a href="https://foo.com">valid</a>
12
- content
13
- }
14
- let(:url) { 'http://example.com' }
15
- let(:crawler) { Rawler::Crawler.new(url) }
16
-
17
- before(:each) do
18
- register(url, content)
19
- end
20
-
21
- it "should ignore links other than http or https" do
22
- crawler.links.should == ['http://example.com/valid', 'https://foo.com']
23
- end
24
- end
25
-
26
- end