rawler 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/.autotest ADDED
@@ -0,0 +1,23 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'autotest/restart'
4
+
5
+ # Autotest.add_hook :initialize do |at|
6
+ # at.extra_files << "../some/external/dependency.rb"
7
+ #
8
+ # at.libs << ":../some/external"
9
+ #
10
+ # at.add_exception 'vendor'
11
+ #
12
+ # at.add_mapping(/dependency.rb/) do |f, _|
13
+ # at.files_matching(/test_.*rb$/)
14
+ # end
15
+ #
16
+ # %w(TestA TestB).each do |klass|
17
+ # at.extra_class_map[klass] = "test/test_misc.rb"
18
+ # end
19
+ # end
20
+
21
+ # Autotest.add_hook :run_command do |at|
22
+ # system "rake build"
23
+ # end
data/Manifest.txt CHANGED
@@ -1,17 +1,22 @@
1
+ .autotest
1
2
  History.txt
2
3
  Manifest.txt
3
4
  README.txt
4
5
  Rakefile
5
6
  bin/rawler
7
+ lib/rawler.rb
6
8
  lib/rawler/base.rb
7
- lib/rawler/core_extensions/module.rb
8
9
  lib/rawler/core_extensions.rb
10
+ lib/rawler/core_extensions/module.rb
9
11
  lib/rawler/crawler.rb
10
12
  lib/rawler/request.rb
11
- lib/rawler.rb
12
13
  spec/spec.opts
13
14
  spec/spec_helper.rb
14
15
  spec/unit/base_spec.rb
15
- spec/unit/crawler_spec.rb
16
+ spec/unit/crawler/base_spec.rb
17
+ spec/unit/crawler/content_type_spec.rb
18
+ spec/unit/crawler/exceptions_spec.rb
19
+ spec/unit/crawler/http_basic_spec.rb
20
+ spec/unit/crawler/url_domain_spec.rb
16
21
  tasks/rspec.rake
17
22
  vendor/lib-trollop.rb
data/README.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  = rawler
2
2
 
3
- * http://github.com/#{github_username}/#{project_name}
3
+ * http://github.com/oscardelben/rawler
4
4
 
5
5
  == DESCRIPTION:
6
6
 
@@ -8,6 +8,8 @@ Rawler is a Ruby library that crawls your website and checks the status code for
8
8
 
9
9
  Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
10
10
 
11
+ Please note: I had to temporarily remove url encoding in order to resolve some issues, so if you find any issue, please let me know. I'm also going to use Mechanizer for parsing pages with the next release.
12
+
11
13
  == SYNOPSIS:
12
14
 
13
15
  rawler http://example.com [options]
@@ -24,8 +26,15 @@ gem install rawler
24
26
 
25
27
  == TODO
26
28
 
29
+ * Follow redirects, but still inform about them
30
+ * Respect robots.txt
27
31
  * Export to html
28
32
 
33
+ == CONTRIBUTORS:
34
+
35
+ * Vesa Vänskä https://github.com/vesan
36
+ * Hugh Sasse
37
+
29
38
  == LICENSE:
30
39
 
31
40
  (The MIT License)
data/lib/rawler.rb CHANGED
@@ -6,7 +6,7 @@ require 'nokogiri'
6
6
  require 'rawler/core_extensions'
7
7
 
8
8
  module Rawler
9
- VERSION = '0.0.4'
9
+ VERSION = '0.0.5'
10
10
 
11
11
  mattr_accessor :output
12
12
  mattr_accessor :url
data/lib/rawler/base.rb CHANGED
@@ -22,6 +22,8 @@ module Rawler
22
22
  def validate_links_in_page(current_url)
23
23
  Rawler::Crawler.new(current_url).links.each do |page_url|
24
24
  validate_page(page_url)
25
+ # Todo: include this in a configuration option
26
+ sleep(3)
25
27
  end
26
28
  end
27
29
 
@@ -39,13 +41,13 @@ module Rawler
39
41
  responses[link] = { :status => response.code.to_i }
40
42
  rescue Errno::ECONNREFUSED
41
43
  write("Connection refused - '#{link}'")
42
- rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
43
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
44
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
45
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
44
46
  write("Connection problems - '#{link}'")
45
47
  end
46
48
 
47
49
  def same_domain?(link)
48
- URI.parse(URI.encode(Rawler.url)).host == URI.parse(URI.encode(link)).host
50
+ URI.parse(Rawler.url).host == URI.parse(link).host
49
51
  end
50
52
 
51
53
  def not_yet_parsed?(link)
@@ -20,12 +20,15 @@ module Rawler
20
20
  rescue Errno::ECONNREFUSED
21
21
  write("Couldn't connect to #{url}")
22
22
  []
23
+ rescue Errno::ETIMEDOUT
24
+ write("Connection to #{url} timed out")
25
+ []
23
26
  end
24
27
 
25
28
  private
26
29
 
27
30
  def absolute_url(path)
28
- URI.parse(URI.encode(url)).merge(URI.encode(path.to_s)).to_s
31
+ URI.parse(url).merge(path.to_s).to_s
29
32
  end
30
33
 
31
34
  def write(message)
@@ -33,7 +36,7 @@ module Rawler
33
36
  end
34
37
 
35
38
  def different_domain?(url_1, url_2)
36
- URI.parse(URI.encode(url_1)).host != URI.parse(URI.encode(url_2)).host
39
+ URI.parse(url_1).host != URI.parse(url_2).host
37
40
  end
38
41
 
39
42
  def not_html?(url)
@@ -41,7 +44,7 @@ module Rawler
41
44
  end
42
45
 
43
46
  def valid_url?(url)
44
- scheme = URI.parse(URI.encode(url)).scheme
47
+ scheme = URI.parse(url).scheme
45
48
 
46
49
  ['http', 'https'].include?(scheme)
47
50
  end
@@ -15,7 +15,7 @@ module Rawler
15
15
  private
16
16
 
17
17
  def perform_request(method, url)
18
- uri = URI.parse(URI.encode(url))
18
+ uri = URI.parse(url)
19
19
  http = Net::HTTP.new(uri.host, uri.port)
20
20
  http.use_ssl = (uri.scheme == 'https')
21
21
 
data/spec/spec_helper.rb CHANGED
@@ -1,3 +1,11 @@
1
+ module Kernel
2
+
3
+ def sleep(duration)
4
+ nil
5
+ end
6
+
7
+ end
8
+
1
9
 
2
10
  $:.unshift(File.dirname(__FILE__) + '/../lib')
3
11
  require 'rawler'
@@ -49,6 +49,15 @@ describe Rawler::Base do
49
49
 
50
50
  rawler.validate
51
51
  end
52
+
53
+ it "should validate links with #hashtags" do
54
+ register('http://example.com/foo1', '<a href="http://example.com/page-with#hashtag">x</a>')
55
+ register('http://example.com/page-with', '')
56
+
57
+ output.should_receive(:puts).with('200 - http://example.com/page-with#hashtag')
58
+
59
+ rawler.validate
60
+ end
52
61
 
53
62
  end
54
63
 
@@ -89,7 +98,7 @@ describe Rawler::Base do
89
98
  rawler.send(:add_status_code, url)
90
99
  end
91
100
 
92
- [Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
101
+ [Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError,
93
102
  Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
94
103
  it "should rescue from #{error}" do
95
104
  url = 'http://example.com'
@@ -0,0 +1,75 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
+
3
+ describe Rawler::Crawler do
4
+
5
+ context "basic functionality" do
6
+
7
+ let(:url) { 'http://example.com' }
8
+ let(:crawler) { Rawler::Crawler.new(url) }
9
+ let(:content) {
10
+ content = <<-content
11
+ <p><a href="http://example.com/foo">foo</a></p>
12
+
13
+ <p><a href="http://external.com/bar">bar</a></p>
14
+ content
15
+ }
16
+
17
+ before(:each) do
18
+ register(url, content)
19
+ end
20
+
21
+ it "should parse all links" do
22
+ crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
23
+ end
24
+
25
+ end
26
+
27
+ context "relative paths" do
28
+
29
+ let(:url) { 'http://example.com/path' }
30
+ let(:crawler) { Rawler::Crawler.new(url) }
31
+ let(:content) { '<a href="/foo">foo</a>' }
32
+
33
+ before(:each) do
34
+ register(url, content)
35
+ end
36
+
37
+ it "should parse relative links" do
38
+ crawler.links.should == ['http://example.com/foo']
39
+ end
40
+
41
+ end
42
+
43
+ context "different domains" do
44
+
45
+ let(:url) { 'http://external.com/path' }
46
+ let(:crawler) { Rawler::Crawler.new(url) }
47
+ let(:content) { '<a href="/foo">foo</a>' }
48
+
49
+ before(:each) do
50
+ register(url, content)
51
+ end
52
+
53
+ it "should parse relative links" do
54
+ crawler.links.should == []
55
+ end
56
+
57
+ end
58
+
59
+ context "urls with hash tags" do
60
+
61
+ let(:url) { 'http://example.com/path' }
62
+ let(:crawler) { Rawler::Crawler.new(url) }
63
+ let(:content) { '<a href="/foo#bar">foo</a>' }
64
+
65
+ before(:each) do
66
+ register(url, content)
67
+ end
68
+
69
+ it "should parse relative links" do
70
+ crawler.links.should == ['http://example.com/foo#bar']
71
+ end
72
+
73
+ end
74
+
75
+ end
@@ -0,0 +1,23 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
+
3
+ describe Rawler::Crawler do
4
+
5
+ context "content type" do
6
+
7
+ ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
8
+
9
+ let(:url) { 'http://example.com' }
10
+ let(:crawler) { Rawler::Crawler.new(url) }
11
+
12
+ before(:each) do
13
+ register(url, '', 200, :content_type => content_type)
14
+ end
15
+
16
+ it "should ignore '#{content_type}'" do
17
+ crawler.links.should == []
18
+ end
19
+
20
+ end
21
+ end
22
+
23
+ end
@@ -0,0 +1,54 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
+
3
+ describe Rawler::Crawler do
4
+
5
+ context "Exceptions" do
6
+
7
+ let(:url) { 'http://example.com' }
8
+ let(:crawler) { Rawler::Crawler.new(url) }
9
+ let(:output) { double('output', :puts => nil) }
10
+
11
+ before(:each) do
12
+ register(url, '')
13
+ Rawler.stub!(:output).and_return(output)
14
+ end
15
+
16
+ context "Errno::ECONNREFUSED" do
17
+
18
+ before(:each) do
19
+ Rawler::Request.stub!(:get).and_raise Errno::ECONNREFUSED
20
+ end
21
+
22
+ it "should return an empty array" do
23
+ crawler.links.should == []
24
+ end
25
+
26
+ it "should print a message when raising Errno::ECONNREFUSED" do
27
+ output.should_receive(:puts).with("Couldn't connect to #{url}")
28
+
29
+ crawler.links
30
+ end
31
+
32
+ end
33
+
34
+ context "Errno::ETIMEDOUT" do
35
+
36
+ before(:each) do
37
+ Rawler::Request.stub!(:get).and_raise Errno::ETIMEDOUT
38
+ end
39
+
40
+ it "should return an empty array when raising Errno::ETIMEDOUT" do
41
+ crawler.links.should == []
42
+ end
43
+
44
+ it "should print a message when raising Errno::ETIMEDOUT" do
45
+ output.should_receive(:puts).with("Connection to #{url} timed out")
46
+
47
+ crawler.links
48
+ end
49
+
50
+ end
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,25 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
+
3
+ describe Rawler::Crawler do
4
+
5
+ context "http basic" do
6
+
7
+ let(:url) { 'http://example.com' }
8
+ let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
9
+ let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
10
+
11
+ before(:each) do
12
+ register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
13
+ register('http://foo:bar@example.com/secret', content)
14
+
15
+ Rawler.stub!(:username).and_return('foo')
16
+ Rawler.stub!(:password).and_return('bar')
17
+ end
18
+
19
+ it "should crawl http basic pages" do
20
+ crawler.links.should == ['http://example.com/secret-path']
21
+ end
22
+
23
+ end
24
+
25
+ end
@@ -0,0 +1,26 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
+
3
+ describe Rawler::Crawler do
4
+
5
+ context "url domain" do
6
+
7
+ let(:content) {
8
+ content = <<-content
9
+ <a href="http://example.com/valid">foo</a>
10
+ <a href="mailto:info@example.com">invalid</a>
11
+ <a href="https://foo.com">valid</a>
12
+ content
13
+ }
14
+ let(:url) { 'http://example.com' }
15
+ let(:crawler) { Rawler::Crawler.new(url) }
16
+
17
+ before(:each) do
18
+ register(url, content)
19
+ end
20
+
21
+ it "should ignore links other than http or https" do
22
+ crawler.links.should == ['http://example.com/valid', 'https://foo.com']
23
+ end
24
+ end
25
+
26
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 21
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 4
10
- version: 0.0.4
9
+ - 5
10
+ version: 0.0.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Oscar Del Ben
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-11 00:00:00 +01:00
18
+ date: 2011-01-21 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -52,6 +52,8 @@ description: |-
52
52
  Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
53
53
 
54
54
  Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
55
+
56
+ Please note: I had to temporarily remove url encoding in order to resolve some issues, so if you find any issue, please let me know. I'm also going to use Mechanizer for parsing pages with the next release.
55
57
  email:
56
58
  - info@oscardelben.com
57
59
  executables:
@@ -63,25 +65,30 @@ extra_rdoc_files:
63
65
  - Manifest.txt
64
66
  - README.txt
65
67
  files:
68
+ - .autotest
66
69
  - History.txt
67
70
  - Manifest.txt
68
71
  - README.txt
69
72
  - Rakefile
70
73
  - bin/rawler
74
+ - lib/rawler.rb
71
75
  - lib/rawler/base.rb
72
- - lib/rawler/core_extensions/module.rb
73
76
  - lib/rawler/core_extensions.rb
77
+ - lib/rawler/core_extensions/module.rb
74
78
  - lib/rawler/crawler.rb
75
79
  - lib/rawler/request.rb
76
- - lib/rawler.rb
77
80
  - spec/spec.opts
78
81
  - spec/spec_helper.rb
79
82
  - spec/unit/base_spec.rb
80
- - spec/unit/crawler_spec.rb
83
+ - spec/unit/crawler/base_spec.rb
84
+ - spec/unit/crawler/content_type_spec.rb
85
+ - spec/unit/crawler/exceptions_spec.rb
86
+ - spec/unit/crawler/http_basic_spec.rb
87
+ - spec/unit/crawler/url_domain_spec.rb
81
88
  - tasks/rspec.rake
82
89
  - vendor/lib-trollop.rb
83
90
  has_rdoc: true
84
- homepage: http://github.com/#{github_username}/#{project_name}
91
+ homepage: http://github.com/oscardelben/rawler
85
92
  licenses: []
86
93
 
87
94
  post_install_message:
@@ -1,114 +0,0 @@
1
- require File.dirname(__FILE__) + '/../spec_helper.rb'
2
-
3
- describe Rawler::Crawler do
4
-
5
- let(:url) { 'http://example.com' }
6
-
7
- before(:each) do
8
- Rawler.stub!(:url).and_return(url)
9
- end
10
-
11
- it "should parse all links" do
12
- register(url, site)
13
-
14
- Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
15
- end
16
-
17
- it "should parse relative links" do
18
- url = 'http://example.com/path'
19
- register(url, '<a href="/foo">foo</a>')
20
-
21
- Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
22
- end
23
-
24
- it "should parse links only if the page is in the same domain as the main url" do
25
- url = 'http://external.com/path'
26
- register(url, '<a href="/foo">foo</a>')
27
-
28
- Rawler.should_receive(:url).and_return('http://example.com')
29
-
30
- Rawler::Crawler.new(url).links.should == []
31
- end
32
-
33
- it "should return an empty array when raising Errno::ECONNREFUSED" do
34
- register(url, site)
35
- crawler = Rawler::Crawler.new(url)
36
-
37
- Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
38
-
39
- crawler.links.should == []
40
- end
41
-
42
- it "should print a message when raising Errno::ECONNREFUSED" do
43
- output = double('output')
44
- register(url, site)
45
-
46
- crawler = Rawler::Crawler.new(url)
47
-
48
- Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
49
- Rawler.should_receive(:output).and_return(output)
50
- output.should_receive(:puts).with("Couldn't connect to #{url}")
51
-
52
- crawler.links
53
- end
54
-
55
- context "should ignore content type other than text/html" do
56
-
57
- ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
58
-
59
- it "should ignore '#{content_type}'" do
60
- register(url, site, 200, :content_type => content_type)
61
-
62
- crawler = Rawler::Crawler.new(url)
63
- crawler.links.should == []
64
- end
65
-
66
- end
67
-
68
- end
69
-
70
- it "should ignore links other than http or https" do
71
- content = <<-content
72
- <a href="http://example.com/valid">foo</a>
73
- <a href="mailto:info@example.com">invalid</a>
74
- <a href="https://foo.com">valid</a>
75
- content
76
-
77
- register(url, content)
78
-
79
- crawler = Rawler::Crawler.new(url)
80
- crawler.links.should == ['http://example.com/valid', 'https://foo.com']
81
- end
82
-
83
- it "should crawl http basic pages" do
84
- content = '<a href="http://example.com/secret-path">foo</a>'
85
-
86
- register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
87
- register('http://foo:bar@example.com/secret', content)
88
-
89
- Rawler.stub!(:username).and_return('foo')
90
- Rawler.stub!(:password).and_return('bar')
91
-
92
- crawler = Rawler::Crawler.new('http://example.com/secret')
93
- crawler.links.should == ['http://example.com/secret-path']
94
- end
95
-
96
- private
97
-
98
- def site
99
- <<-site
100
- <!DOCTYPE html>
101
- <html>
102
- <body>
103
- <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
104
-
105
- <p><a href="http://example.com/foo">foo</a></p>
106
-
107
- <p><a href="http://external.com/bar">bar</a></p>
108
-
109
- </body>
110
- </html>
111
- site
112
- end
113
-
114
- end