rawler 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,5 @@
1
1
  Gemfile
2
2
  Gemfile.lock
3
- History.txt
4
3
  Manifest.txt
5
4
  README.txt
6
5
  Rakefile
@@ -19,4 +18,4 @@ spec/spec.opts
19
18
  spec/spec_helper.rb
20
19
  specs.watchr
21
20
  tasks/rspec.rake
22
- vendor/lib-trollop.rb
21
+ vendor/lib-trollop.rb
data/README.txt CHANGED
@@ -15,6 +15,7 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
15
15
  where [options] are:
16
16
  --username, -u <s>: HTTP Basic Username
17
17
  --password, -p <s>: HTTP Basic Password
18
+ --wait: Number of seconds to wait betweet requests (default 3)
18
19
  --version, -v: Print version and exit
19
20
  --help, -h: Show this message
20
21
 
@@ -46,6 +47,7 @@ And add them to the Manifest file.
46
47
  * Hugh Sasse
47
48
  * Ken Egozi
48
49
  * Robert Glaser
50
+ * Stefan Schüßler
49
51
  * Vesa Vänskä
50
52
 
51
53
  See also https://github.com/oscardelben/rawler/contributors
data/bin/rawler CHANGED
@@ -14,13 +14,12 @@ Usage:
14
14
  where [options] are:
15
15
  EOS
16
16
 
17
- # opt :domain, "domain that you want to test", :type => :string
18
17
  opt :username, "HTT Basic Username", :type => :string
19
18
  opt :password, "HTT Basic Password", :type => :string
19
+ opt :wait, "Number of seconds to wait betweet requests", :default => 3
20
20
  end
21
21
 
22
- # Use dup to unfrozen string
23
- domain = ARGV.shift.dup
22
+ domain = ARGV.shift
24
23
 
25
24
  if domain.nil?
26
25
  Trollop::die "Domain name is mandatory. Type --help for help"
@@ -30,5 +29,5 @@ else
30
29
  end
31
30
  end
32
31
 
33
- Rawler::Base.new(domain, $stdout, opts[:username], opts[:password]).validate
32
+ Rawler::Base.new(domain, $stdout, opts).validate
34
33
 
@@ -7,10 +7,11 @@ require 'logger'
7
7
  require 'rawler/core_extensions'
8
8
 
9
9
  module Rawler
10
- VERSION = '0.0.9'
10
+ VERSION = '0.1.0'
11
11
 
12
12
  mattr_accessor :output
13
13
  mattr_accessor :url
14
+ mattr_accessor :wait
14
15
 
15
16
  mattr_accessor :username, :password
16
17
 
@@ -4,13 +4,14 @@ module Rawler
4
4
 
5
5
  attr_accessor :responses
6
6
 
7
- def initialize(url, output, username=nil, password=nil)
7
+ def initialize(url, output, options={})
8
8
  @responses = {}
9
9
 
10
10
  Rawler.url = URI.escape(url)
11
11
  Rawler.output = Logger.new(output)
12
- Rawler.username = username
13
- Rawler.password = password
12
+ Rawler.username = options[:username]
13
+ Rawler.password = options[:password]
14
+ Rawler.wait = options[:wait].to_i
14
15
  end
15
16
 
16
17
  def validate
@@ -22,8 +23,7 @@ module Rawler
22
23
  def validate_links_in_page(current_url)
23
24
  Rawler::Crawler.new(current_url).links.each do |page_url|
24
25
  validate_page(page_url, current_url)
25
- # Todo: include this in a configuration option
26
- sleep(3)
26
+ sleep(Rawler.wait)
27
27
  end
28
28
  end
29
29
 
@@ -30,13 +30,11 @@ module Rawler
30
30
  private
31
31
 
32
32
  def absolute_url(path)
33
- path = URI.encode(path.strip)
34
- if path[0].chr == '/'
35
- URI.parse(url).merge(path.to_s).to_s
36
- elsif URI.parse(path).scheme.nil?
37
- URI.parse(url).merge("/#{path.to_s}").to_s
38
- else
33
+ path = URI.encode(path.strip, Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]"))
34
+ if URI.parse(path).scheme
39
35
  path
36
+ else
37
+ URI.parse(url).merge(path).to_s
40
38
  end
41
39
  rescue URI::InvalidURIError
42
40
  write("Invalid url: #{path} - Called from: #{url}")
@@ -36,16 +36,36 @@ describe Rawler::Crawler do
36
36
 
37
37
  context "relative paths" do
38
38
 
39
- let(:url) { 'http://example.com/path' }
40
- let(:crawler) { Rawler::Crawler.new(url) }
41
- let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a>' }
42
-
43
- before(:each) do
44
- register(url, content)
39
+ context "base URL ends with a slash" do
40
+
41
+ let(:url) { 'http://example.com/dir1/dir2/' }
42
+ let(:crawler) { Rawler::Crawler.new(url) }
43
+ let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
44
+
45
+ before(:each) do
46
+ register(url, content)
47
+ end
48
+
49
+ it "should parse relative links" do
50
+ crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/dir2/bar', 'http://example.com/dir1/baz']
51
+ end
52
+
45
53
  end
46
54
 
47
- it "should parse relative links" do
48
- crawler.links.should == ['http://example.com/foo', 'http://example.com/bar']
55
+ context "base URL doesn't end with a slash" do
56
+
57
+ let(:url) { 'http://example.com/dir1/dir2' }
58
+ let(:crawler) { Rawler::Crawler.new(url) }
59
+ let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
60
+
61
+ before(:each) do
62
+ register(url, content)
63
+ end
64
+
65
+ it "should parse relative links" do
66
+ crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/bar', 'http://example.com/baz']
67
+ end
68
+
49
69
  end
50
70
 
51
71
  end
@@ -77,8 +97,8 @@ describe Rawler::Crawler do
77
97
  register(url, content)
78
98
  end
79
99
 
80
- it "should parse urls with hashtags" do
81
- crawler.links.should == ['http://example.com/foo%23bar']
100
+ it "should not encode hashtags" do
101
+ crawler.links.should == ['http://example.com/foo#bar']
82
102
  end
83
103
 
84
104
  end
@@ -99,12 +99,18 @@ describe Rawler::Base do
99
99
  end
100
100
 
101
101
  it "should save username and password" do
102
- rawler = Rawler::Base.new('http://example.com', output, 'my_user', 'secret')
102
+ rawler = Rawler::Base.new('http://example.com', output, {:username => 'my_user', :password => 'secret'})
103
103
 
104
104
  Rawler.username.should == 'my_user'
105
105
  Rawler.password.should == 'secret'
106
106
  end
107
107
 
108
+ it "should save wait" do
109
+ rawler = Rawler::Base.new('http://example.com', output, {:wait => 5})
110
+
111
+ Rawler.wait.should == 5
112
+ end
113
+
108
114
  it "should rescue from Errno::ECONNREFUSED" do
109
115
  url = 'http://example.com'
110
116
  from = 'http://other.com'
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
+ - 1
7
8
  - 0
8
- - 9
9
- version: 0.0.9
9
+ version: 0.1.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Oscar Del Ben
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-25 00:00:00 +01:00
17
+ date: 2011-03-29 00:00:00 +02:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -71,13 +71,11 @@ executables:
71
71
  extensions: []
72
72
 
73
73
  extra_rdoc_files:
74
- - History.txt
75
74
  - Manifest.txt
76
75
  - README.txt
77
76
  files:
78
77
  - Gemfile
79
78
  - Gemfile.lock
80
- - History.txt
81
79
  - Manifest.txt
82
80
  - README.txt
83
81
  - Rakefile
@@ -1,12 +0,0 @@
1
- === 0.0.2 / 2011-01-10
2
-
3
- * 1 major enhancement
4
-
5
- * Handle relative urls
6
-
7
- === 0.0.1 / 2011-01-10
8
-
9
- * 1 major enhancement
10
-
11
- * Birthday!
12
-