rawler 0.0.9 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,5 @@
1
1
  Gemfile
2
2
  Gemfile.lock
3
- History.txt
4
3
  Manifest.txt
5
4
  README.txt
6
5
  Rakefile
@@ -19,4 +18,4 @@ spec/spec.opts
19
18
  spec/spec_helper.rb
20
19
  specs.watchr
21
20
  tasks/rspec.rake
22
- vendor/lib-trollop.rb
21
+ vendor/lib-trollop.rb
data/README.txt CHANGED
@@ -15,6 +15,7 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
15
15
  where [options] are:
16
16
  --username, -u <s>: HTTP Basic Username
17
17
  --password, -p <s>: HTTP Basic Password
18
+ --wait: Number of seconds to wait betweet requests (default 3)
18
19
  --version, -v: Print version and exit
19
20
  --help, -h: Show this message
20
21
 
@@ -46,6 +47,7 @@ And add them to the Manifest file.
46
47
  * Hugh Sasse
47
48
  * Ken Egozi
48
49
  * Robert Glaser
50
+ * Stefan Schüßler
49
51
  * Vesa Vänskä
50
52
 
51
53
  See also https://github.com/oscardelben/rawler/contributors
data/bin/rawler CHANGED
@@ -14,13 +14,12 @@ Usage:
14
14
  where [options] are:
15
15
  EOS
16
16
 
17
- # opt :domain, "domain that you want to test", :type => :string
18
17
  opt :username, "HTT Basic Username", :type => :string
19
18
  opt :password, "HTT Basic Password", :type => :string
19
+ opt :wait, "Number of seconds to wait betweet requests", :default => 3
20
20
  end
21
21
 
22
- # Use dup to unfrozen string
23
- domain = ARGV.shift.dup
22
+ domain = ARGV.shift
24
23
 
25
24
  if domain.nil?
26
25
  Trollop::die "Domain name is mandatory. Type --help for help"
@@ -30,5 +29,5 @@ else
30
29
  end
31
30
  end
32
31
 
33
- Rawler::Base.new(domain, $stdout, opts[:username], opts[:password]).validate
32
+ Rawler::Base.new(domain, $stdout, opts).validate
34
33
 
@@ -7,10 +7,11 @@ require 'logger'
7
7
  require 'rawler/core_extensions'
8
8
 
9
9
  module Rawler
10
- VERSION = '0.0.9'
10
+ VERSION = '0.1.0'
11
11
 
12
12
  mattr_accessor :output
13
13
  mattr_accessor :url
14
+ mattr_accessor :wait
14
15
 
15
16
  mattr_accessor :username, :password
16
17
 
@@ -4,13 +4,14 @@ module Rawler
4
4
 
5
5
  attr_accessor :responses
6
6
 
7
- def initialize(url, output, username=nil, password=nil)
7
+ def initialize(url, output, options={})
8
8
  @responses = {}
9
9
 
10
10
  Rawler.url = URI.escape(url)
11
11
  Rawler.output = Logger.new(output)
12
- Rawler.username = username
13
- Rawler.password = password
12
+ Rawler.username = options[:username]
13
+ Rawler.password = options[:password]
14
+ Rawler.wait = options[:wait].to_i
14
15
  end
15
16
 
16
17
  def validate
@@ -22,8 +23,7 @@ module Rawler
22
23
  def validate_links_in_page(current_url)
23
24
  Rawler::Crawler.new(current_url).links.each do |page_url|
24
25
  validate_page(page_url, current_url)
25
- # Todo: include this in a configuration option
26
- sleep(3)
26
+ sleep(Rawler.wait)
27
27
  end
28
28
  end
29
29
 
@@ -30,13 +30,11 @@ module Rawler
30
30
  private
31
31
 
32
32
  def absolute_url(path)
33
- path = URI.encode(path.strip)
34
- if path[0].chr == '/'
35
- URI.parse(url).merge(path.to_s).to_s
36
- elsif URI.parse(path).scheme.nil?
37
- URI.parse(url).merge("/#{path.to_s}").to_s
38
- else
33
+ path = URI.encode(path.strip, Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]"))
34
+ if URI.parse(path).scheme
39
35
  path
36
+ else
37
+ URI.parse(url).merge(path).to_s
40
38
  end
41
39
  rescue URI::InvalidURIError
42
40
  write("Invalid url: #{path} - Called from: #{url}")
@@ -36,16 +36,36 @@ describe Rawler::Crawler do
36
36
 
37
37
  context "relative paths" do
38
38
 
39
- let(:url) { 'http://example.com/path' }
40
- let(:crawler) { Rawler::Crawler.new(url) }
41
- let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a>' }
42
-
43
- before(:each) do
44
- register(url, content)
39
+ context "base URL ends with a slash" do
40
+
41
+ let(:url) { 'http://example.com/dir1/dir2/' }
42
+ let(:crawler) { Rawler::Crawler.new(url) }
43
+ let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
44
+
45
+ before(:each) do
46
+ register(url, content)
47
+ end
48
+
49
+ it "should parse relative links" do
50
+ crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/dir2/bar', 'http://example.com/dir1/baz']
51
+ end
52
+
45
53
  end
46
54
 
47
- it "should parse relative links" do
48
- crawler.links.should == ['http://example.com/foo', 'http://example.com/bar']
55
+ context "base URL doesn't end with a slash" do
56
+
57
+ let(:url) { 'http://example.com/dir1/dir2' }
58
+ let(:crawler) { Rawler::Crawler.new(url) }
59
+ let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
60
+
61
+ before(:each) do
62
+ register(url, content)
63
+ end
64
+
65
+ it "should parse relative links" do
66
+ crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/bar', 'http://example.com/baz']
67
+ end
68
+
49
69
  end
50
70
 
51
71
  end
@@ -77,8 +97,8 @@ describe Rawler::Crawler do
77
97
  register(url, content)
78
98
  end
79
99
 
80
- it "should parse urls with hashtags" do
81
- crawler.links.should == ['http://example.com/foo%23bar']
100
+ it "should not encode hashtags" do
101
+ crawler.links.should == ['http://example.com/foo#bar']
82
102
  end
83
103
 
84
104
  end
@@ -99,12 +99,18 @@ describe Rawler::Base do
99
99
  end
100
100
 
101
101
  it "should save username and password" do
102
- rawler = Rawler::Base.new('http://example.com', output, 'my_user', 'secret')
102
+ rawler = Rawler::Base.new('http://example.com', output, {:username => 'my_user', :password => 'secret'})
103
103
 
104
104
  Rawler.username.should == 'my_user'
105
105
  Rawler.password.should == 'secret'
106
106
  end
107
107
 
108
+ it "should save wait" do
109
+ rawler = Rawler::Base.new('http://example.com', output, {:wait => 5})
110
+
111
+ Rawler.wait.should == 5
112
+ end
113
+
108
114
  it "should rescue from Errno::ECONNREFUSED" do
109
115
  url = 'http://example.com'
110
116
  from = 'http://other.com'
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
+ - 1
7
8
  - 0
8
- - 9
9
- version: 0.0.9
9
+ version: 0.1.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Oscar Del Ben
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-25 00:00:00 +01:00
17
+ date: 2011-03-29 00:00:00 +02:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -71,13 +71,11 @@ executables:
71
71
  extensions: []
72
72
 
73
73
  extra_rdoc_files:
74
- - History.txt
75
74
  - Manifest.txt
76
75
  - README.txt
77
76
  files:
78
77
  - Gemfile
79
78
  - Gemfile.lock
80
- - History.txt
81
79
  - Manifest.txt
82
80
  - README.txt
83
81
  - Rakefile
@@ -1,12 +0,0 @@
1
- === 0.0.2 / 2011-01-10
2
-
3
- * 1 major enhancement
4
-
5
- * Handle relative urls
6
-
7
- === 0.0.1 / 2011-01-10
8
-
9
- * 1 major enhancement
10
-
11
- * Birthday!
12
-