rawler 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile.lock CHANGED
@@ -3,7 +3,12 @@ GEM
3
3
  specs:
4
4
  diff-lcs (1.1.2)
5
5
  fakeweb (1.3.0)
6
+ hoe (2.6.2)
7
+ rake (>= 0.8.7)
8
+ rubyforge (>= 2.0.4)
9
+ json_pure (1.5.1)
6
10
  nokogiri (1.4.4)
11
+ rake (0.8.7)
7
12
  rspec (2.4.0)
8
13
  rspec-core (~> 2.4.0)
9
14
  rspec-expectations (~> 2.4.0)
@@ -12,11 +17,14 @@ GEM
12
17
  rspec-expectations (2.4.0)
13
18
  diff-lcs (~> 1.1.2)
14
19
  rspec-mocks (2.4.0)
20
+ rubyforge (2.0.4)
21
+ json_pure (>= 1.1.7)
15
22
 
16
23
  PLATFORMS
17
24
  ruby
18
25
 
19
26
  DEPENDENCIES
20
27
  fakeweb (= 1.3.0)
28
+ hoe (= 2.6.2)
21
29
  nokogiri (= 1.4.4)
22
30
  rspec (= 2.4.0)
data/lib/rawler/base.rb CHANGED
@@ -7,7 +7,7 @@ module Rawler
7
7
  def initialize(url, output, username=nil, password=nil)
8
8
  @responses = {}
9
9
 
10
- Rawler.url = url
10
+ Rawler.url = URI.escape(url)
11
11
  Rawler.output = Logger.new(output)
12
12
  Rawler.username = username
13
13
  Rawler.password = password
@@ -16,7 +16,7 @@ module Rawler
16
16
  response = Rawler::Request.get(url)
17
17
 
18
18
  doc = Nokogiri::HTML(response.body)
19
- doc.css('a').map { |a| a['href'] }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
19
+ doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
20
20
  rescue Errno::ECONNREFUSED
21
21
  write("Couldn't connect to #{url}")
22
22
  []
@@ -28,9 +28,11 @@ module Rawler
28
28
  private
29
29
 
30
30
  def absolute_url(path)
31
- path.strip!
31
+ path = URI.encode(path.strip)
32
32
  if path[0].chr == '/'
33
33
  URI.parse(url).merge(path.to_s).to_s
34
+ elsif URI.parse(path).scheme.nil?
35
+ URI.parse(url).merge("/#{path.to_s}").to_s
34
36
  else
35
37
  path
36
38
  end
data/lib/rawler.rb CHANGED
@@ -7,7 +7,7 @@ require 'logger'
7
7
  require 'rawler/core_extensions'
8
8
 
9
9
  module Rawler
10
- VERSION = '0.0.6'
10
+ VERSION = '0.0.7'
11
11
 
12
12
  mattr_accessor :output
13
13
  mattr_accessor :url
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
4
 
3
5
  describe Rawler::Crawler do
@@ -36,14 +38,14 @@ describe Rawler::Crawler do
36
38
 
37
39
  let(:url) { 'http://example.com/path' }
38
40
  let(:crawler) { Rawler::Crawler.new(url) }
39
- let(:content) { '<a href="/foo">foo</a>' }
41
+ let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a>' }
40
42
 
41
43
  before(:each) do
42
44
  register(url, content)
43
45
  end
44
46
 
45
47
  it "should parse relative links" do
46
- crawler.links.should == ['http://example.com/foo']
48
+ crawler.links.should == ['http://example.com/foo', 'http://example.com/bar']
47
49
  end
48
50
 
49
51
  end
@@ -75,8 +77,24 @@ describe Rawler::Crawler do
75
77
  register(url, content)
76
78
  end
77
79
 
78
- it "should parse relative links" do
79
- crawler.links.should == ['http://example.com/foo#bar']
80
+ it "should parse urls with hashtags" do
81
+ crawler.links.should == ['http://example.com/foo%23bar']
82
+ end
83
+
84
+ end
85
+
86
+ context "urls with unicode characters" do
87
+
88
+ let(:url) { 'http://example.com' }
89
+ let(:crawler) { Rawler::Crawler.new(url) }
90
+ let(:content) { '<a href="http://example.com/写程序容易出现的几个不好的地方">foo</a>' }
91
+
92
+ before(:each) do
93
+ register(url, content)
94
+ end
95
+
96
+ it "should parse unicode links" do
97
+ crawler.links.should == ['http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9']
80
98
  end
81
99
 
82
100
  end
@@ -85,7 +103,7 @@ describe Rawler::Crawler do
85
103
  let(:url) { 'http://example.com/path' }
86
104
  let(:crawler) { Rawler::Crawler.new(url) }
87
105
  let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
88
- let(:content) { "<a href=\"#{js_url}\">foo</a>" }
106
+ let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
89
107
 
90
108
  before(:each) do
91
109
  register(url, content)
@@ -94,9 +112,9 @@ describe Rawler::Crawler do
94
112
  it "should parse relative links" do
95
113
  crawler.links.should == []
96
114
  end
97
-
115
+
98
116
  it "should report the error" do
99
- crawler.should_receive(:write).with("Invalid url - #{js_url}")
117
+ crawler.should_receive(:write).with("Invalid url - javascript:fn('nbjmup;jhfs.esf%7Bfio/dpn');")
100
118
  crawler.links
101
119
  end
102
120
  end
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require File.dirname(__FILE__) + '/../spec_helper.rb'
2
4
 
3
5
  describe Rawler::Base do
@@ -9,6 +11,16 @@ describe Rawler::Base do
9
11
  Rawler.stub!(:output).and_return(output)
10
12
  register('http://example.com', site)
11
13
  end
14
+
15
+ describe "url encoding" do
16
+ it "should encode url" do
17
+ original = 'http://example.com/写程序容易出现的几个不好的地方'
18
+ expected = 'http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9'
19
+
20
+ Rawler::Base.new(original, output)
21
+ Rawler.url.should == expected
22
+ end
23
+ end
12
24
 
13
25
  describe "validate_links" do
14
26
 
@@ -50,16 +62,7 @@ describe Rawler::Base do
50
62
 
51
63
  rawler.validate
52
64
  end
53
-
54
- it "should validate links with #hashtags" do
55
- register('http://example.com/foo1', '<a href="http://example.com/page-with#hashtag">x</a>')
56
- register('http://example.com/page-with', '')
57
-
58
- output.should_receive(:info).with('200 - http://example.com/page-with#hashtag')
59
-
60
- rawler.validate
61
- end
62
-
65
+
63
66
  end
64
67
 
65
68
  describe "get_status_code" do
@@ -200,4 +203,4 @@ describe Rawler::Base do
200
203
  site
201
204
  end
202
205
 
203
- end
206
+ end
data/specs.watchr CHANGED
@@ -25,7 +25,6 @@ def run(files_to_run)
25
25
  end
26
26
 
27
27
  def run_all_tests
28
- puts "foo"
29
28
  run(all_test_files.join(' '))
30
29
  end
31
30
 
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
5
- prerelease:
4
+ prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 0
9
- - 6
10
- version: 0.0.6
8
+ - 7
9
+ version: 0.0.7
11
10
  platform: ruby
12
11
  authors:
13
12
  - Oscar Del Ben
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2011-02-04 00:00:00 +01:00
17
+ date: 2011-03-07 00:00:00 +01:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
@@ -26,28 +25,41 @@ dependencies:
26
25
  requirements:
27
26
  - - ">="
28
27
  - !ruby/object:Gem::Version
29
- hash: 3
30
28
  segments:
31
29
  - 0
32
30
  version: "0"
33
31
  type: :runtime
34
32
  version_requirements: *id001
35
33
  - !ruby/object:Gem::Dependency
36
- name: hoe
34
+ name: rubyforge
37
35
  prerelease: false
38
36
  requirement: &id002 !ruby/object:Gem::Requirement
39
37
  none: false
40
38
  requirements:
41
39
  - - ">="
42
40
  - !ruby/object:Gem::Version
43
- hash: 47
44
41
  segments:
45
42
  - 2
46
- - 8
47
43
  - 0
48
- version: 2.8.0
44
+ - 4
45
+ version: 2.0.4
49
46
  type: :development
50
47
  version_requirements: *id002
48
+ - !ruby/object:Gem::Dependency
49
+ name: hoe
50
+ prerelease: false
51
+ requirement: &id003 !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ segments:
57
+ - 2
58
+ - 6
59
+ - 2
60
+ version: 2.6.2
61
+ type: :development
62
+ version_requirements: *id003
51
63
  description: |-
52
64
  Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
53
65
 
@@ -100,7 +112,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
100
112
  requirements:
101
113
  - - ">="
102
114
  - !ruby/object:Gem::Version
103
- hash: 3
104
115
  segments:
105
116
  - 0
106
117
  version: "0"
@@ -109,14 +120,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
120
  requirements:
110
121
  - - ">="
111
122
  - !ruby/object:Gem::Version
112
- hash: 3
113
123
  segments:
114
124
  - 0
115
125
  version: "0"
116
126
  requirements: []
117
127
 
118
128
  rubyforge_project: oscardelben
119
- rubygems_version: 1.4.1
129
+ rubygems_version: 1.3.7
120
130
  signing_key:
121
131
  specification_version: 3
122
132
  summary: Rawler is a Ruby library that crawls your website and checks the status code for each of your links