rawler 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock CHANGED
@@ -3,7 +3,12 @@ GEM
3
3
  specs:
4
4
  diff-lcs (1.1.2)
5
5
  fakeweb (1.3.0)
6
+ hoe (2.6.2)
7
+ rake (>= 0.8.7)
8
+ rubyforge (>= 2.0.4)
9
+ json_pure (1.5.1)
6
10
  nokogiri (1.4.4)
11
+ rake (0.8.7)
7
12
  rspec (2.4.0)
8
13
  rspec-core (~> 2.4.0)
9
14
  rspec-expectations (~> 2.4.0)
@@ -12,11 +17,14 @@ GEM
12
17
  rspec-expectations (2.4.0)
13
18
  diff-lcs (~> 1.1.2)
14
19
  rspec-mocks (2.4.0)
20
+ rubyforge (2.0.4)
21
+ json_pure (>= 1.1.7)
15
22
 
16
23
  PLATFORMS
17
24
  ruby
18
25
 
19
26
  DEPENDENCIES
20
27
  fakeweb (= 1.3.0)
28
+ hoe (= 2.6.2)
21
29
  nokogiri (= 1.4.4)
22
30
  rspec (= 2.4.0)
data/lib/rawler/base.rb CHANGED
@@ -7,7 +7,7 @@ module Rawler
7
7
  def initialize(url, output, username=nil, password=nil)
8
8
  @responses = {}
9
9
 
10
- Rawler.url = url
10
+ Rawler.url = URI.escape(url)
11
11
  Rawler.output = Logger.new(output)
12
12
  Rawler.username = username
13
13
  Rawler.password = password
@@ -16,7 +16,7 @@ module Rawler
16
16
  response = Rawler::Request.get(url)
17
17
 
18
18
  doc = Nokogiri::HTML(response.body)
19
- doc.css('a').map { |a| a['href'] }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
19
+ doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
20
20
  rescue Errno::ECONNREFUSED
21
21
  write("Couldn't connect to #{url}")
22
22
  []
@@ -28,9 +28,11 @@ module Rawler
28
28
  private
29
29
 
30
30
  def absolute_url(path)
31
- path.strip!
31
+ path = URI.encode(path.strip)
32
32
  if path[0].chr == '/'
33
33
  URI.parse(url).merge(path.to_s).to_s
34
+ elsif URI.parse(path).scheme.nil?
35
+ URI.parse(url).merge("/#{path.to_s}").to_s
34
36
  else
35
37
  path
36
38
  end
data/lib/rawler.rb CHANGED
@@ -7,7 +7,7 @@ require 'logger'
7
7
  require 'rawler/core_extensions'
8
8
 
9
9
  module Rawler
10
- VERSION = '0.0.6'
10
+ VERSION = '0.0.7'
11
11
 
12
12
  mattr_accessor :output
13
13
  mattr_accessor :url
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require File.dirname(__FILE__) + '/../../spec_helper.rb'
2
4
 
3
5
  describe Rawler::Crawler do
@@ -36,14 +38,14 @@ describe Rawler::Crawler do
36
38
 
37
39
  let(:url) { 'http://example.com/path' }
38
40
  let(:crawler) { Rawler::Crawler.new(url) }
39
- let(:content) { '<a href="/foo">foo</a>' }
41
+ let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a>' }
40
42
 
41
43
  before(:each) do
42
44
  register(url, content)
43
45
  end
44
46
 
45
47
  it "should parse relative links" do
46
- crawler.links.should == ['http://example.com/foo']
48
+ crawler.links.should == ['http://example.com/foo', 'http://example.com/bar']
47
49
  end
48
50
 
49
51
  end
@@ -75,8 +77,24 @@ describe Rawler::Crawler do
75
77
  register(url, content)
76
78
  end
77
79
 
78
- it "should parse relative links" do
79
- crawler.links.should == ['http://example.com/foo#bar']
80
+ it "should parse urls with hashtags" do
81
+ crawler.links.should == ['http://example.com/foo%23bar']
82
+ end
83
+
84
+ end
85
+
86
+ context "urls with unicode characters" do
87
+
88
+ let(:url) { 'http://example.com' }
89
+ let(:crawler) { Rawler::Crawler.new(url) }
90
+ let(:content) { '<a href="http://example.com/写程序容易出现的几个不好的地方">foo</a>' }
91
+
92
+ before(:each) do
93
+ register(url, content)
94
+ end
95
+
96
+ it "should parse unicode links" do
97
+ crawler.links.should == ['http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9']
80
98
  end
81
99
 
82
100
  end
@@ -85,7 +103,7 @@ describe Rawler::Crawler do
85
103
  let(:url) { 'http://example.com/path' }
86
104
  let(:crawler) { Rawler::Crawler.new(url) }
87
105
  let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
88
- let(:content) { "<a href=\"#{js_url}\">foo</a>" }
106
+ let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
89
107
 
90
108
  before(:each) do
91
109
  register(url, content)
@@ -94,9 +112,9 @@ describe Rawler::Crawler do
94
112
  it "should parse relative links" do
95
113
  crawler.links.should == []
96
114
  end
97
-
115
+
98
116
  it "should report the error" do
99
- crawler.should_receive(:write).with("Invalid url - #{js_url}")
117
+ crawler.should_receive(:write).with("Invalid url - javascript:fn('nbjmup;jhfs.esf%7Bfio/dpn');")
100
118
  crawler.links
101
119
  end
102
120
  end
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require File.dirname(__FILE__) + '/../spec_helper.rb'
2
4
 
3
5
  describe Rawler::Base do
@@ -9,6 +11,16 @@ describe Rawler::Base do
9
11
  Rawler.stub!(:output).and_return(output)
10
12
  register('http://example.com', site)
11
13
  end
14
+
15
+ describe "url encoding" do
16
+ it "should encode url" do
17
+ original = 'http://example.com/写程序容易出现的几个不好的地方'
18
+ expected = 'http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9'
19
+
20
+ Rawler::Base.new(original, output)
21
+ Rawler.url.should == expected
22
+ end
23
+ end
12
24
 
13
25
  describe "validate_links" do
14
26
 
@@ -50,16 +62,7 @@ describe Rawler::Base do
50
62
 
51
63
  rawler.validate
52
64
  end
53
-
54
- it "should validate links with #hashtags" do
55
- register('http://example.com/foo1', '<a href="http://example.com/page-with#hashtag">x</a>')
56
- register('http://example.com/page-with', '')
57
-
58
- output.should_receive(:info).with('200 - http://example.com/page-with#hashtag')
59
-
60
- rawler.validate
61
- end
62
-
65
+
63
66
  end
64
67
 
65
68
  describe "get_status_code" do
@@ -200,4 +203,4 @@ describe Rawler::Base do
200
203
  site
201
204
  end
202
205
 
203
- end
206
+ end
data/specs.watchr CHANGED
@@ -25,7 +25,6 @@ def run(files_to_run)
25
25
  end
26
26
 
27
27
  def run_all_tests
28
- puts "foo"
29
28
  run(all_test_files.join(' '))
30
29
  end
31
30
 
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
5
- prerelease:
4
+ prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 0
9
- - 6
10
- version: 0.0.6
8
+ - 7
9
+ version: 0.0.7
11
10
  platform: ruby
12
11
  authors:
13
12
  - Oscar Del Ben
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2011-02-04 00:00:00 +01:00
17
+ date: 2011-03-07 00:00:00 +01:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
@@ -26,28 +25,41 @@ dependencies:
26
25
  requirements:
27
26
  - - ">="
28
27
  - !ruby/object:Gem::Version
29
- hash: 3
30
28
  segments:
31
29
  - 0
32
30
  version: "0"
33
31
  type: :runtime
34
32
  version_requirements: *id001
35
33
  - !ruby/object:Gem::Dependency
36
- name: hoe
34
+ name: rubyforge
37
35
  prerelease: false
38
36
  requirement: &id002 !ruby/object:Gem::Requirement
39
37
  none: false
40
38
  requirements:
41
39
  - - ">="
42
40
  - !ruby/object:Gem::Version
43
- hash: 47
44
41
  segments:
45
42
  - 2
46
- - 8
47
43
  - 0
48
- version: 2.8.0
44
+ - 4
45
+ version: 2.0.4
49
46
  type: :development
50
47
  version_requirements: *id002
48
+ - !ruby/object:Gem::Dependency
49
+ name: hoe
50
+ prerelease: false
51
+ requirement: &id003 !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ segments:
57
+ - 2
58
+ - 6
59
+ - 2
60
+ version: 2.6.2
61
+ type: :development
62
+ version_requirements: *id003
51
63
  description: |-
52
64
  Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
53
65
 
@@ -100,7 +112,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
100
112
  requirements:
101
113
  - - ">="
102
114
  - !ruby/object:Gem::Version
103
- hash: 3
104
115
  segments:
105
116
  - 0
106
117
  version: "0"
@@ -109,14 +120,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
120
  requirements:
110
121
  - - ">="
111
122
  - !ruby/object:Gem::Version
112
- hash: 3
113
123
  segments:
114
124
  - 0
115
125
  version: "0"
116
126
  requirements: []
117
127
 
118
128
  rubyforge_project: oscardelben
119
- rubygems_version: 1.4.1
129
+ rubygems_version: 1.3.7
120
130
  signing_key:
121
131
  specification_version: 3
122
132
  summary: Rawler is a Ruby library that crawls your website and checks the status code for each of your links