rawler 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +8 -0
- data/lib/rawler/base.rb +1 -1
- data/lib/rawler/crawler.rb +4 -2
- data/lib/rawler.rb +1 -1
- data/spec/lib/rawler/crawler_spec.rb +25 -7
- data/spec/lib/rawler_spec.rb +14 -11
- data/specs.watchr +0 -1
- metadata +23 -13
data/Gemfile.lock
CHANGED
@@ -3,7 +3,12 @@ GEM
|
|
3
3
|
specs:
|
4
4
|
diff-lcs (1.1.2)
|
5
5
|
fakeweb (1.3.0)
|
6
|
+
hoe (2.6.2)
|
7
|
+
rake (>= 0.8.7)
|
8
|
+
rubyforge (>= 2.0.4)
|
9
|
+
json_pure (1.5.1)
|
6
10
|
nokogiri (1.4.4)
|
11
|
+
rake (0.8.7)
|
7
12
|
rspec (2.4.0)
|
8
13
|
rspec-core (~> 2.4.0)
|
9
14
|
rspec-expectations (~> 2.4.0)
|
@@ -12,11 +17,14 @@ GEM
|
|
12
17
|
rspec-expectations (2.4.0)
|
13
18
|
diff-lcs (~> 1.1.2)
|
14
19
|
rspec-mocks (2.4.0)
|
20
|
+
rubyforge (2.0.4)
|
21
|
+
json_pure (>= 1.1.7)
|
15
22
|
|
16
23
|
PLATFORMS
|
17
24
|
ruby
|
18
25
|
|
19
26
|
DEPENDENCIES
|
20
27
|
fakeweb (= 1.3.0)
|
28
|
+
hoe (= 2.6.2)
|
21
29
|
nokogiri (= 1.4.4)
|
22
30
|
rspec (= 2.4.0)
|
data/lib/rawler/base.rb
CHANGED
data/lib/rawler/crawler.rb
CHANGED
@@ -16,7 +16,7 @@ module Rawler
|
|
16
16
|
response = Rawler::Request.get(url)
|
17
17
|
|
18
18
|
doc = Nokogiri::HTML(response.body)
|
19
|
-
doc.css('a').map { |a| a['href'] }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
19
|
+
doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
20
20
|
rescue Errno::ECONNREFUSED
|
21
21
|
write("Couldn't connect to #{url}")
|
22
22
|
[]
|
@@ -28,9 +28,11 @@ module Rawler
|
|
28
28
|
private
|
29
29
|
|
30
30
|
def absolute_url(path)
|
31
|
-
path.strip
|
31
|
+
path = URI.encode(path.strip)
|
32
32
|
if path[0].chr == '/'
|
33
33
|
URI.parse(url).merge(path.to_s).to_s
|
34
|
+
elsif URI.parse(path).scheme.nil?
|
35
|
+
URI.parse(url).merge("/#{path.to_s}").to_s
|
34
36
|
else
|
35
37
|
path
|
36
38
|
end
|
data/lib/rawler.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
4
|
|
3
5
|
describe Rawler::Crawler do
|
@@ -36,14 +38,14 @@ describe Rawler::Crawler do
|
|
36
38
|
|
37
39
|
let(:url) { 'http://example.com/path' }
|
38
40
|
let(:crawler) { Rawler::Crawler.new(url) }
|
39
|
-
let(:content) { '<a href="/foo">foo</a>' }
|
41
|
+
let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a>' }
|
40
42
|
|
41
43
|
before(:each) do
|
42
44
|
register(url, content)
|
43
45
|
end
|
44
46
|
|
45
47
|
it "should parse relative links" do
|
46
|
-
crawler.links.should == ['http://example.com/foo']
|
48
|
+
crawler.links.should == ['http://example.com/foo', 'http://example.com/bar']
|
47
49
|
end
|
48
50
|
|
49
51
|
end
|
@@ -75,8 +77,24 @@ describe Rawler::Crawler do
|
|
75
77
|
register(url, content)
|
76
78
|
end
|
77
79
|
|
78
|
-
it "should parse
|
79
|
-
crawler.links.should == ['http://example.com/foo
|
80
|
+
it "should parse urls with hashtags" do
|
81
|
+
crawler.links.should == ['http://example.com/foo%23bar']
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
context "urls with unicode characters" do
|
87
|
+
|
88
|
+
let(:url) { 'http://example.com' }
|
89
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
90
|
+
let(:content) { '<a href="http://example.com/写程序容易出现的几个不好的地方">foo</a>' }
|
91
|
+
|
92
|
+
before(:each) do
|
93
|
+
register(url, content)
|
94
|
+
end
|
95
|
+
|
96
|
+
it "should parse unicode links" do
|
97
|
+
crawler.links.should == ['http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9']
|
80
98
|
end
|
81
99
|
|
82
100
|
end
|
@@ -85,7 +103,7 @@ describe Rawler::Crawler do
|
|
85
103
|
let(:url) { 'http://example.com/path' }
|
86
104
|
let(:crawler) { Rawler::Crawler.new(url) }
|
87
105
|
let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
|
88
|
-
let(:content) { "<a href=\"#{js_url}\">foo</a>" }
|
106
|
+
let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
|
89
107
|
|
90
108
|
before(:each) do
|
91
109
|
register(url, content)
|
@@ -94,9 +112,9 @@ describe Rawler::Crawler do
|
|
94
112
|
it "should parse relative links" do
|
95
113
|
crawler.links.should == []
|
96
114
|
end
|
97
|
-
|
115
|
+
|
98
116
|
it "should report the error" do
|
99
|
-
crawler.should_receive(:write).with("Invalid url -
|
117
|
+
crawler.should_receive(:write).with("Invalid url - javascript:fn('nbjmup;jhfs.esf%7Bfio/dpn');")
|
100
118
|
crawler.links
|
101
119
|
end
|
102
120
|
end
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require File.dirname(__FILE__) + '/../spec_helper.rb'
|
2
4
|
|
3
5
|
describe Rawler::Base do
|
@@ -9,6 +11,16 @@ describe Rawler::Base do
|
|
9
11
|
Rawler.stub!(:output).and_return(output)
|
10
12
|
register('http://example.com', site)
|
11
13
|
end
|
14
|
+
|
15
|
+
describe "url encoding" do
|
16
|
+
it "should encode url" do
|
17
|
+
original = 'http://example.com/写程序容易出现的几个不好的地方'
|
18
|
+
expected = 'http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9'
|
19
|
+
|
20
|
+
Rawler::Base.new(original, output)
|
21
|
+
Rawler.url.should == expected
|
22
|
+
end
|
23
|
+
end
|
12
24
|
|
13
25
|
describe "validate_links" do
|
14
26
|
|
@@ -50,16 +62,7 @@ describe Rawler::Base do
|
|
50
62
|
|
51
63
|
rawler.validate
|
52
64
|
end
|
53
|
-
|
54
|
-
it "should validate links with #hashtags" do
|
55
|
-
register('http://example.com/foo1', '<a href="http://example.com/page-with#hashtag">x</a>')
|
56
|
-
register('http://example.com/page-with', '')
|
57
|
-
|
58
|
-
output.should_receive(:info).with('200 - http://example.com/page-with#hashtag')
|
59
|
-
|
60
|
-
rawler.validate
|
61
|
-
end
|
62
|
-
|
65
|
+
|
63
66
|
end
|
64
67
|
|
65
68
|
describe "get_status_code" do
|
@@ -200,4 +203,4 @@ describe Rawler::Base do
|
|
200
203
|
site
|
201
204
|
end
|
202
205
|
|
203
|
-
end
|
206
|
+
end
|
data/specs.watchr
CHANGED
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease:
|
4
|
+
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
7
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
8
|
+
- 7
|
9
|
+
version: 0.0.7
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Oscar Del Ben
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2011-
|
17
|
+
date: 2011-03-07 00:00:00 +01:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
@@ -26,28 +25,41 @@ dependencies:
|
|
26
25
|
requirements:
|
27
26
|
- - ">="
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 3
|
30
28
|
segments:
|
31
29
|
- 0
|
32
30
|
version: "0"
|
33
31
|
type: :runtime
|
34
32
|
version_requirements: *id001
|
35
33
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
34
|
+
name: rubyforge
|
37
35
|
prerelease: false
|
38
36
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
37
|
none: false
|
40
38
|
requirements:
|
41
39
|
- - ">="
|
42
40
|
- !ruby/object:Gem::Version
|
43
|
-
hash: 47
|
44
41
|
segments:
|
45
42
|
- 2
|
46
|
-
- 8
|
47
43
|
- 0
|
48
|
-
|
44
|
+
- 4
|
45
|
+
version: 2.0.4
|
49
46
|
type: :development
|
50
47
|
version_requirements: *id002
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: hoe
|
50
|
+
prerelease: false
|
51
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
segments:
|
57
|
+
- 2
|
58
|
+
- 6
|
59
|
+
- 2
|
60
|
+
version: 2.6.2
|
61
|
+
type: :development
|
62
|
+
version_requirements: *id003
|
51
63
|
description: |-
|
52
64
|
Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
|
53
65
|
|
@@ -100,7 +112,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
100
112
|
requirements:
|
101
113
|
- - ">="
|
102
114
|
- !ruby/object:Gem::Version
|
103
|
-
hash: 3
|
104
115
|
segments:
|
105
116
|
- 0
|
106
117
|
version: "0"
|
@@ -109,14 +120,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
120
|
requirements:
|
110
121
|
- - ">="
|
111
122
|
- !ruby/object:Gem::Version
|
112
|
-
hash: 3
|
113
123
|
segments:
|
114
124
|
- 0
|
115
125
|
version: "0"
|
116
126
|
requirements: []
|
117
127
|
|
118
128
|
rubyforge_project: oscardelben
|
119
|
-
rubygems_version: 1.
|
129
|
+
rubygems_version: 1.3.7
|
120
130
|
signing_key:
|
121
131
|
specification_version: 3
|
122
132
|
summary: Rawler is a Ruby library that crawls your website and checks the status code for each of your links
|