rawler 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +8 -0
- data/lib/rawler/base.rb +1 -1
- data/lib/rawler/crawler.rb +4 -2
- data/lib/rawler.rb +1 -1
- data/spec/lib/rawler/crawler_spec.rb +25 -7
- data/spec/lib/rawler_spec.rb +14 -11
- data/specs.watchr +0 -1
- metadata +23 -13
data/Gemfile.lock
CHANGED
@@ -3,7 +3,12 @@ GEM
|
|
3
3
|
specs:
|
4
4
|
diff-lcs (1.1.2)
|
5
5
|
fakeweb (1.3.0)
|
6
|
+
hoe (2.6.2)
|
7
|
+
rake (>= 0.8.7)
|
8
|
+
rubyforge (>= 2.0.4)
|
9
|
+
json_pure (1.5.1)
|
6
10
|
nokogiri (1.4.4)
|
11
|
+
rake (0.8.7)
|
7
12
|
rspec (2.4.0)
|
8
13
|
rspec-core (~> 2.4.0)
|
9
14
|
rspec-expectations (~> 2.4.0)
|
@@ -12,11 +17,14 @@ GEM
|
|
12
17
|
rspec-expectations (2.4.0)
|
13
18
|
diff-lcs (~> 1.1.2)
|
14
19
|
rspec-mocks (2.4.0)
|
20
|
+
rubyforge (2.0.4)
|
21
|
+
json_pure (>= 1.1.7)
|
15
22
|
|
16
23
|
PLATFORMS
|
17
24
|
ruby
|
18
25
|
|
19
26
|
DEPENDENCIES
|
20
27
|
fakeweb (= 1.3.0)
|
28
|
+
hoe (= 2.6.2)
|
21
29
|
nokogiri (= 1.4.4)
|
22
30
|
rspec (= 2.4.0)
|
data/lib/rawler/base.rb
CHANGED
data/lib/rawler/crawler.rb
CHANGED
@@ -16,7 +16,7 @@ module Rawler
|
|
16
16
|
response = Rawler::Request.get(url)
|
17
17
|
|
18
18
|
doc = Nokogiri::HTML(response.body)
|
19
|
-
doc.css('a').map { |a| a['href'] }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
19
|
+
doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
20
20
|
rescue Errno::ECONNREFUSED
|
21
21
|
write("Couldn't connect to #{url}")
|
22
22
|
[]
|
@@ -28,9 +28,11 @@ module Rawler
|
|
28
28
|
private
|
29
29
|
|
30
30
|
def absolute_url(path)
|
31
|
-
path.strip
|
31
|
+
path = URI.encode(path.strip)
|
32
32
|
if path[0].chr == '/'
|
33
33
|
URI.parse(url).merge(path.to_s).to_s
|
34
|
+
elsif URI.parse(path).scheme.nil?
|
35
|
+
URI.parse(url).merge("/#{path.to_s}").to_s
|
34
36
|
else
|
35
37
|
path
|
36
38
|
end
|
data/lib/rawler.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
4
|
|
3
5
|
describe Rawler::Crawler do
|
@@ -36,14 +38,14 @@ describe Rawler::Crawler do
|
|
36
38
|
|
37
39
|
let(:url) { 'http://example.com/path' }
|
38
40
|
let(:crawler) { Rawler::Crawler.new(url) }
|
39
|
-
let(:content) { '<a href="/foo">foo</a>' }
|
41
|
+
let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a>' }
|
40
42
|
|
41
43
|
before(:each) do
|
42
44
|
register(url, content)
|
43
45
|
end
|
44
46
|
|
45
47
|
it "should parse relative links" do
|
46
|
-
crawler.links.should == ['http://example.com/foo']
|
48
|
+
crawler.links.should == ['http://example.com/foo', 'http://example.com/bar']
|
47
49
|
end
|
48
50
|
|
49
51
|
end
|
@@ -75,8 +77,24 @@ describe Rawler::Crawler do
|
|
75
77
|
register(url, content)
|
76
78
|
end
|
77
79
|
|
78
|
-
it "should parse
|
79
|
-
crawler.links.should == ['http://example.com/foo
|
80
|
+
it "should parse urls with hashtags" do
|
81
|
+
crawler.links.should == ['http://example.com/foo%23bar']
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
context "urls with unicode characters" do
|
87
|
+
|
88
|
+
let(:url) { 'http://example.com' }
|
89
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
90
|
+
let(:content) { '<a href="http://example.com/写程序容易出现的几个不好的地方">foo</a>' }
|
91
|
+
|
92
|
+
before(:each) do
|
93
|
+
register(url, content)
|
94
|
+
end
|
95
|
+
|
96
|
+
it "should parse unicode links" do
|
97
|
+
crawler.links.should == ['http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9']
|
80
98
|
end
|
81
99
|
|
82
100
|
end
|
@@ -85,7 +103,7 @@ describe Rawler::Crawler do
|
|
85
103
|
let(:url) { 'http://example.com/path' }
|
86
104
|
let(:crawler) { Rawler::Crawler.new(url) }
|
87
105
|
let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
|
88
|
-
let(:content) { "<a href=\"#{js_url}\">foo</a>" }
|
106
|
+
let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
|
89
107
|
|
90
108
|
before(:each) do
|
91
109
|
register(url, content)
|
@@ -94,9 +112,9 @@ describe Rawler::Crawler do
|
|
94
112
|
it "should parse relative links" do
|
95
113
|
crawler.links.should == []
|
96
114
|
end
|
97
|
-
|
115
|
+
|
98
116
|
it "should report the error" do
|
99
|
-
crawler.should_receive(:write).with("Invalid url -
|
117
|
+
crawler.should_receive(:write).with("Invalid url - javascript:fn('nbjmup;jhfs.esf%7Bfio/dpn');")
|
100
118
|
crawler.links
|
101
119
|
end
|
102
120
|
end
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require File.dirname(__FILE__) + '/../spec_helper.rb'
|
2
4
|
|
3
5
|
describe Rawler::Base do
|
@@ -9,6 +11,16 @@ describe Rawler::Base do
|
|
9
11
|
Rawler.stub!(:output).and_return(output)
|
10
12
|
register('http://example.com', site)
|
11
13
|
end
|
14
|
+
|
15
|
+
describe "url encoding" do
|
16
|
+
it "should encode url" do
|
17
|
+
original = 'http://example.com/写程序容易出现的几个不好的地方'
|
18
|
+
expected = 'http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9'
|
19
|
+
|
20
|
+
Rawler::Base.new(original, output)
|
21
|
+
Rawler.url.should == expected
|
22
|
+
end
|
23
|
+
end
|
12
24
|
|
13
25
|
describe "validate_links" do
|
14
26
|
|
@@ -50,16 +62,7 @@ describe Rawler::Base do
|
|
50
62
|
|
51
63
|
rawler.validate
|
52
64
|
end
|
53
|
-
|
54
|
-
it "should validate links with #hashtags" do
|
55
|
-
register('http://example.com/foo1', '<a href="http://example.com/page-with#hashtag">x</a>')
|
56
|
-
register('http://example.com/page-with', '')
|
57
|
-
|
58
|
-
output.should_receive(:info).with('200 - http://example.com/page-with#hashtag')
|
59
|
-
|
60
|
-
rawler.validate
|
61
|
-
end
|
62
|
-
|
65
|
+
|
63
66
|
end
|
64
67
|
|
65
68
|
describe "get_status_code" do
|
@@ -200,4 +203,4 @@ describe Rawler::Base do
|
|
200
203
|
site
|
201
204
|
end
|
202
205
|
|
203
|
-
end
|
206
|
+
end
|
data/specs.watchr
CHANGED
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease:
|
4
|
+
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
7
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
8
|
+
- 7
|
9
|
+
version: 0.0.7
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Oscar Del Ben
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2011-
|
17
|
+
date: 2011-03-07 00:00:00 +01:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
@@ -26,28 +25,41 @@ dependencies:
|
|
26
25
|
requirements:
|
27
26
|
- - ">="
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 3
|
30
28
|
segments:
|
31
29
|
- 0
|
32
30
|
version: "0"
|
33
31
|
type: :runtime
|
34
32
|
version_requirements: *id001
|
35
33
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
34
|
+
name: rubyforge
|
37
35
|
prerelease: false
|
38
36
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
37
|
none: false
|
40
38
|
requirements:
|
41
39
|
- - ">="
|
42
40
|
- !ruby/object:Gem::Version
|
43
|
-
hash: 47
|
44
41
|
segments:
|
45
42
|
- 2
|
46
|
-
- 8
|
47
43
|
- 0
|
48
|
-
|
44
|
+
- 4
|
45
|
+
version: 2.0.4
|
49
46
|
type: :development
|
50
47
|
version_requirements: *id002
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: hoe
|
50
|
+
prerelease: false
|
51
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
segments:
|
57
|
+
- 2
|
58
|
+
- 6
|
59
|
+
- 2
|
60
|
+
version: 2.6.2
|
61
|
+
type: :development
|
62
|
+
version_requirements: *id003
|
51
63
|
description: |-
|
52
64
|
Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
|
53
65
|
|
@@ -100,7 +112,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
100
112
|
requirements:
|
101
113
|
- - ">="
|
102
114
|
- !ruby/object:Gem::Version
|
103
|
-
hash: 3
|
104
115
|
segments:
|
105
116
|
- 0
|
106
117
|
version: "0"
|
@@ -109,14 +120,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
120
|
requirements:
|
110
121
|
- - ">="
|
111
122
|
- !ruby/object:Gem::Version
|
112
|
-
hash: 3
|
113
123
|
segments:
|
114
124
|
- 0
|
115
125
|
version: "0"
|
116
126
|
requirements: []
|
117
127
|
|
118
128
|
rubyforge_project: oscardelben
|
119
|
-
rubygems_version: 1.
|
129
|
+
rubygems_version: 1.3.7
|
120
130
|
signing_key:
|
121
131
|
specification_version: 3
|
122
132
|
summary: Rawler is a Ruby library that crawls your website and checks the status code for each of your links
|