rawler 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/bin/rawler +2 -0
- data/lib/rawler.rb +1 -0
- data/lib/rawler/base.rb +15 -0
- data/lib/rawler/crawler.rb +27 -1
- data/rawler.gemspec +3 -3
- data/spec/lib/rawler/crawler_spec.rb +5 -1
- metadata +110 -113
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.5
|
data/bin/rawler
CHANGED
@@ -18,8 +18,10 @@ EOS
|
|
18
18
|
opt :password, "HTT Basic Password", :type => :string
|
19
19
|
opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
|
20
20
|
opt :log, "Log results to file rawler_log.txt", :type => :boolean, :default => false
|
21
|
+
opt :css, "Check CSS links", :type => :boolean, :default => false
|
21
22
|
end
|
22
23
|
|
24
|
+
|
23
25
|
domain = ARGV.shift
|
24
26
|
|
25
27
|
if domain.nil?
|
data/lib/rawler.rb
CHANGED
data/lib/rawler/base.rb
CHANGED
@@ -13,6 +13,7 @@ module Rawler
|
|
13
13
|
Rawler.password = options[:password]
|
14
14
|
Rawler.wait = options[:wait]
|
15
15
|
Rawler.log = options[:log]
|
16
|
+
Rawler.css = options[:css]
|
16
17
|
@logfile = File.new("rawler_log.txt", "w") if Rawler.log
|
17
18
|
end
|
18
19
|
|
@@ -30,10 +31,24 @@ module Rawler
|
|
30
31
|
end
|
31
32
|
end
|
32
33
|
|
34
|
+
def validate_css_links_in_page(page)
|
35
|
+
Rawler::Crawler.new(page).css_links.each do |page_url|
|
36
|
+
validate_non_html(page_url, page)
|
37
|
+
sleep(Rawler.wait)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
33
41
|
def validate_page(page_url, from_url)
|
34
42
|
if not_yet_parsed?(page_url)
|
35
43
|
add_status_code(page_url, from_url)
|
36
44
|
validate_links_in_page(page_url) if same_domain?(page_url)
|
45
|
+
validate_css_links_in_page(page_url) if same_domain?(page_url) and Rawler.css
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def validate_non_html(page_url, from_url)
|
50
|
+
if not_yet_parsed?(page_url)
|
51
|
+
add_status_code(page_url, from_url)
|
37
52
|
end
|
38
53
|
end
|
39
54
|
|
data/lib/rawler/crawler.rb
CHANGED
@@ -27,6 +27,24 @@ module Rawler
|
|
27
27
|
[]
|
28
28
|
end
|
29
29
|
|
30
|
+
def css_links
|
31
|
+
if different_domain?(url, Rawler.url) || not_html?(url)
|
32
|
+
return []
|
33
|
+
end
|
34
|
+
|
35
|
+
response = Rawler::Request.get(url)
|
36
|
+
|
37
|
+
doc = Nokogiri::HTML(response.body)
|
38
|
+
|
39
|
+
doc.css('link').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
40
|
+
rescue Errno::ECONNREFUSED
|
41
|
+
write("Couldn't connect to #{url}")
|
42
|
+
[]
|
43
|
+
rescue Errno::ETIMEDOUT
|
44
|
+
write("Connection to #{url} timed out")
|
45
|
+
[]
|
46
|
+
end
|
47
|
+
|
30
48
|
private
|
31
49
|
|
32
50
|
def absolute_url(path)
|
@@ -50,8 +68,16 @@ module Rawler
|
|
50
68
|
URI.parse(url_1).host != URI.parse(url_2).host
|
51
69
|
end
|
52
70
|
|
71
|
+
def content_type(url)
|
72
|
+
Rawler::Request.head(url).content_type
|
73
|
+
end
|
74
|
+
|
53
75
|
def not_html?(url)
|
54
|
-
|
76
|
+
content_type(url) != 'text/html'
|
77
|
+
end
|
78
|
+
|
79
|
+
def not_css?(url)
|
80
|
+
content_type(url) != 'text/css'
|
55
81
|
end
|
56
82
|
|
57
83
|
def valid_url?(url)
|
data/rawler.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "rawler"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Oscar Del Ben"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-04-14"
|
13
13
|
s.description = "Rawler is a tool that crawls the links of your website"
|
14
14
|
s.email = "info@oscardelben.com"
|
15
15
|
s.executables = ["rawler"]
|
@@ -45,7 +45,7 @@ Gem::Specification.new do |s|
|
|
45
45
|
s.homepage = "http://github.com/oscardelben/rawler"
|
46
46
|
s.licenses = ["MIT"]
|
47
47
|
s.require_paths = ["lib"]
|
48
|
-
s.rubygems_version = "1.8.
|
48
|
+
s.rubygems_version = "1.8.21"
|
49
49
|
s.summary = "Rawler is a tool that crawls the links of your website"
|
50
50
|
|
51
51
|
if s.respond_to? :specification_version then
|
@@ -18,7 +18,8 @@ describe Rawler::Crawler do
|
|
18
18
|
let(:crawler) { Rawler::Crawler.new(url) }
|
19
19
|
let(:content) {
|
20
20
|
content = <<-content
|
21
|
-
<
|
21
|
+
<link rel="stylesheet" href="css/styles.css" />
|
22
|
+
<p><a href="http://example.com/foo">foo</a></p>
|
22
23
|
|
23
24
|
<p><a href="http://external.com/bar">bar</a></p>
|
24
25
|
content
|
@@ -32,6 +33,9 @@ describe Rawler::Crawler do
|
|
32
33
|
crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
|
33
34
|
end
|
34
35
|
|
36
|
+
it "should parse css links" do
|
37
|
+
crawler.css_links.should == ['http://example.com/css/styles.css']
|
38
|
+
end
|
35
39
|
end
|
36
40
|
|
37
41
|
context "relative paths" do
|
metadata
CHANGED
@@ -1,134 +1,137 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.5
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 1
|
9
|
-
- 4
|
10
|
-
version: 0.1.4
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Oscar Del Ben
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
12
|
+
date: 2012-04-14 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
22
17
|
none: false
|
23
|
-
requirements:
|
24
|
-
- -
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
|
27
|
-
segments:
|
28
|
-
- 0
|
29
|
-
version: "0"
|
30
|
-
requirement: *id001
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
31
22
|
type: :runtime
|
32
23
|
prerelease: false
|
33
|
-
|
34
|
-
- !ruby/object:Gem::Dependency
|
35
|
-
version_requirements: &id002 !ruby/object:Gem::Requirement
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
36
25
|
none: false
|
37
|
-
requirements:
|
38
|
-
- -
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
|
41
|
-
|
42
|
-
- 0
|
43
|
-
version: "0"
|
44
|
-
requirement: *id002
|
45
|
-
type: :development
|
46
|
-
prerelease: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
47
31
|
name: fakeweb
|
48
|
-
|
49
|
-
version_requirements: &id003 !ruby/object:Gem::Requirement
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
50
33
|
none: false
|
51
|
-
requirements:
|
52
|
-
- -
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
|
55
|
-
segments:
|
56
|
-
- 0
|
57
|
-
version: "0"
|
58
|
-
requirement: *id003
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
59
38
|
type: :development
|
60
39
|
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
61
47
|
name: rspec
|
62
|
-
|
63
|
-
version_requirements: &id004 !ruby/object:Gem::Requirement
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
64
49
|
none: false
|
65
|
-
requirements:
|
66
|
-
- -
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
|
69
|
-
segments:
|
70
|
-
- 0
|
71
|
-
version: "0"
|
72
|
-
requirement: *id004
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
73
54
|
type: :development
|
74
55
|
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
75
63
|
name: shoulda
|
76
|
-
|
77
|
-
version_requirements: &id005 !ruby/object:Gem::Requirement
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
78
65
|
none: false
|
79
|
-
requirements:
|
80
|
-
- -
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
|
83
|
-
segments:
|
84
|
-
- 1
|
85
|
-
- 0
|
86
|
-
- 0
|
87
|
-
version: 1.0.0
|
88
|
-
requirement: *id005
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
89
70
|
type: :development
|
90
71
|
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
91
79
|
name: bundler
|
92
|
-
|
93
|
-
version_requirements: &id006 !ruby/object:Gem::Requirement
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
94
81
|
none: false
|
95
|
-
requirements:
|
82
|
+
requirements:
|
96
83
|
- - ~>
|
97
|
-
- !ruby/object:Gem::Version
|
98
|
-
|
99
|
-
segments:
|
100
|
-
- 1
|
101
|
-
- 6
|
102
|
-
- 4
|
103
|
-
version: 1.6.4
|
104
|
-
requirement: *id006
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: 1.0.0
|
105
86
|
type: :development
|
106
87
|
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 1.0.0
|
94
|
+
- !ruby/object:Gem::Dependency
|
107
95
|
name: jeweler
|
108
|
-
|
109
|
-
version_requirements: &id007 !ruby/object:Gem::Requirement
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
110
97
|
none: false
|
111
|
-
requirements:
|
112
|
-
- -
|
113
|
-
- !ruby/object:Gem::Version
|
114
|
-
|
115
|
-
segments:
|
116
|
-
- 0
|
117
|
-
version: "0"
|
118
|
-
requirement: *id007
|
98
|
+
requirements:
|
99
|
+
- - ~>
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.6.4
|
119
102
|
type: :development
|
120
103
|
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ~>
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.6.4
|
110
|
+
- !ruby/object:Gem::Dependency
|
121
111
|
name: rcov
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
122
126
|
description: Rawler is a tool that crawls the links of your website
|
123
127
|
email: info@oscardelben.com
|
124
|
-
executables:
|
128
|
+
executables:
|
125
129
|
- rawler
|
126
130
|
extensions: []
|
127
|
-
|
128
|
-
extra_rdoc_files:
|
131
|
+
extra_rdoc_files:
|
129
132
|
- LICENSE.txt
|
130
133
|
- README.md
|
131
|
-
files:
|
134
|
+
files:
|
132
135
|
- Gemfile
|
133
136
|
- Gemfile.lock
|
134
137
|
- LICENSE.txt
|
@@ -153,37 +156,31 @@ files:
|
|
153
156
|
- test/test_rawler.rb
|
154
157
|
- vendor/lib-trollop.rb
|
155
158
|
homepage: http://github.com/oscardelben/rawler
|
156
|
-
licenses:
|
159
|
+
licenses:
|
157
160
|
- MIT
|
158
161
|
post_install_message:
|
159
162
|
rdoc_options: []
|
160
|
-
|
161
|
-
require_paths:
|
163
|
+
require_paths:
|
162
164
|
- lib
|
163
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
165
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
164
166
|
none: false
|
165
|
-
requirements:
|
166
|
-
- -
|
167
|
-
- !ruby/object:Gem::Version
|
168
|
-
|
169
|
-
segments:
|
167
|
+
requirements:
|
168
|
+
- - ! '>='
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: '0'
|
171
|
+
segments:
|
170
172
|
- 0
|
171
|
-
|
172
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
173
|
+
hash: 2522129833142198431
|
174
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
173
175
|
none: false
|
174
|
-
requirements:
|
175
|
-
- -
|
176
|
-
- !ruby/object:Gem::Version
|
177
|
-
|
178
|
-
segments:
|
179
|
-
- 0
|
180
|
-
version: "0"
|
176
|
+
requirements:
|
177
|
+
- - ! '>='
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '0'
|
181
180
|
requirements: []
|
182
|
-
|
183
181
|
rubyforge_project:
|
184
|
-
rubygems_version: 1.8.
|
182
|
+
rubygems_version: 1.8.21
|
185
183
|
signing_key:
|
186
184
|
specification_version: 3
|
187
185
|
summary: Rawler is a tool that crawls the links of your website
|
188
186
|
test_files: []
|
189
|
-
|