rawler 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.4
1
+ 0.1.5
data/bin/rawler CHANGED
@@ -18,8 +18,10 @@ EOS
18
18
  opt :password, "HTT Basic Password", :type => :string
19
19
  opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
20
20
  opt :log, "Log results to file rawler_log.txt", :type => :boolean, :default => false
21
+ opt :css, "Check CSS links", :type => :boolean, :default => false
21
22
  end
22
23
 
24
+
23
25
  domain = ARGV.shift
24
26
 
25
27
  if domain.nil?
data/lib/rawler.rb CHANGED
@@ -11,6 +11,7 @@ module Rawler
11
11
  mattr_accessor :wait
12
12
  mattr_accessor :username, :password
13
13
  mattr_accessor :log
14
+ mattr_accessor :css
14
15
 
15
16
  autoload :Base, "rawler/base"
16
17
  autoload :Crawler, "rawler/crawler"
data/lib/rawler/base.rb CHANGED
@@ -13,6 +13,7 @@ module Rawler
13
13
  Rawler.password = options[:password]
14
14
  Rawler.wait = options[:wait]
15
15
  Rawler.log = options[:log]
16
+ Rawler.css = options[:css]
16
17
  @logfile = File.new("rawler_log.txt", "w") if Rawler.log
17
18
  end
18
19
 
@@ -30,10 +31,24 @@ module Rawler
30
31
  end
31
32
  end
32
33
 
34
+ def validate_css_links_in_page(page)
35
+ Rawler::Crawler.new(page).css_links.each do |page_url|
36
+ validate_non_html(page_url, page)
37
+ sleep(Rawler.wait)
38
+ end
39
+ end
40
+
33
41
  def validate_page(page_url, from_url)
34
42
  if not_yet_parsed?(page_url)
35
43
  add_status_code(page_url, from_url)
36
44
  validate_links_in_page(page_url) if same_domain?(page_url)
45
+ validate_css_links_in_page(page_url) if same_domain?(page_url) and Rawler.css
46
+ end
47
+ end
48
+
49
+ def validate_non_html(page_url, from_url)
50
+ if not_yet_parsed?(page_url)
51
+ add_status_code(page_url, from_url)
37
52
  end
38
53
  end
39
54
 
@@ -27,6 +27,24 @@ module Rawler
27
27
  []
28
28
  end
29
29
 
30
+ def css_links
31
+ if different_domain?(url, Rawler.url) || not_html?(url)
32
+ return []
33
+ end
34
+
35
+ response = Rawler::Request.get(url)
36
+
37
+ doc = Nokogiri::HTML(response.body)
38
+
39
+ doc.css('link').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
40
+ rescue Errno::ECONNREFUSED
41
+ write("Couldn't connect to #{url}")
42
+ []
43
+ rescue Errno::ETIMEDOUT
44
+ write("Connection to #{url} timed out")
45
+ []
46
+ end
47
+
30
48
  private
31
49
 
32
50
  def absolute_url(path)
@@ -50,8 +68,16 @@ module Rawler
50
68
  URI.parse(url_1).host != URI.parse(url_2).host
51
69
  end
52
70
 
71
+ def content_type(url)
72
+ Rawler::Request.head(url).content_type
73
+ end
74
+
53
75
  def not_html?(url)
54
- Rawler::Request.head(url).content_type != 'text/html'
76
+ content_type(url) != 'text/html'
77
+ end
78
+
79
+ def not_css?(url)
80
+ content_type(url) != 'text/css'
55
81
  end
56
82
 
57
83
  def valid_url?(url)
data/rawler.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "rawler"
8
- s.version = "0.1.4"
8
+ s.version = "0.1.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Oscar Del Ben"]
12
- s.date = "2012-02-27"
12
+ s.date = "2012-04-14"
13
13
  s.description = "Rawler is a tool that crawls the links of your website"
14
14
  s.email = "info@oscardelben.com"
15
15
  s.executables = ["rawler"]
@@ -45,7 +45,7 @@ Gem::Specification.new do |s|
45
45
  s.homepage = "http://github.com/oscardelben/rawler"
46
46
  s.licenses = ["MIT"]
47
47
  s.require_paths = ["lib"]
48
- s.rubygems_version = "1.8.10"
48
+ s.rubygems_version = "1.8.21"
49
49
  s.summary = "Rawler is a tool that crawls the links of your website"
50
50
 
51
51
  if s.respond_to? :specification_version then
@@ -18,7 +18,8 @@ describe Rawler::Crawler do
18
18
  let(:crawler) { Rawler::Crawler.new(url) }
19
19
  let(:content) {
20
20
  content = <<-content
21
- <p><a href="http://example.com/foo">foo</a></p>
21
+ <link rel="stylesheet" href="css/styles.css" />
22
+ <p><a href="http://example.com/foo">foo</a></p>
22
23
 
23
24
  <p><a href="http://external.com/bar">bar</a></p>
24
25
  content
@@ -32,6 +33,9 @@ describe Rawler::Crawler do
32
33
  crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
33
34
  end
34
35
 
36
+ it "should parse css links" do
37
+ crawler.css_links.should == ['http://example.com/css/styles.css']
38
+ end
35
39
  end
36
40
 
37
41
  context "relative paths" do
metadata CHANGED
@@ -1,134 +1,137 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
- version: !ruby/object:Gem::Version
4
- hash: 19
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.5
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 1
9
- - 4
10
- version: 0.1.4
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Oscar Del Ben
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2012-02-27 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- version_requirements: &id001 !ruby/object:Gem::Requirement
12
+ date: 2012-04-14 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
22
17
  none: false
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- hash: 3
27
- segments:
28
- - 0
29
- version: "0"
30
- requirement: *id001
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
31
22
  type: :runtime
32
23
  prerelease: false
33
- name: nokogiri
34
- - !ruby/object:Gem::Dependency
35
- version_requirements: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
36
25
  none: false
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- hash: 3
41
- segments:
42
- - 0
43
- version: "0"
44
- requirement: *id002
45
- type: :development
46
- prerelease: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
47
31
  name: fakeweb
48
- - !ruby/object:Gem::Dependency
49
- version_requirements: &id003 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
50
33
  none: false
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- hash: 3
55
- segments:
56
- - 0
57
- version: "0"
58
- requirement: *id003
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
59
38
  type: :development
60
39
  prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
61
47
  name: rspec
62
- - !ruby/object:Gem::Dependency
63
- version_requirements: &id004 !ruby/object:Gem::Requirement
48
+ requirement: !ruby/object:Gem::Requirement
64
49
  none: false
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- hash: 3
69
- segments:
70
- - 0
71
- version: "0"
72
- requirement: *id004
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
73
54
  type: :development
74
55
  prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
75
63
  name: shoulda
76
- - !ruby/object:Gem::Dependency
77
- version_requirements: &id005 !ruby/object:Gem::Requirement
64
+ requirement: !ruby/object:Gem::Requirement
78
65
  none: false
79
- requirements:
80
- - - ~>
81
- - !ruby/object:Gem::Version
82
- hash: 23
83
- segments:
84
- - 1
85
- - 0
86
- - 0
87
- version: 1.0.0
88
- requirement: *id005
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
89
70
  type: :development
90
71
  prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
91
79
  name: bundler
92
- - !ruby/object:Gem::Dependency
93
- version_requirements: &id006 !ruby/object:Gem::Requirement
80
+ requirement: !ruby/object:Gem::Requirement
94
81
  none: false
95
- requirements:
82
+ requirements:
96
83
  - - ~>
97
- - !ruby/object:Gem::Version
98
- hash: 7
99
- segments:
100
- - 1
101
- - 6
102
- - 4
103
- version: 1.6.4
104
- requirement: *id006
84
+ - !ruby/object:Gem::Version
85
+ version: 1.0.0
105
86
  type: :development
106
87
  prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: 1.0.0
94
+ - !ruby/object:Gem::Dependency
107
95
  name: jeweler
108
- - !ruby/object:Gem::Dependency
109
- version_requirements: &id007 !ruby/object:Gem::Requirement
96
+ requirement: !ruby/object:Gem::Requirement
110
97
  none: false
111
- requirements:
112
- - - ">="
113
- - !ruby/object:Gem::Version
114
- hash: 3
115
- segments:
116
- - 0
117
- version: "0"
118
- requirement: *id007
98
+ requirements:
99
+ - - ~>
100
+ - !ruby/object:Gem::Version
101
+ version: 1.6.4
119
102
  type: :development
120
103
  prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ~>
108
+ - !ruby/object:Gem::Version
109
+ version: 1.6.4
110
+ - !ruby/object:Gem::Dependency
121
111
  name: rcov
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
122
126
  description: Rawler is a tool that crawls the links of your website
123
127
  email: info@oscardelben.com
124
- executables:
128
+ executables:
125
129
  - rawler
126
130
  extensions: []
127
-
128
- extra_rdoc_files:
131
+ extra_rdoc_files:
129
132
  - LICENSE.txt
130
133
  - README.md
131
- files:
134
+ files:
132
135
  - Gemfile
133
136
  - Gemfile.lock
134
137
  - LICENSE.txt
@@ -153,37 +156,31 @@ files:
153
156
  - test/test_rawler.rb
154
157
  - vendor/lib-trollop.rb
155
158
  homepage: http://github.com/oscardelben/rawler
156
- licenses:
159
+ licenses:
157
160
  - MIT
158
161
  post_install_message:
159
162
  rdoc_options: []
160
-
161
- require_paths:
163
+ require_paths:
162
164
  - lib
163
- required_ruby_version: !ruby/object:Gem::Requirement
165
+ required_ruby_version: !ruby/object:Gem::Requirement
164
166
  none: false
165
- requirements:
166
- - - ">="
167
- - !ruby/object:Gem::Version
168
- hash: 3
169
- segments:
167
+ requirements:
168
+ - - ! '>='
169
+ - !ruby/object:Gem::Version
170
+ version: '0'
171
+ segments:
170
172
  - 0
171
- version: "0"
172
- required_rubygems_version: !ruby/object:Gem::Requirement
173
+ hash: 2522129833142198431
174
+ required_rubygems_version: !ruby/object:Gem::Requirement
173
175
  none: false
174
- requirements:
175
- - - ">="
176
- - !ruby/object:Gem::Version
177
- hash: 3
178
- segments:
179
- - 0
180
- version: "0"
176
+ requirements:
177
+ - - ! '>='
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
181
180
  requirements: []
182
-
183
181
  rubyforge_project:
184
- rubygems_version: 1.8.10
182
+ rubygems_version: 1.8.21
185
183
  signing_key:
186
184
  specification_version: 3
187
185
  summary: Rawler is a tool that crawls the links of your website
188
186
  test_files: []
189
-