rawler 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.4
1
+ 0.1.5
data/bin/rawler CHANGED
@@ -18,8 +18,10 @@ EOS
18
18
  opt :password, "HTT Basic Password", :type => :string
19
19
  opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
20
20
  opt :log, "Log results to file rawler_log.txt", :type => :boolean, :default => false
21
+ opt :css, "Check CSS links", :type => :boolean, :default => false
21
22
  end
22
23
 
24
+
23
25
  domain = ARGV.shift
24
26
 
25
27
  if domain.nil?
data/lib/rawler.rb CHANGED
@@ -11,6 +11,7 @@ module Rawler
11
11
  mattr_accessor :wait
12
12
  mattr_accessor :username, :password
13
13
  mattr_accessor :log
14
+ mattr_accessor :css
14
15
 
15
16
  autoload :Base, "rawler/base"
16
17
  autoload :Crawler, "rawler/crawler"
data/lib/rawler/base.rb CHANGED
@@ -13,6 +13,7 @@ module Rawler
13
13
  Rawler.password = options[:password]
14
14
  Rawler.wait = options[:wait]
15
15
  Rawler.log = options[:log]
16
+ Rawler.css = options[:css]
16
17
  @logfile = File.new("rawler_log.txt", "w") if Rawler.log
17
18
  end
18
19
 
@@ -30,10 +31,24 @@ module Rawler
30
31
  end
31
32
  end
32
33
 
34
+ def validate_css_links_in_page(page)
35
+ Rawler::Crawler.new(page).css_links.each do |page_url|
36
+ validate_non_html(page_url, page)
37
+ sleep(Rawler.wait)
38
+ end
39
+ end
40
+
33
41
  def validate_page(page_url, from_url)
34
42
  if not_yet_parsed?(page_url)
35
43
  add_status_code(page_url, from_url)
36
44
  validate_links_in_page(page_url) if same_domain?(page_url)
45
+ validate_css_links_in_page(page_url) if same_domain?(page_url) and Rawler.css
46
+ end
47
+ end
48
+
49
+ def validate_non_html(page_url, from_url)
50
+ if not_yet_parsed?(page_url)
51
+ add_status_code(page_url, from_url)
37
52
  end
38
53
  end
39
54
 
@@ -27,6 +27,24 @@ module Rawler
27
27
  []
28
28
  end
29
29
 
30
+ def css_links
31
+ if different_domain?(url, Rawler.url) || not_html?(url)
32
+ return []
33
+ end
34
+
35
+ response = Rawler::Request.get(url)
36
+
37
+ doc = Nokogiri::HTML(response.body)
38
+
39
+ doc.css('link').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
40
+ rescue Errno::ECONNREFUSED
41
+ write("Couldn't connect to #{url}")
42
+ []
43
+ rescue Errno::ETIMEDOUT
44
+ write("Connection to #{url} timed out")
45
+ []
46
+ end
47
+
30
48
  private
31
49
 
32
50
  def absolute_url(path)
@@ -50,8 +68,16 @@ module Rawler
50
68
  URI.parse(url_1).host != URI.parse(url_2).host
51
69
  end
52
70
 
71
+ def content_type(url)
72
+ Rawler::Request.head(url).content_type
73
+ end
74
+
53
75
  def not_html?(url)
54
- Rawler::Request.head(url).content_type != 'text/html'
76
+ content_type(url) != 'text/html'
77
+ end
78
+
79
+ def not_css?(url)
80
+ content_type(url) != 'text/css'
55
81
  end
56
82
 
57
83
  def valid_url?(url)
data/rawler.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "rawler"
8
- s.version = "0.1.4"
8
+ s.version = "0.1.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Oscar Del Ben"]
12
- s.date = "2012-02-27"
12
+ s.date = "2012-04-14"
13
13
  s.description = "Rawler is a tool that crawls the links of your website"
14
14
  s.email = "info@oscardelben.com"
15
15
  s.executables = ["rawler"]
@@ -45,7 +45,7 @@ Gem::Specification.new do |s|
45
45
  s.homepage = "http://github.com/oscardelben/rawler"
46
46
  s.licenses = ["MIT"]
47
47
  s.require_paths = ["lib"]
48
- s.rubygems_version = "1.8.10"
48
+ s.rubygems_version = "1.8.21"
49
49
  s.summary = "Rawler is a tool that crawls the links of your website"
50
50
 
51
51
  if s.respond_to? :specification_version then
@@ -18,7 +18,8 @@ describe Rawler::Crawler do
18
18
  let(:crawler) { Rawler::Crawler.new(url) }
19
19
  let(:content) {
20
20
  content = <<-content
21
- <p><a href="http://example.com/foo">foo</a></p>
21
+ <link rel="stylesheet" href="css/styles.css" />
22
+ <p><a href="http://example.com/foo">foo</a></p>
22
23
 
23
24
  <p><a href="http://external.com/bar">bar</a></p>
24
25
  content
@@ -32,6 +33,9 @@ describe Rawler::Crawler do
32
33
  crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
33
34
  end
34
35
 
36
+ it "should parse css links" do
37
+ crawler.css_links.should == ['http://example.com/css/styles.css']
38
+ end
35
39
  end
36
40
 
37
41
  context "relative paths" do
metadata CHANGED
@@ -1,134 +1,137 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
- version: !ruby/object:Gem::Version
4
- hash: 19
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.5
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 1
9
- - 4
10
- version: 0.1.4
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Oscar Del Ben
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2012-02-27 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- version_requirements: &id001 !ruby/object:Gem::Requirement
12
+ date: 2012-04-14 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
22
17
  none: false
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- hash: 3
27
- segments:
28
- - 0
29
- version: "0"
30
- requirement: *id001
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
31
22
  type: :runtime
32
23
  prerelease: false
33
- name: nokogiri
34
- - !ruby/object:Gem::Dependency
35
- version_requirements: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
36
25
  none: false
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- hash: 3
41
- segments:
42
- - 0
43
- version: "0"
44
- requirement: *id002
45
- type: :development
46
- prerelease: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
47
31
  name: fakeweb
48
- - !ruby/object:Gem::Dependency
49
- version_requirements: &id003 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
50
33
  none: false
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- hash: 3
55
- segments:
56
- - 0
57
- version: "0"
58
- requirement: *id003
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
59
38
  type: :development
60
39
  prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
61
47
  name: rspec
62
- - !ruby/object:Gem::Dependency
63
- version_requirements: &id004 !ruby/object:Gem::Requirement
48
+ requirement: !ruby/object:Gem::Requirement
64
49
  none: false
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- hash: 3
69
- segments:
70
- - 0
71
- version: "0"
72
- requirement: *id004
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
73
54
  type: :development
74
55
  prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
75
63
  name: shoulda
76
- - !ruby/object:Gem::Dependency
77
- version_requirements: &id005 !ruby/object:Gem::Requirement
64
+ requirement: !ruby/object:Gem::Requirement
78
65
  none: false
79
- requirements:
80
- - - ~>
81
- - !ruby/object:Gem::Version
82
- hash: 23
83
- segments:
84
- - 1
85
- - 0
86
- - 0
87
- version: 1.0.0
88
- requirement: *id005
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
89
70
  type: :development
90
71
  prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
91
79
  name: bundler
92
- - !ruby/object:Gem::Dependency
93
- version_requirements: &id006 !ruby/object:Gem::Requirement
80
+ requirement: !ruby/object:Gem::Requirement
94
81
  none: false
95
- requirements:
82
+ requirements:
96
83
  - - ~>
97
- - !ruby/object:Gem::Version
98
- hash: 7
99
- segments:
100
- - 1
101
- - 6
102
- - 4
103
- version: 1.6.4
104
- requirement: *id006
84
+ - !ruby/object:Gem::Version
85
+ version: 1.0.0
105
86
  type: :development
106
87
  prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: 1.0.0
94
+ - !ruby/object:Gem::Dependency
107
95
  name: jeweler
108
- - !ruby/object:Gem::Dependency
109
- version_requirements: &id007 !ruby/object:Gem::Requirement
96
+ requirement: !ruby/object:Gem::Requirement
110
97
  none: false
111
- requirements:
112
- - - ">="
113
- - !ruby/object:Gem::Version
114
- hash: 3
115
- segments:
116
- - 0
117
- version: "0"
118
- requirement: *id007
98
+ requirements:
99
+ - - ~>
100
+ - !ruby/object:Gem::Version
101
+ version: 1.6.4
119
102
  type: :development
120
103
  prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ~>
108
+ - !ruby/object:Gem::Version
109
+ version: 1.6.4
110
+ - !ruby/object:Gem::Dependency
121
111
  name: rcov
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
122
126
  description: Rawler is a tool that crawls the links of your website
123
127
  email: info@oscardelben.com
124
- executables:
128
+ executables:
125
129
  - rawler
126
130
  extensions: []
127
-
128
- extra_rdoc_files:
131
+ extra_rdoc_files:
129
132
  - LICENSE.txt
130
133
  - README.md
131
- files:
134
+ files:
132
135
  - Gemfile
133
136
  - Gemfile.lock
134
137
  - LICENSE.txt
@@ -153,37 +156,31 @@ files:
153
156
  - test/test_rawler.rb
154
157
  - vendor/lib-trollop.rb
155
158
  homepage: http://github.com/oscardelben/rawler
156
- licenses:
159
+ licenses:
157
160
  - MIT
158
161
  post_install_message:
159
162
  rdoc_options: []
160
-
161
- require_paths:
163
+ require_paths:
162
164
  - lib
163
- required_ruby_version: !ruby/object:Gem::Requirement
165
+ required_ruby_version: !ruby/object:Gem::Requirement
164
166
  none: false
165
- requirements:
166
- - - ">="
167
- - !ruby/object:Gem::Version
168
- hash: 3
169
- segments:
167
+ requirements:
168
+ - - ! '>='
169
+ - !ruby/object:Gem::Version
170
+ version: '0'
171
+ segments:
170
172
  - 0
171
- version: "0"
172
- required_rubygems_version: !ruby/object:Gem::Requirement
173
+ hash: 2522129833142198431
174
+ required_rubygems_version: !ruby/object:Gem::Requirement
173
175
  none: false
174
- requirements:
175
- - - ">="
176
- - !ruby/object:Gem::Version
177
- hash: 3
178
- segments:
179
- - 0
180
- version: "0"
176
+ requirements:
177
+ - - ! '>='
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
181
180
  requirements: []
182
-
183
181
  rubyforge_project:
184
- rubygems_version: 1.8.10
182
+ rubygems_version: 1.8.21
185
183
  signing_key:
186
184
  specification_version: 3
187
185
  summary: Rawler is a tool that crawls the links of your website
188
186
  test_files: []
189
-