crawl 0.0.5 → 0.1.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
data/bin/crawl CHANGED
@@ -32,7 +32,7 @@ end
32
32
  crawler.run
33
33
  crawler.summarize
34
34
 
35
- unless crawler.errors.empty?
35
+ if crawler.errors?
36
36
  puts 'Errors during crawling'
37
37
  exit -1
38
38
  end
data/crawl.gemspec CHANGED
@@ -17,4 +17,6 @@ Gem::Specification.new do |gem|
17
17
  gem.add_dependency('nokogiri')
18
18
  gem.add_dependency('rest-client')
19
19
  gem.add_dependency('ci_reporter')
20
+ gem.add_dependency('eventmachine', '~> 1.0.0.beta.4')
21
+ gem.add_dependency('em-http-request')
20
22
  end
data/lib/crawl/engine.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
+
2
3
  class Crawl::Engine
3
4
  DEFAULT_OPTIONS = {:domain => '',
4
5
  :start => ['/'],
@@ -13,69 +14,42 @@ class Crawl::Engine
13
14
  MAX_REDIRECTS = 3
14
15
  LINE_WIDTH = 78
15
16
 
16
- Result = Struct.new(:url, :object)
17
-
18
- attr_reader :options, :errors
19
-
17
+ attr_reader :options
20
18
 
21
19
  def initialize(caller_options = {})
22
20
  @options = DEFAULT_OPTIONS.merge(caller_options)
23
21
  @authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
24
-
25
- @found_links = options[:start].to_set
26
- @link_sources = {}
27
- @found_links.each {|target| @link_sources[target] = 'Initial'}
28
- @visited_links = Set[]
29
- @visited_documents = Set[]
30
- @invalid_links = Set[]
31
- @broken_pages = []
32
- @errors = []
33
22
  @verbose = options[:verbose] || ENV['VERBOSE']
34
- @number_of_dots = 0
35
- @report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
36
23
  @validate_markup = options[:markup]
24
+ @register = Crawl::Register.new(options[:start].to_a)
25
+
26
+ @report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
37
27
  end
38
28
 
39
29
  def run
40
- until (links = @found_links - (@visited_links + @invalid_links)).empty? do
41
- links.each do |link|
42
- puts "\nChecking #{link}" if @verbose
43
- next unless response = retrieve(link)
44
- next unless response.headers[:content_type] =~ %r{text/html}
45
- @visited_documents << link
46
- @found_links += links = find_links(link, response.to_str)
47
- validate(link, response.body) if @validate_markup
48
- end
30
+ EventMachine.run do
31
+ process_next
49
32
  end
50
33
  end
51
34
 
52
-
35
+ def process_next
36
+ return if @register.processing_size >= EM.threadpool_size
37
+ if @register.finished?
38
+ EventMachine.stop
39
+ elsif (link = @register.next_link)
40
+ puts "\nChecking #{link}" if @verbose
41
+ retrieve(link)
42
+ # validate(link, response.body) if @validate_markup
43
+ process_next
44
+ end
45
+ end
53
46
 
54
47
  def summarize
55
- if @errors.size > 0
56
-
57
- @errors.each do |error|
58
- puts "\n#{error.url}"
59
- puts " Linked from #{linked_from(error.url)}"
60
- puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
61
- end
62
-
63
- print(<<-SUM)
64
-
65
- Pages crawled: #{@visited_documents.size}
66
- Pages with errors: #{@errors.size - @invalid_links.size}
67
- Broken pages: #{@broken_pages.size}
68
- Invalid links: #{@invalid_links.size}
69
-
70
- I=Invalid P=Parse Error S=Status code bad
71
-
72
- SUM
73
- exit(@errors.size)
74
- else
75
- puts "\n\n#{@visited_documents.size} pages crawled"
76
- end
48
+ @register.summarize
49
+ end
77
50
 
78
- puts
51
+ def errors?
52
+ @register.errors?
79
53
  end
80
54
 
81
55
  private
@@ -96,7 +70,7 @@ private
96
70
  "\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
97
71
  end.join("\n\n")
98
72
 
99
- @errors << Result.new(link, response)
73
+ @register.error link, response
100
74
  false
101
75
  end
102
76
  rescue RestClient::ServiceUnavailable
@@ -104,41 +78,72 @@ private
104
78
  false
105
79
  end
106
80
 
81
+ def register_error(link, message)
82
+ @register.error link, message
83
+ @register.returned_invalid link
84
+ process_next
85
+ end
86
+
107
87
  def retrieve(link)
108
- test_suite = CI::Reporter::TestSuite.new(link)
109
- test_case = CI::Reporter::TestCase.new(link)
110
- test_suite.start
111
- test_case.start
112
- puts " Fetching.." if @verbose
113
-
114
- attributes = {:method => :get, :url => options[:domain] + link}
115
- attributes.merge!(user: options[:username], password: options[:password])
116
- response = RestClient::Request.execute(attributes)
117
- test_suite.name = link
118
- test_case.name = link
119
- test_case.finish
120
- @visited_links << link
121
- unless VALID_RESPONSE_CODES.include?(response.code)
122
- @errors << Result.new(link, "Status code was #{response.code}")
123
- @broken_pages << link
124
- test_case.failures << Crawl::Failure.new(link, response.code, linked_from(link))
125
- test_suite.testcases << test_case
126
- test_suite.finish
127
- @report_manager.write_report(test_suite) if options[:ci]
88
+ # test_suite = CI::Reporter::TestSuite.new(link)
89
+ # test_case = CI::Reporter::TestCase.new(link)
90
+ # test_suite.start
91
+ # test_case.start
92
+ # test_suite.name = link
93
+ # test_case.name = link
94
+
95
+ puts "Fetching #{options[:domain] + link} ..." if @verbose
96
+
97
+ unless link.start_with? '/'
98
+ register_error(link, "Relative path found. Crawl does not support relative paths.")
128
99
  return nil
129
100
  end
130
- test_suite.testcases << test_case
131
- test_suite.finish
132
- @report_manager.write_report(test_suite) if options[:ci]
133
- return response
134
- rescue RestClient::InternalServerError, RestClient::ResourceNotFound, RestClient::Unauthorized => e
135
- @errors << Result.new(link, "Error whilst retrieving page: #{e.message}")
136
- @invalid_links << link
137
- return nil
101
+
102
+ http = EventMachine::HttpRequest.new(options[:domain] + link)
103
+ req = http.get :redirects => MAX_REDIRECTS, :head => {'authorization' => [options[:username], options[:password]]}
104
+ req.timeout(30)
105
+
106
+ req.errback do
107
+ if req.nil?
108
+ @register.retry(link, 'WAT?')
109
+ process_next
110
+ elsif msg = req.error
111
+ register_error(link, msg)
112
+ elsif req.response.nil? || req.response.empty?
113
+ # no response at all?
114
+ @register.retry(link, 'Timeout?')
115
+ # register_error(link, 'Timeout?')
116
+ else
117
+ @register.retry(link, 'Partial response: Server Broke Connection?')
118
+ process_next
119
+ end
120
+ end
121
+
122
+ req.callback do
123
+ if VALID_RESPONSE_CODES.include?(req.response_header.status)
124
+ @register.returned link
125
+ if req.response_header["CONTENT_TYPE"] =~ %r{text/html}
126
+ @register.add find_links(link, req.response.to_str)
127
+ end
128
+ else
129
+ @register.error link, "Status code was #{req.response_header.status}"
130
+ @register.returned_broken link
131
+ # test_case.failures << Crawl::Failure.new(link, req.response_header.status, linked_from(link))
132
+ # test_suite.testcases << test_case
133
+ # test_suite.finish
134
+ # @report_manager.write_report(test_suite) if options[:ci]
135
+ end
136
+ process_next
137
+ end
138
+
139
+ # test_case.finish
140
+ # test_suite.testcases << test_case
141
+ # test_suite.finish
142
+ # @report_manager.write_report(test_suite) if options[:ci]
138
143
  end
139
144
 
140
145
  def linked_from(target)
141
- @link_sources[target] # => source
146
+ @register.source_for target
142
147
  end
143
148
 
144
149
  def find_links(source_link, body)
@@ -147,17 +152,17 @@ private
147
152
  anchors = doc.css('a').to_a
148
153
  anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
149
154
  anchors.reject!{|anchor| anchor['data-method'] =~ /put|post|delete/ }
155
+ anchors.reject!{|anchor| anchor['data-remote'] =~ /true/ }
150
156
  anchors.reject!{|anchor| anchor['class'].to_s =~ /unobtrusive_/}
157
+ anchors.reject!{|anchor| anchor['rel'].to_s =~ /nofollow/}
151
158
  raw_links = anchors.map{|anchor| anchor['href']}
152
159
  raw_links.compact!
153
160
  raw_links.map!{|link| link.sub(options[:domain], '')}
154
- raw_links.delete_if{|link| link =~ %r{^http://}}
161
+ raw_links.delete_if{|link| link =~ %r{^http(s)?://}}
155
162
  raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
156
163
  raw_links.each do |target_link|
157
- unless @found_links.include?(target_link)
158
- puts " Adding #{target_link} found on #{source_link}" if @verbose
159
- @link_sources[target_link] = source_link
160
- end
164
+ puts " Adding #{target_link} found on #{source_link}" if @verbose
165
+ @register.set_link_source(target_link, source_link)
161
166
  end
162
167
 
163
168
  raw_links
@@ -0,0 +1,102 @@
1
+ class Crawl::Register
2
+
3
+ Result = Struct.new(:url, :object)
4
+
5
+ def initialize(unprocessed)
6
+ @unprocessed = unprocessed
7
+ @processing = []
8
+ @processed = []
9
+
10
+ @invalid_links = Set[]
11
+ @broken_pages = Set[]
12
+
13
+ @errors = []
14
+ @link_sources = {}
15
+ end
16
+
17
+ def add(links)
18
+ new_links = links - @processed - @processing - @unprocessed
19
+ @unprocessed += new_links
20
+ end
21
+
22
+ def next_link
23
+ link = @unprocessed.shift
24
+ @processing << link if link
25
+ if @processing.size > EM.threadpool_size
26
+ puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
27
+ end
28
+ link
29
+ end
30
+
31
+ def set_link_source(link, source)
32
+ @link_sources[link] = source
33
+ end
34
+
35
+ def source_for(link)
36
+ @link_sources.fetch link, '?'
37
+ end
38
+
39
+ def error(link, object)
40
+ @errors << Result.new(link, object)
41
+ end
42
+
43
+ def returned_invalid(link)
44
+ returned link
45
+ @invalid_links << link
46
+ end
47
+
48
+ def returned_broken(link)
49
+ returned link
50
+ @broken_pages << link
51
+ end
52
+
53
+ def returned(link)
54
+ @processed << link
55
+ @processing -= [link]
56
+ end
57
+
58
+ def finished?
59
+ @unprocessed.size + @processing.size == 0
60
+ end
61
+
62
+ def processing_size
63
+ @processing.size
64
+ end
65
+
66
+ def retry(link, reason)
67
+ puts "Retrying #{link} : #{reason}"
68
+ @processing -= [link]
69
+ @unprocessed << link
70
+ end
71
+
72
+ def summarize
73
+ if @errors.size > 0
74
+
75
+ @errors.each do |error|
76
+ puts "\n#{error.url}"
77
+ puts " Linked from #{source_for error.url}"
78
+ puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
79
+ end
80
+
81
+ print(<<-SUM)
82
+
83
+ Pages crawled: #{@processed.size}
84
+ Pages with errors: #{@errors.size - @invalid_links.size}
85
+ Broken pages: #{@broken_pages.size}
86
+ Invalid links: #{@invalid_links.size}
87
+
88
+ I=Invalid P=Parse Error S=Status code bad
89
+
90
+ SUM
91
+ exit(@errors.size)
92
+ else
93
+ puts "\n\n#{@processed.size} pages crawled"
94
+ end
95
+
96
+ puts
97
+ end
98
+
99
+ def errors?
100
+ @errors.size > 0
101
+ end
102
+ end
data/lib/crawl/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module Crawl
3
- VERSION = "0.0.5"
3
+ VERSION = "0.1.0.beta1"
4
4
  end
data/lib/crawl.rb CHANGED
@@ -2,7 +2,8 @@
2
2
  require('nokogiri')
3
3
  require('rest_client')
4
4
  require 'ci/reporter/core'
5
-
5
+ require 'eventmachine'
6
+ require 'em-http-request'
6
7
  require 'base64'
7
8
  require 'set'
8
9
  require 'fileutils'
@@ -14,4 +15,5 @@ require 'tmpdir'
14
15
  require_relative "crawl/version"
15
16
  require_relative "crawl/engine"
16
17
  require_relative "crawl/string"
17
- require_relative "crawl/failure"
18
+ require_relative "crawl/failure"
19
+ require_relative "crawl/register"
metadata CHANGED
@@ -1,19 +1,19 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
5
- prerelease:
4
+ version: 0.1.0.beta1
5
+ prerelease: 6
6
6
  platform: ruby
7
7
  authors:
8
8
  - Tor Erik Linnerud
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-21 00:00:00.000000000 Z
12
+ date: 2012-04-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &70216317741600 !ruby/object:Gem::Requirement
16
+ requirement: &70366743805820 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70216317741600
24
+ version_requirements: *70366743805820
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rest-client
27
- requirement: &70216317740600 !ruby/object:Gem::Requirement
27
+ requirement: &70366743804960 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70216317740600
35
+ version_requirements: *70366743804960
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ci_reporter
38
- requirement: &70216317739980 !ruby/object:Gem::Requirement
38
+ requirement: &70366750375000 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,7 +43,29 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70216317739980
46
+ version_requirements: *70366750375000
47
+ - !ruby/object:Gem::Dependency
48
+ name: eventmachine
49
+ requirement: &70366750374440 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 1.0.0.beta.4
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *70366750374440
58
+ - !ruby/object:Gem::Dependency
59
+ name: em-http-request
60
+ requirement: &70366750373840 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *70366750373840
47
69
  description: Crawl all pages on a domain, checking for errors
48
70
  email:
49
71
  - tor@alphasights.com
@@ -61,6 +83,7 @@ files:
61
83
  - lib/crawl.rb
62
84
  - lib/crawl/engine.rb
63
85
  - lib/crawl/failure.rb
86
+ - lib/crawl/register.rb
64
87
  - lib/crawl/string.rb
65
88
  - lib/crawl/version.rb
66
89
  homepage: http://github.com/alphasights/crawl
@@ -78,9 +101,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
78
101
  required_rubygems_version: !ruby/object:Gem::Requirement
79
102
  none: false
80
103
  requirements:
81
- - - ! '>='
104
+ - - ! '>'
82
105
  - !ruby/object:Gem::Version
83
- version: '0'
106
+ version: 1.3.1
84
107
  requirements: []
85
108
  rubyforge_project:
86
109
  rubygems_version: 1.8.11