crawl 0.1.0.beta2 → 0.1.0.beta3

Sign up to get free protection for your applications and to get access to all the features.
data/bin/crawl CHANGED
@@ -8,10 +8,9 @@ optparse = OptionParser.new do |opts|
8
8
  opts.on('-s', '--start /home,/about', Array, 'Starting path(s), defaults to /') { |o| options[:start] = o }
9
9
  opts.on('-u', '--username username', String, 'Basic auth username') { |o| options[:username] = o }
10
10
  opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
11
- opts.on('-c', '--ci', 'Output files for CI integration') { |o| options[:ci] = o }
12
- opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
13
- opts.on('-m', '--markup', 'Validate markup') { |o| options[:markup] = o }
11
+ opts.on('-v', '--verbose', 'Give details when crawling') { |o| $verbose = o }
14
12
  opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
13
+ opts.on_tail("-v", "--version", "Print version") { |o| puts Crawl::VERSION; exit }
15
14
  end.parse!
16
15
 
17
16
  options.merge!(domain: optparse.first)
@@ -33,6 +32,5 @@ crawler.run
33
32
  crawler.summarize
34
33
 
35
34
  if crawler.errors?
36
- puts 'Errors during crawling'
37
35
  exit -1
38
36
  end
data/crawl.gemspec CHANGED
@@ -16,7 +16,6 @@ Gem::Specification.new do |gem|
16
16
  gem.version = Crawl::VERSION
17
17
  gem.add_dependency('nokogiri')
18
18
  gem.add_dependency('rest-client')
19
- gem.add_dependency('ci_reporter')
20
- gem.add_dependency('eventmachine', '~> 1.0.0.beta.4')
19
+ gem.add_dependency('eventmachine', '~> 1.0.0.rc1')
21
20
  gem.add_dependency('em-http-request')
22
21
  end
data/lib/crawl/engine.rb CHANGED
@@ -19,11 +19,11 @@ class Crawl::Engine
19
19
  def initialize(caller_options = {})
20
20
  @options = DEFAULT_OPTIONS.merge(caller_options)
21
21
  @authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
22
- @verbose = options[:verbose] || ENV['VERBOSE']
23
- @validate_markup = options[:markup]
24
- @register = Crawl::Register.new(options[:start].to_a)
22
+ @register = Crawl::Register.new
25
23
 
26
- @report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
24
+ start_pages = options[:start].to_a.map{|page| Page.new(@register, page, 'the command line')}
25
+
26
+ @register.add(start_pages)
27
27
  end
28
28
 
29
29
  def run
@@ -36,10 +36,8 @@ class Crawl::Engine
36
36
  return if @register.processing_size >= EM.threadpool_size
37
37
  if @register.finished?
38
38
  EventMachine.stop
39
- elsif (link = @register.next_link)
40
- puts "\nChecking #{link}" if @verbose
41
- retrieve(link)
42
- # validate(link, response.body) if @validate_markup
39
+ elsif (page = @register.next_page)
40
+ retrieve(page)
43
41
  process_next
44
42
  end
45
43
  end
@@ -54,100 +52,45 @@ class Crawl::Engine
54
52
 
55
53
  private
56
54
 
57
- def validate(link, body)
58
- puts " Validating..." if @verbose
59
-
60
- json_response = RestClient.post 'http://validator.nu?out=json', body, :content_type => 'text/html; charset=utf-8'
61
- messages = JSON.parse(json_response.body)['messages']
62
- error_messages = messages.select { |message| message['type'] != 'info' }
63
-
64
- if error_messages.empty?
65
- true
66
- else
67
- response = error_messages.map do |message|
68
- type, message = message['type'], message['message']
69
- type_color = type == 'error' ? 31 : 33
70
- "\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
71
- end.join("\n\n")
72
-
73
- @register.error link, response
74
- false
75
- end
76
- rescue RestClient::ServiceUnavailable
77
- handle_error('U')
78
- false
79
- end
80
-
81
- def register_error(link, message)
82
- @register.error link, message
83
- @register.returned_invalid link
84
- process_next
85
- end
86
-
87
- def retrieve(link)
88
- # test_suite = CI::Reporter::TestSuite.new(link)
89
- # test_case = CI::Reporter::TestCase.new(link)
90
- # test_suite.start
91
- # test_case.start
92
- # test_suite.name = link
93
- # test_case.name = link
94
-
95
- puts "Fetching #{options[:domain] + link} ..." if @verbose
55
+ def retrieve(page)
56
+ puts "Fetching #{page.url} ..." if $verbose
96
57
 
97
- unless link.start_with? '/'
98
- register_error(link, "Relative path found. Crawl does not support relative paths.")
99
- return nil
100
- end
58
+ full_url = options[:domain] + page.url
101
59
 
102
- http = EventMachine::HttpRequest.new(options[:domain] + link)
60
+ http = EventMachine::HttpRequest.new(full_url)
103
61
  req = http.get :redirects => MAX_REDIRECTS, :head => {'authorization' => [options[:username], options[:password]]}
104
- req.timeout(30)
62
+ req.timeout(15)
105
63
 
106
64
  req.errback do
107
65
  if req.nil?
108
- @register.retry(link, 'WAT?')
109
- process_next
66
+ page.intermittent("Req is nil. WAT?")
110
67
  elsif msg = req.error
111
- register_error(link, msg)
68
+ page.intermittent(msg)
112
69
  elsif req.response.nil? || req.response.empty?
113
- @register.retry(link, 'Timeout?')
114
- process_next
115
- # register_error(link, 'Timeout?')
70
+ page.intermittent('Timeout?')
116
71
  else
117
- @register.retry(link, 'Partial response: Server Broke Connection?')
118
- process_next
72
+ page.intermittent('Partial response: Server Broke Connection?')
119
73
  end
74
+ process_next
120
75
  end
121
76
 
122
77
  req.callback do
123
- if VALID_RESPONSE_CODES.include?(req.response_header.status)
124
- @register.returned link
78
+ status_code = req.response_header.status
79
+ if VALID_RESPONSE_CODES.include?(status_code)
80
+ page.success
125
81
  if req.response_header["CONTENT_TYPE"] =~ %r{text/html}
126
- @register.add find_links(link, req.response.to_str)
82
+ @register.add find_linked_pages(page, req.response.to_str)
127
83
  end
84
+ elsif(status_code == 503)
85
+ page.intermittent("Status code: 503")
128
86
  else
129
- @register.error link, "Status code was #{req.response_header.status}"
130
- @register.returned_broken link
131
- # test_case.failures << Crawl::Failure.new(link, req.response_header.status, linked_from(link))
132
- # test_suite.testcases << test_case
133
- # test_suite.finish
134
- # @report_manager.write_report(test_suite) if options[:ci]
87
+ page.fatal("Status code: #{status_code}")
135
88
  end
136
89
  process_next
137
90
  end
138
-
139
- # test_case.finish
140
- # test_suite.testcases << test_case
141
- # test_suite.finish
142
- # @report_manager.write_report(test_suite) if options[:ci]
143
91
  end
144
92
 
145
- def linked_from(target)
146
- @register.source_for target
147
- end
148
-
149
- def find_links(source_link, body)
150
- puts " Finding links.." if @verbose
93
+ def find_linked_pages(page, body)
151
94
  doc = Nokogiri::HTML(body)
152
95
  anchors = doc.css('a').to_a
153
96
  anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
@@ -160,11 +103,6 @@ private
160
103
  raw_links.map!{|link| link.sub(options[:domain], '')}
161
104
  raw_links.delete_if{|link| link =~ %r{^http(s)?://}}
162
105
  raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
163
- raw_links.each do |target_link|
164
- puts " Adding #{target_link} found on #{source_link}" if @verbose
165
- @register.set_link_source(target_link, source_link)
166
- end
167
-
168
- raw_links
106
+ raw_links.map{ |url| Page.new(@register, url, page.url) }
169
107
  end
170
108
  end
data/lib/crawl/page.rb ADDED
@@ -0,0 +1,53 @@
1
+ class Page
2
+ include Comparable
3
+
4
+ attr_reader :register, :url, :source, :error
5
+
6
+ ATTEMPTS = 3
7
+
8
+ def initialize(register, url, source)
9
+ @register = register
10
+ @url = url
11
+ @source = source
12
+ @attempts = 0
13
+ @errors = nil
14
+ end
15
+
16
+ def <=>(other)
17
+ url <=> other.url
18
+ end
19
+
20
+ def eql?(other)
21
+ url.eql?(other.url)
22
+ end
23
+
24
+ def hash
25
+ url.hash
26
+ end
27
+
28
+ def success
29
+ @error = nil
30
+ @register.completed(self)
31
+ end
32
+
33
+ def fatal(error)
34
+ puts " Fatal - #{error}" if $VERBOSE
35
+ @error = error
36
+ @register.completed(self)
37
+ end
38
+
39
+ def intermittent(error)
40
+ puts " Intermittent - #{error}" if $VERBOSE
41
+ if @attempts >= ATTEMPTS
42
+ @error = error
43
+ @register.completed(self)
44
+ else
45
+ @attempts += 1
46
+ @register.retry(self)
47
+ end
48
+ end
49
+
50
+ def to_s
51
+ "#{url} found on #{source} - #{error || 'OK'}"
52
+ end
53
+ end
@@ -2,57 +2,38 @@ class Crawl::Register
2
2
 
3
3
  Result = Struct.new(:url, :object)
4
4
 
5
- def initialize(unprocessed)
6
- @unprocessed = unprocessed
7
- @processing = []
8
- @processed = []
9
-
10
- @invalid_links = Set[]
11
- @broken_pages = Set[]
12
-
13
- @errors = []
14
- @link_sources = {}
5
+ def initialize
6
+ @unprocessed = Set.new
7
+ @processing = Set.new
8
+ @processed = Set.new
15
9
  end
16
10
 
17
- def add(links)
18
- new_links = links - @processed - @processing - @unprocessed
19
- @unprocessed += new_links
11
+ def add(pages)
12
+ new_pages = pages.to_set - @processed - @processing - @unprocessed
13
+ new_pages.each do |new_page|
14
+ puts " Adding #{new_page.url}" if $verbose
15
+ end
16
+ @unprocessed.merge(new_pages)
20
17
  end
21
18
 
22
- def next_link
23
- link = @unprocessed.shift
24
- @processing << link if link
19
+ def next_page
20
+ page = @unprocessed.first
21
+ @unprocessed.delete(page)
22
+ @processing << page if page
25
23
  if @processing.size > EM.threadpool_size
26
24
  puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
27
25
  end
28
- link
29
- end
30
-
31
- def set_link_source(link, source)
32
- @link_sources[link] = source
33
- end
34
-
35
- def source_for(link)
36
- @link_sources.fetch link, '?'
37
- end
38
-
39
- def error(link, object)
40
- @errors << Result.new(link, object)
26
+ page
41
27
  end
42
28
 
43
- def returned_invalid(link)
44
- returned link
45
- @invalid_links << link
29
+ def retry(page)
30
+ @unprocessed << page
31
+ @processing.delete(page)
46
32
  end
47
33
 
48
- def returned_broken(link)
49
- returned link
50
- @broken_pages << link
51
- end
52
-
53
- def returned(link)
54
- @processed << link
55
- @processing -= [link]
34
+ def completed(page)
35
+ @processed << page
36
+ @processing.delete(page)
56
37
  end
57
38
 
58
39
  def finished?
@@ -63,40 +44,22 @@ class Crawl::Register
63
44
  @processing.size
64
45
  end
65
46
 
66
- def retry(link, reason)
67
- puts "Retrying #{link} : #{reason}"
68
- @processing -= [link]
69
- @unprocessed << link
47
+ def error_pages
48
+ @processed.select{ |page| page.error }
70
49
  end
71
50
 
72
- def summarize
73
- if @errors.size > 0
51
+ def errors?
52
+ !error_pages.empty?
53
+ end
74
54
 
75
- @errors.each do |error|
76
- puts "\n#{error.url}"
77
- puts " Linked from #{source_for error.url}"
78
- puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
55
+ def summarize
56
+ if errors?
57
+ puts "\nPages with errors:"
58
+ error_pages.each do |page|
59
+ puts page.to_s
79
60
  end
80
-
81
- print(<<-SUM)
82
-
83
- Pages crawled: #{@processed.size}
84
- Pages with errors: #{@errors.size - @invalid_links.size}
85
- Broken pages: #{@broken_pages.size}
86
- Invalid links: #{@invalid_links.size}
87
-
88
- I=Invalid P=Parse Error S=Status code bad
89
-
90
- SUM
91
- exit(@errors.size)
92
61
  else
93
- puts "\n\n#{@processed.size} pages crawled"
62
+ puts "\n#{@processed.size} pages crawled without errors."
94
63
  end
95
-
96
- puts
97
- end
98
-
99
- def errors?
100
- @errors.size > 0
101
64
  end
102
65
  end
data/lib/crawl/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module Crawl
3
- VERSION = "0.1.0.beta2"
3
+ VERSION = "0.1.0.beta3"
4
4
  end
data/lib/crawl.rb CHANGED
@@ -16,4 +16,5 @@ require_relative "crawl/version"
16
16
  require_relative "crawl/engine"
17
17
  require_relative "crawl/string"
18
18
  require_relative "crawl/failure"
19
- require_relative "crawl/register"
19
+ require_relative "crawl/register"
20
+ require_relative "crawl/page"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.beta2
4
+ version: 0.1.0.beta3
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-01 00:00:00.000000000 Z
12
+ date: 2012-06-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &70140204969220 !ruby/object:Gem::Requirement
16
+ requirement: &70174767878200 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70140204969220
24
+ version_requirements: *70174767878200
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rest-client
27
- requirement: &70140204968500 !ruby/object:Gem::Requirement
27
+ requirement: &70174767877780 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,32 +32,21 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70140204968500
36
- - !ruby/object:Gem::Dependency
37
- name: ci_reporter
38
- requirement: &70140204967840 !ruby/object:Gem::Requirement
39
- none: false
40
- requirements:
41
- - - ! '>='
42
- - !ruby/object:Gem::Version
43
- version: '0'
44
- type: :runtime
45
- prerelease: false
46
- version_requirements: *70140204967840
35
+ version_requirements: *70174767877780
47
36
  - !ruby/object:Gem::Dependency
48
37
  name: eventmachine
49
- requirement: &70140204967240 !ruby/object:Gem::Requirement
38
+ requirement: &70174767877280 !ruby/object:Gem::Requirement
50
39
  none: false
51
40
  requirements:
52
41
  - - ~>
53
42
  - !ruby/object:Gem::Version
54
- version: 1.0.0.beta.4
43
+ version: 1.0.0.rc1
55
44
  type: :runtime
56
45
  prerelease: false
57
- version_requirements: *70140204967240
46
+ version_requirements: *70174767877280
58
47
  - !ruby/object:Gem::Dependency
59
48
  name: em-http-request
60
- requirement: &70140204966740 !ruby/object:Gem::Requirement
49
+ requirement: &70174767876860 !ruby/object:Gem::Requirement
61
50
  none: false
62
51
  requirements:
63
52
  - - ! '>='
@@ -65,7 +54,7 @@ dependencies:
65
54
  version: '0'
66
55
  type: :runtime
67
56
  prerelease: false
68
- version_requirements: *70140204966740
57
+ version_requirements: *70174767876860
69
58
  description: Crawl all pages on a domain, checking for errors
70
59
  email:
71
60
  - tor@alphasights.com
@@ -83,6 +72,7 @@ files:
83
72
  - lib/crawl.rb
84
73
  - lib/crawl/engine.rb
85
74
  - lib/crawl/failure.rb
75
+ - lib/crawl/page.rb
86
76
  - lib/crawl/register.rb
87
77
  - lib/crawl/string.rb
88
78
  - lib/crawl/version.rb