crawl 0.1.0.beta2 → 0.1.0.beta3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/crawl CHANGED
@@ -8,10 +8,9 @@ optparse = OptionParser.new do |opts|
8
8
  opts.on('-s', '--start /home,/about', Array, 'Starting path(s), defaults to /') { |o| options[:start] = o }
9
9
  opts.on('-u', '--username username', String, 'Basic auth username') { |o| options[:username] = o }
10
10
  opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
11
- opts.on('-c', '--ci', 'Output files for CI integration') { |o| options[:ci] = o }
12
- opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
13
- opts.on('-m', '--markup', 'Validate markup') { |o| options[:markup] = o }
11
+ opts.on('-v', '--verbose', 'Give details when crawling') { |o| $verbose = o }
14
12
  opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
13
+ opts.on_tail("-v", "--version", "Print version") { |o| puts Crawl::VERSION; exit }
15
14
  end.parse!
16
15
 
17
16
  options.merge!(domain: optparse.first)
@@ -33,6 +32,5 @@ crawler.run
33
32
  crawler.summarize
34
33
 
35
34
  if crawler.errors?
36
- puts 'Errors during crawling'
37
35
  exit -1
38
36
  end
data/crawl.gemspec CHANGED
@@ -16,7 +16,6 @@ Gem::Specification.new do |gem|
16
16
  gem.version = Crawl::VERSION
17
17
  gem.add_dependency('nokogiri')
18
18
  gem.add_dependency('rest-client')
19
- gem.add_dependency('ci_reporter')
20
- gem.add_dependency('eventmachine', '~> 1.0.0.beta.4')
19
+ gem.add_dependency('eventmachine', '~> 1.0.0.rc1')
21
20
  gem.add_dependency('em-http-request')
22
21
  end
data/lib/crawl/engine.rb CHANGED
@@ -19,11 +19,11 @@ class Crawl::Engine
19
19
  def initialize(caller_options = {})
20
20
  @options = DEFAULT_OPTIONS.merge(caller_options)
21
21
  @authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
22
- @verbose = options[:verbose] || ENV['VERBOSE']
23
- @validate_markup = options[:markup]
24
- @register = Crawl::Register.new(options[:start].to_a)
22
+ @register = Crawl::Register.new
25
23
 
26
- @report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
24
+ start_pages = options[:start].to_a.map{|page| Page.new(@register, page, 'the command line')}
25
+
26
+ @register.add(start_pages)
27
27
  end
28
28
 
29
29
  def run
@@ -36,10 +36,8 @@ class Crawl::Engine
36
36
  return if @register.processing_size >= EM.threadpool_size
37
37
  if @register.finished?
38
38
  EventMachine.stop
39
- elsif (link = @register.next_link)
40
- puts "\nChecking #{link}" if @verbose
41
- retrieve(link)
42
- # validate(link, response.body) if @validate_markup
39
+ elsif (page = @register.next_page)
40
+ retrieve(page)
43
41
  process_next
44
42
  end
45
43
  end
@@ -54,100 +52,45 @@ class Crawl::Engine
54
52
 
55
53
  private
56
54
 
57
- def validate(link, body)
58
- puts " Validating..." if @verbose
59
-
60
- json_response = RestClient.post 'http://validator.nu?out=json', body, :content_type => 'text/html; charset=utf-8'
61
- messages = JSON.parse(json_response.body)['messages']
62
- error_messages = messages.select { |message| message['type'] != 'info' }
63
-
64
- if error_messages.empty?
65
- true
66
- else
67
- response = error_messages.map do |message|
68
- type, message = message['type'], message['message']
69
- type_color = type == 'error' ? 31 : 33
70
- "\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
71
- end.join("\n\n")
72
-
73
- @register.error link, response
74
- false
75
- end
76
- rescue RestClient::ServiceUnavailable
77
- handle_error('U')
78
- false
79
- end
80
-
81
- def register_error(link, message)
82
- @register.error link, message
83
- @register.returned_invalid link
84
- process_next
85
- end
86
-
87
- def retrieve(link)
88
- # test_suite = CI::Reporter::TestSuite.new(link)
89
- # test_case = CI::Reporter::TestCase.new(link)
90
- # test_suite.start
91
- # test_case.start
92
- # test_suite.name = link
93
- # test_case.name = link
94
-
95
- puts "Fetching #{options[:domain] + link} ..." if @verbose
55
+ def retrieve(page)
56
+ puts "Fetching #{page.url} ..." if $verbose
96
57
 
97
- unless link.start_with? '/'
98
- register_error(link, "Relative path found. Crawl does not support relative paths.")
99
- return nil
100
- end
58
+ full_url = options[:domain] + page.url
101
59
 
102
- http = EventMachine::HttpRequest.new(options[:domain] + link)
60
+ http = EventMachine::HttpRequest.new(full_url)
103
61
  req = http.get :redirects => MAX_REDIRECTS, :head => {'authorization' => [options[:username], options[:password]]}
104
- req.timeout(30)
62
+ req.timeout(15)
105
63
 
106
64
  req.errback do
107
65
  if req.nil?
108
- @register.retry(link, 'WAT?')
109
- process_next
66
+ page.intermittent("Req is nil. WAT?")
110
67
  elsif msg = req.error
111
- register_error(link, msg)
68
+ page.intermittent(msg)
112
69
  elsif req.response.nil? || req.response.empty?
113
- @register.retry(link, 'Timeout?')
114
- process_next
115
- # register_error(link, 'Timeout?')
70
+ page.intermittent('Timeout?')
116
71
  else
117
- @register.retry(link, 'Partial response: Server Broke Connection?')
118
- process_next
72
+ page.intermittent('Partial response: Server Broke Connection?')
119
73
  end
74
+ process_next
120
75
  end
121
76
 
122
77
  req.callback do
123
- if VALID_RESPONSE_CODES.include?(req.response_header.status)
124
- @register.returned link
78
+ status_code = req.response_header.status
79
+ if VALID_RESPONSE_CODES.include?(status_code)
80
+ page.success
125
81
  if req.response_header["CONTENT_TYPE"] =~ %r{text/html}
126
- @register.add find_links(link, req.response.to_str)
82
+ @register.add find_linked_pages(page, req.response.to_str)
127
83
  end
84
+ elsif(status_code == 503)
85
+ page.intermittent("Status code: 503")
128
86
  else
129
- @register.error link, "Status code was #{req.response_header.status}"
130
- @register.returned_broken link
131
- # test_case.failures << Crawl::Failure.new(link, req.response_header.status, linked_from(link))
132
- # test_suite.testcases << test_case
133
- # test_suite.finish
134
- # @report_manager.write_report(test_suite) if options[:ci]
87
+ page.fatal("Status code: #{status_code}")
135
88
  end
136
89
  process_next
137
90
  end
138
-
139
- # test_case.finish
140
- # test_suite.testcases << test_case
141
- # test_suite.finish
142
- # @report_manager.write_report(test_suite) if options[:ci]
143
91
  end
144
92
 
145
- def linked_from(target)
146
- @register.source_for target
147
- end
148
-
149
- def find_links(source_link, body)
150
- puts " Finding links.." if @verbose
93
+ def find_linked_pages(page, body)
151
94
  doc = Nokogiri::HTML(body)
152
95
  anchors = doc.css('a').to_a
153
96
  anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
@@ -160,11 +103,6 @@ private
160
103
  raw_links.map!{|link| link.sub(options[:domain], '')}
161
104
  raw_links.delete_if{|link| link =~ %r{^http(s)?://}}
162
105
  raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
163
- raw_links.each do |target_link|
164
- puts " Adding #{target_link} found on #{source_link}" if @verbose
165
- @register.set_link_source(target_link, source_link)
166
- end
167
-
168
- raw_links
106
+ raw_links.map{ |url| Page.new(@register, url, page.url) }
169
107
  end
170
108
  end
data/lib/crawl/page.rb ADDED
@@ -0,0 +1,53 @@
1
+ class Page
2
+ include Comparable
3
+
4
+ attr_reader :register, :url, :source, :error
5
+
6
+ ATTEMPTS = 3
7
+
8
+ def initialize(register, url, source)
9
+ @register = register
10
+ @url = url
11
+ @source = source
12
+ @attempts = 0
13
+ @errors = nil
14
+ end
15
+
16
+ def <=>(other)
17
+ url <=> other.url
18
+ end
19
+
20
+ def eql?(other)
21
+ url.eql?(other.url)
22
+ end
23
+
24
+ def hash
25
+ url.hash
26
+ end
27
+
28
+ def success
29
+ @error = nil
30
+ @register.completed(self)
31
+ end
32
+
33
+ def fatal(error)
34
+ puts " Fatal - #{error}" if $VERBOSE
35
+ @error = error
36
+ @register.completed(self)
37
+ end
38
+
39
+ def intermittent(error)
40
+ puts " Intermittent - #{error}" if $VERBOSE
41
+ if @attempts >= ATTEMPTS
42
+ @error = error
43
+ @register.completed(self)
44
+ else
45
+ @attempts += 1
46
+ @register.retry(self)
47
+ end
48
+ end
49
+
50
+ def to_s
51
+ "#{url} found on #{source} - #{error || 'OK'}"
52
+ end
53
+ end
@@ -2,57 +2,38 @@ class Crawl::Register
2
2
 
3
3
  Result = Struct.new(:url, :object)
4
4
 
5
- def initialize(unprocessed)
6
- @unprocessed = unprocessed
7
- @processing = []
8
- @processed = []
9
-
10
- @invalid_links = Set[]
11
- @broken_pages = Set[]
12
-
13
- @errors = []
14
- @link_sources = {}
5
+ def initialize
6
+ @unprocessed = Set.new
7
+ @processing = Set.new
8
+ @processed = Set.new
15
9
  end
16
10
 
17
- def add(links)
18
- new_links = links - @processed - @processing - @unprocessed
19
- @unprocessed += new_links
11
+ def add(pages)
12
+ new_pages = pages.to_set - @processed - @processing - @unprocessed
13
+ new_pages.each do |new_page|
14
+ puts " Adding #{new_page.url}" if $verbose
15
+ end
16
+ @unprocessed.merge(new_pages)
20
17
  end
21
18
 
22
- def next_link
23
- link = @unprocessed.shift
24
- @processing << link if link
19
+ def next_page
20
+ page = @unprocessed.first
21
+ @unprocessed.delete(page)
22
+ @processing << page if page
25
23
  if @processing.size > EM.threadpool_size
26
24
  puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
27
25
  end
28
- link
29
- end
30
-
31
- def set_link_source(link, source)
32
- @link_sources[link] = source
33
- end
34
-
35
- def source_for(link)
36
- @link_sources.fetch link, '?'
37
- end
38
-
39
- def error(link, object)
40
- @errors << Result.new(link, object)
26
+ page
41
27
  end
42
28
 
43
- def returned_invalid(link)
44
- returned link
45
- @invalid_links << link
29
+ def retry(page)
30
+ @unprocessed << page
31
+ @processing.delete(page)
46
32
  end
47
33
 
48
- def returned_broken(link)
49
- returned link
50
- @broken_pages << link
51
- end
52
-
53
- def returned(link)
54
- @processed << link
55
- @processing -= [link]
34
+ def completed(page)
35
+ @processed << page
36
+ @processing.delete(page)
56
37
  end
57
38
 
58
39
  def finished?
@@ -63,40 +44,22 @@ class Crawl::Register
63
44
  @processing.size
64
45
  end
65
46
 
66
- def retry(link, reason)
67
- puts "Retrying #{link} : #{reason}"
68
- @processing -= [link]
69
- @unprocessed << link
47
+ def error_pages
48
+ @processed.select{ |page| page.error }
70
49
  end
71
50
 
72
- def summarize
73
- if @errors.size > 0
51
+ def errors?
52
+ !error_pages.empty?
53
+ end
74
54
 
75
- @errors.each do |error|
76
- puts "\n#{error.url}"
77
- puts " Linked from #{source_for error.url}"
78
- puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
55
+ def summarize
56
+ if errors?
57
+ puts "\nPages with errors:"
58
+ error_pages.each do |page|
59
+ puts page.to_s
79
60
  end
80
-
81
- print(<<-SUM)
82
-
83
- Pages crawled: #{@processed.size}
84
- Pages with errors: #{@errors.size - @invalid_links.size}
85
- Broken pages: #{@broken_pages.size}
86
- Invalid links: #{@invalid_links.size}
87
-
88
- I=Invalid P=Parse Error S=Status code bad
89
-
90
- SUM
91
- exit(@errors.size)
92
61
  else
93
- puts "\n\n#{@processed.size} pages crawled"
62
+ puts "\n#{@processed.size} pages crawled without errors."
94
63
  end
95
-
96
- puts
97
- end
98
-
99
- def errors?
100
- @errors.size > 0
101
64
  end
102
65
  end
data/lib/crawl/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module Crawl
3
- VERSION = "0.1.0.beta2"
3
+ VERSION = "0.1.0.beta3"
4
4
  end
data/lib/crawl.rb CHANGED
@@ -16,4 +16,5 @@ require_relative "crawl/version"
16
16
  require_relative "crawl/engine"
17
17
  require_relative "crawl/string"
18
18
  require_relative "crawl/failure"
19
- require_relative "crawl/register"
19
+ require_relative "crawl/register"
20
+ require_relative "crawl/page"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.beta2
4
+ version: 0.1.0.beta3
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-01 00:00:00.000000000 Z
12
+ date: 2012-06-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &70140204969220 !ruby/object:Gem::Requirement
16
+ requirement: &70174767878200 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70140204969220
24
+ version_requirements: *70174767878200
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rest-client
27
- requirement: &70140204968500 !ruby/object:Gem::Requirement
27
+ requirement: &70174767877780 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,32 +32,21 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70140204968500
36
- - !ruby/object:Gem::Dependency
37
- name: ci_reporter
38
- requirement: &70140204967840 !ruby/object:Gem::Requirement
39
- none: false
40
- requirements:
41
- - - ! '>='
42
- - !ruby/object:Gem::Version
43
- version: '0'
44
- type: :runtime
45
- prerelease: false
46
- version_requirements: *70140204967840
35
+ version_requirements: *70174767877780
47
36
  - !ruby/object:Gem::Dependency
48
37
  name: eventmachine
49
- requirement: &70140204967240 !ruby/object:Gem::Requirement
38
+ requirement: &70174767877280 !ruby/object:Gem::Requirement
50
39
  none: false
51
40
  requirements:
52
41
  - - ~>
53
42
  - !ruby/object:Gem::Version
54
- version: 1.0.0.beta.4
43
+ version: 1.0.0.rc1
55
44
  type: :runtime
56
45
  prerelease: false
57
- version_requirements: *70140204967240
46
+ version_requirements: *70174767877280
58
47
  - !ruby/object:Gem::Dependency
59
48
  name: em-http-request
60
- requirement: &70140204966740 !ruby/object:Gem::Requirement
49
+ requirement: &70174767876860 !ruby/object:Gem::Requirement
61
50
  none: false
62
51
  requirements:
63
52
  - - ! '>='
@@ -65,7 +54,7 @@ dependencies:
65
54
  version: '0'
66
55
  type: :runtime
67
56
  prerelease: false
68
- version_requirements: *70140204966740
57
+ version_requirements: *70174767876860
69
58
  description: Crawl all pages on a domain, checking for errors
70
59
  email:
71
60
  - tor@alphasights.com
@@ -83,6 +72,7 @@ files:
83
72
  - lib/crawl.rb
84
73
  - lib/crawl/engine.rb
85
74
  - lib/crawl/failure.rb
75
+ - lib/crawl/page.rb
86
76
  - lib/crawl/register.rb
87
77
  - lib/crawl/string.rb
88
78
  - lib/crawl/version.rb