crawl 0.0.5 → 0.1.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/crawl CHANGED
@@ -32,7 +32,7 @@ end
32
32
  crawler.run
33
33
  crawler.summarize
34
34
 
35
- unless crawler.errors.empty?
35
+ if crawler.errors?
36
36
  puts 'Errors during crawling'
37
37
  exit -1
38
38
  end
data/crawl.gemspec CHANGED
@@ -17,4 +17,6 @@ Gem::Specification.new do |gem|
17
17
  gem.add_dependency('nokogiri')
18
18
  gem.add_dependency('rest-client')
19
19
  gem.add_dependency('ci_reporter')
20
+ gem.add_dependency('eventmachine', '~> 1.0.0.beta.4')
21
+ gem.add_dependency('em-http-request')
20
22
  end
data/lib/crawl/engine.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
+
2
3
  class Crawl::Engine
3
4
  DEFAULT_OPTIONS = {:domain => '',
4
5
  :start => ['/'],
@@ -13,69 +14,42 @@ class Crawl::Engine
13
14
  MAX_REDIRECTS = 3
14
15
  LINE_WIDTH = 78
15
16
 
16
- Result = Struct.new(:url, :object)
17
-
18
- attr_reader :options, :errors
19
-
17
+ attr_reader :options
20
18
 
21
19
  def initialize(caller_options = {})
22
20
  @options = DEFAULT_OPTIONS.merge(caller_options)
23
21
  @authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
24
-
25
- @found_links = options[:start].to_set
26
- @link_sources = {}
27
- @found_links.each {|target| @link_sources[target] = 'Initial'}
28
- @visited_links = Set[]
29
- @visited_documents = Set[]
30
- @invalid_links = Set[]
31
- @broken_pages = []
32
- @errors = []
33
22
  @verbose = options[:verbose] || ENV['VERBOSE']
34
- @number_of_dots = 0
35
- @report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
36
23
  @validate_markup = options[:markup]
24
+ @register = Crawl::Register.new(options[:start].to_a)
25
+
26
+ @report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
37
27
  end
38
28
 
39
29
  def run
40
- until (links = @found_links - (@visited_links + @invalid_links)).empty? do
41
- links.each do |link|
42
- puts "\nChecking #{link}" if @verbose
43
- next unless response = retrieve(link)
44
- next unless response.headers[:content_type] =~ %r{text/html}
45
- @visited_documents << link
46
- @found_links += links = find_links(link, response.to_str)
47
- validate(link, response.body) if @validate_markup
48
- end
30
+ EventMachine.run do
31
+ process_next
49
32
  end
50
33
  end
51
34
 
52
-
35
+ def process_next
36
+ return if @register.processing_size >= EM.threadpool_size
37
+ if @register.finished?
38
+ EventMachine.stop
39
+ elsif (link = @register.next_link)
40
+ puts "\nChecking #{link}" if @verbose
41
+ retrieve(link)
42
+ # validate(link, response.body) if @validate_markup
43
+ process_next
44
+ end
45
+ end
53
46
 
54
47
  def summarize
55
- if @errors.size > 0
56
-
57
- @errors.each do |error|
58
- puts "\n#{error.url}"
59
- puts " Linked from #{linked_from(error.url)}"
60
- puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
61
- end
62
-
63
- print(<<-SUM)
64
-
65
- Pages crawled: #{@visited_documents.size}
66
- Pages with errors: #{@errors.size - @invalid_links.size}
67
- Broken pages: #{@broken_pages.size}
68
- Invalid links: #{@invalid_links.size}
69
-
70
- I=Invalid P=Parse Error S=Status code bad
71
-
72
- SUM
73
- exit(@errors.size)
74
- else
75
- puts "\n\n#{@visited_documents.size} pages crawled"
76
- end
48
+ @register.summarize
49
+ end
77
50
 
78
- puts
51
+ def errors?
52
+ @register.errors?
79
53
  end
80
54
 
81
55
  private
@@ -96,7 +70,7 @@ private
96
70
  "\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
97
71
  end.join("\n\n")
98
72
 
99
- @errors << Result.new(link, response)
73
+ @register.error link, response
100
74
  false
101
75
  end
102
76
  rescue RestClient::ServiceUnavailable
@@ -104,41 +78,72 @@ private
104
78
  false
105
79
  end
106
80
 
81
+ def register_error(link, message)
82
+ @register.error link, message
83
+ @register.returned_invalid link
84
+ process_next
85
+ end
86
+
107
87
  def retrieve(link)
108
- test_suite = CI::Reporter::TestSuite.new(link)
109
- test_case = CI::Reporter::TestCase.new(link)
110
- test_suite.start
111
- test_case.start
112
- puts " Fetching.." if @verbose
113
-
114
- attributes = {:method => :get, :url => options[:domain] + link}
115
- attributes.merge!(user: options[:username], password: options[:password])
116
- response = RestClient::Request.execute(attributes)
117
- test_suite.name = link
118
- test_case.name = link
119
- test_case.finish
120
- @visited_links << link
121
- unless VALID_RESPONSE_CODES.include?(response.code)
122
- @errors << Result.new(link, "Status code was #{response.code}")
123
- @broken_pages << link
124
- test_case.failures << Crawl::Failure.new(link, response.code, linked_from(link))
125
- test_suite.testcases << test_case
126
- test_suite.finish
127
- @report_manager.write_report(test_suite) if options[:ci]
88
+ # test_suite = CI::Reporter::TestSuite.new(link)
89
+ # test_case = CI::Reporter::TestCase.new(link)
90
+ # test_suite.start
91
+ # test_case.start
92
+ # test_suite.name = link
93
+ # test_case.name = link
94
+
95
+ puts "Fetching #{options[:domain] + link} ..." if @verbose
96
+
97
+ unless link.start_with? '/'
98
+ register_error(link, "Relative path found. Crawl does not support relative paths.")
128
99
  return nil
129
100
  end
130
- test_suite.testcases << test_case
131
- test_suite.finish
132
- @report_manager.write_report(test_suite) if options[:ci]
133
- return response
134
- rescue RestClient::InternalServerError, RestClient::ResourceNotFound, RestClient::Unauthorized => e
135
- @errors << Result.new(link, "Error whilst retrieving page: #{e.message}")
136
- @invalid_links << link
137
- return nil
101
+
102
+ http = EventMachine::HttpRequest.new(options[:domain] + link)
103
+ req = http.get :redirects => MAX_REDIRECTS, :head => {'authorization' => [options[:username], options[:password]]}
104
+ req.timeout(30)
105
+
106
+ req.errback do
107
+ if req.nil?
108
+ @register.retry(link, 'WAT?')
109
+ process_next
110
+ elsif msg = req.error
111
+ register_error(link, msg)
112
+ elsif req.response.nil? || req.response.empty?
113
+ # no response at all?
114
+ @register.retry(link, 'Timeout?')
115
+ # register_error(link, 'Timeout?')
116
+ else
117
+ @register.retry(link, 'Partial response: Server Broke Connection?')
118
+ process_next
119
+ end
120
+ end
121
+
122
+ req.callback do
123
+ if VALID_RESPONSE_CODES.include?(req.response_header.status)
124
+ @register.returned link
125
+ if req.response_header["CONTENT_TYPE"] =~ %r{text/html}
126
+ @register.add find_links(link, req.response.to_str)
127
+ end
128
+ else
129
+ @register.error link, "Status code was #{req.response_header.status}"
130
+ @register.returned_broken link
131
+ # test_case.failures << Crawl::Failure.new(link, req.response_header.status, linked_from(link))
132
+ # test_suite.testcases << test_case
133
+ # test_suite.finish
134
+ # @report_manager.write_report(test_suite) if options[:ci]
135
+ end
136
+ process_next
137
+ end
138
+
139
+ # test_case.finish
140
+ # test_suite.testcases << test_case
141
+ # test_suite.finish
142
+ # @report_manager.write_report(test_suite) if options[:ci]
138
143
  end
139
144
 
140
145
  def linked_from(target)
141
- @link_sources[target] # => source
146
+ @register.source_for target
142
147
  end
143
148
 
144
149
  def find_links(source_link, body)
@@ -147,17 +152,17 @@ private
147
152
  anchors = doc.css('a').to_a
148
153
  anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
149
154
  anchors.reject!{|anchor| anchor['data-method'] =~ /put|post|delete/ }
155
+ anchors.reject!{|anchor| anchor['data-remote'] =~ /true/ }
150
156
  anchors.reject!{|anchor| anchor['class'].to_s =~ /unobtrusive_/}
157
+ anchors.reject!{|anchor| anchor['rel'].to_s =~ /nofollow/}
151
158
  raw_links = anchors.map{|anchor| anchor['href']}
152
159
  raw_links.compact!
153
160
  raw_links.map!{|link| link.sub(options[:domain], '')}
154
- raw_links.delete_if{|link| link =~ %r{^http://}}
161
+ raw_links.delete_if{|link| link =~ %r{^http(s)?://}}
155
162
  raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
156
163
  raw_links.each do |target_link|
157
- unless @found_links.include?(target_link)
158
- puts " Adding #{target_link} found on #{source_link}" if @verbose
159
- @link_sources[target_link] = source_link
160
- end
164
+ puts " Adding #{target_link} found on #{source_link}" if @verbose
165
+ @register.set_link_source(target_link, source_link)
161
166
  end
162
167
 
163
168
  raw_links
@@ -0,0 +1,102 @@
1
+ class Crawl::Register
2
+
3
+ Result = Struct.new(:url, :object)
4
+
5
+ def initialize(unprocessed)
6
+ @unprocessed = unprocessed
7
+ @processing = []
8
+ @processed = []
9
+
10
+ @invalid_links = Set[]
11
+ @broken_pages = Set[]
12
+
13
+ @errors = []
14
+ @link_sources = {}
15
+ end
16
+
17
+ def add(links)
18
+ new_links = links - @processed - @processing - @unprocessed
19
+ @unprocessed += new_links
20
+ end
21
+
22
+ def next_link
23
+ link = @unprocessed.shift
24
+ @processing << link if link
25
+ if @processing.size > EM.threadpool_size
26
+ puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
27
+ end
28
+ link
29
+ end
30
+
31
+ def set_link_source(link, source)
32
+ @link_sources[link] = source
33
+ end
34
+
35
+ def source_for(link)
36
+ @link_sources.fetch link, '?'
37
+ end
38
+
39
+ def error(link, object)
40
+ @errors << Result.new(link, object)
41
+ end
42
+
43
+ def returned_invalid(link)
44
+ returned link
45
+ @invalid_links << link
46
+ end
47
+
48
+ def returned_broken(link)
49
+ returned link
50
+ @broken_pages << link
51
+ end
52
+
53
+ def returned(link)
54
+ @processed << link
55
+ @processing -= [link]
56
+ end
57
+
58
+ def finished?
59
+ @unprocessed.size + @processing.size == 0
60
+ end
61
+
62
+ def processing_size
63
+ @processing.size
64
+ end
65
+
66
+ def retry(link, reason)
67
+ puts "Retrying #{link} : #{reason}"
68
+ @processing -= [link]
69
+ @unprocessed << link
70
+ end
71
+
72
+ def summarize
73
+ if @errors.size > 0
74
+
75
+ @errors.each do |error|
76
+ puts "\n#{error.url}"
77
+ puts " Linked from #{source_for error.url}"
78
+ puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
79
+ end
80
+
81
+ print(<<-SUM)
82
+
83
+ Pages crawled: #{@processed.size}
84
+ Pages with errors: #{@errors.size - @invalid_links.size}
85
+ Broken pages: #{@broken_pages.size}
86
+ Invalid links: #{@invalid_links.size}
87
+
88
+ I=Invalid P=Parse Error S=Status code bad
89
+
90
+ SUM
91
+ exit(@errors.size)
92
+ else
93
+ puts "\n\n#{@processed.size} pages crawled"
94
+ end
95
+
96
+ puts
97
+ end
98
+
99
+ def errors?
100
+ @errors.size > 0
101
+ end
102
+ end
data/lib/crawl/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module Crawl
3
- VERSION = "0.0.5"
3
+ VERSION = "0.1.0.beta1"
4
4
  end
data/lib/crawl.rb CHANGED
@@ -2,7 +2,8 @@
2
2
  require('nokogiri')
3
3
  require('rest_client')
4
4
  require 'ci/reporter/core'
5
-
5
+ require 'eventmachine'
6
+ require 'em-http-request'
6
7
  require 'base64'
7
8
  require 'set'
8
9
  require 'fileutils'
@@ -14,4 +15,5 @@ require 'tmpdir'
14
15
  require_relative "crawl/version"
15
16
  require_relative "crawl/engine"
16
17
  require_relative "crawl/string"
17
- require_relative "crawl/failure"
18
+ require_relative "crawl/failure"
19
+ require_relative "crawl/register"
metadata CHANGED
@@ -1,19 +1,19 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
5
- prerelease:
4
+ version: 0.1.0.beta1
5
+ prerelease: 6
6
6
  platform: ruby
7
7
  authors:
8
8
  - Tor Erik Linnerud
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-21 00:00:00.000000000 Z
12
+ date: 2012-04-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &70216317741600 !ruby/object:Gem::Requirement
16
+ requirement: &70366743805820 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70216317741600
24
+ version_requirements: *70366743805820
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rest-client
27
- requirement: &70216317740600 !ruby/object:Gem::Requirement
27
+ requirement: &70366743804960 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70216317740600
35
+ version_requirements: *70366743804960
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ci_reporter
38
- requirement: &70216317739980 !ruby/object:Gem::Requirement
38
+ requirement: &70366750375000 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,7 +43,29 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70216317739980
46
+ version_requirements: *70366750375000
47
+ - !ruby/object:Gem::Dependency
48
+ name: eventmachine
49
+ requirement: &70366750374440 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 1.0.0.beta.4
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *70366750374440
58
+ - !ruby/object:Gem::Dependency
59
+ name: em-http-request
60
+ requirement: &70366750373840 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *70366750373840
47
69
  description: Crawl all pages on a domain, checking for errors
48
70
  email:
49
71
  - tor@alphasights.com
@@ -61,6 +83,7 @@ files:
61
83
  - lib/crawl.rb
62
84
  - lib/crawl/engine.rb
63
85
  - lib/crawl/failure.rb
86
+ - lib/crawl/register.rb
64
87
  - lib/crawl/string.rb
65
88
  - lib/crawl/version.rb
66
89
  homepage: http://github.com/alphasights/crawl
@@ -78,9 +101,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
78
101
  required_rubygems_version: !ruby/object:Gem::Requirement
79
102
  none: false
80
103
  requirements:
81
- - - ! '>='
104
+ - - ! '>'
82
105
  - !ruby/object:Gem::Version
83
- version: '0'
106
+ version: 1.3.1
84
107
  requirements: []
85
108
  rubyforge_project:
86
109
  rubygems_version: 1.8.11