crawl 0.0.5 → 0.1.0.beta1
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/crawl +1 -1
- data/crawl.gemspec +2 -0
- data/lib/crawl/engine.rb +88 -83
- data/lib/crawl/register.rb +102 -0
- data/lib/crawl/version.rb +1 -1
- data/lib/crawl.rb +4 -2
- metadata +34 -11
data/bin/crawl
CHANGED
data/crawl.gemspec
CHANGED
data/lib/crawl/engine.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
2
3
|
class Crawl::Engine
|
3
4
|
DEFAULT_OPTIONS = {:domain => '',
|
4
5
|
:start => ['/'],
|
@@ -13,69 +14,42 @@ class Crawl::Engine
|
|
13
14
|
MAX_REDIRECTS = 3
|
14
15
|
LINE_WIDTH = 78
|
15
16
|
|
16
|
-
|
17
|
-
|
18
|
-
attr_reader :options, :errors
|
19
|
-
|
17
|
+
attr_reader :options
|
20
18
|
|
21
19
|
def initialize(caller_options = {})
|
22
20
|
@options = DEFAULT_OPTIONS.merge(caller_options)
|
23
21
|
@authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
|
24
|
-
|
25
|
-
@found_links = options[:start].to_set
|
26
|
-
@link_sources = {}
|
27
|
-
@found_links.each {|target| @link_sources[target] = 'Initial'}
|
28
|
-
@visited_links = Set[]
|
29
|
-
@visited_documents = Set[]
|
30
|
-
@invalid_links = Set[]
|
31
|
-
@broken_pages = []
|
32
|
-
@errors = []
|
33
22
|
@verbose = options[:verbose] || ENV['VERBOSE']
|
34
|
-
@number_of_dots = 0
|
35
|
-
@report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
|
36
23
|
@validate_markup = options[:markup]
|
24
|
+
@register = Crawl::Register.new(options[:start].to_a)
|
25
|
+
|
26
|
+
@report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
|
37
27
|
end
|
38
28
|
|
39
29
|
def run
|
40
|
-
|
41
|
-
|
42
|
-
puts "\nChecking #{link}" if @verbose
|
43
|
-
next unless response = retrieve(link)
|
44
|
-
next unless response.headers[:content_type] =~ %r{text/html}
|
45
|
-
@visited_documents << link
|
46
|
-
@found_links += links = find_links(link, response.to_str)
|
47
|
-
validate(link, response.body) if @validate_markup
|
48
|
-
end
|
30
|
+
EventMachine.run do
|
31
|
+
process_next
|
49
32
|
end
|
50
33
|
end
|
51
34
|
|
52
|
-
|
35
|
+
def process_next
|
36
|
+
return if @register.processing_size >= EM.threadpool_size
|
37
|
+
if @register.finished?
|
38
|
+
EventMachine.stop
|
39
|
+
elsif (link = @register.next_link)
|
40
|
+
puts "\nChecking #{link}" if @verbose
|
41
|
+
retrieve(link)
|
42
|
+
# validate(link, response.body) if @validate_markup
|
43
|
+
process_next
|
44
|
+
end
|
45
|
+
end
|
53
46
|
|
54
47
|
def summarize
|
55
|
-
|
56
|
-
|
57
|
-
@errors.each do |error|
|
58
|
-
puts "\n#{error.url}"
|
59
|
-
puts " Linked from #{linked_from(error.url)}"
|
60
|
-
puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
|
61
|
-
end
|
62
|
-
|
63
|
-
print(<<-SUM)
|
64
|
-
|
65
|
-
Pages crawled: #{@visited_documents.size}
|
66
|
-
Pages with errors: #{@errors.size - @invalid_links.size}
|
67
|
-
Broken pages: #{@broken_pages.size}
|
68
|
-
Invalid links: #{@invalid_links.size}
|
69
|
-
|
70
|
-
I=Invalid P=Parse Error S=Status code bad
|
71
|
-
|
72
|
-
SUM
|
73
|
-
exit(@errors.size)
|
74
|
-
else
|
75
|
-
puts "\n\n#{@visited_documents.size} pages crawled"
|
76
|
-
end
|
48
|
+
@register.summarize
|
49
|
+
end
|
77
50
|
|
78
|
-
|
51
|
+
def errors?
|
52
|
+
@register.errors?
|
79
53
|
end
|
80
54
|
|
81
55
|
private
|
@@ -96,7 +70,7 @@ private
|
|
96
70
|
"\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
|
97
71
|
end.join("\n\n")
|
98
72
|
|
99
|
-
@
|
73
|
+
@register.error link, response
|
100
74
|
false
|
101
75
|
end
|
102
76
|
rescue RestClient::ServiceUnavailable
|
@@ -104,41 +78,72 @@ private
|
|
104
78
|
false
|
105
79
|
end
|
106
80
|
|
81
|
+
def register_error(link, message)
|
82
|
+
@register.error link, message
|
83
|
+
@register.returned_invalid link
|
84
|
+
process_next
|
85
|
+
end
|
86
|
+
|
107
87
|
def retrieve(link)
|
108
|
-
test_suite = CI::Reporter::TestSuite.new(link)
|
109
|
-
test_case = CI::Reporter::TestCase.new(link)
|
110
|
-
test_suite.start
|
111
|
-
test_case.start
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
test_case.finish
|
120
|
-
@visited_links << link
|
121
|
-
unless VALID_RESPONSE_CODES.include?(response.code)
|
122
|
-
@errors << Result.new(link, "Status code was #{response.code}")
|
123
|
-
@broken_pages << link
|
124
|
-
test_case.failures << Crawl::Failure.new(link, response.code, linked_from(link))
|
125
|
-
test_suite.testcases << test_case
|
126
|
-
test_suite.finish
|
127
|
-
@report_manager.write_report(test_suite) if options[:ci]
|
88
|
+
# test_suite = CI::Reporter::TestSuite.new(link)
|
89
|
+
# test_case = CI::Reporter::TestCase.new(link)
|
90
|
+
# test_suite.start
|
91
|
+
# test_case.start
|
92
|
+
# test_suite.name = link
|
93
|
+
# test_case.name = link
|
94
|
+
|
95
|
+
puts "Fetching #{options[:domain] + link} ..." if @verbose
|
96
|
+
|
97
|
+
unless link.start_with? '/'
|
98
|
+
register_error(link, "Relative path found. Crawl does not support relative paths.")
|
128
99
|
return nil
|
129
100
|
end
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
101
|
+
|
102
|
+
http = EventMachine::HttpRequest.new(options[:domain] + link)
|
103
|
+
req = http.get :redirects => MAX_REDIRECTS, :head => {'authorization' => [options[:username], options[:password]]}
|
104
|
+
req.timeout(30)
|
105
|
+
|
106
|
+
req.errback do
|
107
|
+
if req.nil?
|
108
|
+
@register.retry(link, 'WAT?')
|
109
|
+
process_next
|
110
|
+
elsif msg = req.error
|
111
|
+
register_error(link, msg)
|
112
|
+
elsif req.response.nil? || req.response.empty?
|
113
|
+
# no response at all?
|
114
|
+
@register.retry(link, 'Timeout?')
|
115
|
+
# register_error(link, 'Timeout?')
|
116
|
+
else
|
117
|
+
@register.retry(link, 'Partial response: Server Broke Connection?')
|
118
|
+
process_next
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
req.callback do
|
123
|
+
if VALID_RESPONSE_CODES.include?(req.response_header.status)
|
124
|
+
@register.returned link
|
125
|
+
if req.response_header["CONTENT_TYPE"] =~ %r{text/html}
|
126
|
+
@register.add find_links(link, req.response.to_str)
|
127
|
+
end
|
128
|
+
else
|
129
|
+
@register.error link, "Status code was #{req.response_header.status}"
|
130
|
+
@register.returned_broken link
|
131
|
+
# test_case.failures << Crawl::Failure.new(link, req.response_header.status, linked_from(link))
|
132
|
+
# test_suite.testcases << test_case
|
133
|
+
# test_suite.finish
|
134
|
+
# @report_manager.write_report(test_suite) if options[:ci]
|
135
|
+
end
|
136
|
+
process_next
|
137
|
+
end
|
138
|
+
|
139
|
+
# test_case.finish
|
140
|
+
# test_suite.testcases << test_case
|
141
|
+
# test_suite.finish
|
142
|
+
# @report_manager.write_report(test_suite) if options[:ci]
|
138
143
|
end
|
139
144
|
|
140
145
|
def linked_from(target)
|
141
|
-
@
|
146
|
+
@register.source_for target
|
142
147
|
end
|
143
148
|
|
144
149
|
def find_links(source_link, body)
|
@@ -147,17 +152,17 @@ private
|
|
147
152
|
anchors = doc.css('a').to_a
|
148
153
|
anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
|
149
154
|
anchors.reject!{|anchor| anchor['data-method'] =~ /put|post|delete/ }
|
155
|
+
anchors.reject!{|anchor| anchor['data-remote'] =~ /true/ }
|
150
156
|
anchors.reject!{|anchor| anchor['class'].to_s =~ /unobtrusive_/}
|
157
|
+
anchors.reject!{|anchor| anchor['rel'].to_s =~ /nofollow/}
|
151
158
|
raw_links = anchors.map{|anchor| anchor['href']}
|
152
159
|
raw_links.compact!
|
153
160
|
raw_links.map!{|link| link.sub(options[:domain], '')}
|
154
|
-
raw_links.delete_if{|link| link =~ %r{^http
|
161
|
+
raw_links.delete_if{|link| link =~ %r{^http(s)?://}}
|
155
162
|
raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
|
156
163
|
raw_links.each do |target_link|
|
157
|
-
|
158
|
-
|
159
|
-
@link_sources[target_link] = source_link
|
160
|
-
end
|
164
|
+
puts " Adding #{target_link} found on #{source_link}" if @verbose
|
165
|
+
@register.set_link_source(target_link, source_link)
|
161
166
|
end
|
162
167
|
|
163
168
|
raw_links
|
@@ -0,0 +1,102 @@
|
|
1
|
+
class Crawl::Register
|
2
|
+
|
3
|
+
Result = Struct.new(:url, :object)
|
4
|
+
|
5
|
+
def initialize(unprocessed)
|
6
|
+
@unprocessed = unprocessed
|
7
|
+
@processing = []
|
8
|
+
@processed = []
|
9
|
+
|
10
|
+
@invalid_links = Set[]
|
11
|
+
@broken_pages = Set[]
|
12
|
+
|
13
|
+
@errors = []
|
14
|
+
@link_sources = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def add(links)
|
18
|
+
new_links = links - @processed - @processing - @unprocessed
|
19
|
+
@unprocessed += new_links
|
20
|
+
end
|
21
|
+
|
22
|
+
def next_link
|
23
|
+
link = @unprocessed.shift
|
24
|
+
@processing << link if link
|
25
|
+
if @processing.size > EM.threadpool_size
|
26
|
+
puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
|
27
|
+
end
|
28
|
+
link
|
29
|
+
end
|
30
|
+
|
31
|
+
def set_link_source(link, source)
|
32
|
+
@link_sources[link] = source
|
33
|
+
end
|
34
|
+
|
35
|
+
def source_for(link)
|
36
|
+
@link_sources.fetch link, '?'
|
37
|
+
end
|
38
|
+
|
39
|
+
def error(link, object)
|
40
|
+
@errors << Result.new(link, object)
|
41
|
+
end
|
42
|
+
|
43
|
+
def returned_invalid(link)
|
44
|
+
returned link
|
45
|
+
@invalid_links << link
|
46
|
+
end
|
47
|
+
|
48
|
+
def returned_broken(link)
|
49
|
+
returned link
|
50
|
+
@broken_pages << link
|
51
|
+
end
|
52
|
+
|
53
|
+
def returned(link)
|
54
|
+
@processed << link
|
55
|
+
@processing -= [link]
|
56
|
+
end
|
57
|
+
|
58
|
+
def finished?
|
59
|
+
@unprocessed.size + @processing.size == 0
|
60
|
+
end
|
61
|
+
|
62
|
+
def processing_size
|
63
|
+
@processing.size
|
64
|
+
end
|
65
|
+
|
66
|
+
def retry(link, reason)
|
67
|
+
puts "Retrying #{link} : #{reason}"
|
68
|
+
@processing -= [link]
|
69
|
+
@unprocessed << link
|
70
|
+
end
|
71
|
+
|
72
|
+
def summarize
|
73
|
+
if @errors.size > 0
|
74
|
+
|
75
|
+
@errors.each do |error|
|
76
|
+
puts "\n#{error.url}"
|
77
|
+
puts " Linked from #{source_for error.url}"
|
78
|
+
puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
|
79
|
+
end
|
80
|
+
|
81
|
+
print(<<-SUM)
|
82
|
+
|
83
|
+
Pages crawled: #{@processed.size}
|
84
|
+
Pages with errors: #{@errors.size - @invalid_links.size}
|
85
|
+
Broken pages: #{@broken_pages.size}
|
86
|
+
Invalid links: #{@invalid_links.size}
|
87
|
+
|
88
|
+
I=Invalid P=Parse Error S=Status code bad
|
89
|
+
|
90
|
+
SUM
|
91
|
+
exit(@errors.size)
|
92
|
+
else
|
93
|
+
puts "\n\n#{@processed.size} pages crawled"
|
94
|
+
end
|
95
|
+
|
96
|
+
puts
|
97
|
+
end
|
98
|
+
|
99
|
+
def errors?
|
100
|
+
@errors.size > 0
|
101
|
+
end
|
102
|
+
end
|
data/lib/crawl/version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -2,7 +2,8 @@
|
|
2
2
|
require('nokogiri')
|
3
3
|
require('rest_client')
|
4
4
|
require 'ci/reporter/core'
|
5
|
-
|
5
|
+
require 'eventmachine'
|
6
|
+
require 'em-http-request'
|
6
7
|
require 'base64'
|
7
8
|
require 'set'
|
8
9
|
require 'fileutils'
|
@@ -14,4 +15,5 @@ require 'tmpdir'
|
|
14
15
|
require_relative "crawl/version"
|
15
16
|
require_relative "crawl/engine"
|
16
17
|
require_relative "crawl/string"
|
17
|
-
require_relative "crawl/failure"
|
18
|
+
require_relative "crawl/failure"
|
19
|
+
require_relative "crawl/register"
|
metadata
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0.beta1
|
5
|
+
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Tor Erik Linnerud
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-04-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70366743805820 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70366743805820
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rest-client
|
27
|
-
requirement: &
|
27
|
+
requirement: &70366743804960 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70366743804960
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ci_reporter
|
38
|
-
requirement: &
|
38
|
+
requirement: &70366750375000 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,7 +43,29 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70366750375000
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: eventmachine
|
49
|
+
requirement: &70366750374440 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.0.0.beta.4
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70366750374440
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: em-http-request
|
60
|
+
requirement: &70366750373840 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
type: :runtime
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *70366750373840
|
47
69
|
description: Crawl all pages on a domain, checking for errors
|
48
70
|
email:
|
49
71
|
- tor@alphasights.com
|
@@ -61,6 +83,7 @@ files:
|
|
61
83
|
- lib/crawl.rb
|
62
84
|
- lib/crawl/engine.rb
|
63
85
|
- lib/crawl/failure.rb
|
86
|
+
- lib/crawl/register.rb
|
64
87
|
- lib/crawl/string.rb
|
65
88
|
- lib/crawl/version.rb
|
66
89
|
homepage: http://github.com/alphasights/crawl
|
@@ -78,9 +101,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
78
101
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
102
|
none: false
|
80
103
|
requirements:
|
81
|
-
- - ! '
|
104
|
+
- - ! '>'
|
82
105
|
- !ruby/object:Gem::Version
|
83
|
-
version:
|
106
|
+
version: 1.3.1
|
84
107
|
requirements: []
|
85
108
|
rubyforge_project:
|
86
109
|
rubygems_version: 1.8.11
|