crawl 0.1.0.beta2 → 0.1.0.beta3
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/crawl +2 -4
- data/crawl.gemspec +1 -2
- data/lib/crawl/engine.rb +25 -87
- data/lib/crawl/page.rb +53 -0
- data/lib/crawl/register.rb +32 -69
- data/lib/crawl/version.rb +1 -1
- data/lib/crawl.rb +2 -1
- metadata +12 -22
data/bin/crawl
CHANGED
@@ -8,10 +8,9 @@ optparse = OptionParser.new do |opts|
|
|
8
8
|
opts.on('-s', '--start /home,/about', Array, 'Starting path(s), defaults to /') { |o| options[:start] = o }
|
9
9
|
opts.on('-u', '--username username', String, 'Basic auth username') { |o| options[:username] = o }
|
10
10
|
opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
|
11
|
-
opts.on('-
|
12
|
-
opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
|
13
|
-
opts.on('-m', '--markup', 'Validate markup') { |o| options[:markup] = o }
|
11
|
+
opts.on('-v', '--verbose', 'Give details when crawling') { |o| $verbose = o }
|
14
12
|
opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
|
13
|
+
opts.on_tail("-v", "--version", "Print version") { |o| puts Crawl::VERSION; exit }
|
15
14
|
end.parse!
|
16
15
|
|
17
16
|
options.merge!(domain: optparse.first)
|
@@ -33,6 +32,5 @@ crawler.run
|
|
33
32
|
crawler.summarize
|
34
33
|
|
35
34
|
if crawler.errors?
|
36
|
-
puts 'Errors during crawling'
|
37
35
|
exit -1
|
38
36
|
end
|
data/crawl.gemspec
CHANGED
@@ -16,7 +16,6 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.version = Crawl::VERSION
|
17
17
|
gem.add_dependency('nokogiri')
|
18
18
|
gem.add_dependency('rest-client')
|
19
|
-
gem.add_dependency('
|
20
|
-
gem.add_dependency('eventmachine', '~> 1.0.0.beta.4')
|
19
|
+
gem.add_dependency('eventmachine', '~> 1.0.0.rc1')
|
21
20
|
gem.add_dependency('em-http-request')
|
22
21
|
end
|
data/lib/crawl/engine.rb
CHANGED
@@ -19,11 +19,11 @@ class Crawl::Engine
|
|
19
19
|
def initialize(caller_options = {})
|
20
20
|
@options = DEFAULT_OPTIONS.merge(caller_options)
|
21
21
|
@authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
|
22
|
-
@
|
23
|
-
@validate_markup = options[:markup]
|
24
|
-
@register = Crawl::Register.new(options[:start].to_a)
|
22
|
+
@register = Crawl::Register.new
|
25
23
|
|
26
|
-
|
24
|
+
start_pages = options[:start].to_a.map{|page| Page.new(@register, page, 'the command line')}
|
25
|
+
|
26
|
+
@register.add(start_pages)
|
27
27
|
end
|
28
28
|
|
29
29
|
def run
|
@@ -36,10 +36,8 @@ class Crawl::Engine
|
|
36
36
|
return if @register.processing_size >= EM.threadpool_size
|
37
37
|
if @register.finished?
|
38
38
|
EventMachine.stop
|
39
|
-
elsif (
|
40
|
-
|
41
|
-
retrieve(link)
|
42
|
-
# validate(link, response.body) if @validate_markup
|
39
|
+
elsif (page = @register.next_page)
|
40
|
+
retrieve(page)
|
43
41
|
process_next
|
44
42
|
end
|
45
43
|
end
|
@@ -54,100 +52,45 @@ class Crawl::Engine
|
|
54
52
|
|
55
53
|
private
|
56
54
|
|
57
|
-
def
|
58
|
-
puts "
|
59
|
-
|
60
|
-
json_response = RestClient.post 'http://validator.nu?out=json', body, :content_type => 'text/html; charset=utf-8'
|
61
|
-
messages = JSON.parse(json_response.body)['messages']
|
62
|
-
error_messages = messages.select { |message| message['type'] != 'info' }
|
63
|
-
|
64
|
-
if error_messages.empty?
|
65
|
-
true
|
66
|
-
else
|
67
|
-
response = error_messages.map do |message|
|
68
|
-
type, message = message['type'], message['message']
|
69
|
-
type_color = type == 'error' ? 31 : 33
|
70
|
-
"\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
|
71
|
-
end.join("\n\n")
|
72
|
-
|
73
|
-
@register.error link, response
|
74
|
-
false
|
75
|
-
end
|
76
|
-
rescue RestClient::ServiceUnavailable
|
77
|
-
handle_error('U')
|
78
|
-
false
|
79
|
-
end
|
80
|
-
|
81
|
-
def register_error(link, message)
|
82
|
-
@register.error link, message
|
83
|
-
@register.returned_invalid link
|
84
|
-
process_next
|
85
|
-
end
|
86
|
-
|
87
|
-
def retrieve(link)
|
88
|
-
# test_suite = CI::Reporter::TestSuite.new(link)
|
89
|
-
# test_case = CI::Reporter::TestCase.new(link)
|
90
|
-
# test_suite.start
|
91
|
-
# test_case.start
|
92
|
-
# test_suite.name = link
|
93
|
-
# test_case.name = link
|
94
|
-
|
95
|
-
puts "Fetching #{options[:domain] + link} ..." if @verbose
|
55
|
+
def retrieve(page)
|
56
|
+
puts "Fetching #{page.url} ..." if $verbose
|
96
57
|
|
97
|
-
|
98
|
-
register_error(link, "Relative path found. Crawl does not support relative paths.")
|
99
|
-
return nil
|
100
|
-
end
|
58
|
+
full_url = options[:domain] + page.url
|
101
59
|
|
102
|
-
http = EventMachine::HttpRequest.new(
|
60
|
+
http = EventMachine::HttpRequest.new(full_url)
|
103
61
|
req = http.get :redirects => MAX_REDIRECTS, :head => {'authorization' => [options[:username], options[:password]]}
|
104
|
-
req.timeout(
|
62
|
+
req.timeout(15)
|
105
63
|
|
106
64
|
req.errback do
|
107
65
|
if req.nil?
|
108
|
-
|
109
|
-
process_next
|
66
|
+
page.intermittent("Req is nil. WAT?")
|
110
67
|
elsif msg = req.error
|
111
|
-
|
68
|
+
page.intermittent(msg)
|
112
69
|
elsif req.response.nil? || req.response.empty?
|
113
|
-
|
114
|
-
process_next
|
115
|
-
# register_error(link, 'Timeout?')
|
70
|
+
page.intermittent('Timeout?')
|
116
71
|
else
|
117
|
-
|
118
|
-
process_next
|
72
|
+
page.intermittent('Partial response: Server Broke Connection?')
|
119
73
|
end
|
74
|
+
process_next
|
120
75
|
end
|
121
76
|
|
122
77
|
req.callback do
|
123
|
-
|
124
|
-
|
78
|
+
status_code = req.response_header.status
|
79
|
+
if VALID_RESPONSE_CODES.include?(status_code)
|
80
|
+
page.success
|
125
81
|
if req.response_header["CONTENT_TYPE"] =~ %r{text/html}
|
126
|
-
@register.add
|
82
|
+
@register.add find_linked_pages(page, req.response.to_str)
|
127
83
|
end
|
84
|
+
elsif(status_code == 503)
|
85
|
+
page.intermittent("Status code: 503")
|
128
86
|
else
|
129
|
-
|
130
|
-
@register.returned_broken link
|
131
|
-
# test_case.failures << Crawl::Failure.new(link, req.response_header.status, linked_from(link))
|
132
|
-
# test_suite.testcases << test_case
|
133
|
-
# test_suite.finish
|
134
|
-
# @report_manager.write_report(test_suite) if options[:ci]
|
87
|
+
page.fatal("Status code: #{status_code}")
|
135
88
|
end
|
136
89
|
process_next
|
137
90
|
end
|
138
|
-
|
139
|
-
# test_case.finish
|
140
|
-
# test_suite.testcases << test_case
|
141
|
-
# test_suite.finish
|
142
|
-
# @report_manager.write_report(test_suite) if options[:ci]
|
143
91
|
end
|
144
92
|
|
145
|
-
def
|
146
|
-
@register.source_for target
|
147
|
-
end
|
148
|
-
|
149
|
-
def find_links(source_link, body)
|
150
|
-
puts " Finding links.." if @verbose
|
93
|
+
def find_linked_pages(page, body)
|
151
94
|
doc = Nokogiri::HTML(body)
|
152
95
|
anchors = doc.css('a').to_a
|
153
96
|
anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
|
@@ -160,11 +103,6 @@ private
|
|
160
103
|
raw_links.map!{|link| link.sub(options[:domain], '')}
|
161
104
|
raw_links.delete_if{|link| link =~ %r{^http(s)?://}}
|
162
105
|
raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
|
163
|
-
raw_links.
|
164
|
-
puts " Adding #{target_link} found on #{source_link}" if @verbose
|
165
|
-
@register.set_link_source(target_link, source_link)
|
166
|
-
end
|
167
|
-
|
168
|
-
raw_links
|
106
|
+
raw_links.map{ |url| Page.new(@register, url, page.url) }
|
169
107
|
end
|
170
108
|
end
|
data/lib/crawl/page.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
class Page
|
2
|
+
include Comparable
|
3
|
+
|
4
|
+
attr_reader :register, :url, :source, :error
|
5
|
+
|
6
|
+
ATTEMPTS = 3
|
7
|
+
|
8
|
+
def initialize(register, url, source)
|
9
|
+
@register = register
|
10
|
+
@url = url
|
11
|
+
@source = source
|
12
|
+
@attempts = 0
|
13
|
+
@errors = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def <=>(other)
|
17
|
+
url <=> other.url
|
18
|
+
end
|
19
|
+
|
20
|
+
def eql?(other)
|
21
|
+
url.eql?(other.url)
|
22
|
+
end
|
23
|
+
|
24
|
+
def hash
|
25
|
+
url.hash
|
26
|
+
end
|
27
|
+
|
28
|
+
def success
|
29
|
+
@error = nil
|
30
|
+
@register.completed(self)
|
31
|
+
end
|
32
|
+
|
33
|
+
def fatal(error)
|
34
|
+
puts " Fatal - #{error}" if $VERBOSE
|
35
|
+
@error = error
|
36
|
+
@register.completed(self)
|
37
|
+
end
|
38
|
+
|
39
|
+
def intermittent(error)
|
40
|
+
puts " Intermittent - #{error}" if $VERBOSE
|
41
|
+
if @attempts >= ATTEMPTS
|
42
|
+
@error = error
|
43
|
+
@register.completed(self)
|
44
|
+
else
|
45
|
+
@attempts += 1
|
46
|
+
@register.retry(self)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def to_s
|
51
|
+
"#{url} found on #{source} - #{error || 'OK'}"
|
52
|
+
end
|
53
|
+
end
|
data/lib/crawl/register.rb
CHANGED
@@ -2,57 +2,38 @@ class Crawl::Register
|
|
2
2
|
|
3
3
|
Result = Struct.new(:url, :object)
|
4
4
|
|
5
|
-
def initialize
|
6
|
-
@unprocessed =
|
7
|
-
@processing =
|
8
|
-
@processed =
|
9
|
-
|
10
|
-
@invalid_links = Set[]
|
11
|
-
@broken_pages = Set[]
|
12
|
-
|
13
|
-
@errors = []
|
14
|
-
@link_sources = {}
|
5
|
+
def initialize
|
6
|
+
@unprocessed = Set.new
|
7
|
+
@processing = Set.new
|
8
|
+
@processed = Set.new
|
15
9
|
end
|
16
10
|
|
17
|
-
def add(
|
18
|
-
|
19
|
-
|
11
|
+
def add(pages)
|
12
|
+
new_pages = pages.to_set - @processed - @processing - @unprocessed
|
13
|
+
new_pages.each do |new_page|
|
14
|
+
puts " Adding #{new_page.url}" if $verbose
|
15
|
+
end
|
16
|
+
@unprocessed.merge(new_pages)
|
20
17
|
end
|
21
18
|
|
22
|
-
def
|
23
|
-
|
24
|
-
@
|
19
|
+
def next_page
|
20
|
+
page = @unprocessed.first
|
21
|
+
@unprocessed.delete(page)
|
22
|
+
@processing << page if page
|
25
23
|
if @processing.size > EM.threadpool_size
|
26
24
|
puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
|
27
25
|
end
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
def set_link_source(link, source)
|
32
|
-
@link_sources[link] = source
|
33
|
-
end
|
34
|
-
|
35
|
-
def source_for(link)
|
36
|
-
@link_sources.fetch link, '?'
|
37
|
-
end
|
38
|
-
|
39
|
-
def error(link, object)
|
40
|
-
@errors << Result.new(link, object)
|
26
|
+
page
|
41
27
|
end
|
42
28
|
|
43
|
-
def
|
44
|
-
|
45
|
-
@
|
29
|
+
def retry(page)
|
30
|
+
@unprocessed << page
|
31
|
+
@processing.delete(page)
|
46
32
|
end
|
47
33
|
|
48
|
-
def
|
49
|
-
|
50
|
-
@
|
51
|
-
end
|
52
|
-
|
53
|
-
def returned(link)
|
54
|
-
@processed << link
|
55
|
-
@processing -= [link]
|
34
|
+
def completed(page)
|
35
|
+
@processed << page
|
36
|
+
@processing.delete(page)
|
56
37
|
end
|
57
38
|
|
58
39
|
def finished?
|
@@ -63,40 +44,22 @@ class Crawl::Register
|
|
63
44
|
@processing.size
|
64
45
|
end
|
65
46
|
|
66
|
-
def
|
67
|
-
|
68
|
-
@processing -= [link]
|
69
|
-
@unprocessed << link
|
47
|
+
def error_pages
|
48
|
+
@processed.select{ |page| page.error }
|
70
49
|
end
|
71
50
|
|
72
|
-
def
|
73
|
-
|
51
|
+
def errors?
|
52
|
+
!error_pages.empty?
|
53
|
+
end
|
74
54
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
55
|
+
def summarize
|
56
|
+
if errors?
|
57
|
+
puts "\nPages with errors:"
|
58
|
+
error_pages.each do |page|
|
59
|
+
puts page.to_s
|
79
60
|
end
|
80
|
-
|
81
|
-
print(<<-SUM)
|
82
|
-
|
83
|
-
Pages crawled: #{@processed.size}
|
84
|
-
Pages with errors: #{@errors.size - @invalid_links.size}
|
85
|
-
Broken pages: #{@broken_pages.size}
|
86
|
-
Invalid links: #{@invalid_links.size}
|
87
|
-
|
88
|
-
I=Invalid P=Parse Error S=Status code bad
|
89
|
-
|
90
|
-
SUM
|
91
|
-
exit(@errors.size)
|
92
61
|
else
|
93
|
-
puts "\n
|
62
|
+
puts "\n#{@processed.size} pages crawled without errors."
|
94
63
|
end
|
95
|
-
|
96
|
-
puts
|
97
|
-
end
|
98
|
-
|
99
|
-
def errors?
|
100
|
-
@errors.size > 0
|
101
64
|
end
|
102
65
|
end
|
data/lib/crawl/version.rb
CHANGED
data/lib/crawl.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.beta3
|
5
5
|
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70174767878200 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70174767878200
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rest-client
|
27
|
-
requirement: &
|
27
|
+
requirement: &70174767877780 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,32 +32,21 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: ci_reporter
|
38
|
-
requirement: &70140204967840 !ruby/object:Gem::Requirement
|
39
|
-
none: false
|
40
|
-
requirements:
|
41
|
-
- - ! '>='
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: '0'
|
44
|
-
type: :runtime
|
45
|
-
prerelease: false
|
46
|
-
version_requirements: *70140204967840
|
35
|
+
version_requirements: *70174767877780
|
47
36
|
- !ruby/object:Gem::Dependency
|
48
37
|
name: eventmachine
|
49
|
-
requirement: &
|
38
|
+
requirement: &70174767877280 !ruby/object:Gem::Requirement
|
50
39
|
none: false
|
51
40
|
requirements:
|
52
41
|
- - ~>
|
53
42
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.0.0.
|
43
|
+
version: 1.0.0.rc1
|
55
44
|
type: :runtime
|
56
45
|
prerelease: false
|
57
|
-
version_requirements: *
|
46
|
+
version_requirements: *70174767877280
|
58
47
|
- !ruby/object:Gem::Dependency
|
59
48
|
name: em-http-request
|
60
|
-
requirement: &
|
49
|
+
requirement: &70174767876860 !ruby/object:Gem::Requirement
|
61
50
|
none: false
|
62
51
|
requirements:
|
63
52
|
- - ! '>='
|
@@ -65,7 +54,7 @@ dependencies:
|
|
65
54
|
version: '0'
|
66
55
|
type: :runtime
|
67
56
|
prerelease: false
|
68
|
-
version_requirements: *
|
57
|
+
version_requirements: *70174767876860
|
69
58
|
description: Crawl all pages on a domain, checking for errors
|
70
59
|
email:
|
71
60
|
- tor@alphasights.com
|
@@ -83,6 +72,7 @@ files:
|
|
83
72
|
- lib/crawl.rb
|
84
73
|
- lib/crawl/engine.rb
|
85
74
|
- lib/crawl/failure.rb
|
75
|
+
- lib/crawl/page.rb
|
86
76
|
- lib/crawl/register.rb
|
87
77
|
- lib/crawl/string.rb
|
88
78
|
- lib/crawl/version.rb
|