crawl 0.1.0.beta2 → 0.1.0.beta3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/crawl +2 -4
- data/crawl.gemspec +1 -2
- data/lib/crawl/engine.rb +25 -87
- data/lib/crawl/page.rb +53 -0
- data/lib/crawl/register.rb +32 -69
- data/lib/crawl/version.rb +1 -1
- data/lib/crawl.rb +2 -1
- metadata +12 -22
data/bin/crawl
CHANGED
@@ -8,10 +8,9 @@ optparse = OptionParser.new do |opts|
|
|
8
8
|
opts.on('-s', '--start /home,/about', Array, 'Starting path(s), defaults to /') { |o| options[:start] = o }
|
9
9
|
opts.on('-u', '--username username', String, 'Basic auth username') { |o| options[:username] = o }
|
10
10
|
opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
|
11
|
-
opts.on('-
|
12
|
-
opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
|
13
|
-
opts.on('-m', '--markup', 'Validate markup') { |o| options[:markup] = o }
|
11
|
+
opts.on('-v', '--verbose', 'Give details when crawling') { |o| $verbose = o }
|
14
12
|
opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
|
13
|
+
opts.on_tail("-v", "--version", "Print version") { |o| puts Crawl::VERSION; exit }
|
15
14
|
end.parse!
|
16
15
|
|
17
16
|
options.merge!(domain: optparse.first)
|
@@ -33,6 +32,5 @@ crawler.run
|
|
33
32
|
crawler.summarize
|
34
33
|
|
35
34
|
if crawler.errors?
|
36
|
-
puts 'Errors during crawling'
|
37
35
|
exit -1
|
38
36
|
end
|
data/crawl.gemspec
CHANGED
@@ -16,7 +16,6 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.version = Crawl::VERSION
|
17
17
|
gem.add_dependency('nokogiri')
|
18
18
|
gem.add_dependency('rest-client')
|
19
|
-
gem.add_dependency('
|
20
|
-
gem.add_dependency('eventmachine', '~> 1.0.0.beta.4')
|
19
|
+
gem.add_dependency('eventmachine', '~> 1.0.0.rc1')
|
21
20
|
gem.add_dependency('em-http-request')
|
22
21
|
end
|
data/lib/crawl/engine.rb
CHANGED
@@ -19,11 +19,11 @@ class Crawl::Engine
|
|
19
19
|
def initialize(caller_options = {})
|
20
20
|
@options = DEFAULT_OPTIONS.merge(caller_options)
|
21
21
|
@authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
|
22
|
-
@
|
23
|
-
@validate_markup = options[:markup]
|
24
|
-
@register = Crawl::Register.new(options[:start].to_a)
|
22
|
+
@register = Crawl::Register.new
|
25
23
|
|
26
|
-
|
24
|
+
start_pages = options[:start].to_a.map{|page| Page.new(@register, page, 'the command line')}
|
25
|
+
|
26
|
+
@register.add(start_pages)
|
27
27
|
end
|
28
28
|
|
29
29
|
def run
|
@@ -36,10 +36,8 @@ class Crawl::Engine
|
|
36
36
|
return if @register.processing_size >= EM.threadpool_size
|
37
37
|
if @register.finished?
|
38
38
|
EventMachine.stop
|
39
|
-
elsif (
|
40
|
-
|
41
|
-
retrieve(link)
|
42
|
-
# validate(link, response.body) if @validate_markup
|
39
|
+
elsif (page = @register.next_page)
|
40
|
+
retrieve(page)
|
43
41
|
process_next
|
44
42
|
end
|
45
43
|
end
|
@@ -54,100 +52,45 @@ class Crawl::Engine
|
|
54
52
|
|
55
53
|
private
|
56
54
|
|
57
|
-
def
|
58
|
-
puts "
|
59
|
-
|
60
|
-
json_response = RestClient.post 'http://validator.nu?out=json', body, :content_type => 'text/html; charset=utf-8'
|
61
|
-
messages = JSON.parse(json_response.body)['messages']
|
62
|
-
error_messages = messages.select { |message| message['type'] != 'info' }
|
63
|
-
|
64
|
-
if error_messages.empty?
|
65
|
-
true
|
66
|
-
else
|
67
|
-
response = error_messages.map do |message|
|
68
|
-
type, message = message['type'], message['message']
|
69
|
-
type_color = type == 'error' ? 31 : 33
|
70
|
-
"\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
|
71
|
-
end.join("\n\n")
|
72
|
-
|
73
|
-
@register.error link, response
|
74
|
-
false
|
75
|
-
end
|
76
|
-
rescue RestClient::ServiceUnavailable
|
77
|
-
handle_error('U')
|
78
|
-
false
|
79
|
-
end
|
80
|
-
|
81
|
-
def register_error(link, message)
|
82
|
-
@register.error link, message
|
83
|
-
@register.returned_invalid link
|
84
|
-
process_next
|
85
|
-
end
|
86
|
-
|
87
|
-
def retrieve(link)
|
88
|
-
# test_suite = CI::Reporter::TestSuite.new(link)
|
89
|
-
# test_case = CI::Reporter::TestCase.new(link)
|
90
|
-
# test_suite.start
|
91
|
-
# test_case.start
|
92
|
-
# test_suite.name = link
|
93
|
-
# test_case.name = link
|
94
|
-
|
95
|
-
puts "Fetching #{options[:domain] + link} ..." if @verbose
|
55
|
+
def retrieve(page)
|
56
|
+
puts "Fetching #{page.url} ..." if $verbose
|
96
57
|
|
97
|
-
|
98
|
-
register_error(link, "Relative path found. Crawl does not support relative paths.")
|
99
|
-
return nil
|
100
|
-
end
|
58
|
+
full_url = options[:domain] + page.url
|
101
59
|
|
102
|
-
http = EventMachine::HttpRequest.new(
|
60
|
+
http = EventMachine::HttpRequest.new(full_url)
|
103
61
|
req = http.get :redirects => MAX_REDIRECTS, :head => {'authorization' => [options[:username], options[:password]]}
|
104
|
-
req.timeout(
|
62
|
+
req.timeout(15)
|
105
63
|
|
106
64
|
req.errback do
|
107
65
|
if req.nil?
|
108
|
-
|
109
|
-
process_next
|
66
|
+
page.intermittent("Req is nil. WAT?")
|
110
67
|
elsif msg = req.error
|
111
|
-
|
68
|
+
page.intermittent(msg)
|
112
69
|
elsif req.response.nil? || req.response.empty?
|
113
|
-
|
114
|
-
process_next
|
115
|
-
# register_error(link, 'Timeout?')
|
70
|
+
page.intermittent('Timeout?')
|
116
71
|
else
|
117
|
-
|
118
|
-
process_next
|
72
|
+
page.intermittent('Partial response: Server Broke Connection?')
|
119
73
|
end
|
74
|
+
process_next
|
120
75
|
end
|
121
76
|
|
122
77
|
req.callback do
|
123
|
-
|
124
|
-
|
78
|
+
status_code = req.response_header.status
|
79
|
+
if VALID_RESPONSE_CODES.include?(status_code)
|
80
|
+
page.success
|
125
81
|
if req.response_header["CONTENT_TYPE"] =~ %r{text/html}
|
126
|
-
@register.add
|
82
|
+
@register.add find_linked_pages(page, req.response.to_str)
|
127
83
|
end
|
84
|
+
elsif(status_code == 503)
|
85
|
+
page.intermittent("Status code: 503")
|
128
86
|
else
|
129
|
-
|
130
|
-
@register.returned_broken link
|
131
|
-
# test_case.failures << Crawl::Failure.new(link, req.response_header.status, linked_from(link))
|
132
|
-
# test_suite.testcases << test_case
|
133
|
-
# test_suite.finish
|
134
|
-
# @report_manager.write_report(test_suite) if options[:ci]
|
87
|
+
page.fatal("Status code: #{status_code}")
|
135
88
|
end
|
136
89
|
process_next
|
137
90
|
end
|
138
|
-
|
139
|
-
# test_case.finish
|
140
|
-
# test_suite.testcases << test_case
|
141
|
-
# test_suite.finish
|
142
|
-
# @report_manager.write_report(test_suite) if options[:ci]
|
143
91
|
end
|
144
92
|
|
145
|
-
def
|
146
|
-
@register.source_for target
|
147
|
-
end
|
148
|
-
|
149
|
-
def find_links(source_link, body)
|
150
|
-
puts " Finding links.." if @verbose
|
93
|
+
def find_linked_pages(page, body)
|
151
94
|
doc = Nokogiri::HTML(body)
|
152
95
|
anchors = doc.css('a').to_a
|
153
96
|
anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
|
@@ -160,11 +103,6 @@ private
|
|
160
103
|
raw_links.map!{|link| link.sub(options[:domain], '')}
|
161
104
|
raw_links.delete_if{|link| link =~ %r{^http(s)?://}}
|
162
105
|
raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
|
163
|
-
raw_links.
|
164
|
-
puts " Adding #{target_link} found on #{source_link}" if @verbose
|
165
|
-
@register.set_link_source(target_link, source_link)
|
166
|
-
end
|
167
|
-
|
168
|
-
raw_links
|
106
|
+
raw_links.map{ |url| Page.new(@register, url, page.url) }
|
169
107
|
end
|
170
108
|
end
|
data/lib/crawl/page.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
class Page
|
2
|
+
include Comparable
|
3
|
+
|
4
|
+
attr_reader :register, :url, :source, :error
|
5
|
+
|
6
|
+
ATTEMPTS = 3
|
7
|
+
|
8
|
+
def initialize(register, url, source)
|
9
|
+
@register = register
|
10
|
+
@url = url
|
11
|
+
@source = source
|
12
|
+
@attempts = 0
|
13
|
+
@errors = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def <=>(other)
|
17
|
+
url <=> other.url
|
18
|
+
end
|
19
|
+
|
20
|
+
def eql?(other)
|
21
|
+
url.eql?(other.url)
|
22
|
+
end
|
23
|
+
|
24
|
+
def hash
|
25
|
+
url.hash
|
26
|
+
end
|
27
|
+
|
28
|
+
def success
|
29
|
+
@error = nil
|
30
|
+
@register.completed(self)
|
31
|
+
end
|
32
|
+
|
33
|
+
def fatal(error)
|
34
|
+
puts " Fatal - #{error}" if $VERBOSE
|
35
|
+
@error = error
|
36
|
+
@register.completed(self)
|
37
|
+
end
|
38
|
+
|
39
|
+
def intermittent(error)
|
40
|
+
puts " Intermittent - #{error}" if $VERBOSE
|
41
|
+
if @attempts >= ATTEMPTS
|
42
|
+
@error = error
|
43
|
+
@register.completed(self)
|
44
|
+
else
|
45
|
+
@attempts += 1
|
46
|
+
@register.retry(self)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def to_s
|
51
|
+
"#{url} found on #{source} - #{error || 'OK'}"
|
52
|
+
end
|
53
|
+
end
|
data/lib/crawl/register.rb
CHANGED
@@ -2,57 +2,38 @@ class Crawl::Register
|
|
2
2
|
|
3
3
|
Result = Struct.new(:url, :object)
|
4
4
|
|
5
|
-
def initialize
|
6
|
-
@unprocessed =
|
7
|
-
@processing =
|
8
|
-
@processed =
|
9
|
-
|
10
|
-
@invalid_links = Set[]
|
11
|
-
@broken_pages = Set[]
|
12
|
-
|
13
|
-
@errors = []
|
14
|
-
@link_sources = {}
|
5
|
+
def initialize
|
6
|
+
@unprocessed = Set.new
|
7
|
+
@processing = Set.new
|
8
|
+
@processed = Set.new
|
15
9
|
end
|
16
10
|
|
17
|
-
def add(
|
18
|
-
|
19
|
-
|
11
|
+
def add(pages)
|
12
|
+
new_pages = pages.to_set - @processed - @processing - @unprocessed
|
13
|
+
new_pages.each do |new_page|
|
14
|
+
puts " Adding #{new_page.url}" if $verbose
|
15
|
+
end
|
16
|
+
@unprocessed.merge(new_pages)
|
20
17
|
end
|
21
18
|
|
22
|
-
def
|
23
|
-
|
24
|
-
@
|
19
|
+
def next_page
|
20
|
+
page = @unprocessed.first
|
21
|
+
@unprocessed.delete(page)
|
22
|
+
@processing << page if page
|
25
23
|
if @processing.size > EM.threadpool_size
|
26
24
|
puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
|
27
25
|
end
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
def set_link_source(link, source)
|
32
|
-
@link_sources[link] = source
|
33
|
-
end
|
34
|
-
|
35
|
-
def source_for(link)
|
36
|
-
@link_sources.fetch link, '?'
|
37
|
-
end
|
38
|
-
|
39
|
-
def error(link, object)
|
40
|
-
@errors << Result.new(link, object)
|
26
|
+
page
|
41
27
|
end
|
42
28
|
|
43
|
-
def
|
44
|
-
|
45
|
-
@
|
29
|
+
def retry(page)
|
30
|
+
@unprocessed << page
|
31
|
+
@processing.delete(page)
|
46
32
|
end
|
47
33
|
|
48
|
-
def
|
49
|
-
|
50
|
-
@
|
51
|
-
end
|
52
|
-
|
53
|
-
def returned(link)
|
54
|
-
@processed << link
|
55
|
-
@processing -= [link]
|
34
|
+
def completed(page)
|
35
|
+
@processed << page
|
36
|
+
@processing.delete(page)
|
56
37
|
end
|
57
38
|
|
58
39
|
def finished?
|
@@ -63,40 +44,22 @@ class Crawl::Register
|
|
63
44
|
@processing.size
|
64
45
|
end
|
65
46
|
|
66
|
-
def
|
67
|
-
|
68
|
-
@processing -= [link]
|
69
|
-
@unprocessed << link
|
47
|
+
def error_pages
|
48
|
+
@processed.select{ |page| page.error }
|
70
49
|
end
|
71
50
|
|
72
|
-
def
|
73
|
-
|
51
|
+
def errors?
|
52
|
+
!error_pages.empty?
|
53
|
+
end
|
74
54
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
55
|
+
def summarize
|
56
|
+
if errors?
|
57
|
+
puts "\nPages with errors:"
|
58
|
+
error_pages.each do |page|
|
59
|
+
puts page.to_s
|
79
60
|
end
|
80
|
-
|
81
|
-
print(<<-SUM)
|
82
|
-
|
83
|
-
Pages crawled: #{@processed.size}
|
84
|
-
Pages with errors: #{@errors.size - @invalid_links.size}
|
85
|
-
Broken pages: #{@broken_pages.size}
|
86
|
-
Invalid links: #{@invalid_links.size}
|
87
|
-
|
88
|
-
I=Invalid P=Parse Error S=Status code bad
|
89
|
-
|
90
|
-
SUM
|
91
|
-
exit(@errors.size)
|
92
61
|
else
|
93
|
-
puts "\n
|
62
|
+
puts "\n#{@processed.size} pages crawled without errors."
|
94
63
|
end
|
95
|
-
|
96
|
-
puts
|
97
|
-
end
|
98
|
-
|
99
|
-
def errors?
|
100
|
-
@errors.size > 0
|
101
64
|
end
|
102
65
|
end
|
data/lib/crawl/version.rb
CHANGED
data/lib/crawl.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.beta3
|
5
5
|
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70174767878200 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70174767878200
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rest-client
|
27
|
-
requirement: &
|
27
|
+
requirement: &70174767877780 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,32 +32,21 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: ci_reporter
|
38
|
-
requirement: &70140204967840 !ruby/object:Gem::Requirement
|
39
|
-
none: false
|
40
|
-
requirements:
|
41
|
-
- - ! '>='
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: '0'
|
44
|
-
type: :runtime
|
45
|
-
prerelease: false
|
46
|
-
version_requirements: *70140204967840
|
35
|
+
version_requirements: *70174767877780
|
47
36
|
- !ruby/object:Gem::Dependency
|
48
37
|
name: eventmachine
|
49
|
-
requirement: &
|
38
|
+
requirement: &70174767877280 !ruby/object:Gem::Requirement
|
50
39
|
none: false
|
51
40
|
requirements:
|
52
41
|
- - ~>
|
53
42
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.0.0.
|
43
|
+
version: 1.0.0.rc1
|
55
44
|
type: :runtime
|
56
45
|
prerelease: false
|
57
|
-
version_requirements: *
|
46
|
+
version_requirements: *70174767877280
|
58
47
|
- !ruby/object:Gem::Dependency
|
59
48
|
name: em-http-request
|
60
|
-
requirement: &
|
49
|
+
requirement: &70174767876860 !ruby/object:Gem::Requirement
|
61
50
|
none: false
|
62
51
|
requirements:
|
63
52
|
- - ! '>='
|
@@ -65,7 +54,7 @@ dependencies:
|
|
65
54
|
version: '0'
|
66
55
|
type: :runtime
|
67
56
|
prerelease: false
|
68
|
-
version_requirements: *
|
57
|
+
version_requirements: *70174767876860
|
69
58
|
description: Crawl all pages on a domain, checking for errors
|
70
59
|
email:
|
71
60
|
- tor@alphasights.com
|
@@ -83,6 +72,7 @@ files:
|
|
83
72
|
- lib/crawl.rb
|
84
73
|
- lib/crawl/engine.rb
|
85
74
|
- lib/crawl/failure.rb
|
75
|
+
- lib/crawl/page.rb
|
86
76
|
- lib/crawl/register.rb
|
87
77
|
- lib/crawl/string.rb
|
88
78
|
- lib/crawl/version.rb
|