crawl 0.0.5 → 0.1.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/crawl +1 -1
- data/crawl.gemspec +2 -0
- data/lib/crawl/engine.rb +88 -83
- data/lib/crawl/register.rb +102 -0
- data/lib/crawl/version.rb +1 -1
- data/lib/crawl.rb +4 -2
- metadata +34 -11
data/bin/crawl
CHANGED
data/crawl.gemspec
CHANGED
data/lib/crawl/engine.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
2
3
|
class Crawl::Engine
|
3
4
|
DEFAULT_OPTIONS = {:domain => '',
|
4
5
|
:start => ['/'],
|
@@ -13,69 +14,42 @@ class Crawl::Engine
|
|
13
14
|
MAX_REDIRECTS = 3
|
14
15
|
LINE_WIDTH = 78
|
15
16
|
|
16
|
-
|
17
|
-
|
18
|
-
attr_reader :options, :errors
|
19
|
-
|
17
|
+
attr_reader :options
|
20
18
|
|
21
19
|
def initialize(caller_options = {})
|
22
20
|
@options = DEFAULT_OPTIONS.merge(caller_options)
|
23
21
|
@authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
|
24
|
-
|
25
|
-
@found_links = options[:start].to_set
|
26
|
-
@link_sources = {}
|
27
|
-
@found_links.each {|target| @link_sources[target] = 'Initial'}
|
28
|
-
@visited_links = Set[]
|
29
|
-
@visited_documents = Set[]
|
30
|
-
@invalid_links = Set[]
|
31
|
-
@broken_pages = []
|
32
|
-
@errors = []
|
33
22
|
@verbose = options[:verbose] || ENV['VERBOSE']
|
34
|
-
@number_of_dots = 0
|
35
|
-
@report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
|
36
23
|
@validate_markup = options[:markup]
|
24
|
+
@register = Crawl::Register.new(options[:start].to_a)
|
25
|
+
|
26
|
+
@report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
|
37
27
|
end
|
38
28
|
|
39
29
|
def run
|
40
|
-
|
41
|
-
|
42
|
-
puts "\nChecking #{link}" if @verbose
|
43
|
-
next unless response = retrieve(link)
|
44
|
-
next unless response.headers[:content_type] =~ %r{text/html}
|
45
|
-
@visited_documents << link
|
46
|
-
@found_links += links = find_links(link, response.to_str)
|
47
|
-
validate(link, response.body) if @validate_markup
|
48
|
-
end
|
30
|
+
EventMachine.run do
|
31
|
+
process_next
|
49
32
|
end
|
50
33
|
end
|
51
34
|
|
52
|
-
|
35
|
+
def process_next
|
36
|
+
return if @register.processing_size >= EM.threadpool_size
|
37
|
+
if @register.finished?
|
38
|
+
EventMachine.stop
|
39
|
+
elsif (link = @register.next_link)
|
40
|
+
puts "\nChecking #{link}" if @verbose
|
41
|
+
retrieve(link)
|
42
|
+
# validate(link, response.body) if @validate_markup
|
43
|
+
process_next
|
44
|
+
end
|
45
|
+
end
|
53
46
|
|
54
47
|
def summarize
|
55
|
-
|
56
|
-
|
57
|
-
@errors.each do |error|
|
58
|
-
puts "\n#{error.url}"
|
59
|
-
puts " Linked from #{linked_from(error.url)}"
|
60
|
-
puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
|
61
|
-
end
|
62
|
-
|
63
|
-
print(<<-SUM)
|
64
|
-
|
65
|
-
Pages crawled: #{@visited_documents.size}
|
66
|
-
Pages with errors: #{@errors.size - @invalid_links.size}
|
67
|
-
Broken pages: #{@broken_pages.size}
|
68
|
-
Invalid links: #{@invalid_links.size}
|
69
|
-
|
70
|
-
I=Invalid P=Parse Error S=Status code bad
|
71
|
-
|
72
|
-
SUM
|
73
|
-
exit(@errors.size)
|
74
|
-
else
|
75
|
-
puts "\n\n#{@visited_documents.size} pages crawled"
|
76
|
-
end
|
48
|
+
@register.summarize
|
49
|
+
end
|
77
50
|
|
78
|
-
|
51
|
+
def errors?
|
52
|
+
@register.errors?
|
79
53
|
end
|
80
54
|
|
81
55
|
private
|
@@ -96,7 +70,7 @@ private
|
|
96
70
|
"\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
|
97
71
|
end.join("\n\n")
|
98
72
|
|
99
|
-
@
|
73
|
+
@register.error link, response
|
100
74
|
false
|
101
75
|
end
|
102
76
|
rescue RestClient::ServiceUnavailable
|
@@ -104,41 +78,72 @@ private
|
|
104
78
|
false
|
105
79
|
end
|
106
80
|
|
81
|
+
def register_error(link, message)
|
82
|
+
@register.error link, message
|
83
|
+
@register.returned_invalid link
|
84
|
+
process_next
|
85
|
+
end
|
86
|
+
|
107
87
|
def retrieve(link)
|
108
|
-
test_suite = CI::Reporter::TestSuite.new(link)
|
109
|
-
test_case = CI::Reporter::TestCase.new(link)
|
110
|
-
test_suite.start
|
111
|
-
test_case.start
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
test_case.finish
|
120
|
-
@visited_links << link
|
121
|
-
unless VALID_RESPONSE_CODES.include?(response.code)
|
122
|
-
@errors << Result.new(link, "Status code was #{response.code}")
|
123
|
-
@broken_pages << link
|
124
|
-
test_case.failures << Crawl::Failure.new(link, response.code, linked_from(link))
|
125
|
-
test_suite.testcases << test_case
|
126
|
-
test_suite.finish
|
127
|
-
@report_manager.write_report(test_suite) if options[:ci]
|
88
|
+
# test_suite = CI::Reporter::TestSuite.new(link)
|
89
|
+
# test_case = CI::Reporter::TestCase.new(link)
|
90
|
+
# test_suite.start
|
91
|
+
# test_case.start
|
92
|
+
# test_suite.name = link
|
93
|
+
# test_case.name = link
|
94
|
+
|
95
|
+
puts "Fetching #{options[:domain] + link} ..." if @verbose
|
96
|
+
|
97
|
+
unless link.start_with? '/'
|
98
|
+
register_error(link, "Relative path found. Crawl does not support relative paths.")
|
128
99
|
return nil
|
129
100
|
end
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
101
|
+
|
102
|
+
http = EventMachine::HttpRequest.new(options[:domain] + link)
|
103
|
+
req = http.get :redirects => MAX_REDIRECTS, :head => {'authorization' => [options[:username], options[:password]]}
|
104
|
+
req.timeout(30)
|
105
|
+
|
106
|
+
req.errback do
|
107
|
+
if req.nil?
|
108
|
+
@register.retry(link, 'WAT?')
|
109
|
+
process_next
|
110
|
+
elsif msg = req.error
|
111
|
+
register_error(link, msg)
|
112
|
+
elsif req.response.nil? || req.response.empty?
|
113
|
+
# no response at all?
|
114
|
+
@register.retry(link, 'Timeout?')
|
115
|
+
# register_error(link, 'Timeout?')
|
116
|
+
else
|
117
|
+
@register.retry(link, 'Partial response: Server Broke Connection?')
|
118
|
+
process_next
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
req.callback do
|
123
|
+
if VALID_RESPONSE_CODES.include?(req.response_header.status)
|
124
|
+
@register.returned link
|
125
|
+
if req.response_header["CONTENT_TYPE"] =~ %r{text/html}
|
126
|
+
@register.add find_links(link, req.response.to_str)
|
127
|
+
end
|
128
|
+
else
|
129
|
+
@register.error link, "Status code was #{req.response_header.status}"
|
130
|
+
@register.returned_broken link
|
131
|
+
# test_case.failures << Crawl::Failure.new(link, req.response_header.status, linked_from(link))
|
132
|
+
# test_suite.testcases << test_case
|
133
|
+
# test_suite.finish
|
134
|
+
# @report_manager.write_report(test_suite) if options[:ci]
|
135
|
+
end
|
136
|
+
process_next
|
137
|
+
end
|
138
|
+
|
139
|
+
# test_case.finish
|
140
|
+
# test_suite.testcases << test_case
|
141
|
+
# test_suite.finish
|
142
|
+
# @report_manager.write_report(test_suite) if options[:ci]
|
138
143
|
end
|
139
144
|
|
140
145
|
def linked_from(target)
|
141
|
-
@
|
146
|
+
@register.source_for target
|
142
147
|
end
|
143
148
|
|
144
149
|
def find_links(source_link, body)
|
@@ -147,17 +152,17 @@ private
|
|
147
152
|
anchors = doc.css('a').to_a
|
148
153
|
anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
|
149
154
|
anchors.reject!{|anchor| anchor['data-method'] =~ /put|post|delete/ }
|
155
|
+
anchors.reject!{|anchor| anchor['data-remote'] =~ /true/ }
|
150
156
|
anchors.reject!{|anchor| anchor['class'].to_s =~ /unobtrusive_/}
|
157
|
+
anchors.reject!{|anchor| anchor['rel'].to_s =~ /nofollow/}
|
151
158
|
raw_links = anchors.map{|anchor| anchor['href']}
|
152
159
|
raw_links.compact!
|
153
160
|
raw_links.map!{|link| link.sub(options[:domain], '')}
|
154
|
-
raw_links.delete_if{|link| link =~ %r{^http
|
161
|
+
raw_links.delete_if{|link| link =~ %r{^http(s)?://}}
|
155
162
|
raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
|
156
163
|
raw_links.each do |target_link|
|
157
|
-
|
158
|
-
|
159
|
-
@link_sources[target_link] = source_link
|
160
|
-
end
|
164
|
+
puts " Adding #{target_link} found on #{source_link}" if @verbose
|
165
|
+
@register.set_link_source(target_link, source_link)
|
161
166
|
end
|
162
167
|
|
163
168
|
raw_links
|
@@ -0,0 +1,102 @@
|
|
1
|
+
class Crawl::Register
|
2
|
+
|
3
|
+
Result = Struct.new(:url, :object)
|
4
|
+
|
5
|
+
def initialize(unprocessed)
|
6
|
+
@unprocessed = unprocessed
|
7
|
+
@processing = []
|
8
|
+
@processed = []
|
9
|
+
|
10
|
+
@invalid_links = Set[]
|
11
|
+
@broken_pages = Set[]
|
12
|
+
|
13
|
+
@errors = []
|
14
|
+
@link_sources = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def add(links)
|
18
|
+
new_links = links - @processed - @processing - @unprocessed
|
19
|
+
@unprocessed += new_links
|
20
|
+
end
|
21
|
+
|
22
|
+
def next_link
|
23
|
+
link = @unprocessed.shift
|
24
|
+
@processing << link if link
|
25
|
+
if @processing.size > EM.threadpool_size
|
26
|
+
puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
|
27
|
+
end
|
28
|
+
link
|
29
|
+
end
|
30
|
+
|
31
|
+
def set_link_source(link, source)
|
32
|
+
@link_sources[link] = source
|
33
|
+
end
|
34
|
+
|
35
|
+
def source_for(link)
|
36
|
+
@link_sources.fetch link, '?'
|
37
|
+
end
|
38
|
+
|
39
|
+
def error(link, object)
|
40
|
+
@errors << Result.new(link, object)
|
41
|
+
end
|
42
|
+
|
43
|
+
def returned_invalid(link)
|
44
|
+
returned link
|
45
|
+
@invalid_links << link
|
46
|
+
end
|
47
|
+
|
48
|
+
def returned_broken(link)
|
49
|
+
returned link
|
50
|
+
@broken_pages << link
|
51
|
+
end
|
52
|
+
|
53
|
+
def returned(link)
|
54
|
+
@processed << link
|
55
|
+
@processing -= [link]
|
56
|
+
end
|
57
|
+
|
58
|
+
def finished?
|
59
|
+
@unprocessed.size + @processing.size == 0
|
60
|
+
end
|
61
|
+
|
62
|
+
def processing_size
|
63
|
+
@processing.size
|
64
|
+
end
|
65
|
+
|
66
|
+
def retry(link, reason)
|
67
|
+
puts "Retrying #{link} : #{reason}"
|
68
|
+
@processing -= [link]
|
69
|
+
@unprocessed << link
|
70
|
+
end
|
71
|
+
|
72
|
+
def summarize
|
73
|
+
if @errors.size > 0
|
74
|
+
|
75
|
+
@errors.each do |error|
|
76
|
+
puts "\n#{error.url}"
|
77
|
+
puts " Linked from #{source_for error.url}"
|
78
|
+
puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
|
79
|
+
end
|
80
|
+
|
81
|
+
print(<<-SUM)
|
82
|
+
|
83
|
+
Pages crawled: #{@processed.size}
|
84
|
+
Pages with errors: #{@errors.size - @invalid_links.size}
|
85
|
+
Broken pages: #{@broken_pages.size}
|
86
|
+
Invalid links: #{@invalid_links.size}
|
87
|
+
|
88
|
+
I=Invalid P=Parse Error S=Status code bad
|
89
|
+
|
90
|
+
SUM
|
91
|
+
exit(@errors.size)
|
92
|
+
else
|
93
|
+
puts "\n\n#{@processed.size} pages crawled"
|
94
|
+
end
|
95
|
+
|
96
|
+
puts
|
97
|
+
end
|
98
|
+
|
99
|
+
def errors?
|
100
|
+
@errors.size > 0
|
101
|
+
end
|
102
|
+
end
|
data/lib/crawl/version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -2,7 +2,8 @@
|
|
2
2
|
require('nokogiri')
|
3
3
|
require('rest_client')
|
4
4
|
require 'ci/reporter/core'
|
5
|
-
|
5
|
+
require 'eventmachine'
|
6
|
+
require 'em-http-request'
|
6
7
|
require 'base64'
|
7
8
|
require 'set'
|
8
9
|
require 'fileutils'
|
@@ -14,4 +15,5 @@ require 'tmpdir'
|
|
14
15
|
require_relative "crawl/version"
|
15
16
|
require_relative "crawl/engine"
|
16
17
|
require_relative "crawl/string"
|
17
|
-
require_relative "crawl/failure"
|
18
|
+
require_relative "crawl/failure"
|
19
|
+
require_relative "crawl/register"
|
metadata
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0.beta1
|
5
|
+
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Tor Erik Linnerud
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-04-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70366743805820 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70366743805820
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rest-client
|
27
|
-
requirement: &
|
27
|
+
requirement: &70366743804960 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70366743804960
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ci_reporter
|
38
|
-
requirement: &
|
38
|
+
requirement: &70366750375000 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,7 +43,29 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70366750375000
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: eventmachine
|
49
|
+
requirement: &70366750374440 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.0.0.beta.4
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70366750374440
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: em-http-request
|
60
|
+
requirement: &70366750373840 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
type: :runtime
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *70366750373840
|
47
69
|
description: Crawl all pages on a domain, checking for errors
|
48
70
|
email:
|
49
71
|
- tor@alphasights.com
|
@@ -61,6 +83,7 @@ files:
|
|
61
83
|
- lib/crawl.rb
|
62
84
|
- lib/crawl/engine.rb
|
63
85
|
- lib/crawl/failure.rb
|
86
|
+
- lib/crawl/register.rb
|
64
87
|
- lib/crawl/string.rb
|
65
88
|
- lib/crawl/version.rb
|
66
89
|
homepage: http://github.com/alphasights/crawl
|
@@ -78,9 +101,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
78
101
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
102
|
none: false
|
80
103
|
requirements:
|
81
|
-
- - ! '
|
104
|
+
- - ! '>'
|
82
105
|
- !ruby/object:Gem::Version
|
83
|
-
version:
|
106
|
+
version: 1.3.1
|
84
107
|
requirements: []
|
85
108
|
rubyforge_project:
|
86
109
|
rubygems_version: 1.8.11
|