just_crawl 1.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +20 -0
- data/README.md +53 -0
- data/Rakefile +2 -0
- data/bin/just_crawl +39 -0
- data/circle.yml +3 -0
- data/just_crawl.gemspec +26 -0
- data/lib/just_crawl/engine.rb +116 -0
- data/lib/just_crawl/failure.rb +29 -0
- data/lib/just_crawl/page.rb +59 -0
- data/lib/just_crawl/register.rb +69 -0
- data/lib/just_crawl/string.rb +7 -0
- data/lib/just_crawl/version.rb +3 -0
- data/lib/just_crawl.rb +18 -0
- data/spec/lib/just_crawl/engine_spec.rb +9 -0
- data/spec/lib/just_crawl/failure_spec.rb +10 -0
- data/spec/lib/just_crawl/page_spec.rb +24 -0
- data/spec/lib/just_crawl/register_spec.rb +15 -0
- data/spec/lib/just_crawl/string_spec.rb +5 -0
- data/spec/lib/just_crawl/version_spec.rb +8 -0
- data/spec/lib/just_crawl_spec.rb +5 -0
- data/spec/spec_helper.rb +18 -0
- metadata +213 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 97e1b466ee0a52002c3e35a3f5ed622e5fa7208761dd831fe695c9b3104ed7db
|
4
|
+
data.tar.gz: 4ded6f28ab8e36e144969fba44f7d017fd0405fa2bece4efeb45f67e9b5985d5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b0f0ea51add4876a3b397abb74c0b6a0d4c21585eae7ecdf58069178f38396c6ec9022f96af42dbc69ecb50755d7237d9a533c42777cc6cf9debd977e9362c1e
|
7
|
+
data.tar.gz: 16a5673b5aa8d7d727de7f4cfbdedb05abbd9dab576738f1bf53ed04494a842626f04a86e0ff59d8515879dbe7225938479fd0b3a7927991964f853d4f5bebba
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.5.1
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 AlphaSights
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
# JustCrawl
|
2
|
+
|
3
|
+
JustCrawl crawls pages within a domain, reporting any page that returns a bad response code
|
4
|
+
|
5
|
+
Usage:
|
6
|
+
|
7
|
+
> just_crawl [options] domain
|
8
|
+
|
9
|
+
Usage: just_crawl [options] domain
|
10
|
+
-s, --start /home,/about Starting path(s), defaults to /
|
11
|
+
-u, --username username Basic auth username
|
12
|
+
-p, --password password Basic auth password
|
13
|
+
-c, --connections count Max mumber of parallel connections to use. The default is 5.
|
14
|
+
-v, --verbose Give details when crawling
|
15
|
+
-h, --help Show this message
|
16
|
+
--version Print version
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
Example:
|
21
|
+
|
22
|
+
> just_crawl https://vaskohandmade.com --connections=5 --start=/ --verbose
|
23
|
+
|
24
|
+
Adding /
|
25
|
+
Fetching / ...
|
26
|
+
Adding index.html
|
27
|
+
Adding assets/custom/images/vasko/portfolio/reverb-con-modulo-belton.jpg
|
28
|
+
Adding assets/custom/images/vasko/portfolio/amplificador-fender-champ.jpg
|
29
|
+
Adding assets/custom/images/vasko/portfolio/overdrive-simil-zendrive.jpg
|
30
|
+
Adding assets/custom/images/vasko/portfolio/booster-simil-super-hardon.jpg
|
31
|
+
Adding assets/custom/images/vasko/portfolio/amplificador-valvular.jpg
|
32
|
+
Adding assets/custom/images/vasko/portfolio/rehousing-fender-superchamp.jpg
|
33
|
+
Adding assets/custom/images/vasko/portfolio/fuzz-simil-big-muff-violet-ram.jpg
|
34
|
+
Adding assets/custom/images/vasko/portfolio/amplificador-valvular-simil-marshall-1974x.jpg
|
35
|
+
Adding assets/custom/images/vasko/portfolio/distorsion-high-gain-simil-triple-wreck.jpg
|
36
|
+
Adding assets/custom/images/vasko/portfolio/booster-simil-rc-booster.jpg
|
37
|
+
Adding assets/custom/images/vasko/portfolio/amplificador-blues-mojo-7w-valvular.jpg
|
38
|
+
Adding assets/custom/images/vasko/portfolio/rehousing-fender-super-champ.jpg
|
39
|
+
Fetching index.html ...
|
40
|
+
Fetching assets/custom/images/vasko/portfolio/reverb-con-modulo-belton.jpg ...
|
41
|
+
Fetching assets/custom/images/vasko/portfolio/amplificador-fender-champ.jpg ...
|
42
|
+
Fetching assets/custom/images/vasko/portfolio/overdrive-simil-zendrive.jpg ...
|
43
|
+
Fetching assets/custom/images/vasko/portfolio/booster-simil-super-hardon.jpg ...
|
44
|
+
Fetching assets/custom/images/vasko/portfolio/amplificador-valvular.jpg ...
|
45
|
+
Fetching assets/custom/images/vasko/portfolio/rehousing-fender-superchamp.jpg ...
|
46
|
+
Fetching assets/custom/images/vasko/portfolio/fuzz-simil-big-muff-violet-ram.jpg ...
|
47
|
+
Fetching assets/custom/images/vasko/portfolio/amplificador-valvular-simil-marshall-1974x.jpg ...
|
48
|
+
Fetching assets/custom/images/vasko/portfolio/distorsion-high-gain-simil-triple-wreck.jpg ...
|
49
|
+
Fetching assets/custom/images/vasko/portfolio/booster-simil-rc-booster.jpg ...
|
50
|
+
Fetching assets/custom/images/vasko/portfolio/amplificador-blues-mojo-7w-valvular.jpg ...
|
51
|
+
Fetching assets/custom/images/vasko/portfolio/rehousing-fender-super-champ.jpg ...
|
52
|
+
|
53
|
+
14 pages crawled without errors.
|
data/Rakefile
ADDED
data/bin/just_crawl
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require_relative '../lib/just_crawl.rb'
|
4
|
+
|
5
|
+
EM.threadpool_size = 5
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
optparse = OptionParser.new do |opts|
|
9
|
+
opts.banner = "JustCrawl pages within a domain, reporting any page that returns a bad response code\nUsage: just_crawl [options] domain"
|
10
|
+
opts.on('-s', '--start /home,/about', Array, 'Starting path(s), defaults to /') { |o| options[:start] = o }
|
11
|
+
opts.on('-u', '--username username', String, 'Basic auth username') { |o| options[:username] = o }
|
12
|
+
opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
|
13
|
+
opts.on('-c', '--connections count', Integer, "Max mumber of parallel connections to use. The default is #{EM.threadpool_size}.") { |o| EM.threadpool_size = o }
|
14
|
+
opts.on('-v', '--verbose', 'Give details when crawling') { |o| $verbose = o }
|
15
|
+
opts.on_tail('-h', '--help', 'Show this message') { |o| puts opts; exit }
|
16
|
+
opts.on_tail('-v', '--version', 'Print version') { |o| puts JustCrawl::VERSION; exit }
|
17
|
+
end.parse!
|
18
|
+
|
19
|
+
options.merge!(domain: optparse.first)
|
20
|
+
|
21
|
+
unless options[:domain]
|
22
|
+
puts 'Must provide a domain'
|
23
|
+
exit -1
|
24
|
+
end
|
25
|
+
|
26
|
+
options[:domain] = "http://#{options[:domain]}" unless options[:domain].include?('://')
|
27
|
+
|
28
|
+
crawler = JustCrawl::Engine.new(options)
|
29
|
+
|
30
|
+
trap('SIGINT') do
|
31
|
+
puts "\n\nAborting just_crawl.."
|
32
|
+
crawler.summarize
|
33
|
+
exit -1
|
34
|
+
end
|
35
|
+
|
36
|
+
crawler.run
|
37
|
+
crawler.summarize
|
38
|
+
|
39
|
+
exit -1 if crawler.errors? || crawler.no_links_found?
|
data/circle.yml
ADDED
data/just_crawl.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require File.expand_path('../lib/just_crawl/version', __FILE__)
|
2
|
+
|
3
|
+
Gem::Specification.new do |gem|
|
4
|
+
gem.authors = ['Nicolas Sebastian Vidal']
|
5
|
+
gem.email = ['nicolas.s.vidal@gmail.com']
|
6
|
+
gem.description = 'JustCrawl crawls all pages on a domain, checking for errors'
|
7
|
+
gem.summary = 'JustCrawl crawls pages within a domain, reporting any page that returns a bad response code'
|
8
|
+
gem.homepage = 'http://github.com/nisevi/just_crawl'
|
9
|
+
|
10
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
11
|
+
gem.files = `git ls-files`.split("\n")
|
12
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
13
|
+
gem.name = 'just_crawl'
|
14
|
+
gem.require_paths = ['lib']
|
15
|
+
gem.required_ruby_version = '>= 2.5.1'
|
16
|
+
gem.version = JustCrawl::VERSION
|
17
|
+
gem.licenses = ['MIT']
|
18
|
+
gem.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.5'
|
19
|
+
gem.add_dependency 'eventmachine', '~> 1.2', '>= 1.2.6'
|
20
|
+
gem.add_dependency 'nokogiri', '~> 1.8', '>= 1.8.2'
|
21
|
+
gem.add_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
|
22
|
+
gem.add_development_dependency 'rspec-core', '~> 3.7', '>= 3.7.1'
|
23
|
+
gem.add_development_dependency 'rspec-expectations', '~> 3.7'
|
24
|
+
gem.add_development_dependency 'rspec_junit_formatter', '~> 0.3.0'
|
25
|
+
gem.add_development_dependency 'simplecov', '~> 0.16.1'
|
26
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
class JustCrawl::Engine
|
2
|
+
DEFAULT_OPTIONS = {
|
3
|
+
domain: '',
|
4
|
+
start: ['/'],
|
5
|
+
username: '',
|
6
|
+
password: '',
|
7
|
+
verbose: false,
|
8
|
+
session_id: false
|
9
|
+
}.freeze
|
10
|
+
|
11
|
+
IGNORE = [/#/, /mailto:/, /skype:/, /logout/, /javascript:/, %r{/xhr/}, /https:/, /\.pdf$/, /^$/, /tel:/].freeze
|
12
|
+
VALID_RESPONSE_CODES = [200, 302].freeze
|
13
|
+
MAX_REDIRECTS = 3
|
14
|
+
LINE_WIDTH = 78
|
15
|
+
|
16
|
+
attr_reader :options
|
17
|
+
|
18
|
+
def initialize(caller_options = {})
|
19
|
+
@options = DEFAULT_OPTIONS.merge(caller_options)
|
20
|
+
@authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
|
21
|
+
@register = JustCrawl::Register.new
|
22
|
+
|
23
|
+
start_pages = options[:start].to_a.map { |page| Page.new(@register, page, '/') }
|
24
|
+
|
25
|
+
@register.add(start_pages)
|
26
|
+
end
|
27
|
+
|
28
|
+
def run
|
29
|
+
EventMachine.run do
|
30
|
+
process_next
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def process_next
|
35
|
+
return if @register.processing_size >= EM.threadpool_size
|
36
|
+
if @register.finished?
|
37
|
+
EventMachine.stop
|
38
|
+
elsif (page = @register.next_page)
|
39
|
+
retrieve(page)
|
40
|
+
process_next
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def summarize
|
45
|
+
@register.summarize
|
46
|
+
end
|
47
|
+
|
48
|
+
def errors?
|
49
|
+
@register.errors?
|
50
|
+
end
|
51
|
+
|
52
|
+
def no_links_found?
|
53
|
+
@register.no_links_found?
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def retrieve(page)
|
59
|
+
puts "Fetching #{page.url} ..." if $verbose
|
60
|
+
|
61
|
+
absolute_url = options[:domain] + page.relative_url
|
62
|
+
|
63
|
+
http = EventMachine::HttpRequest.new(absolute_url)
|
64
|
+
req = http.get redirects: MAX_REDIRECTS,
|
65
|
+
connect_timeout: 20,
|
66
|
+
inactivity_timeout: 20,
|
67
|
+
head: {
|
68
|
+
'authorization' => [
|
69
|
+
options[:username], options[:password]
|
70
|
+
]
|
71
|
+
}
|
72
|
+
req.errback do
|
73
|
+
if req.nil?
|
74
|
+
page.intermittent('Req is nil. WAT?')
|
75
|
+
elsif msg = req.error
|
76
|
+
page.intermittent(msg)
|
77
|
+
elsif req.response.nil? || req.response.empty?
|
78
|
+
page.intermittent('Timeout?')
|
79
|
+
else
|
80
|
+
page.intermittent('Partial response: Server Broke Connection?')
|
81
|
+
end
|
82
|
+
process_next
|
83
|
+
end
|
84
|
+
|
85
|
+
req.callback do
|
86
|
+
status_code = req.response_header.status
|
87
|
+
if VALID_RESPONSE_CODES.include?(status_code)
|
88
|
+
page.success
|
89
|
+
if req.response_header['CONTENT_TYPE'] =~ %r{text/html}
|
90
|
+
@register.add find_linked_pages(page, req.response.to_str)
|
91
|
+
end
|
92
|
+
elsif status_code == 503
|
93
|
+
page.intermittent('Status code: 503')
|
94
|
+
else
|
95
|
+
page.fatal("Status code: #{status_code}")
|
96
|
+
end
|
97
|
+
process_next
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def find_linked_pages(page, body)
|
102
|
+
doc = Nokogiri::HTML(body)
|
103
|
+
anchors = doc.css('a').to_a
|
104
|
+
anchors.reject! { |anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/ }
|
105
|
+
anchors.reject! { |anchor| anchor['data-method'] =~ /put|post|delete/ }
|
106
|
+
anchors.reject! { |anchor| anchor['data-remote'] =~ /true/ }
|
107
|
+
anchors.reject! { |anchor| anchor['class'].to_s =~ /unobtrusive_/ }
|
108
|
+
anchors.reject! { |anchor| anchor['rel'].to_s =~ /nofollow/ }
|
109
|
+
raw_links = anchors.map { |anchor| anchor['href'] }
|
110
|
+
raw_links.compact!
|
111
|
+
raw_links.map! { |link| link.sub(options[:domain], '') }
|
112
|
+
raw_links.delete_if { |link| link =~ %r{^http(s)?://} && !link.include?(options[:domain]) }
|
113
|
+
raw_links.delete_if { |link| IGNORE.any? { |pattern| link =~ pattern } }
|
114
|
+
raw_links.map { |url| Page.new(@register, url, page.url) }
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class JustCrawl::Failure
|
2
|
+
attr_reader :link, :code, :from
|
3
|
+
|
4
|
+
def initialize(link, code, from)
|
5
|
+
@link = link
|
6
|
+
@code = code
|
7
|
+
@from = from
|
8
|
+
end
|
9
|
+
|
10
|
+
def failure?
|
11
|
+
true
|
12
|
+
end
|
13
|
+
|
14
|
+
def error?
|
15
|
+
!failure?
|
16
|
+
end
|
17
|
+
|
18
|
+
def name
|
19
|
+
link
|
20
|
+
end
|
21
|
+
|
22
|
+
def message
|
23
|
+
"Status code was #{code}"
|
24
|
+
end
|
25
|
+
|
26
|
+
def location
|
27
|
+
"Linked from #{from}"
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
class Page
|
4
|
+
include Comparable
|
5
|
+
|
6
|
+
attr_reader :register, :url, :source, :error
|
7
|
+
|
8
|
+
ATTEMPTS = 3
|
9
|
+
|
10
|
+
def initialize(register, url, source)
|
11
|
+
@register = register
|
12
|
+
@url = url
|
13
|
+
@source = source
|
14
|
+
@attempts = 0
|
15
|
+
@errors = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
def relative_url
|
19
|
+
@relative_url ||= URI.join('http://example.com', source, url).path
|
20
|
+
end
|
21
|
+
|
22
|
+
def <=>(other)
|
23
|
+
relative_url <=> other.relative_url
|
24
|
+
end
|
25
|
+
|
26
|
+
def eql?(other)
|
27
|
+
relative_url.eql?(other.relative_url)
|
28
|
+
end
|
29
|
+
|
30
|
+
def hash
|
31
|
+
relative_url.hash
|
32
|
+
end
|
33
|
+
|
34
|
+
def success
|
35
|
+
@error = nil
|
36
|
+
@register.completed(self)
|
37
|
+
end
|
38
|
+
|
39
|
+
def fatal(error)
|
40
|
+
puts " Fatal - #{error}" if $VERBOSE
|
41
|
+
@error = error
|
42
|
+
@register.completed(self)
|
43
|
+
end
|
44
|
+
|
45
|
+
def intermittent(error)
|
46
|
+
puts " Intermittent - #{error}" if $VERBOSE
|
47
|
+
if @attempts >= ATTEMPTS
|
48
|
+
@error = error
|
49
|
+
@register.completed(self)
|
50
|
+
else
|
51
|
+
@attempts += 1
|
52
|
+
@register.retry(self)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_s
|
57
|
+
"#{url} found on #{source} - #{error || 'OK'}"
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
class JustCrawl::Register
|
2
|
+
|
3
|
+
Result = Struct.new(:url, :object)
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@unprocessed = Set.new
|
7
|
+
@processing = Set.new
|
8
|
+
@processed = Set.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def add(pages)
|
12
|
+
new_pages = pages.to_set - @processed - @processing - @unprocessed
|
13
|
+
new_pages.each do |new_page|
|
14
|
+
puts " Adding #{new_page.url}" if $verbose
|
15
|
+
end
|
16
|
+
@unprocessed.merge(new_pages)
|
17
|
+
end
|
18
|
+
|
19
|
+
def next_page
|
20
|
+
page = @unprocessed.first
|
21
|
+
@unprocessed.delete(page)
|
22
|
+
@processing << page if page
|
23
|
+
if @processing.size > EM.threadpool_size
|
24
|
+
puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
|
25
|
+
end
|
26
|
+
page
|
27
|
+
end
|
28
|
+
|
29
|
+
def retry(page)
|
30
|
+
@unprocessed << page
|
31
|
+
@processing.delete(page)
|
32
|
+
end
|
33
|
+
|
34
|
+
def completed(page)
|
35
|
+
@processed << page
|
36
|
+
@processing.delete(page)
|
37
|
+
end
|
38
|
+
|
39
|
+
def finished?
|
40
|
+
(@unprocessed.size + @processing.size).zero?
|
41
|
+
end
|
42
|
+
|
43
|
+
def processing_size
|
44
|
+
@processing.size
|
45
|
+
end
|
46
|
+
|
47
|
+
def error_pages
|
48
|
+
@processed.select(&:error)
|
49
|
+
end
|
50
|
+
|
51
|
+
def errors?
|
52
|
+
!error_pages.empty?
|
53
|
+
end
|
54
|
+
|
55
|
+
def summarize
|
56
|
+
if errors?
|
57
|
+
puts "\nPages with errors:"
|
58
|
+
error_pages.each do |page|
|
59
|
+
puts page.to_s
|
60
|
+
end
|
61
|
+
else
|
62
|
+
puts "\n#{@processed.size} pages crawled without errors."
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def no_links_found?
|
67
|
+
@processed.size <= 1
|
68
|
+
end
|
69
|
+
end
|
data/lib/just_crawl.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'rest_client'
|
3
|
+
require 'eventmachine'
|
4
|
+
require 'em-http-request'
|
5
|
+
require 'base64'
|
6
|
+
require 'set'
|
7
|
+
require 'fileutils'
|
8
|
+
require 'digest/sha1'
|
9
|
+
require 'json'
|
10
|
+
require 'tempfile'
|
11
|
+
require 'tmpdir'
|
12
|
+
|
13
|
+
require_relative 'just_crawl/version'
|
14
|
+
require_relative 'just_crawl/engine'
|
15
|
+
require_relative 'just_crawl/string'
|
16
|
+
require_relative 'just_crawl/failure'
|
17
|
+
require_relative 'just_crawl/register'
|
18
|
+
require_relative 'just_crawl/page'
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require_relative '../../spec_helper'
|
2
|
+
require './lib/just_crawl/page'
|
3
|
+
|
4
|
+
RSpec.describe Page do
|
5
|
+
xit 'initialize'
|
6
|
+
|
7
|
+
it '#relative_url' do
|
8
|
+
expect(Page.new(:register, '/', '/').relative_url).to eq '/'
|
9
|
+
expect(Page.new(:register, './', '/').relative_url).to eq '/'
|
10
|
+
expect(Page.new(:register, 'page.html', '').relative_url).to eq '/page.html'
|
11
|
+
expect(Page.new(:register, '/interview', '/').relative_url).to eq '/interview'
|
12
|
+
expect(Page.new(:register, 'overview.html', '/').relative_url).to eq '/overview.html'
|
13
|
+
expect(Page.new(:register, 'post-5.html', '/posts/index.html').relative_url).to eq '/posts/post-5.html'
|
14
|
+
expect(Page.new(:register, 'https://staging.alphasights.com/careers/meet-us', '/posts/foo').relative_url).to eq '/careers/meet-us'
|
15
|
+
end
|
16
|
+
|
17
|
+
xit '#<=>'
|
18
|
+
xit '#eql?'
|
19
|
+
xit '#hash'
|
20
|
+
xit '#success'
|
21
|
+
xit '#fatal'
|
22
|
+
xit '#intermittent'
|
23
|
+
xit '#to_s'
|
24
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require_relative '../../spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe 'JustCrawl::Register' do
|
4
|
+
xit '#initialize'
|
5
|
+
xit '#add'
|
6
|
+
xit '#next_page'
|
7
|
+
xit '#retry'
|
8
|
+
xit '#completed'
|
9
|
+
xit '#finished?'
|
10
|
+
xit '#processing_size'
|
11
|
+
xit '#error_pages'
|
12
|
+
xit '#errors?'
|
13
|
+
xit '#summarize'
|
14
|
+
xit '#no_links_found?'
|
15
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'simplecov'
|
2
|
+
SimpleCov.start
|
3
|
+
|
4
|
+
RSpec.configure do |config|
|
5
|
+
config.expect_with :rspec do |expectations|
|
6
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
7
|
+
end
|
8
|
+
|
9
|
+
config.default_formatter = 'doc' if config.files_to_run.one?
|
10
|
+
|
11
|
+
config.shared_context_metadata_behavior = :apply_to_host_groups
|
12
|
+
config.filter_run_when_matching :focus
|
13
|
+
config.example_status_persistence_file_path = 'spec/examples.txt'
|
14
|
+
config.disable_monkey_patching!
|
15
|
+
config.warnings = true
|
16
|
+
config.order = :random
|
17
|
+
Kernel.srand config.seed
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,213 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: just_crawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.1.8
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nicolas Sebastian Vidal
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-05-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: em-http-request
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.1'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.1.5
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.1'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.1.5
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: eventmachine
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '1.2'
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 1.2.6
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '1.2'
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.2.6
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: nokogiri
|
55
|
+
requirement: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '1.8'
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 1.8.2
|
63
|
+
type: :runtime
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '1.8'
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 1.8.2
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: rest-client
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - "~>"
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: '2.0'
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 2.0.2
|
83
|
+
type: :runtime
|
84
|
+
prerelease: false
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '2.0'
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: 2.0.2
|
93
|
+
- !ruby/object:Gem::Dependency
|
94
|
+
name: rspec-core
|
95
|
+
requirement: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - "~>"
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '3.7'
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: 3.7.1
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '3.7'
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: 3.7.1
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: rspec-expectations
|
115
|
+
requirement: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - "~>"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '3.7'
|
120
|
+
type: :development
|
121
|
+
prerelease: false
|
122
|
+
version_requirements: !ruby/object:Gem::Requirement
|
123
|
+
requirements:
|
124
|
+
- - "~>"
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '3.7'
|
127
|
+
- !ruby/object:Gem::Dependency
|
128
|
+
name: rspec_junit_formatter
|
129
|
+
requirement: !ruby/object:Gem::Requirement
|
130
|
+
requirements:
|
131
|
+
- - "~>"
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: 0.3.0
|
134
|
+
type: :development
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
requirements:
|
138
|
+
- - "~>"
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: 0.3.0
|
141
|
+
- !ruby/object:Gem::Dependency
|
142
|
+
name: simplecov
|
143
|
+
requirement: !ruby/object:Gem::Requirement
|
144
|
+
requirements:
|
145
|
+
- - "~>"
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: 0.16.1
|
148
|
+
type: :development
|
149
|
+
prerelease: false
|
150
|
+
version_requirements: !ruby/object:Gem::Requirement
|
151
|
+
requirements:
|
152
|
+
- - "~>"
|
153
|
+
- !ruby/object:Gem::Version
|
154
|
+
version: 0.16.1
|
155
|
+
description: JustCrawl crawls all pages on a domain, checking for errors
|
156
|
+
email:
|
157
|
+
- nicolas.s.vidal@gmail.com
|
158
|
+
executables:
|
159
|
+
- just_crawl
|
160
|
+
extensions: []
|
161
|
+
extra_rdoc_files: []
|
162
|
+
files:
|
163
|
+
- ".gitignore"
|
164
|
+
- ".rspec"
|
165
|
+
- ".ruby-version"
|
166
|
+
- Gemfile
|
167
|
+
- LICENSE.txt
|
168
|
+
- README.md
|
169
|
+
- Rakefile
|
170
|
+
- bin/just_crawl
|
171
|
+
- circle.yml
|
172
|
+
- just_crawl.gemspec
|
173
|
+
- lib/just_crawl.rb
|
174
|
+
- lib/just_crawl/engine.rb
|
175
|
+
- lib/just_crawl/failure.rb
|
176
|
+
- lib/just_crawl/page.rb
|
177
|
+
- lib/just_crawl/register.rb
|
178
|
+
- lib/just_crawl/string.rb
|
179
|
+
- lib/just_crawl/version.rb
|
180
|
+
- spec/lib/just_crawl/engine_spec.rb
|
181
|
+
- spec/lib/just_crawl/failure_spec.rb
|
182
|
+
- spec/lib/just_crawl/page_spec.rb
|
183
|
+
- spec/lib/just_crawl/register_spec.rb
|
184
|
+
- spec/lib/just_crawl/string_spec.rb
|
185
|
+
- spec/lib/just_crawl/version_spec.rb
|
186
|
+
- spec/lib/just_crawl_spec.rb
|
187
|
+
- spec/spec_helper.rb
|
188
|
+
homepage: http://github.com/nisevi/just_crawl
|
189
|
+
licenses:
|
190
|
+
- MIT
|
191
|
+
metadata: {}
|
192
|
+
post_install_message:
|
193
|
+
rdoc_options: []
|
194
|
+
require_paths:
|
195
|
+
- lib
|
196
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
197
|
+
requirements:
|
198
|
+
- - ">="
|
199
|
+
- !ruby/object:Gem::Version
|
200
|
+
version: 2.5.1
|
201
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
202
|
+
requirements:
|
203
|
+
- - ">="
|
204
|
+
- !ruby/object:Gem::Version
|
205
|
+
version: '0'
|
206
|
+
requirements: []
|
207
|
+
rubyforge_project:
|
208
|
+
rubygems_version: 2.7.6
|
209
|
+
signing_key:
|
210
|
+
specification_version: 4
|
211
|
+
summary: JustCrawl crawls pages within a domain, reporting any page that returns a
|
212
|
+
bad response code
|
213
|
+
test_files: []
|