browser_crawler 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +2 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +277 -0
- data/Rakefile +7 -0
- data/bin/console +10 -0
- data/bin/crawl +51 -0
- data/bin/setup +8 -0
- data/browser_crawler.gemspec +47 -0
- data/lib/browser_crawler.rb +12 -0
- data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
- data/lib/browser_crawler/dsl/sign_in.rb +37 -0
- data/lib/browser_crawler/engine.rb +156 -0
- data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
- data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
- data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
- data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
- data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
- data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
- data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
- data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
- data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
- data/lib/browser_crawler/hooks_container.rb +31 -0
- data/lib/browser_crawler/hooks_operator.rb +44 -0
- data/lib/browser_crawler/options.rb +86 -0
- data/lib/browser_crawler/report_factory.rb +22 -0
- data/lib/browser_crawler/reports/csv_report.rb +75 -0
- data/lib/browser_crawler/reports/store.rb +114 -0
- data/lib/browser_crawler/reports/yaml_report.rb +15 -0
- data/lib/browser_crawler/screenshot_operator.rb +47 -0
- data/lib/browser_crawler/support/capybara.rb +20 -0
- data/lib/browser_crawler/url_tools.rb +32 -0
- data/lib/browser_crawler/version.rb +3 -0
- metadata +244 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative '../url_tools'
|
2
|
+
|
3
|
+
module BrowserCrawler
|
4
|
+
module EngineUtilities
|
5
|
+
class LinkInspector
|
6
|
+
attr_reader :raw_link, :host_name, :uri
|
7
|
+
|
8
|
+
def initialize(raw_link:, host_name:)
|
9
|
+
@raw_link = raw_link
|
10
|
+
@host_name = host_name
|
11
|
+
@uri = UrlTools.uri(url: raw_link)
|
12
|
+
end
|
13
|
+
|
14
|
+
def external_url?
|
15
|
+
!internal_url?
|
16
|
+
end
|
17
|
+
|
18
|
+
def link_valid?
|
19
|
+
@link_valid ||= !uri.nil? && uri.host && uri.scheme
|
20
|
+
end
|
21
|
+
|
22
|
+
def internal_url?
|
23
|
+
@internal_url ||= !uri.nil? && uri.host == host_name
|
24
|
+
end
|
25
|
+
|
26
|
+
def full_url
|
27
|
+
@full_url ||= UrlTools.full_url(uri: uri)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module BrowserCrawler
|
2
|
+
module EngineUtilities
|
3
|
+
class LinkScanner
|
4
|
+
include Capybara::DSL
|
5
|
+
include HooksOperator
|
6
|
+
|
7
|
+
attr_reader :link_inspector
|
8
|
+
|
9
|
+
def initialize(link_inspector:)
|
10
|
+
@link_inspector = link_inspector
|
11
|
+
end
|
12
|
+
|
13
|
+
def scan(page:)
|
14
|
+
link_inspector.internal_url? ? get_page_links(page: page) : []
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def get_page_links(page:)
|
20
|
+
remove_blank_links(link_matcher(page: page))
|
21
|
+
end
|
22
|
+
|
23
|
+
def remove_blank_links(links)
|
24
|
+
links.reject do |link|
|
25
|
+
link.nil? || link.empty?
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# return Array consists of links from page
|
30
|
+
# if hooks exists when execute them instead of basic behavior
|
31
|
+
def link_matcher(page:)
|
32
|
+
exchange_on_hooks(type: :scan_rules) do
|
33
|
+
page.all('a').map { |a| a['href'] }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require_relative '../url_tools'
|
2
|
+
require_relative '../hooks_operator'
|
3
|
+
require_relative 'link_scanner'
|
4
|
+
|
5
|
+
module BrowserCrawler
|
6
|
+
module EngineUtilities
|
7
|
+
class PageInspector
|
8
|
+
include Capybara::DSL
|
9
|
+
include HooksOperator
|
10
|
+
|
11
|
+
attr_accessor :link_inspector,
|
12
|
+
:link_scanner,
|
13
|
+
:capybara_session,
|
14
|
+
:scan_result,
|
15
|
+
:report_store
|
16
|
+
|
17
|
+
def initialize(link_inspector:, capybara_session:, report_store:)
|
18
|
+
@link_inspector = link_inspector
|
19
|
+
@capybara_session = capybara_session
|
20
|
+
@report_store = report_store
|
21
|
+
@scan_result = []
|
22
|
+
@link_scanner = LinkScanner.new(link_inspector: link_inspector)
|
23
|
+
end
|
24
|
+
|
25
|
+
def visit_page
|
26
|
+
uri = link_inspector.uri
|
27
|
+
Capybara.app_host = "#{uri.scheme}://#{uri.host}:#{uri.port}"
|
28
|
+
|
29
|
+
visit link_inspector.full_url
|
30
|
+
|
31
|
+
with_hooks_for(type: :each) do
|
32
|
+
@scan_result = scanning
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def save_to_report(screenshot_operator: nil)
|
37
|
+
screenshot_path = save_screenshot(screenshot_operator)
|
38
|
+
|
39
|
+
report_store.record_page_visit(
|
40
|
+
page: link_inspector.full_url,
|
41
|
+
extracted_links: scan_result,
|
42
|
+
screenshot_filename: screenshot_path,
|
43
|
+
external: link_inspector.external_url?,
|
44
|
+
code: capybara_session.status_code
|
45
|
+
)
|
46
|
+
end
|
47
|
+
|
48
|
+
def before_page_scan; end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def scanning
|
53
|
+
link_scanner.scan(page: capybara_session) || []
|
54
|
+
end
|
55
|
+
|
56
|
+
def save_screenshot(screenshot_operator)
|
57
|
+
return unless screenshot_operator&.save_screenshots?
|
58
|
+
|
59
|
+
capybara_session.save_screenshot(
|
60
|
+
screenshot_operator.file_path(url: capybara_session.current_url)
|
61
|
+
)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module BrowserCrawler
|
2
|
+
module Errors
|
3
|
+
class InvalidHooksType < StandardError
|
4
|
+
def initialize(invalid_type:)
|
5
|
+
message = "Passed hooks type `#{invalid_type}` is invalid." \
|
6
|
+
' A type has to apply one of the follow values:' \
|
7
|
+
" #{HooksContainer::VALID_TYPES.join(', ')}"
|
8
|
+
super(message)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'erb'
|
2
|
+
module BrowserCrawler
|
3
|
+
module Followups
|
4
|
+
# Indexes screenshots captured by the crawler, creates index.html from the captured screenshots.
|
5
|
+
# ERB Template can be provided that will receive the list of files.
|
6
|
+
class ScreenshotsIndexer
|
7
|
+
def initialize(template:)
|
8
|
+
@template = template || File.read(default_template_file)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Produce index.html with links to screenshots found in the `path` specified.
|
12
|
+
# Optionally file_mask can be provided to filter out files to be indexed.
|
13
|
+
def index_directory(path, file_mask: '*.png')
|
14
|
+
files = Dir[File.join(path, file_mask)].map { |file| File.basename(file) }
|
15
|
+
html = render_index(files: files)
|
16
|
+
index_path = File.join(path, 'index.html')
|
17
|
+
File.write(index_path, html)
|
18
|
+
index_path
|
19
|
+
end
|
20
|
+
|
21
|
+
def index_report(report)
|
22
|
+
sorted_pages = Hash[report.pages.sort_by { |(k, _v)| k }]
|
23
|
+
files = Hash[sorted_pages.map do |(k, _v)|
|
24
|
+
k
|
25
|
+
end]
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def default_template_file
|
31
|
+
File.join(__dir__, 'templates/index.html.erb')
|
32
|
+
end
|
33
|
+
|
34
|
+
def render_index(files:)
|
35
|
+
renderer = ERB.new(@template)
|
36
|
+
renderer.result(binding)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Screenshots</title>
|
5
|
+
|
6
|
+
<style type="text/css">
|
7
|
+
body {
|
8
|
+
color: #eee;
|
9
|
+
margin-top: 20px;
|
10
|
+
font-family: Arial, "Helvetica Neue", Helvetica, sans-serif;
|
11
|
+
}
|
12
|
+
|
13
|
+
a {
|
14
|
+
color: #FFF;
|
15
|
+
}
|
16
|
+
|
17
|
+
a:hover {
|
18
|
+
color: yellow;
|
19
|
+
text-decoration: underline;
|
20
|
+
}
|
21
|
+
|
22
|
+
.thumbnails {
|
23
|
+
overflow: scroll;
|
24
|
+
height: 150px;
|
25
|
+
}
|
26
|
+
|
27
|
+
.thumbnails img {
|
28
|
+
height: 80px;
|
29
|
+
border: 4px solid #555;
|
30
|
+
padding: 1px;
|
31
|
+
margin: 0 10px 10px 0;
|
32
|
+
}
|
33
|
+
|
34
|
+
.thumbnails img:hover {
|
35
|
+
border: 4px solid #00ccff;
|
36
|
+
cursor: pointer;
|
37
|
+
}
|
38
|
+
|
39
|
+
.preview img {
|
40
|
+
border: 4px solid #444;
|
41
|
+
padding: 1px;
|
42
|
+
width: 800px;
|
43
|
+
}
|
44
|
+
</style>
|
45
|
+
|
46
|
+
</head>
|
47
|
+
<body>
|
48
|
+
|
49
|
+
<div class="gallery" align="center">
|
50
|
+
<h2>Screenshots captured</h2>
|
51
|
+
|
52
|
+
<br/>
|
53
|
+
|
54
|
+
<div class="thumbnails">
|
55
|
+
<% files.each_with_index do |img, index| %>
|
56
|
+
<img onmouseover="preview.src=img<%= index %>.src" name="img<%= index %>" src="<%= img %>" alt=""/>
|
57
|
+
<% end %>
|
58
|
+
</div>
|
59
|
+
<br/>
|
60
|
+
|
61
|
+
<div class="preview" align="center">
|
62
|
+
<img name="preview" src="<%= files.first %>" alt=""/>
|
63
|
+
</div>
|
64
|
+
|
65
|
+
</div>
|
66
|
+
|
67
|
+
|
68
|
+
</body>
|
69
|
+
</html>
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'active_support/core_ext/string'
|
3
|
+
|
4
|
+
module BrowserCrawler
|
5
|
+
module Followups
|
6
|
+
# Updates the :paths section of the Wraith's config file.
|
7
|
+
class WraithIntegrator
|
8
|
+
def initialize(report:)
|
9
|
+
@report = if report.respond_to?(:pages)
|
10
|
+
report
|
11
|
+
else
|
12
|
+
YAML.safe_load(report, [Symbol]).symbolize_keys
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def update_config(wraith_config_file, path_suffix: nil)
|
17
|
+
config = YAML.safe_load(File.read(wraith_config_file))
|
18
|
+
config['paths'] = paths(with_suffix: path_suffix)
|
19
|
+
File.write(wraith_config_file, config.to_yaml)
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [Hash] sorted hash of page_name => path pair values appended with optional suffix.
|
23
|
+
# Page name equals to path which makes it easy to navigate the page from the Wraith gallery.
|
24
|
+
def paths(with_suffix: nil)
|
25
|
+
Hash[sorted_pages.map { |(k, v)| [k, "#{v}#{with_suffix}"] }]
|
26
|
+
end
|
27
|
+
|
28
|
+
def named_pages
|
29
|
+
@report[:pages].each_with_object({}) do |(page_url, _links), h|
|
30
|
+
page_path = URI(page_url.to_s).path
|
31
|
+
page_name = page_path.parameterize
|
32
|
+
h[page_name] = page_path
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def sorted_pages
|
37
|
+
Hash[named_pages.sort_by { |(k, _v)| k }]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
require_relative 'errors/invalid_hooks_type'
|
3
|
+
|
4
|
+
module BrowserCrawler
|
5
|
+
class HooksContainer
|
6
|
+
include Singleton
|
7
|
+
|
8
|
+
VALID_TYPES = %i[each all unvisited_links scan_rules].freeze
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
reset
|
12
|
+
end
|
13
|
+
|
14
|
+
def reset
|
15
|
+
@hooks_container = Hash.new { |h, k| h[k] = { each: [],
|
16
|
+
all: [],
|
17
|
+
unvisited_links: [],
|
18
|
+
scan_rules: []} }
|
19
|
+
end
|
20
|
+
|
21
|
+
attr_reader :hooks_container
|
22
|
+
|
23
|
+
def add_hook(method: :run_only_one, type:, hook: nil)
|
24
|
+
unless VALID_TYPES.include?(type)
|
25
|
+
raise Errors::InvalidHooksType.new(invalid_type: type)
|
26
|
+
end
|
27
|
+
|
28
|
+
@hooks_container[method][type.to_sym] << hook
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module BrowserCrawler
|
2
|
+
module HooksOperator
|
3
|
+
def with_hooks_for(type:)
|
4
|
+
run_before_hooks(type: type)
|
5
|
+
yield
|
6
|
+
run_after_hooks(type: type)
|
7
|
+
end
|
8
|
+
|
9
|
+
def exchange_on_hooks(type:, &default_block)
|
10
|
+
hooks_array = BrowserCrawler::HooksContainer
|
11
|
+
.instance.hooks_container[:run_only_one][type]
|
12
|
+
|
13
|
+
if hooks_array && !hooks_array.empty?
|
14
|
+
instance_exec(&hooks_array[0])
|
15
|
+
elsif block_given?
|
16
|
+
instance_exec(&default_block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def run_before_hooks(type:)
|
23
|
+
before_hook = BrowserCrawler::HooksContainer.instance
|
24
|
+
.hooks_container[:before][type]
|
25
|
+
return unless before_hook
|
26
|
+
|
27
|
+
run_hooks(before_hook)
|
28
|
+
end
|
29
|
+
|
30
|
+
def run_after_hooks(type:)
|
31
|
+
after_hook = BrowserCrawler::HooksContainer.instance
|
32
|
+
.hooks_container[:after][type]
|
33
|
+
return unless after_hook
|
34
|
+
|
35
|
+
run_hooks(after_hook)
|
36
|
+
end
|
37
|
+
|
38
|
+
def run_hooks(hooks)
|
39
|
+
hooks.each do |hook|
|
40
|
+
instance_exec(&hook)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'optionparser'
|
2
|
+
|
3
|
+
module BrowserCrawler
|
4
|
+
module Options
|
5
|
+
module_function
|
6
|
+
|
7
|
+
def default_options
|
8
|
+
{
|
9
|
+
report_folder: 'tmp',
|
10
|
+
report_format: 'yaml',
|
11
|
+
window_width: 1024,
|
12
|
+
window_height: 768
|
13
|
+
}
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse_args
|
17
|
+
options = {}
|
18
|
+
p = OptionParser.new do |opts|
|
19
|
+
opts.on_tail
|
20
|
+
|
21
|
+
opts.banner = 'Site crawler. Usage example: crawl http://localhost:3000'
|
22
|
+
|
23
|
+
opts.on('-U', '[--url] URL', 'Crawls the site starting from the url specified. E.g. http://localhost:3000/welcome.') do |v|
|
24
|
+
options[:url] = v
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on('-u', '--user USERNAME', 'The authentication user name (optional).') do |v|
|
28
|
+
options[:username] = v
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on('-p', '--password PASSWORD', 'The authentication password (optional).') do |v|
|
32
|
+
options[:password] = v
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on('-n', '--max_pages NUM', 'The maximum number of pages to visit.') do |v|
|
36
|
+
options[:max_pages] = v.to_i
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on('-w', '--window_size WxH', 'Browser window size. Default 1024x768') do |v|
|
40
|
+
options[:window_width], options[:window_height] = v.split('x')
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('-r', '--report FOLDER', 'The folder path to save report to. '\
|
44
|
+
'Default: tmp') do |v|
|
45
|
+
options[:report_folder] = v
|
46
|
+
end
|
47
|
+
|
48
|
+
opts.on('-f', '--report_format TYPE', 'The report type to save result '\
|
49
|
+
'Default: yaml') do |v|
|
50
|
+
options[:report_format] = v
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on('-s', '--screenshots_path PATH',
|
54
|
+
'If specified along with the url, screenshots are captured visiting each page.'\
|
55
|
+
' Otherwise used to generate a screenshots index based on files caprured previously. ') do |v|
|
56
|
+
options[:screenshots_path] = v
|
57
|
+
end
|
58
|
+
|
59
|
+
opts.on('-t', '--template FILENAME',
|
60
|
+
'Specify the template used for indexing.'\
|
61
|
+
' Default: followups/templates/index.html.erb') do |v|
|
62
|
+
options[:index_template] = v
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on('-c', '--wraith_config FILENAME',
|
66
|
+
'Update config "paths" section with the pages extracted.') do |v|
|
67
|
+
options[:wraith_config] = v
|
68
|
+
end
|
69
|
+
|
70
|
+
opts.on('-h', '--help', 'Show this help message and exit.') do
|
71
|
+
puts opts
|
72
|
+
end
|
73
|
+
end
|
74
|
+
p.parse!
|
75
|
+
|
76
|
+
options[:url] = ARGV.pop unless ARGV.empty?
|
77
|
+
|
78
|
+
if options.empty?
|
79
|
+
puts p
|
80
|
+
exit
|
81
|
+
end
|
82
|
+
|
83
|
+
default_options.merge(options)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|