browser_crawler 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +2 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +277 -0
- data/Rakefile +7 -0
- data/bin/console +10 -0
- data/bin/crawl +51 -0
- data/bin/setup +8 -0
- data/browser_crawler.gemspec +47 -0
- data/lib/browser_crawler.rb +12 -0
- data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
- data/lib/browser_crawler/dsl/sign_in.rb +37 -0
- data/lib/browser_crawler/engine.rb +156 -0
- data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
- data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
- data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
- data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
- data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
- data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
- data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
- data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
- data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
- data/lib/browser_crawler/hooks_container.rb +31 -0
- data/lib/browser_crawler/hooks_operator.rb +44 -0
- data/lib/browser_crawler/options.rb +86 -0
- data/lib/browser_crawler/report_factory.rb +22 -0
- data/lib/browser_crawler/reports/csv_report.rb +75 -0
- data/lib/browser_crawler/reports/store.rb +114 -0
- data/lib/browser_crawler/reports/yaml_report.rb +15 -0
- data/lib/browser_crawler/screenshot_operator.rb +47 -0
- data/lib/browser_crawler/support/capybara.rb +20 -0
- data/lib/browser_crawler/url_tools.rb +32 -0
- data/lib/browser_crawler/version.rb +3 -0
- metadata +244 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative '../url_tools'
|
2
|
+
|
3
|
+
module BrowserCrawler
|
4
|
+
module EngineUtilities
|
5
|
+
class LinkInspector
|
6
|
+
attr_reader :raw_link, :host_name, :uri
|
7
|
+
|
8
|
+
def initialize(raw_link:, host_name:)
|
9
|
+
@raw_link = raw_link
|
10
|
+
@host_name = host_name
|
11
|
+
@uri = UrlTools.uri(url: raw_link)
|
12
|
+
end
|
13
|
+
|
14
|
+
def external_url?
|
15
|
+
!internal_url?
|
16
|
+
end
|
17
|
+
|
18
|
+
def link_valid?
|
19
|
+
@link_valid ||= !uri.nil? && uri.host && uri.scheme
|
20
|
+
end
|
21
|
+
|
22
|
+
def internal_url?
|
23
|
+
@internal_url ||= !uri.nil? && uri.host == host_name
|
24
|
+
end
|
25
|
+
|
26
|
+
def full_url
|
27
|
+
@full_url ||= UrlTools.full_url(uri: uri)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module BrowserCrawler
|
2
|
+
module EngineUtilities
|
3
|
+
class LinkScanner
|
4
|
+
include Capybara::DSL
|
5
|
+
include HooksOperator
|
6
|
+
|
7
|
+
attr_reader :link_inspector
|
8
|
+
|
9
|
+
def initialize(link_inspector:)
|
10
|
+
@link_inspector = link_inspector
|
11
|
+
end
|
12
|
+
|
13
|
+
def scan(page:)
|
14
|
+
link_inspector.internal_url? ? get_page_links(page: page) : []
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def get_page_links(page:)
|
20
|
+
remove_blank_links(link_matcher(page: page))
|
21
|
+
end
|
22
|
+
|
23
|
+
def remove_blank_links(links)
|
24
|
+
links.reject do |link|
|
25
|
+
link.nil? || link.empty?
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# return Array consists of links from page
|
30
|
+
# if hooks exists when execute them instead of basic behavior
|
31
|
+
def link_matcher(page:)
|
32
|
+
exchange_on_hooks(type: :scan_rules) do
|
33
|
+
page.all('a').map { |a| a['href'] }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require_relative '../url_tools'
|
2
|
+
require_relative '../hooks_operator'
|
3
|
+
require_relative 'link_scanner'
|
4
|
+
|
5
|
+
module BrowserCrawler
|
6
|
+
module EngineUtilities
|
7
|
+
class PageInspector
|
8
|
+
include Capybara::DSL
|
9
|
+
include HooksOperator
|
10
|
+
|
11
|
+
attr_accessor :link_inspector,
|
12
|
+
:link_scanner,
|
13
|
+
:capybara_session,
|
14
|
+
:scan_result,
|
15
|
+
:report_store
|
16
|
+
|
17
|
+
def initialize(link_inspector:, capybara_session:, report_store:)
|
18
|
+
@link_inspector = link_inspector
|
19
|
+
@capybara_session = capybara_session
|
20
|
+
@report_store = report_store
|
21
|
+
@scan_result = []
|
22
|
+
@link_scanner = LinkScanner.new(link_inspector: link_inspector)
|
23
|
+
end
|
24
|
+
|
25
|
+
def visit_page
|
26
|
+
uri = link_inspector.uri
|
27
|
+
Capybara.app_host = "#{uri.scheme}://#{uri.host}:#{uri.port}"
|
28
|
+
|
29
|
+
visit link_inspector.full_url
|
30
|
+
|
31
|
+
with_hooks_for(type: :each) do
|
32
|
+
@scan_result = scanning
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def save_to_report(screenshot_operator: nil)
|
37
|
+
screenshot_path = save_screenshot(screenshot_operator)
|
38
|
+
|
39
|
+
report_store.record_page_visit(
|
40
|
+
page: link_inspector.full_url,
|
41
|
+
extracted_links: scan_result,
|
42
|
+
screenshot_filename: screenshot_path,
|
43
|
+
external: link_inspector.external_url?,
|
44
|
+
code: capybara_session.status_code
|
45
|
+
)
|
46
|
+
end
|
47
|
+
|
48
|
+
def before_page_scan; end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def scanning
|
53
|
+
link_scanner.scan(page: capybara_session) || []
|
54
|
+
end
|
55
|
+
|
56
|
+
def save_screenshot(screenshot_operator)
|
57
|
+
return unless screenshot_operator&.save_screenshots?
|
58
|
+
|
59
|
+
capybara_session.save_screenshot(
|
60
|
+
screenshot_operator.file_path(url: capybara_session.current_url)
|
61
|
+
)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module BrowserCrawler
|
2
|
+
module Errors
|
3
|
+
class InvalidHooksType < StandardError
|
4
|
+
def initialize(invalid_type:)
|
5
|
+
message = "Passed hooks type `#{invalid_type}` is invalid." \
|
6
|
+
' A type has to apply one of the follow values:' \
|
7
|
+
" #{HooksContainer::VALID_TYPES.join(', ')}"
|
8
|
+
super(message)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'erb'
|
2
|
+
module BrowserCrawler
|
3
|
+
module Followups
|
4
|
+
# Indexes screenshots captured by the crawler, creates index.html from the captured screenshots.
|
5
|
+
# ERB Template can be provided that will receive the list of files.
|
6
|
+
class ScreenshotsIndexer
|
7
|
+
def initialize(template:)
|
8
|
+
@template = template || File.read(default_template_file)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Produce index.html with links to screenshots found in the `path` specified.
|
12
|
+
# Optionally file_mask can be provided to filter out files to be indexed.
|
13
|
+
def index_directory(path, file_mask: '*.png')
|
14
|
+
files = Dir[File.join(path, file_mask)].map { |file| File.basename(file) }
|
15
|
+
html = render_index(files: files)
|
16
|
+
index_path = File.join(path, 'index.html')
|
17
|
+
File.write(index_path, html)
|
18
|
+
index_path
|
19
|
+
end
|
20
|
+
|
21
|
+
def index_report(report)
|
22
|
+
sorted_pages = Hash[report.pages.sort_by { |(k, _v)| k }]
|
23
|
+
files = Hash[sorted_pages.map do |(k, _v)|
|
24
|
+
k
|
25
|
+
end]
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def default_template_file
|
31
|
+
File.join(__dir__, 'templates/index.html.erb')
|
32
|
+
end
|
33
|
+
|
34
|
+
def render_index(files:)
|
35
|
+
renderer = ERB.new(@template)
|
36
|
+
renderer.result(binding)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Screenshots</title>
|
5
|
+
|
6
|
+
<style type="text/css">
|
7
|
+
body {
|
8
|
+
color: #eee;
|
9
|
+
margin-top: 20px;
|
10
|
+
font-family: Arial, "Helvetica Neue", Helvetica, sans-serif;
|
11
|
+
}
|
12
|
+
|
13
|
+
a {
|
14
|
+
color: #FFF;
|
15
|
+
}
|
16
|
+
|
17
|
+
a:hover {
|
18
|
+
color: yellow;
|
19
|
+
text-decoration: underline;
|
20
|
+
}
|
21
|
+
|
22
|
+
.thumbnails {
|
23
|
+
overflow: scroll;
|
24
|
+
height: 150px;
|
25
|
+
}
|
26
|
+
|
27
|
+
.thumbnails img {
|
28
|
+
height: 80px;
|
29
|
+
border: 4px solid #555;
|
30
|
+
padding: 1px;
|
31
|
+
margin: 0 10px 10px 0;
|
32
|
+
}
|
33
|
+
|
34
|
+
.thumbnails img:hover {
|
35
|
+
border: 4px solid #00ccff;
|
36
|
+
cursor: pointer;
|
37
|
+
}
|
38
|
+
|
39
|
+
.preview img {
|
40
|
+
border: 4px solid #444;
|
41
|
+
padding: 1px;
|
42
|
+
width: 800px;
|
43
|
+
}
|
44
|
+
</style>
|
45
|
+
|
46
|
+
</head>
|
47
|
+
<body>
|
48
|
+
|
49
|
+
<div class="gallery" align="center">
|
50
|
+
<h2>Screenshots captured</h2>
|
51
|
+
|
52
|
+
<br/>
|
53
|
+
|
54
|
+
<div class="thumbnails">
|
55
|
+
<% files.each_with_index do |img, index| %>
|
56
|
+
<img onmouseover="preview.src=img<%= index %>.src" name="img<%= index %>" src="<%= img %>" alt=""/>
|
57
|
+
<% end %>
|
58
|
+
</div>
|
59
|
+
<br/>
|
60
|
+
|
61
|
+
<div class="preview" align="center">
|
62
|
+
<img name="preview" src="<%= files.first %>" alt=""/>
|
63
|
+
</div>
|
64
|
+
|
65
|
+
</div>
|
66
|
+
|
67
|
+
|
68
|
+
</body>
|
69
|
+
</html>
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'active_support/core_ext/string'
|
3
|
+
|
4
|
+
module BrowserCrawler
|
5
|
+
module Followups
|
6
|
+
# Updates the :paths section of the Wraith's config file.
|
7
|
+
class WraithIntegrator
|
8
|
+
def initialize(report:)
|
9
|
+
@report = if report.respond_to?(:pages)
|
10
|
+
report
|
11
|
+
else
|
12
|
+
YAML.safe_load(report, [Symbol]).symbolize_keys
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def update_config(wraith_config_file, path_suffix: nil)
|
17
|
+
config = YAML.safe_load(File.read(wraith_config_file))
|
18
|
+
config['paths'] = paths(with_suffix: path_suffix)
|
19
|
+
File.write(wraith_config_file, config.to_yaml)
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [Hash] sorted hash of page_name => path pair values appended with optional suffix.
|
23
|
+
# Page name equals to path which makes it easy to navigate the page from the Wraith gallery.
|
24
|
+
def paths(with_suffix: nil)
|
25
|
+
Hash[sorted_pages.map { |(k, v)| [k, "#{v}#{with_suffix}"] }]
|
26
|
+
end
|
27
|
+
|
28
|
+
def named_pages
|
29
|
+
@report[:pages].each_with_object({}) do |(page_url, _links), h|
|
30
|
+
page_path = URI(page_url.to_s).path
|
31
|
+
page_name = page_path.parameterize
|
32
|
+
h[page_name] = page_path
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def sorted_pages
|
37
|
+
Hash[named_pages.sort_by { |(k, _v)| k }]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
require_relative 'errors/invalid_hooks_type'
|
3
|
+
|
4
|
+
module BrowserCrawler
|
5
|
+
class HooksContainer
|
6
|
+
include Singleton
|
7
|
+
|
8
|
+
VALID_TYPES = %i[each all unvisited_links scan_rules].freeze
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
reset
|
12
|
+
end
|
13
|
+
|
14
|
+
def reset
|
15
|
+
@hooks_container = Hash.new { |h, k| h[k] = { each: [],
|
16
|
+
all: [],
|
17
|
+
unvisited_links: [],
|
18
|
+
scan_rules: []} }
|
19
|
+
end
|
20
|
+
|
21
|
+
attr_reader :hooks_container
|
22
|
+
|
23
|
+
def add_hook(method: :run_only_one, type:, hook: nil)
|
24
|
+
unless VALID_TYPES.include?(type)
|
25
|
+
raise Errors::InvalidHooksType.new(invalid_type: type)
|
26
|
+
end
|
27
|
+
|
28
|
+
@hooks_container[method][type.to_sym] << hook
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module BrowserCrawler
|
2
|
+
module HooksOperator
|
3
|
+
def with_hooks_for(type:)
|
4
|
+
run_before_hooks(type: type)
|
5
|
+
yield
|
6
|
+
run_after_hooks(type: type)
|
7
|
+
end
|
8
|
+
|
9
|
+
def exchange_on_hooks(type:, &default_block)
|
10
|
+
hooks_array = BrowserCrawler::HooksContainer
|
11
|
+
.instance.hooks_container[:run_only_one][type]
|
12
|
+
|
13
|
+
if hooks_array && !hooks_array.empty?
|
14
|
+
instance_exec(&hooks_array[0])
|
15
|
+
elsif block_given?
|
16
|
+
instance_exec(&default_block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def run_before_hooks(type:)
|
23
|
+
before_hook = BrowserCrawler::HooksContainer.instance
|
24
|
+
.hooks_container[:before][type]
|
25
|
+
return unless before_hook
|
26
|
+
|
27
|
+
run_hooks(before_hook)
|
28
|
+
end
|
29
|
+
|
30
|
+
def run_after_hooks(type:)
|
31
|
+
after_hook = BrowserCrawler::HooksContainer.instance
|
32
|
+
.hooks_container[:after][type]
|
33
|
+
return unless after_hook
|
34
|
+
|
35
|
+
run_hooks(after_hook)
|
36
|
+
end
|
37
|
+
|
38
|
+
def run_hooks(hooks)
|
39
|
+
hooks.each do |hook|
|
40
|
+
instance_exec(&hook)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'optionparser'
|
2
|
+
|
3
|
+
module BrowserCrawler
|
4
|
+
module Options
|
5
|
+
module_function
|
6
|
+
|
7
|
+
def default_options
|
8
|
+
{
|
9
|
+
report_folder: 'tmp',
|
10
|
+
report_format: 'yaml',
|
11
|
+
window_width: 1024,
|
12
|
+
window_height: 768
|
13
|
+
}
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse_args
|
17
|
+
options = {}
|
18
|
+
p = OptionParser.new do |opts|
|
19
|
+
opts.on_tail
|
20
|
+
|
21
|
+
opts.banner = 'Site crawler. Usage example: crawl http://localhost:3000'
|
22
|
+
|
23
|
+
opts.on('-U', '[--url] URL', 'Crawls the site starting from the url specified. E.g. http://localhost:3000/welcome.') do |v|
|
24
|
+
options[:url] = v
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on('-u', '--user USERNAME', 'The authentication user name (optional).') do |v|
|
28
|
+
options[:username] = v
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on('-p', '--password PASSWORD', 'The authentication password (optional).') do |v|
|
32
|
+
options[:password] = v
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on('-n', '--max_pages NUM', 'The maximum number of pages to visit.') do |v|
|
36
|
+
options[:max_pages] = v.to_i
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on('-w', '--window_size WxH', 'Browser window size. Default 1024x768') do |v|
|
40
|
+
options[:window_width], options[:window_height] = v.split('x')
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('-r', '--report FOLDER', 'The folder path to save report to. '\
|
44
|
+
'Default: tmp') do |v|
|
45
|
+
options[:report_folder] = v
|
46
|
+
end
|
47
|
+
|
48
|
+
opts.on('-f', '--report_format TYPE', 'The report type to save result '\
|
49
|
+
'Default: yaml') do |v|
|
50
|
+
options[:report_format] = v
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on('-s', '--screenshots_path PATH',
|
54
|
+
'If specified along with the url, screenshots are captured visiting each page.'\
|
55
|
+
' Otherwise used to generate a screenshots index based on files caprured previously. ') do |v|
|
56
|
+
options[:screenshots_path] = v
|
57
|
+
end
|
58
|
+
|
59
|
+
opts.on('-t', '--template FILENAME',
|
60
|
+
'Specify the template used for indexing.'\
|
61
|
+
' Default: followups/templates/index.html.erb') do |v|
|
62
|
+
options[:index_template] = v
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on('-c', '--wraith_config FILENAME',
|
66
|
+
'Update config "paths" section with the pages extracted.') do |v|
|
67
|
+
options[:wraith_config] = v
|
68
|
+
end
|
69
|
+
|
70
|
+
opts.on('-h', '--help', 'Show this help message and exit.') do
|
71
|
+
puts opts
|
72
|
+
end
|
73
|
+
end
|
74
|
+
p.parse!
|
75
|
+
|
76
|
+
options[:url] = ARGV.pop unless ARGV.empty?
|
77
|
+
|
78
|
+
if options.empty?
|
79
|
+
puts p
|
80
|
+
exit
|
81
|
+
end
|
82
|
+
|
83
|
+
default_options.merge(options)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|