browser_crawler 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +10 -0
  5. data/.travis.yml +29 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +277 -0
  9. data/Rakefile +7 -0
  10. data/bin/console +10 -0
  11. data/bin/crawl +51 -0
  12. data/bin/setup +8 -0
  13. data/browser_crawler.gemspec +47 -0
  14. data/lib/browser_crawler.rb +12 -0
  15. data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
  16. data/lib/browser_crawler/dsl/sign_in.rb +37 -0
  17. data/lib/browser_crawler/engine.rb +156 -0
  18. data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
  19. data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
  20. data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
  21. data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
  22. data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
  23. data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
  24. data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
  25. data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
  26. data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
  27. data/lib/browser_crawler/hooks_container.rb +31 -0
  28. data/lib/browser_crawler/hooks_operator.rb +44 -0
  29. data/lib/browser_crawler/options.rb +86 -0
  30. data/lib/browser_crawler/report_factory.rb +22 -0
  31. data/lib/browser_crawler/reports/csv_report.rb +75 -0
  32. data/lib/browser_crawler/reports/store.rb +114 -0
  33. data/lib/browser_crawler/reports/yaml_report.rb +15 -0
  34. data/lib/browser_crawler/screenshot_operator.rb +47 -0
  35. data/lib/browser_crawler/support/capybara.rb +20 -0
  36. data/lib/browser_crawler/url_tools.rb +32 -0
  37. data/lib/browser_crawler/version.rb +3 -0
  38. metadata +244 -0
@@ -0,0 +1,31 @@
1
+ require_relative '../url_tools'
2
+
3
+ module BrowserCrawler
4
+ module EngineUtilities
5
+ class LinkInspector
6
+ attr_reader :raw_link, :host_name, :uri
7
+
8
+ def initialize(raw_link:, host_name:)
9
+ @raw_link = raw_link
10
+ @host_name = host_name
11
+ @uri = UrlTools.uri(url: raw_link)
12
+ end
13
+
14
+ def external_url?
15
+ !internal_url?
16
+ end
17
+
18
+ def link_valid?
19
+ @link_valid ||= !uri.nil? && uri.host && uri.scheme
20
+ end
21
+
22
+ def internal_url?
23
+ @internal_url ||= !uri.nil? && uri.host == host_name
24
+ end
25
+
26
+ def full_url
27
+ @full_url ||= UrlTools.full_url(uri: uri)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,38 @@
1
+ module BrowserCrawler
2
+ module EngineUtilities
3
+ class LinkScanner
4
+ include Capybara::DSL
5
+ include HooksOperator
6
+
7
+ attr_reader :link_inspector
8
+
9
+ def initialize(link_inspector:)
10
+ @link_inspector = link_inspector
11
+ end
12
+
13
+ def scan(page:)
14
+ link_inspector.internal_url? ? get_page_links(page: page) : []
15
+ end
16
+
17
+ private
18
+
19
+ def get_page_links(page:)
20
+ remove_blank_links(link_matcher(page: page))
21
+ end
22
+
23
+ def remove_blank_links(links)
24
+ links.reject do |link|
25
+ link.nil? || link.empty?
26
+ end
27
+ end
28
+
29
+ # return Array consists of links from page
30
+ # if hooks exists when execute them instead of basic behavior
31
+ def link_matcher(page:)
32
+ exchange_on_hooks(type: :scan_rules) do
33
+ page.all('a').map { |a| a['href'] }
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,65 @@
1
+ require_relative '../url_tools'
2
+ require_relative '../hooks_operator'
3
+ require_relative 'link_scanner'
4
+
5
+ module BrowserCrawler
6
+ module EngineUtilities
7
+ class PageInspector
8
+ include Capybara::DSL
9
+ include HooksOperator
10
+
11
+ attr_accessor :link_inspector,
12
+ :link_scanner,
13
+ :capybara_session,
14
+ :scan_result,
15
+ :report_store
16
+
17
+ def initialize(link_inspector:, capybara_session:, report_store:)
18
+ @link_inspector = link_inspector
19
+ @capybara_session = capybara_session
20
+ @report_store = report_store
21
+ @scan_result = []
22
+ @link_scanner = LinkScanner.new(link_inspector: link_inspector)
23
+ end
24
+
25
+ def visit_page
26
+ uri = link_inspector.uri
27
+ Capybara.app_host = "#{uri.scheme}://#{uri.host}:#{uri.port}"
28
+
29
+ visit link_inspector.full_url
30
+
31
+ with_hooks_for(type: :each) do
32
+ @scan_result = scanning
33
+ end
34
+ end
35
+
36
+ def save_to_report(screenshot_operator: nil)
37
+ screenshot_path = save_screenshot(screenshot_operator)
38
+
39
+ report_store.record_page_visit(
40
+ page: link_inspector.full_url,
41
+ extracted_links: scan_result,
42
+ screenshot_filename: screenshot_path,
43
+ external: link_inspector.external_url?,
44
+ code: capybara_session.status_code
45
+ )
46
+ end
47
+
48
+ def before_page_scan; end
49
+
50
+ private
51
+
52
+ def scanning
53
+ link_scanner.scan(page: capybara_session) || []
54
+ end
55
+
56
+ def save_screenshot(screenshot_operator)
57
+ return unless screenshot_operator&.save_screenshots?
58
+
59
+ capybara_session.save_screenshot(
60
+ screenshot_operator.file_path(url: capybara_session.current_url)
61
+ )
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,12 @@
1
+ module BrowserCrawler
2
+ module Errors
3
+ class InvalidHooksType < StandardError
4
+ def initialize(invalid_type:)
5
+ message = "Passed hooks type `#{invalid_type}` is invalid." \
6
+ ' A type has to apply one of the follow values:' \
7
+ " #{HooksContainer::VALID_TYPES.join(', ')}"
8
+ super(message)
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,40 @@
1
+ require 'erb'
2
+ module BrowserCrawler
3
+ module Followups
4
+ # Indexes screenshots captured by the crawler, creates index.html from the captured screenshots.
5
+ # ERB Template can be provided that will receive the list of files.
6
+ class ScreenshotsIndexer
7
+ def initialize(template:)
8
+ @template = template || File.read(default_template_file)
9
+ end
10
+
11
+ # Produce index.html with links to screenshots found in the `path` specified.
12
+ # Optionally file_mask can be provided to filter out files to be indexed.
13
+ def index_directory(path, file_mask: '*.png')
14
+ files = Dir[File.join(path, file_mask)].map { |file| File.basename(file) }
15
+ html = render_index(files: files)
16
+ index_path = File.join(path, 'index.html')
17
+ File.write(index_path, html)
18
+ index_path
19
+ end
20
+
21
+ def index_report(report)
22
+ sorted_pages = Hash[report.pages.sort_by { |(k, _v)| k }]
23
+ files = Hash[sorted_pages.map do |(k, _v)|
24
+ k
25
+ end]
26
+ end
27
+
28
+ private
29
+
30
+ def default_template_file
31
+ File.join(__dir__, 'templates/index.html.erb')
32
+ end
33
+
34
+ def render_index(files:)
35
+ renderer = ERB.new(@template)
36
+ renderer.result(binding)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,69 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
2
+ <html>
3
+ <head>
4
+ <title>Screenshots</title>
5
+
6
+ <style type="text/css">
7
+ body {
8
+ color: #eee;
9
+ margin-top: 20px;
10
+ font-family: Arial, "Helvetica Neue", Helvetica, sans-serif;
11
+ }
12
+
13
+ a {
14
+ color: #FFF;
15
+ }
16
+
17
+ a:hover {
18
+ color: yellow;
19
+ text-decoration: underline;
20
+ }
21
+
22
+ .thumbnails {
23
+ overflow: scroll;
24
+ height: 150px;
25
+ }
26
+
27
+ .thumbnails img {
28
+ height: 80px;
29
+ border: 4px solid #555;
30
+ padding: 1px;
31
+ margin: 0 10px 10px 0;
32
+ }
33
+
34
+ .thumbnails img:hover {
35
+ border: 4px solid #00ccff;
36
+ cursor: pointer;
37
+ }
38
+
39
+ .preview img {
40
+ border: 4px solid #444;
41
+ padding: 1px;
42
+ width: 800px;
43
+ }
44
+ </style>
45
+
46
+ </head>
47
+ <body>
48
+
49
+ <div class="gallery" align="center">
50
+ <h2>Screenshots captured</h2>
51
+
52
+ <br/>
53
+
54
+ <div class="thumbnails">
55
+ <% files.each_with_index do |img, index| %>
56
+ <img onmouseover="preview.src=img<%= index %>.src" name="img<%= index %>" src="<%= img %>" alt=""/>
57
+ <% end %>
58
+ </div>
59
+ <br/>
60
+
61
+ <div class="preview" align="center">
62
+ <img name="preview" src="<%= files.first %>" alt=""/>
63
+ </div>
64
+
65
+ </div>
66
+
67
+
68
+ </body>
69
+ </html>
@@ -0,0 +1,41 @@
1
+ require 'yaml'
2
+ require 'active_support/core_ext/string'
3
+
4
+ module BrowserCrawler
5
+ module Followups
6
+ # Updates the :paths section of the Wraith's config file.
7
+ class WraithIntegrator
8
+ def initialize(report:)
9
+ @report = if report.respond_to?(:pages)
10
+ report
11
+ else
12
+ YAML.safe_load(report, [Symbol]).symbolize_keys
13
+ end
14
+ end
15
+
16
+ def update_config(wraith_config_file, path_suffix: nil)
17
+ config = YAML.safe_load(File.read(wraith_config_file))
18
+ config['paths'] = paths(with_suffix: path_suffix)
19
+ File.write(wraith_config_file, config.to_yaml)
20
+ end
21
+
22
+ # @return [Hash] sorted hash of page_name => path pair values appended with optional suffix.
23
+ # Page name equals to path which makes it easy to navigate the page from the Wraith gallery.
24
+ def paths(with_suffix: nil)
25
+ Hash[sorted_pages.map { |(k, v)| [k, "#{v}#{with_suffix}"] }]
26
+ end
27
+
28
+ def named_pages
29
+ @report[:pages].each_with_object({}) do |(page_url, _links), h|
30
+ page_path = URI(page_url.to_s).path
31
+ page_name = page_path.parameterize
32
+ h[page_name] = page_path
33
+ end
34
+ end
35
+
36
+ def sorted_pages
37
+ Hash[named_pages.sort_by { |(k, _v)| k }]
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,31 @@
1
+ require 'singleton'
2
+ require_relative 'errors/invalid_hooks_type'
3
+
4
+ module BrowserCrawler
5
+ class HooksContainer
6
+ include Singleton
7
+
8
+ VALID_TYPES = %i[each all unvisited_links scan_rules].freeze
9
+
10
+ def initialize
11
+ reset
12
+ end
13
+
14
+ def reset
15
+ @hooks_container = Hash.new { |h, k| h[k] = { each: [],
16
+ all: [],
17
+ unvisited_links: [],
18
+ scan_rules: []} }
19
+ end
20
+
21
+ attr_reader :hooks_container
22
+
23
+ def add_hook(method: :run_only_one, type:, hook: nil)
24
+ unless VALID_TYPES.include?(type)
25
+ raise Errors::InvalidHooksType.new(invalid_type: type)
26
+ end
27
+
28
+ @hooks_container[method][type.to_sym] << hook
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,44 @@
1
+ module BrowserCrawler
2
+ module HooksOperator
3
+ def with_hooks_for(type:)
4
+ run_before_hooks(type: type)
5
+ yield
6
+ run_after_hooks(type: type)
7
+ end
8
+
9
+ def exchange_on_hooks(type:, &default_block)
10
+ hooks_array = BrowserCrawler::HooksContainer
11
+ .instance.hooks_container[:run_only_one][type]
12
+
13
+ if hooks_array && !hooks_array.empty?
14
+ instance_exec(&hooks_array[0])
15
+ elsif block_given?
16
+ instance_exec(&default_block)
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ def run_before_hooks(type:)
23
+ before_hook = BrowserCrawler::HooksContainer.instance
24
+ .hooks_container[:before][type]
25
+ return unless before_hook
26
+
27
+ run_hooks(before_hook)
28
+ end
29
+
30
+ def run_after_hooks(type:)
31
+ after_hook = BrowserCrawler::HooksContainer.instance
32
+ .hooks_container[:after][type]
33
+ return unless after_hook
34
+
35
+ run_hooks(after_hook)
36
+ end
37
+
38
+ def run_hooks(hooks)
39
+ hooks.each do |hook|
40
+ instance_exec(&hook)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,86 @@
1
+ require 'optionparser'
2
+
3
+ module BrowserCrawler
4
+ module Options
5
+ module_function
6
+
7
+ def default_options
8
+ {
9
+ report_folder: 'tmp',
10
+ report_format: 'yaml',
11
+ window_width: 1024,
12
+ window_height: 768
13
+ }
14
+ end
15
+
16
+ def parse_args
17
+ options = {}
18
+ p = OptionParser.new do |opts|
19
+ opts.on_tail
20
+
21
+ opts.banner = 'Site crawler. Usage example: crawl http://localhost:3000'
22
+
23
+ opts.on('-U', '[--url] URL', 'Crawls the site starting from the url specified. E.g. http://localhost:3000/welcome.') do |v|
24
+ options[:url] = v
25
+ end
26
+
27
+ opts.on('-u', '--user USERNAME', 'The authentication user name (optional).') do |v|
28
+ options[:username] = v
29
+ end
30
+
31
+ opts.on('-p', '--password PASSWORD', 'The authentication password (optional).') do |v|
32
+ options[:password] = v
33
+ end
34
+
35
+ opts.on('-n', '--max_pages NUM', 'The maximum number of pages to visit.') do |v|
36
+ options[:max_pages] = v.to_i
37
+ end
38
+
39
+ opts.on('-w', '--window_size WxH', 'Browser window size. Default 1024x768') do |v|
40
+ options[:window_width], options[:window_height] = v.split('x')
41
+ end
42
+
43
+ opts.on('-r', '--report FOLDER', 'The folder path to save report to. '\
44
+ 'Default: tmp') do |v|
45
+ options[:report_folder] = v
46
+ end
47
+
48
+ opts.on('-f', '--report_format TYPE', 'The report type to save result '\
49
+ 'Default: yaml') do |v|
50
+ options[:report_format] = v
51
+ end
52
+
53
+ opts.on('-s', '--screenshots_path PATH',
54
+ 'If specified along with the url, screenshots are captured visiting each page.'\
55
+ ' Otherwise used to generate a screenshots index based on files caprured previously. ') do |v|
56
+ options[:screenshots_path] = v
57
+ end
58
+
59
+ opts.on('-t', '--template FILENAME',
60
+ 'Specify the template used for indexing.'\
61
+ ' Default: followups/templates/index.html.erb') do |v|
62
+ options[:index_template] = v
63
+ end
64
+
65
+ opts.on('-c', '--wraith_config FILENAME',
66
+ 'Update config "paths" section with the pages extracted.') do |v|
67
+ options[:wraith_config] = v
68
+ end
69
+
70
+ opts.on('-h', '--help', 'Show this help message and exit.') do
71
+ puts opts
72
+ end
73
+ end
74
+ p.parse!
75
+
76
+ options[:url] = ARGV.pop unless ARGV.empty?
77
+
78
+ if options.empty?
79
+ puts p
80
+ exit
81
+ end
82
+
83
+ default_options.merge(options)
84
+ end
85
+ end
86
+ end