browser_crawler 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +10 -0
  5. data/.travis.yml +29 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +277 -0
  9. data/Rakefile +7 -0
  10. data/bin/console +10 -0
  11. data/bin/crawl +51 -0
  12. data/bin/setup +8 -0
  13. data/browser_crawler.gemspec +47 -0
  14. data/lib/browser_crawler.rb +12 -0
  15. data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
  16. data/lib/browser_crawler/dsl/sign_in.rb +37 -0
  17. data/lib/browser_crawler/engine.rb +156 -0
  18. data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
  19. data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
  20. data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
  21. data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
  22. data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
  23. data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
  24. data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
  25. data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
  26. data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
  27. data/lib/browser_crawler/hooks_container.rb +31 -0
  28. data/lib/browser_crawler/hooks_operator.rb +44 -0
  29. data/lib/browser_crawler/options.rb +86 -0
  30. data/lib/browser_crawler/report_factory.rb +22 -0
  31. data/lib/browser_crawler/reports/csv_report.rb +75 -0
  32. data/lib/browser_crawler/reports/store.rb +114 -0
  33. data/lib/browser_crawler/reports/yaml_report.rb +15 -0
  34. data/lib/browser_crawler/screenshot_operator.rb +47 -0
  35. data/lib/browser_crawler/support/capybara.rb +20 -0
  36. data/lib/browser_crawler/url_tools.rb +32 -0
  37. data/lib/browser_crawler/version.rb +3 -0
  38. metadata +244 -0
@@ -0,0 +1,31 @@
1
+ require_relative '../url_tools'
2
+
3
+ module BrowserCrawler
4
+ module EngineUtilities
5
+ class LinkInspector
6
+ attr_reader :raw_link, :host_name, :uri
7
+
8
+ def initialize(raw_link:, host_name:)
9
+ @raw_link = raw_link
10
+ @host_name = host_name
11
+ @uri = UrlTools.uri(url: raw_link)
12
+ end
13
+
14
+ def external_url?
15
+ !internal_url?
16
+ end
17
+
18
+ def link_valid?
19
+ @link_valid ||= !uri.nil? && uri.host && uri.scheme
20
+ end
21
+
22
+ def internal_url?
23
+ @internal_url ||= !uri.nil? && uri.host == host_name
24
+ end
25
+
26
+ def full_url
27
+ @full_url ||= UrlTools.full_url(uri: uri)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,38 @@
1
+ module BrowserCrawler
2
+ module EngineUtilities
3
+ class LinkScanner
4
+ include Capybara::DSL
5
+ include HooksOperator
6
+
7
+ attr_reader :link_inspector
8
+
9
+ def initialize(link_inspector:)
10
+ @link_inspector = link_inspector
11
+ end
12
+
13
+ def scan(page:)
14
+ link_inspector.internal_url? ? get_page_links(page: page) : []
15
+ end
16
+
17
+ private
18
+
19
+ def get_page_links(page:)
20
+ remove_blank_links(link_matcher(page: page))
21
+ end
22
+
23
+ def remove_blank_links(links)
24
+ links.reject do |link|
25
+ link.nil? || link.empty?
26
+ end
27
+ end
28
+
29
+ # return Array consists of links from page
30
+ # if hooks exists when execute them instead of basic behavior
31
+ def link_matcher(page:)
32
+ exchange_on_hooks(type: :scan_rules) do
33
+ page.all('a').map { |a| a['href'] }
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,65 @@
1
+ require_relative '../url_tools'
2
+ require_relative '../hooks_operator'
3
+ require_relative 'link_scanner'
4
+
5
+ module BrowserCrawler
6
+ module EngineUtilities
7
+ class PageInspector
8
+ include Capybara::DSL
9
+ include HooksOperator
10
+
11
+ attr_accessor :link_inspector,
12
+ :link_scanner,
13
+ :capybara_session,
14
+ :scan_result,
15
+ :report_store
16
+
17
+ def initialize(link_inspector:, capybara_session:, report_store:)
18
+ @link_inspector = link_inspector
19
+ @capybara_session = capybara_session
20
+ @report_store = report_store
21
+ @scan_result = []
22
+ @link_scanner = LinkScanner.new(link_inspector: link_inspector)
23
+ end
24
+
25
+ def visit_page
26
+ uri = link_inspector.uri
27
+ Capybara.app_host = "#{uri.scheme}://#{uri.host}:#{uri.port}"
28
+
29
+ visit link_inspector.full_url
30
+
31
+ with_hooks_for(type: :each) do
32
+ @scan_result = scanning
33
+ end
34
+ end
35
+
36
+ def save_to_report(screenshot_operator: nil)
37
+ screenshot_path = save_screenshot(screenshot_operator)
38
+
39
+ report_store.record_page_visit(
40
+ page: link_inspector.full_url,
41
+ extracted_links: scan_result,
42
+ screenshot_filename: screenshot_path,
43
+ external: link_inspector.external_url?,
44
+ code: capybara_session.status_code
45
+ )
46
+ end
47
+
48
+ def before_page_scan; end
49
+
50
+ private
51
+
52
+ def scanning
53
+ link_scanner.scan(page: capybara_session) || []
54
+ end
55
+
56
+ def save_screenshot(screenshot_operator)
57
+ return unless screenshot_operator&.save_screenshots?
58
+
59
+ capybara_session.save_screenshot(
60
+ screenshot_operator.file_path(url: capybara_session.current_url)
61
+ )
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,12 @@
1
+ module BrowserCrawler
2
+ module Errors
3
+ class InvalidHooksType < StandardError
4
+ def initialize(invalid_type:)
5
+ message = "Passed hooks type `#{invalid_type}` is invalid." \
6
+ ' A type has to apply one of the follow values:' \
7
+ " #{HooksContainer::VALID_TYPES.join(', ')}"
8
+ super(message)
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,40 @@
1
+ require 'erb'
2
+ module BrowserCrawler
3
+ module Followups
4
+ # Indexes screenshots captured by the crawler, creates index.html from the captured screenshots.
5
+ # ERB Template can be provided that will receive the list of files.
6
+ class ScreenshotsIndexer
7
+ def initialize(template:)
8
+ @template = template || File.read(default_template_file)
9
+ end
10
+
11
+ # Produce index.html with links to screenshots found in the `path` specified.
12
+ # Optionally file_mask can be provided to filter out files to be indexed.
13
+ def index_directory(path, file_mask: '*.png')
14
+ files = Dir[File.join(path, file_mask)].map { |file| File.basename(file) }
15
+ html = render_index(files: files)
16
+ index_path = File.join(path, 'index.html')
17
+ File.write(index_path, html)
18
+ index_path
19
+ end
20
+
21
+ def index_report(report)
22
+ sorted_pages = Hash[report.pages.sort_by { |(k, _v)| k }]
23
+ files = Hash[sorted_pages.map do |(k, _v)|
24
+ k
25
+ end]
26
+ end
27
+
28
+ private
29
+
30
+ def default_template_file
31
+ File.join(__dir__, 'templates/index.html.erb')
32
+ end
33
+
34
+ def render_index(files:)
35
+ renderer = ERB.new(@template)
36
+ renderer.result(binding)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,69 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
2
+ <html>
3
+ <head>
4
+ <title>Screenshots</title>
5
+
6
+ <style type="text/css">
7
+ body {
8
+ color: #eee;
9
+ margin-top: 20px;
10
+ font-family: Arial, "Helvetica Neue", Helvetica, sans-serif;
11
+ }
12
+
13
+ a {
14
+ color: #FFF;
15
+ }
16
+
17
+ a:hover {
18
+ color: yellow;
19
+ text-decoration: underline;
20
+ }
21
+
22
+ .thumbnails {
23
+ overflow: scroll;
24
+ height: 150px;
25
+ }
26
+
27
+ .thumbnails img {
28
+ height: 80px;
29
+ border: 4px solid #555;
30
+ padding: 1px;
31
+ margin: 0 10px 10px 0;
32
+ }
33
+
34
+ .thumbnails img:hover {
35
+ border: 4px solid #00ccff;
36
+ cursor: pointer;
37
+ }
38
+
39
+ .preview img {
40
+ border: 4px solid #444;
41
+ padding: 1px;
42
+ width: 800px;
43
+ }
44
+ </style>
45
+
46
+ </head>
47
+ <body>
48
+
49
+ <div class="gallery" align="center">
50
+ <h2>Screenshots captured</h2>
51
+
52
+ <br/>
53
+
54
+ <div class="thumbnails">
55
+ <% files.each_with_index do |img, index| %>
56
+ <img onmouseover="preview.src=img<%= index %>.src" name="img<%= index %>" src="<%= img %>" alt=""/>
57
+ <% end %>
58
+ </div>
59
+ <br/>
60
+
61
+ <div class="preview" align="center">
62
+ <img name="preview" src="<%= files.first %>" alt=""/>
63
+ </div>
64
+
65
+ </div>
66
+
67
+
68
+ </body>
69
+ </html>
@@ -0,0 +1,41 @@
1
+ require 'yaml'
2
+ require 'active_support/core_ext/string'
3
+
4
+ module BrowserCrawler
5
+ module Followups
6
+ # Updates the :paths section of the Wraith's config file.
7
+ class WraithIntegrator
8
+ def initialize(report:)
9
+ @report = if report.respond_to?(:pages)
10
+ report
11
+ else
12
+ YAML.safe_load(report, [Symbol]).symbolize_keys
13
+ end
14
+ end
15
+
16
+ def update_config(wraith_config_file, path_suffix: nil)
17
+ config = YAML.safe_load(File.read(wraith_config_file))
18
+ config['paths'] = paths(with_suffix: path_suffix)
19
+ File.write(wraith_config_file, config.to_yaml)
20
+ end
21
+
22
+ # @return [Hash] sorted hash of page_name => path pair values appended with optional suffix.
23
+ # Page name equals to path which makes it easy to navigate the page from the Wraith gallery.
24
+ def paths(with_suffix: nil)
25
+ Hash[sorted_pages.map { |(k, v)| [k, "#{v}#{with_suffix}"] }]
26
+ end
27
+
28
+ def named_pages
29
+ @report[:pages].each_with_object({}) do |(page_url, _links), h|
30
+ page_path = URI(page_url.to_s).path
31
+ page_name = page_path.parameterize
32
+ h[page_name] = page_path
33
+ end
34
+ end
35
+
36
+ def sorted_pages
37
+ Hash[named_pages.sort_by { |(k, _v)| k }]
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,31 @@
1
+ require 'singleton'
2
+ require_relative 'errors/invalid_hooks_type'
3
+
4
+ module BrowserCrawler
5
+ class HooksContainer
6
+ include Singleton
7
+
8
+ VALID_TYPES = %i[each all unvisited_links scan_rules].freeze
9
+
10
+ def initialize
11
+ reset
12
+ end
13
+
14
+ def reset
15
+ @hooks_container = Hash.new { |h, k| h[k] = { each: [],
16
+ all: [],
17
+ unvisited_links: [],
18
+ scan_rules: []} }
19
+ end
20
+
21
+ attr_reader :hooks_container
22
+
23
+ def add_hook(method: :run_only_one, type:, hook: nil)
24
+ unless VALID_TYPES.include?(type)
25
+ raise Errors::InvalidHooksType.new(invalid_type: type)
26
+ end
27
+
28
+ @hooks_container[method][type.to_sym] << hook
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,44 @@
1
+ module BrowserCrawler
2
+ module HooksOperator
3
+ def with_hooks_for(type:)
4
+ run_before_hooks(type: type)
5
+ yield
6
+ run_after_hooks(type: type)
7
+ end
8
+
9
+ def exchange_on_hooks(type:, &default_block)
10
+ hooks_array = BrowserCrawler::HooksContainer
11
+ .instance.hooks_container[:run_only_one][type]
12
+
13
+ if hooks_array && !hooks_array.empty?
14
+ instance_exec(&hooks_array[0])
15
+ elsif block_given?
16
+ instance_exec(&default_block)
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ def run_before_hooks(type:)
23
+ before_hook = BrowserCrawler::HooksContainer.instance
24
+ .hooks_container[:before][type]
25
+ return unless before_hook
26
+
27
+ run_hooks(before_hook)
28
+ end
29
+
30
+ def run_after_hooks(type:)
31
+ after_hook = BrowserCrawler::HooksContainer.instance
32
+ .hooks_container[:after][type]
33
+ return unless after_hook
34
+
35
+ run_hooks(after_hook)
36
+ end
37
+
38
+ def run_hooks(hooks)
39
+ hooks.each do |hook|
40
+ instance_exec(&hook)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,86 @@
1
+ require 'optionparser'
2
+
3
+ module BrowserCrawler
4
+ module Options
5
+ module_function
6
+
7
+ def default_options
8
+ {
9
+ report_folder: 'tmp',
10
+ report_format: 'yaml',
11
+ window_width: 1024,
12
+ window_height: 768
13
+ }
14
+ end
15
+
16
+ def parse_args
17
+ options = {}
18
+ p = OptionParser.new do |opts|
19
+ opts.on_tail
20
+
21
+ opts.banner = 'Site crawler. Usage example: crawl http://localhost:3000'
22
+
23
+ opts.on('-U', '[--url] URL', 'Crawls the site starting from the url specified. E.g. http://localhost:3000/welcome.') do |v|
24
+ options[:url] = v
25
+ end
26
+
27
+ opts.on('-u', '--user USERNAME', 'The authentication user name (optional).') do |v|
28
+ options[:username] = v
29
+ end
30
+
31
+ opts.on('-p', '--password PASSWORD', 'The authentication password (optional).') do |v|
32
+ options[:password] = v
33
+ end
34
+
35
+ opts.on('-n', '--max_pages NUM', 'The maximum number of pages to visit.') do |v|
36
+ options[:max_pages] = v.to_i
37
+ end
38
+
39
+ opts.on('-w', '--window_size WxH', 'Browser window size. Default 1024x768') do |v|
40
+ options[:window_width], options[:window_height] = v.split('x')
41
+ end
42
+
43
+ opts.on('-r', '--report FOLDER', 'The folder path to save report to. '\
44
+ 'Default: tmp') do |v|
45
+ options[:report_folder] = v
46
+ end
47
+
48
+ opts.on('-f', '--report_format TYPE', 'The report type to save result '\
49
+ 'Default: yaml') do |v|
50
+ options[:report_format] = v
51
+ end
52
+
53
+ opts.on('-s', '--screenshots_path PATH',
54
+ 'If specified along with the url, screenshots are captured visiting each page.'\
55
+ ' Otherwise used to generate a screenshots index based on files caprured previously. ') do |v|
56
+ options[:screenshots_path] = v
57
+ end
58
+
59
+ opts.on('-t', '--template FILENAME',
60
+ 'Specify the template used for indexing.'\
61
+ ' Default: followups/templates/index.html.erb') do |v|
62
+ options[:index_template] = v
63
+ end
64
+
65
+ opts.on('-c', '--wraith_config FILENAME',
66
+ 'Update config "paths" section with the pages extracted.') do |v|
67
+ options[:wraith_config] = v
68
+ end
69
+
70
+ opts.on('-h', '--help', 'Show this help message and exit.') do
71
+ puts opts
72
+ end
73
+ end
74
+ p.parse!
75
+
76
+ options[:url] = ARGV.pop unless ARGV.empty?
77
+
78
+ if options.empty?
79
+ puts p
80
+ exit
81
+ end
82
+
83
+ default_options.merge(options)
84
+ end
85
+ end
86
+ end