browser_crawler 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +10 -0
  5. data/.travis.yml +29 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +277 -0
  9. data/Rakefile +7 -0
  10. data/bin/console +10 -0
  11. data/bin/crawl +51 -0
  12. data/bin/setup +8 -0
  13. data/browser_crawler.gemspec +47 -0
  14. data/lib/browser_crawler.rb +12 -0
  15. data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
  16. data/lib/browser_crawler/dsl/sign_in.rb +37 -0
  17. data/lib/browser_crawler/engine.rb +156 -0
  18. data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
  19. data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
  20. data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
  21. data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
  22. data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
  23. data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
  24. data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
  25. data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
  26. data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
  27. data/lib/browser_crawler/hooks_container.rb +31 -0
  28. data/lib/browser_crawler/hooks_operator.rb +44 -0
  29. data/lib/browser_crawler/options.rb +86 -0
  30. data/lib/browser_crawler/report_factory.rb +22 -0
  31. data/lib/browser_crawler/reports/csv_report.rb +75 -0
  32. data/lib/browser_crawler/reports/store.rb +114 -0
  33. data/lib/browser_crawler/reports/yaml_report.rb +15 -0
  34. data/lib/browser_crawler/screenshot_operator.rb +47 -0
  35. data/lib/browser_crawler/support/capybara.rb +20 -0
  36. data/lib/browser_crawler/url_tools.rb +32 -0
  37. data/lib/browser_crawler/version.rb +3 -0
  38. metadata +244 -0
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,47 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('lib', __dir__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'browser_crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'browser_crawler'
8
+ spec.version = BrowserCrawler::VERSION
9
+ spec.required_ruby_version = '>= 2.5.0'
10
+ spec.authors = ['Dmytro Samodurov',
11
+ 'Artem Rumiantcev',
12
+ 'Denys Ivanchuk',
13
+ 'Sergiy Tyatin']
14
+ spec.email = ['dimasamodurov@gmail.com', 'tema.place@gmail.com']
15
+ spec.licenses = ['MIT']
16
+
17
+ spec.summary = 'Simple site crawler using Capybara'
18
+ spec.description = ''
19
+ spec.homepage = 'https://github.com/DimaSamodurov/browser_crawler'
20
+
21
+ # Prevent pushing this gem to RubyGems.org.
22
+ # To allow pushes either set the 'allowed_push_host'
23
+ # to allow pushing to a single host
24
+ # or delete this section to allow pushing to any host.
25
+ if spec.respond_to?(:metadata)
26
+ spec.metadata['homepage_uri'] = spec.homepage
27
+ spec.metadata['source_code_uri'] = spec.homepage
28
+ else
29
+ raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.'
30
+ end
31
+
32
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
33
+ spec.bindir = 'bin'
34
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
35
+ spec.require_paths = ['lib']
36
+
37
+ spec.add_dependency 'activesupport', '~> 5.2', '>= 5.2.2'
38
+ spec.add_dependency 'capybara', '~> 3.24', '>= 3.24.0'
39
+ spec.add_dependency 'chromedriver-helper', '~> 2.1', '>= 2.1.0'
40
+ spec.add_dependency 'cuprite', '~> 0.6.0'
41
+
42
+ spec.add_development_dependency 'bundler', '~> 1.17.2', '>= 1.17.2'
43
+ spec.add_development_dependency 'pry-byebug', '~> 3.6', '>= 3.6'
44
+ spec.add_development_dependency 'rake', '~> 10.0'
45
+ spec.add_development_dependency 'rspec', '~> 3.0'
46
+ spec.add_development_dependency 'rubocop', '~> 0.66'
47
+ end
@@ -0,0 +1,12 @@
1
+ require 'browser_crawler/version'
2
+ require 'browser_crawler/options'
3
+ require 'browser_crawler/engine'
4
+
5
+ require 'browser_crawler/followups/screenshots_indexer'
6
+ require 'browser_crawler/followups/wraith_integrator'
7
+
8
+ # Crawls web site and extracts links available.
9
+
10
+ module BrowserCrawler
11
+ # Your code goes here...
12
+ end
@@ -0,0 +1,13 @@
1
+ module BrowserCrawler
2
+ module DSL
3
+ module JsHelpers
4
+ def wait_for_page_to_load
5
+ 10.times do
6
+ return if page.evaluate_script('document.readyState') == 'complete'
7
+
8
+ sleep(0.5)
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,37 @@
1
+ module BrowserCrawler
2
+ module DSL
3
+ module SignIn
4
+ def sign_in
5
+ visit '/'
6
+ pingfed_o365_login
7
+ end
8
+
9
+ def pingfed_login(force: true)
10
+ if force || page.has_content?('Enter your credentials')
11
+ fill_in 'input_username', with: ENV.fetch('username')
12
+ fill_in 'input_password', with: ENV.fetch('password')
13
+ click_on 'Login'
14
+ end
15
+ end
16
+
17
+ def o365_login(force: true)
18
+ if force || page.has_content?('Stay signed in?')
19
+ check 'DontShowAgain'
20
+ click_on 'Yes'
21
+ end
22
+ end
23
+
24
+ def o365_stay_signed_in(force: true)
25
+ if force || page.has_content?('Stay signed in?')
26
+ check 'DontShowAgain'
27
+ click_on 'Yes'
28
+ end
29
+ end
30
+
31
+ def pingfed_o365_login(force: true)
32
+ pingfed_login(force: force)
33
+ o365_stay_signed_in(force: force)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,156 @@
1
+ require 'capybara'
2
+ require 'capybara/dsl'
3
+ require 'logger'
4
+
5
+ require_relative 'dsl/sign_in'
6
+ require_relative 'dsl/js_helpers'
7
+ require_relative 'report_factory'
8
+ require_relative 'reports/store'
9
+ require_relative 'support/capybara'
10
+ require_relative 'screenshot_operator'
11
+ require_relative 'url_tools'
12
+ require_relative 'engine_utilities/crawl_manager'
13
+ require_relative 'hooks_operator'
14
+ require_relative 'hooks_container'
15
+
16
+ module BrowserCrawler
17
+ class Engine
18
+ include Capybara::DSL
19
+ include HooksOperator
20
+ include DSL::SignIn
21
+ include DSL::JsHelpers
22
+
23
+ class UnavailableCallBackMethod < StandardError
24
+ end
25
+
26
+ REPORT_SAVE_FOLDER_PATH = 'tmp'.freeze
27
+
28
+ CUPRITE_OPTIONS = {
29
+ window_size: [1280, 1600]
30
+ }.freeze
31
+
32
+ SCREENSHOT_OPERATOR_OPTIONS = {
33
+ save_screenshots: false,
34
+ save_screenshots_to: nil,
35
+ format: 'png',
36
+ filename: nil
37
+ }.freeze
38
+
39
+ attr_reader :report_store,
40
+ :screenshot_operator,
41
+ :crawl_manager,
42
+ :logger
43
+
44
+ def initialize(browser_options: {},
45
+ screenshots_options: {},
46
+ max_pages: nil,
47
+ deep_visit: false,
48
+ logger: nil)
49
+ screenshots_operator_options = SCREENSHOT_OPERATOR_OPTIONS
50
+ .merge(screenshots_options)
51
+ @screenshot_operator = ScreenshotOperator.new(screenshots_operator_options)
52
+
53
+ cuprite_options = CUPRITE_OPTIONS.merge(browser_options)
54
+
55
+ @logger = logger || Logger.new(STDOUT)
56
+
57
+ register_chrome_driver(cuprite_options)
58
+ initialize_report_store(cuprite_options)
59
+ initialize_crawl_manager(max_pages, deep_visit)
60
+ end
61
+
62
+ def js_before_run(javascript: '')
63
+ return if javascript.empty?
64
+
65
+ @javascript_before_run = javascript
66
+ end
67
+
68
+ def extract_links(url:)
69
+ initialize_crawler(url)
70
+
71
+ begin
72
+ with_hooks_for(type: :all) do
73
+ crawl_manager.crawl(
74
+ target_url: url,
75
+ capybara_session: Capybara.current_session,
76
+ screenshot_operator: screenshot_operator
77
+ )
78
+ end
79
+ rescue StandardError => error
80
+ logger
81
+ .fatal("#{error.message} \n #{error.backtrace.join("\n")}")
82
+ ensure
83
+ @report_store.finish
84
+ end
85
+ self
86
+ end
87
+
88
+ def report_save(folder_path: '', type: :yaml)
89
+ save_folder_path = folder_path.empty? ? REPORT_SAVE_FOLDER_PATH : folder_path
90
+ ReportFactory.save(store: @report_store,
91
+ type: type.to_sym,
92
+ save_folder_path: save_folder_path)
93
+ end
94
+
95
+ def before(type: :all, &hook)
96
+ HooksContainer.instance.add_hook(method: :before, type: type, hook: hook)
97
+ end
98
+
99
+ def after(type: :all, &hook)
100
+ HooksContainer.instance.add_hook(method: :after, type: type, hook: hook)
101
+ end
102
+
103
+ def unvisited_links(&hook)
104
+ HooksContainer.instance.add_hook(type: :unvisited_links, hook: hook)
105
+ end
106
+
107
+ def change_page_scan_rules(&hook)
108
+ HooksContainer.instance.add_hook(type: :scan_rules, hook: hook)
109
+ end
110
+
111
+ private
112
+
113
+ def initialize_crawler(url)
114
+ Capybara.current_session.quit
115
+
116
+ uri = UrlTools.uri!(url: url)
117
+ Capybara.app_host = "#{uri.scheme}://#{uri.host}:#{uri.port}"
118
+
119
+ @report_store.start(url: url)
120
+
121
+ return if @javascript_before_run.nil?
122
+
123
+ Capybara.current_session
124
+ .driver
125
+ .browser
126
+ .page
127
+ .command('Page.addScriptToEvaluateOnNewDocument',
128
+ source: @javascript_before_run)
129
+ end
130
+
131
+ def initialize_report_store(cuprite_options)
132
+ @report_store = Reports::Store.new
133
+ @report_store.metadata[:screenshots_path] = screenshot_operator
134
+ .screenshots_folder
135
+ @report_store.metadata[:window_width] = cuprite_options[:window_size][0]
136
+ @report_store.metadata[:window_height] = cuprite_options[:window_size][1]
137
+ end
138
+
139
+ def register_chrome_driver(cuprite_options)
140
+ Capybara.register_chrome_driver(:cuprite_chrome, options: cuprite_options)
141
+ Capybara.run_server = false
142
+ Capybara.default_driver = :cuprite_chrome
143
+ # a workaround to extracting data from inactive tabs, dialogs, etc.
144
+ Capybara.ignore_hidden_elements = false
145
+ end
146
+
147
+ def initialize_crawl_manager(max_pages, deep_visit)
148
+ @crawl_manager = EngineUtilities::CrawlManager.new(
149
+ report_store: report_store,
150
+ max_pages: max_pages.to_i,
151
+ deep_visit: deep_visit,
152
+ logger: @logger
153
+ )
154
+ end
155
+ end
156
+ end
@@ -0,0 +1,100 @@
1
+ require_relative '../url_tools'
2
+ require_relative 'link_inspector'
3
+ require_relative 'page_inspector'
4
+ require_relative 'inspect_page_process'
5
+
6
+ module BrowserCrawler
7
+ module EngineUtilities
8
+ # This main operated class which controls queue of unvisisted links.
9
+ class CrawlManager
10
+
11
+ attr_reader :target_url,
12
+ :unvisited_links_queue,
13
+ :report_store,
14
+ :host_name,
15
+ :deep_visit,
16
+ :max_pages,
17
+ :logger,
18
+ :page_inspector
19
+
20
+ def initialize(report_store:,
21
+ max_pages: 0,
22
+ deep_visit: false,
23
+ logger: nil)
24
+ @report_store = report_store
25
+ @max_pages = max_pages
26
+ @deep_visit = deep_visit
27
+ @logger = logger || Logger.new(STDOUT)
28
+ end
29
+
30
+ def crawl(target_url:, capybara_session:, screenshot_operator: nil)
31
+ @host_name = UrlTools.uri!(url: target_url).host
32
+ @unvisited_links_queue = [target_url]
33
+
34
+ loop do
35
+ break if unvisited_links_queue.empty? || limit_reached?
36
+
37
+ unvisited_link = unvisited_links_queue.shift
38
+
39
+ link_inspector = LinkInspector.new(raw_link: unvisited_link,
40
+ host_name: host_name)
41
+
42
+ unless link_valid?(link_inspector)
43
+ @logger.info("Skipped visited #{unvisited_link}")
44
+ report_store.record_unrecognized_link(unvisited_link)
45
+ next
46
+ end
47
+
48
+ inspect_page(link_inspector: link_inspector,
49
+ capybara_session: capybara_session,
50
+ screenshot_operator: screenshot_operator)
51
+ end
52
+ end
53
+
54
+ def link_valid?(link_inspector)
55
+ link_inspector.link_valid? &&
56
+ internal_resource?(link_inspector) &&
57
+ page_unvisited?(link_inspector)
58
+ end
59
+
60
+ private
61
+
62
+ def inspect_page(link_inspector:, capybara_session:, screenshot_operator:)
63
+ InspectPageProcess.new(link_inspector: link_inspector,
64
+ capybara_session: capybara_session,
65
+ screenshot_operator: screenshot_operator,
66
+ report_store: report_store,
67
+ logger: logger)
68
+ .call(unvisited_links_queue: unvisited_links_queue)
69
+ rescue StandardError => error
70
+ error_handler(link: link_inspector.raw_link, error: error)
71
+ end
72
+
73
+ def internal_resource?(link_inspector)
74
+ link_inspector.internal_url? || deep_visit
75
+ end
76
+
77
+ def page_unvisited?(link_inspector)
78
+ !visited_pages.include?(link_inspector.full_url)
79
+ end
80
+
81
+ def limit_reached?
82
+ return false if max_pages.zero?
83
+
84
+ visited_pages.count >= max_pages
85
+ end
86
+
87
+ def visited_pages
88
+ report_store.visited_pages
89
+ end
90
+
91
+ def error_handler(link:, error:)
92
+ error_link = "visiting link - #{link};\n"
93
+ error_message = "error message: #{error.message};\n"
94
+ error_backtrace = "error backtrace: #{error.backtrace.join("\n")};\n"
95
+ logger.error("Error: #{error_link} #{error_message} #{error_backtrace}")
96
+ report_store.record_crawler_error(link: link, error: error)
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,74 @@
1
+ require_relative '../hooks_operator'
2
+
3
+ module BrowserCrawler
4
+ module EngineUtilities
5
+ # Inspect a passed link and update the loop queue if it is necessary.
6
+ class InspectPageProcess
7
+ include Capybara::DSL
8
+ include HooksOperator
9
+
10
+ attr_reader :page_inspector,
11
+ :screenshot_operator,
12
+ :link_inspector,
13
+ :logger
14
+
15
+ def initialize(link_inspector:,
16
+ capybara_session:,
17
+ report_store:,
18
+ screenshot_operator: nil,
19
+ logger:)
20
+ @page_inspector = PageInspector.new(
21
+ link_inspector: link_inspector,
22
+ capybara_session: capybara_session,
23
+ report_store: report_store
24
+ )
25
+ @link_inspector = link_inspector
26
+ @screenshot_operator = screenshot_operator
27
+ @logger = logger
28
+ end
29
+
30
+ def call(unvisited_links_queue:)
31
+ visit_page
32
+ update_queue(unvisited_links_queue: unvisited_links_queue)
33
+ end
34
+
35
+ private
36
+
37
+ def add_to_queue?(links:)
38
+ links && !links.empty?
39
+ end
40
+
41
+ # returns array consists of unvisited_links
42
+ # if some hooks is existed to execute hooks instead of base behavior
43
+ def find_unvisited_links
44
+ exchange_on_hooks(type: :unvisited_links) do
45
+ @page_inspector.scan_result
46
+ end
47
+ end
48
+
49
+ def visit_page
50
+ logger.info("Visiting #{link_inspector.raw_link}")
51
+
52
+ @page_inspector.visit_page
53
+
54
+ @page_inspector.save_to_report(screenshot_operator: screenshot_operator)
55
+
56
+ logger
57
+ .info("#{@page_inspector.scan_result.size} links found on the page.")
58
+ end
59
+
60
+ def update_queue(unvisited_links_queue:)
61
+ unvisited_links = find_unvisited_links
62
+
63
+ logger
64
+ .info("#{unvisited_links.size} will add to unvisited links queue.")
65
+
66
+ return unless add_to_queue?(links: unvisited_links)
67
+
68
+ unvisited_links_queue.push(*unvisited_links).uniq!
69
+
70
+ logger.info("#{unvisited_links_queue.size} - current state the queue.")
71
+ end
72
+ end
73
+ end
74
+ end