browser_crawler 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +10 -0
  5. data/.travis.yml +29 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +277 -0
  9. data/Rakefile +7 -0
  10. data/bin/console +10 -0
  11. data/bin/crawl +51 -0
  12. data/bin/setup +8 -0
  13. data/browser_crawler.gemspec +47 -0
  14. data/lib/browser_crawler.rb +12 -0
  15. data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
  16. data/lib/browser_crawler/dsl/sign_in.rb +37 -0
  17. data/lib/browser_crawler/engine.rb +156 -0
  18. data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
  19. data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
  20. data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
  21. data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
  22. data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
  23. data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
  24. data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
  25. data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
  26. data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
  27. data/lib/browser_crawler/hooks_container.rb +31 -0
  28. data/lib/browser_crawler/hooks_operator.rb +44 -0
  29. data/lib/browser_crawler/options.rb +86 -0
  30. data/lib/browser_crawler/report_factory.rb +22 -0
  31. data/lib/browser_crawler/reports/csv_report.rb +75 -0
  32. data/lib/browser_crawler/reports/store.rb +114 -0
  33. data/lib/browser_crawler/reports/yaml_report.rb +15 -0
  34. data/lib/browser_crawler/screenshot_operator.rb +47 -0
  35. data/lib/browser_crawler/support/capybara.rb +20 -0
  36. data/lib/browser_crawler/url_tools.rb +32 -0
  37. data/lib/browser_crawler/version.rb +3 -0
  38. metadata +244 -0
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,47 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('lib', __dir__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'browser_crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'browser_crawler'
8
+ spec.version = BrowserCrawler::VERSION
9
+ spec.required_ruby_version = '>= 2.5.0'
10
+ spec.authors = ['Dmytro Samodurov',
11
+ 'Artem Rumiantcev',
12
+ 'Denys Ivanchuk',
13
+ 'Sergiy Tyatin']
14
+ spec.email = ['dimasamodurov@gmail.com', 'tema.place@gmail.com']
15
+ spec.licenses = ['MIT']
16
+
17
+ spec.summary = 'Simple site crawler using Capybara'
18
+ spec.description = ''
19
+ spec.homepage = 'https://github.com/DimaSamodurov/browser_crawler'
20
+
21
+ # Prevent pushing this gem to RubyGems.org.
22
+ # To allow pushes either set the 'allowed_push_host'
23
+ # to allow pushing to a single host
24
+ # or delete this section to allow pushing to any host.
25
+ if spec.respond_to?(:metadata)
26
+ spec.metadata['homepage_uri'] = spec.homepage
27
+ spec.metadata['source_code_uri'] = spec.homepage
28
+ else
29
+ raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.'
30
+ end
31
+
32
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
33
+ spec.bindir = 'bin'
34
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
35
+ spec.require_paths = ['lib']
36
+
37
+ spec.add_dependency 'activesupport', '~> 5.2', '>= 5.2.2'
38
+ spec.add_dependency 'capybara', '~> 3.24', '>= 3.24.0'
39
+ spec.add_dependency 'chromedriver-helper', '~> 2.1', '>= 2.1.0'
40
+ spec.add_dependency 'cuprite', '~> 0.6.0'
41
+
42
+ spec.add_development_dependency 'bundler', '~> 1.17.2', '>= 1.17.2'
43
+ spec.add_development_dependency 'pry-byebug', '~> 3.6', '>= 3.6'
44
+ spec.add_development_dependency 'rake', '~> 10.0'
45
+ spec.add_development_dependency 'rspec', '~> 3.0'
46
+ spec.add_development_dependency 'rubocop', '~> 0.66'
47
+ end
@@ -0,0 +1,12 @@
1
+ require 'browser_crawler/version'
2
+ require 'browser_crawler/options'
3
+ require 'browser_crawler/engine'
4
+
5
+ require 'browser_crawler/followups/screenshots_indexer'
6
+ require 'browser_crawler/followups/wraith_integrator'
7
+
8
+ # Crawls web site and extracts links available.
9
+
10
+ module BrowserCrawler
11
+ # Your code goes here...
12
+ end
@@ -0,0 +1,13 @@
1
+ module BrowserCrawler
2
+ module DSL
3
+ module JsHelpers
4
+ def wait_for_page_to_load
5
+ 10.times do
6
+ return if page.evaluate_script('document.readyState') == 'complete'
7
+
8
+ sleep(0.5)
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,37 @@
1
+ module BrowserCrawler
2
+ module DSL
3
+ module SignIn
4
+ def sign_in
5
+ visit '/'
6
+ pingfed_o365_login
7
+ end
8
+
9
+ def pingfed_login(force: true)
10
+ if force || page.has_content?('Enter your credentials')
11
+ fill_in 'input_username', with: ENV.fetch('username')
12
+ fill_in 'input_password', with: ENV.fetch('password')
13
+ click_on 'Login'
14
+ end
15
+ end
16
+
17
+ def o365_login(force: true)
18
+ if force || page.has_content?('Stay signed in?')
19
+ check 'DontShowAgain'
20
+ click_on 'Yes'
21
+ end
22
+ end
23
+
24
+ def o365_stay_signed_in(force: true)
25
+ if force || page.has_content?('Stay signed in?')
26
+ check 'DontShowAgain'
27
+ click_on 'Yes'
28
+ end
29
+ end
30
+
31
+ def pingfed_o365_login(force: true)
32
+ pingfed_login(force: force)
33
+ o365_stay_signed_in(force: force)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,156 @@
1
+ require 'capybara'
2
+ require 'capybara/dsl'
3
+ require 'logger'
4
+
5
+ require_relative 'dsl/sign_in'
6
+ require_relative 'dsl/js_helpers'
7
+ require_relative 'report_factory'
8
+ require_relative 'reports/store'
9
+ require_relative 'support/capybara'
10
+ require_relative 'screenshot_operator'
11
+ require_relative 'url_tools'
12
+ require_relative 'engine_utilities/crawl_manager'
13
+ require_relative 'hooks_operator'
14
+ require_relative 'hooks_container'
15
+
16
+ module BrowserCrawler
17
+ class Engine
18
+ include Capybara::DSL
19
+ include HooksOperator
20
+ include DSL::SignIn
21
+ include DSL::JsHelpers
22
+
23
+ class UnavailableCallBackMethod < StandardError
24
+ end
25
+
26
+ REPORT_SAVE_FOLDER_PATH = 'tmp'.freeze
27
+
28
+ CUPRITE_OPTIONS = {
29
+ window_size: [1280, 1600]
30
+ }.freeze
31
+
32
+ SCREENSHOT_OPERATOR_OPTIONS = {
33
+ save_screenshots: false,
34
+ save_screenshots_to: nil,
35
+ format: 'png',
36
+ filename: nil
37
+ }.freeze
38
+
39
+ attr_reader :report_store,
40
+ :screenshot_operator,
41
+ :crawl_manager,
42
+ :logger
43
+
44
+ def initialize(browser_options: {},
45
+ screenshots_options: {},
46
+ max_pages: nil,
47
+ deep_visit: false,
48
+ logger: nil)
49
+ screenshots_operator_options = SCREENSHOT_OPERATOR_OPTIONS
50
+ .merge(screenshots_options)
51
+ @screenshot_operator = ScreenshotOperator.new(screenshots_operator_options)
52
+
53
+ cuprite_options = CUPRITE_OPTIONS.merge(browser_options)
54
+
55
+ @logger = logger || Logger.new(STDOUT)
56
+
57
+ register_chrome_driver(cuprite_options)
58
+ initialize_report_store(cuprite_options)
59
+ initialize_crawl_manager(max_pages, deep_visit)
60
+ end
61
+
62
+ def js_before_run(javascript: '')
63
+ return if javascript.empty?
64
+
65
+ @javascript_before_run = javascript
66
+ end
67
+
68
+ def extract_links(url:)
69
+ initialize_crawler(url)
70
+
71
+ begin
72
+ with_hooks_for(type: :all) do
73
+ crawl_manager.crawl(
74
+ target_url: url,
75
+ capybara_session: Capybara.current_session,
76
+ screenshot_operator: screenshot_operator
77
+ )
78
+ end
79
+ rescue StandardError => error
80
+ logger
81
+ .fatal("#{error.message} \n #{error.backtrace.join("\n")}")
82
+ ensure
83
+ @report_store.finish
84
+ end
85
+ self
86
+ end
87
+
88
+ def report_save(folder_path: '', type: :yaml)
89
+ save_folder_path = folder_path.empty? ? REPORT_SAVE_FOLDER_PATH : folder_path
90
+ ReportFactory.save(store: @report_store,
91
+ type: type.to_sym,
92
+ save_folder_path: save_folder_path)
93
+ end
94
+
95
+ def before(type: :all, &hook)
96
+ HooksContainer.instance.add_hook(method: :before, type: type, hook: hook)
97
+ end
98
+
99
+ def after(type: :all, &hook)
100
+ HooksContainer.instance.add_hook(method: :after, type: type, hook: hook)
101
+ end
102
+
103
+ def unvisited_links(&hook)
104
+ HooksContainer.instance.add_hook(type: :unvisited_links, hook: hook)
105
+ end
106
+
107
+ def change_page_scan_rules(&hook)
108
+ HooksContainer.instance.add_hook(type: :scan_rules, hook: hook)
109
+ end
110
+
111
+ private
112
+
113
+ def initialize_crawler(url)
114
+ Capybara.current_session.quit
115
+
116
+ uri = UrlTools.uri!(url: url)
117
+ Capybara.app_host = "#{uri.scheme}://#{uri.host}:#{uri.port}"
118
+
119
+ @report_store.start(url: url)
120
+
121
+ return if @javascript_before_run.nil?
122
+
123
+ Capybara.current_session
124
+ .driver
125
+ .browser
126
+ .page
127
+ .command('Page.addScriptToEvaluateOnNewDocument',
128
+ source: @javascript_before_run)
129
+ end
130
+
131
+ def initialize_report_store(cuprite_options)
132
+ @report_store = Reports::Store.new
133
+ @report_store.metadata[:screenshots_path] = screenshot_operator
134
+ .screenshots_folder
135
+ @report_store.metadata[:window_width] = cuprite_options[:window_size][0]
136
+ @report_store.metadata[:window_height] = cuprite_options[:window_size][1]
137
+ end
138
+
139
+ def register_chrome_driver(cuprite_options)
140
+ Capybara.register_chrome_driver(:cuprite_chrome, options: cuprite_options)
141
+ Capybara.run_server = false
142
+ Capybara.default_driver = :cuprite_chrome
143
+ # a workaround to extracting data from inactive tabs, dialogs, etc.
144
+ Capybara.ignore_hidden_elements = false
145
+ end
146
+
147
+ def initialize_crawl_manager(max_pages, deep_visit)
148
+ @crawl_manager = EngineUtilities::CrawlManager.new(
149
+ report_store: report_store,
150
+ max_pages: max_pages.to_i,
151
+ deep_visit: deep_visit,
152
+ logger: @logger
153
+ )
154
+ end
155
+ end
156
+ end
@@ -0,0 +1,100 @@
1
+ require_relative '../url_tools'
2
+ require_relative 'link_inspector'
3
+ require_relative 'page_inspector'
4
+ require_relative 'inspect_page_process'
5
+
6
+ module BrowserCrawler
7
+ module EngineUtilities
8
+ # This main operated class which controls queue of unvisisted links.
9
+ class CrawlManager
10
+
11
+ attr_reader :target_url,
12
+ :unvisited_links_queue,
13
+ :report_store,
14
+ :host_name,
15
+ :deep_visit,
16
+ :max_pages,
17
+ :logger,
18
+ :page_inspector
19
+
20
+ def initialize(report_store:,
21
+ max_pages: 0,
22
+ deep_visit: false,
23
+ logger: nil)
24
+ @report_store = report_store
25
+ @max_pages = max_pages
26
+ @deep_visit = deep_visit
27
+ @logger = logger || Logger.new(STDOUT)
28
+ end
29
+
30
+ def crawl(target_url:, capybara_session:, screenshot_operator: nil)
31
+ @host_name = UrlTools.uri!(url: target_url).host
32
+ @unvisited_links_queue = [target_url]
33
+
34
+ loop do
35
+ break if unvisited_links_queue.empty? || limit_reached?
36
+
37
+ unvisited_link = unvisited_links_queue.shift
38
+
39
+ link_inspector = LinkInspector.new(raw_link: unvisited_link,
40
+ host_name: host_name)
41
+
42
+ unless link_valid?(link_inspector)
43
+ @logger.info("Skipped visited #{unvisited_link}")
44
+ report_store.record_unrecognized_link(unvisited_link)
45
+ next
46
+ end
47
+
48
+ inspect_page(link_inspector: link_inspector,
49
+ capybara_session: capybara_session,
50
+ screenshot_operator: screenshot_operator)
51
+ end
52
+ end
53
+
54
+ def link_valid?(link_inspector)
55
+ link_inspector.link_valid? &&
56
+ internal_resource?(link_inspector) &&
57
+ page_unvisited?(link_inspector)
58
+ end
59
+
60
+ private
61
+
62
+ def inspect_page(link_inspector:, capybara_session:, screenshot_operator:)
63
+ InspectPageProcess.new(link_inspector: link_inspector,
64
+ capybara_session: capybara_session,
65
+ screenshot_operator: screenshot_operator,
66
+ report_store: report_store,
67
+ logger: logger)
68
+ .call(unvisited_links_queue: unvisited_links_queue)
69
+ rescue StandardError => error
70
+ error_handler(link: link_inspector.raw_link, error: error)
71
+ end
72
+
73
+ def internal_resource?(link_inspector)
74
+ link_inspector.internal_url? || deep_visit
75
+ end
76
+
77
+ def page_unvisited?(link_inspector)
78
+ !visited_pages.include?(link_inspector.full_url)
79
+ end
80
+
81
+ def limit_reached?
82
+ return false if max_pages.zero?
83
+
84
+ visited_pages.count >= max_pages
85
+ end
86
+
87
+ def visited_pages
88
+ report_store.visited_pages
89
+ end
90
+
91
+ def error_handler(link:, error:)
92
+ error_link = "visiting link - #{link};\n"
93
+ error_message = "error message: #{error.message};\n"
94
+ error_backtrace = "error backtrace: #{error.backtrace.join("\n")};\n"
95
+ logger.error("Error: #{error_link} #{error_message} #{error_backtrace}")
96
+ report_store.record_crawler_error(link: link, error: error)
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,74 @@
1
+ require_relative '../hooks_operator'
2
+
3
+ module BrowserCrawler
4
+ module EngineUtilities
5
+ # Inspect a passed link and update the loop queue if it is necessary.
6
+ class InspectPageProcess
7
+ include Capybara::DSL
8
+ include HooksOperator
9
+
10
+ attr_reader :page_inspector,
11
+ :screenshot_operator,
12
+ :link_inspector,
13
+ :logger
14
+
15
+ def initialize(link_inspector:,
16
+ capybara_session:,
17
+ report_store:,
18
+ screenshot_operator: nil,
19
+ logger:)
20
+ @page_inspector = PageInspector.new(
21
+ link_inspector: link_inspector,
22
+ capybara_session: capybara_session,
23
+ report_store: report_store
24
+ )
25
+ @link_inspector = link_inspector
26
+ @screenshot_operator = screenshot_operator
27
+ @logger = logger
28
+ end
29
+
30
+ def call(unvisited_links_queue:)
31
+ visit_page
32
+ update_queue(unvisited_links_queue: unvisited_links_queue)
33
+ end
34
+
35
+ private
36
+
37
+ def add_to_queue?(links:)
38
+ links && !links.empty?
39
+ end
40
+
41
+ # returns array consists of unvisited_links
42
+ # if some hooks is existed to execute hooks instead of base behavior
43
+ def find_unvisited_links
44
+ exchange_on_hooks(type: :unvisited_links) do
45
+ @page_inspector.scan_result
46
+ end
47
+ end
48
+
49
+ def visit_page
50
+ logger.info("Visiting #{link_inspector.raw_link}")
51
+
52
+ @page_inspector.visit_page
53
+
54
+ @page_inspector.save_to_report(screenshot_operator: screenshot_operator)
55
+
56
+ logger
57
+ .info("#{@page_inspector.scan_result.size} links found on the page.")
58
+ end
59
+
60
+ def update_queue(unvisited_links_queue:)
61
+ unvisited_links = find_unvisited_links
62
+
63
+ logger
64
+ .info("#{unvisited_links.size} will add to unvisited links queue.")
65
+
66
+ return unless add_to_queue?(links: unvisited_links)
67
+
68
+ unvisited_links_queue.push(*unvisited_links).uniq!
69
+
70
+ logger.info("#{unvisited_links_queue.size} - current state the queue.")
71
+ end
72
+ end
73
+ end
74
+ end