browser_crawler 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +2 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +277 -0
- data/Rakefile +7 -0
- data/bin/console +10 -0
- data/bin/crawl +51 -0
- data/bin/setup +8 -0
- data/browser_crawler.gemspec +47 -0
- data/lib/browser_crawler.rb +12 -0
- data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
- data/lib/browser_crawler/dsl/sign_in.rb +37 -0
- data/lib/browser_crawler/engine.rb +156 -0
- data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
- data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
- data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
- data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
- data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
- data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
- data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
- data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
- data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
- data/lib/browser_crawler/hooks_container.rb +31 -0
- data/lib/browser_crawler/hooks_operator.rb +44 -0
- data/lib/browser_crawler/options.rb +86 -0
- data/lib/browser_crawler/report_factory.rb +22 -0
- data/lib/browser_crawler/reports/csv_report.rb +75 -0
- data/lib/browser_crawler/reports/store.rb +114 -0
- data/lib/browser_crawler/reports/yaml_report.rb +15 -0
- data/lib/browser_crawler/screenshot_operator.rb +47 -0
- data/lib/browser_crawler/support/capybara.rb +20 -0
- data/lib/browser_crawler/url_tools.rb +32 -0
- data/lib/browser_crawler/version.rb +3 -0
- metadata +244 -0
data/bin/setup
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('lib', __dir__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'browser_crawler/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'browser_crawler'
|
8
|
+
spec.version = BrowserCrawler::VERSION
|
9
|
+
spec.required_ruby_version = '>= 2.5.0'
|
10
|
+
spec.authors = ['Dmytro Samodurov',
|
11
|
+
'Artem Rumiantcev',
|
12
|
+
'Denys Ivanchuk',
|
13
|
+
'Sergiy Tyatin']
|
14
|
+
spec.email = ['dimasamodurov@gmail.com', 'tema.place@gmail.com']
|
15
|
+
spec.licenses = ['MIT']
|
16
|
+
|
17
|
+
spec.summary = 'Simple site crawler using Capybara'
|
18
|
+
spec.description = ''
|
19
|
+
spec.homepage = 'https://github.com/DimaSamodurov/browser_crawler'
|
20
|
+
|
21
|
+
# Prevent pushing this gem to RubyGems.org.
|
22
|
+
# To allow pushes either set the 'allowed_push_host'
|
23
|
+
# to allow pushing to a single host
|
24
|
+
# or delete this section to allow pushing to any host.
|
25
|
+
if spec.respond_to?(:metadata)
|
26
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
27
|
+
spec.metadata['source_code_uri'] = spec.homepage
|
28
|
+
else
|
29
|
+
raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.'
|
30
|
+
end
|
31
|
+
|
32
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
33
|
+
spec.bindir = 'bin'
|
34
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
35
|
+
spec.require_paths = ['lib']
|
36
|
+
|
37
|
+
spec.add_dependency 'activesupport', '~> 5.2', '>= 5.2.2'
|
38
|
+
spec.add_dependency 'capybara', '~> 3.24', '>= 3.24.0'
|
39
|
+
spec.add_dependency 'chromedriver-helper', '~> 2.1', '>= 2.1.0'
|
40
|
+
spec.add_dependency 'cuprite', '~> 0.6.0'
|
41
|
+
|
42
|
+
spec.add_development_dependency 'bundler', '~> 1.17.2', '>= 1.17.2'
|
43
|
+
spec.add_development_dependency 'pry-byebug', '~> 3.6', '>= 3.6'
|
44
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
45
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
46
|
+
spec.add_development_dependency 'rubocop', '~> 0.66'
|
47
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'browser_crawler/version'
|
2
|
+
require 'browser_crawler/options'
|
3
|
+
require 'browser_crawler/engine'
|
4
|
+
|
5
|
+
require 'browser_crawler/followups/screenshots_indexer'
|
6
|
+
require 'browser_crawler/followups/wraith_integrator'
|
7
|
+
|
8
|
+
# Crawls web site and extracts links available.
|
9
|
+
|
10
|
+
module BrowserCrawler
|
11
|
+
# Your code goes here...
|
12
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module BrowserCrawler
|
2
|
+
module DSL
|
3
|
+
module SignIn
|
4
|
+
def sign_in
|
5
|
+
visit '/'
|
6
|
+
pingfed_o365_login
|
7
|
+
end
|
8
|
+
|
9
|
+
def pingfed_login(force: true)
|
10
|
+
if force || page.has_content?('Enter your credentials')
|
11
|
+
fill_in 'input_username', with: ENV.fetch('username')
|
12
|
+
fill_in 'input_password', with: ENV.fetch('password')
|
13
|
+
click_on 'Login'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def o365_login(force: true)
|
18
|
+
if force || page.has_content?('Stay signed in?')
|
19
|
+
check 'DontShowAgain'
|
20
|
+
click_on 'Yes'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def o365_stay_signed_in(force: true)
|
25
|
+
if force || page.has_content?('Stay signed in?')
|
26
|
+
check 'DontShowAgain'
|
27
|
+
click_on 'Yes'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def pingfed_o365_login(force: true)
|
32
|
+
pingfed_login(force: force)
|
33
|
+
o365_stay_signed_in(force: force)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,156 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/dsl'
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
require_relative 'dsl/sign_in'
|
6
|
+
require_relative 'dsl/js_helpers'
|
7
|
+
require_relative 'report_factory'
|
8
|
+
require_relative 'reports/store'
|
9
|
+
require_relative 'support/capybara'
|
10
|
+
require_relative 'screenshot_operator'
|
11
|
+
require_relative 'url_tools'
|
12
|
+
require_relative 'engine_utilities/crawl_manager'
|
13
|
+
require_relative 'hooks_operator'
|
14
|
+
require_relative 'hooks_container'
|
15
|
+
|
16
|
+
module BrowserCrawler
|
17
|
+
class Engine
|
18
|
+
include Capybara::DSL
|
19
|
+
include HooksOperator
|
20
|
+
include DSL::SignIn
|
21
|
+
include DSL::JsHelpers
|
22
|
+
|
23
|
+
class UnavailableCallBackMethod < StandardError
|
24
|
+
end
|
25
|
+
|
26
|
+
REPORT_SAVE_FOLDER_PATH = 'tmp'.freeze
|
27
|
+
|
28
|
+
CUPRITE_OPTIONS = {
|
29
|
+
window_size: [1280, 1600]
|
30
|
+
}.freeze
|
31
|
+
|
32
|
+
SCREENSHOT_OPERATOR_OPTIONS = {
|
33
|
+
save_screenshots: false,
|
34
|
+
save_screenshots_to: nil,
|
35
|
+
format: 'png',
|
36
|
+
filename: nil
|
37
|
+
}.freeze
|
38
|
+
|
39
|
+
attr_reader :report_store,
|
40
|
+
:screenshot_operator,
|
41
|
+
:crawl_manager,
|
42
|
+
:logger
|
43
|
+
|
44
|
+
def initialize(browser_options: {},
|
45
|
+
screenshots_options: {},
|
46
|
+
max_pages: nil,
|
47
|
+
deep_visit: false,
|
48
|
+
logger: nil)
|
49
|
+
screenshots_operator_options = SCREENSHOT_OPERATOR_OPTIONS
|
50
|
+
.merge(screenshots_options)
|
51
|
+
@screenshot_operator = ScreenshotOperator.new(screenshots_operator_options)
|
52
|
+
|
53
|
+
cuprite_options = CUPRITE_OPTIONS.merge(browser_options)
|
54
|
+
|
55
|
+
@logger = logger || Logger.new(STDOUT)
|
56
|
+
|
57
|
+
register_chrome_driver(cuprite_options)
|
58
|
+
initialize_report_store(cuprite_options)
|
59
|
+
initialize_crawl_manager(max_pages, deep_visit)
|
60
|
+
end
|
61
|
+
|
62
|
+
def js_before_run(javascript: '')
|
63
|
+
return if javascript.empty?
|
64
|
+
|
65
|
+
@javascript_before_run = javascript
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_links(url:)
|
69
|
+
initialize_crawler(url)
|
70
|
+
|
71
|
+
begin
|
72
|
+
with_hooks_for(type: :all) do
|
73
|
+
crawl_manager.crawl(
|
74
|
+
target_url: url,
|
75
|
+
capybara_session: Capybara.current_session,
|
76
|
+
screenshot_operator: screenshot_operator
|
77
|
+
)
|
78
|
+
end
|
79
|
+
rescue StandardError => error
|
80
|
+
logger
|
81
|
+
.fatal("#{error.message} \n #{error.backtrace.join("\n")}")
|
82
|
+
ensure
|
83
|
+
@report_store.finish
|
84
|
+
end
|
85
|
+
self
|
86
|
+
end
|
87
|
+
|
88
|
+
def report_save(folder_path: '', type: :yaml)
|
89
|
+
save_folder_path = folder_path.empty? ? REPORT_SAVE_FOLDER_PATH : folder_path
|
90
|
+
ReportFactory.save(store: @report_store,
|
91
|
+
type: type.to_sym,
|
92
|
+
save_folder_path: save_folder_path)
|
93
|
+
end
|
94
|
+
|
95
|
+
def before(type: :all, &hook)
|
96
|
+
HooksContainer.instance.add_hook(method: :before, type: type, hook: hook)
|
97
|
+
end
|
98
|
+
|
99
|
+
def after(type: :all, &hook)
|
100
|
+
HooksContainer.instance.add_hook(method: :after, type: type, hook: hook)
|
101
|
+
end
|
102
|
+
|
103
|
+
def unvisited_links(&hook)
|
104
|
+
HooksContainer.instance.add_hook(type: :unvisited_links, hook: hook)
|
105
|
+
end
|
106
|
+
|
107
|
+
def change_page_scan_rules(&hook)
|
108
|
+
HooksContainer.instance.add_hook(type: :scan_rules, hook: hook)
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
def initialize_crawler(url)
|
114
|
+
Capybara.current_session.quit
|
115
|
+
|
116
|
+
uri = UrlTools.uri!(url: url)
|
117
|
+
Capybara.app_host = "#{uri.scheme}://#{uri.host}:#{uri.port}"
|
118
|
+
|
119
|
+
@report_store.start(url: url)
|
120
|
+
|
121
|
+
return if @javascript_before_run.nil?
|
122
|
+
|
123
|
+
Capybara.current_session
|
124
|
+
.driver
|
125
|
+
.browser
|
126
|
+
.page
|
127
|
+
.command('Page.addScriptToEvaluateOnNewDocument',
|
128
|
+
source: @javascript_before_run)
|
129
|
+
end
|
130
|
+
|
131
|
+
def initialize_report_store(cuprite_options)
|
132
|
+
@report_store = Reports::Store.new
|
133
|
+
@report_store.metadata[:screenshots_path] = screenshot_operator
|
134
|
+
.screenshots_folder
|
135
|
+
@report_store.metadata[:window_width] = cuprite_options[:window_size][0]
|
136
|
+
@report_store.metadata[:window_height] = cuprite_options[:window_size][1]
|
137
|
+
end
|
138
|
+
|
139
|
+
def register_chrome_driver(cuprite_options)
|
140
|
+
Capybara.register_chrome_driver(:cuprite_chrome, options: cuprite_options)
|
141
|
+
Capybara.run_server = false
|
142
|
+
Capybara.default_driver = :cuprite_chrome
|
143
|
+
# a workaround to extracting data from inactive tabs, dialogs, etc.
|
144
|
+
Capybara.ignore_hidden_elements = false
|
145
|
+
end
|
146
|
+
|
147
|
+
def initialize_crawl_manager(max_pages, deep_visit)
|
148
|
+
@crawl_manager = EngineUtilities::CrawlManager.new(
|
149
|
+
report_store: report_store,
|
150
|
+
max_pages: max_pages.to_i,
|
151
|
+
deep_visit: deep_visit,
|
152
|
+
logger: @logger
|
153
|
+
)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require_relative '../url_tools'
|
2
|
+
require_relative 'link_inspector'
|
3
|
+
require_relative 'page_inspector'
|
4
|
+
require_relative 'inspect_page_process'
|
5
|
+
|
6
|
+
module BrowserCrawler
|
7
|
+
module EngineUtilities
|
8
|
+
# This main operated class which controls queue of unvisisted links.
|
9
|
+
class CrawlManager
|
10
|
+
|
11
|
+
attr_reader :target_url,
|
12
|
+
:unvisited_links_queue,
|
13
|
+
:report_store,
|
14
|
+
:host_name,
|
15
|
+
:deep_visit,
|
16
|
+
:max_pages,
|
17
|
+
:logger,
|
18
|
+
:page_inspector
|
19
|
+
|
20
|
+
def initialize(report_store:,
|
21
|
+
max_pages: 0,
|
22
|
+
deep_visit: false,
|
23
|
+
logger: nil)
|
24
|
+
@report_store = report_store
|
25
|
+
@max_pages = max_pages
|
26
|
+
@deep_visit = deep_visit
|
27
|
+
@logger = logger || Logger.new(STDOUT)
|
28
|
+
end
|
29
|
+
|
30
|
+
def crawl(target_url:, capybara_session:, screenshot_operator: nil)
|
31
|
+
@host_name = UrlTools.uri!(url: target_url).host
|
32
|
+
@unvisited_links_queue = [target_url]
|
33
|
+
|
34
|
+
loop do
|
35
|
+
break if unvisited_links_queue.empty? || limit_reached?
|
36
|
+
|
37
|
+
unvisited_link = unvisited_links_queue.shift
|
38
|
+
|
39
|
+
link_inspector = LinkInspector.new(raw_link: unvisited_link,
|
40
|
+
host_name: host_name)
|
41
|
+
|
42
|
+
unless link_valid?(link_inspector)
|
43
|
+
@logger.info("Skipped visited #{unvisited_link}")
|
44
|
+
report_store.record_unrecognized_link(unvisited_link)
|
45
|
+
next
|
46
|
+
end
|
47
|
+
|
48
|
+
inspect_page(link_inspector: link_inspector,
|
49
|
+
capybara_session: capybara_session,
|
50
|
+
screenshot_operator: screenshot_operator)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def link_valid?(link_inspector)
|
55
|
+
link_inspector.link_valid? &&
|
56
|
+
internal_resource?(link_inspector) &&
|
57
|
+
page_unvisited?(link_inspector)
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def inspect_page(link_inspector:, capybara_session:, screenshot_operator:)
|
63
|
+
InspectPageProcess.new(link_inspector: link_inspector,
|
64
|
+
capybara_session: capybara_session,
|
65
|
+
screenshot_operator: screenshot_operator,
|
66
|
+
report_store: report_store,
|
67
|
+
logger: logger)
|
68
|
+
.call(unvisited_links_queue: unvisited_links_queue)
|
69
|
+
rescue StandardError => error
|
70
|
+
error_handler(link: link_inspector.raw_link, error: error)
|
71
|
+
end
|
72
|
+
|
73
|
+
def internal_resource?(link_inspector)
|
74
|
+
link_inspector.internal_url? || deep_visit
|
75
|
+
end
|
76
|
+
|
77
|
+
def page_unvisited?(link_inspector)
|
78
|
+
!visited_pages.include?(link_inspector.full_url)
|
79
|
+
end
|
80
|
+
|
81
|
+
def limit_reached?
|
82
|
+
return false if max_pages.zero?
|
83
|
+
|
84
|
+
visited_pages.count >= max_pages
|
85
|
+
end
|
86
|
+
|
87
|
+
def visited_pages
|
88
|
+
report_store.visited_pages
|
89
|
+
end
|
90
|
+
|
91
|
+
def error_handler(link:, error:)
|
92
|
+
error_link = "visiting link - #{link};\n"
|
93
|
+
error_message = "error message: #{error.message};\n"
|
94
|
+
error_backtrace = "error backtrace: #{error.backtrace.join("\n")};\n"
|
95
|
+
logger.error("Error: #{error_link} #{error_message} #{error_backtrace}")
|
96
|
+
report_store.record_crawler_error(link: link, error: error)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require_relative '../hooks_operator'
|
2
|
+
|
3
|
+
module BrowserCrawler
|
4
|
+
module EngineUtilities
|
5
|
+
# Inspect a passed link and update the loop queue if it is necessary.
|
6
|
+
class InspectPageProcess
|
7
|
+
include Capybara::DSL
|
8
|
+
include HooksOperator
|
9
|
+
|
10
|
+
attr_reader :page_inspector,
|
11
|
+
:screenshot_operator,
|
12
|
+
:link_inspector,
|
13
|
+
:logger
|
14
|
+
|
15
|
+
def initialize(link_inspector:,
|
16
|
+
capybara_session:,
|
17
|
+
report_store:,
|
18
|
+
screenshot_operator: nil,
|
19
|
+
logger:)
|
20
|
+
@page_inspector = PageInspector.new(
|
21
|
+
link_inspector: link_inspector,
|
22
|
+
capybara_session: capybara_session,
|
23
|
+
report_store: report_store
|
24
|
+
)
|
25
|
+
@link_inspector = link_inspector
|
26
|
+
@screenshot_operator = screenshot_operator
|
27
|
+
@logger = logger
|
28
|
+
end
|
29
|
+
|
30
|
+
def call(unvisited_links_queue:)
|
31
|
+
visit_page
|
32
|
+
update_queue(unvisited_links_queue: unvisited_links_queue)
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def add_to_queue?(links:)
|
38
|
+
links && !links.empty?
|
39
|
+
end
|
40
|
+
|
41
|
+
# returns array consists of unvisited_links
|
42
|
+
# if some hooks is existed to execute hooks instead of base behavior
|
43
|
+
def find_unvisited_links
|
44
|
+
exchange_on_hooks(type: :unvisited_links) do
|
45
|
+
@page_inspector.scan_result
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def visit_page
|
50
|
+
logger.info("Visiting #{link_inspector.raw_link}")
|
51
|
+
|
52
|
+
@page_inspector.visit_page
|
53
|
+
|
54
|
+
@page_inspector.save_to_report(screenshot_operator: screenshot_operator)
|
55
|
+
|
56
|
+
logger
|
57
|
+
.info("#{@page_inspector.scan_result.size} links found on the page.")
|
58
|
+
end
|
59
|
+
|
60
|
+
def update_queue(unvisited_links_queue:)
|
61
|
+
unvisited_links = find_unvisited_links
|
62
|
+
|
63
|
+
logger
|
64
|
+
.info("#{unvisited_links.size} will add to unvisited links queue.")
|
65
|
+
|
66
|
+
return unless add_to_queue?(links: unvisited_links)
|
67
|
+
|
68
|
+
unvisited_links_queue.push(*unvisited_links).uniq!
|
69
|
+
|
70
|
+
logger.info("#{unvisited_links_queue.size} - current state the queue.")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|