browser_crawler 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +2 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +277 -0
- data/Rakefile +7 -0
- data/bin/console +10 -0
- data/bin/crawl +51 -0
- data/bin/setup +8 -0
- data/browser_crawler.gemspec +47 -0
- data/lib/browser_crawler.rb +12 -0
- data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
- data/lib/browser_crawler/dsl/sign_in.rb +37 -0
- data/lib/browser_crawler/engine.rb +156 -0
- data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
- data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
- data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
- data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
- data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
- data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
- data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
- data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
- data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
- data/lib/browser_crawler/hooks_container.rb +31 -0
- data/lib/browser_crawler/hooks_operator.rb +44 -0
- data/lib/browser_crawler/options.rb +86 -0
- data/lib/browser_crawler/report_factory.rb +22 -0
- data/lib/browser_crawler/reports/csv_report.rb +75 -0
- data/lib/browser_crawler/reports/store.rb +114 -0
- data/lib/browser_crawler/reports/yaml_report.rb +15 -0
- data/lib/browser_crawler/screenshot_operator.rb +47 -0
- data/lib/browser_crawler/support/capybara.rb +20 -0
- data/lib/browser_crawler/url_tools.rb +32 -0
- data/lib/browser_crawler/version.rb +3 -0
- metadata +244 -0
data/bin/setup
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('lib', __dir__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'browser_crawler/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'browser_crawler'
|
8
|
+
spec.version = BrowserCrawler::VERSION
|
9
|
+
spec.required_ruby_version = '>= 2.5.0'
|
10
|
+
spec.authors = ['Dmytro Samodurov',
|
11
|
+
'Artem Rumiantcev',
|
12
|
+
'Denys Ivanchuk',
|
13
|
+
'Sergiy Tyatin']
|
14
|
+
spec.email = ['dimasamodurov@gmail.com', 'tema.place@gmail.com']
|
15
|
+
spec.licenses = ['MIT']
|
16
|
+
|
17
|
+
spec.summary = 'Simple site crawler using Capybara'
|
18
|
+
spec.description = ''
|
19
|
+
spec.homepage = 'https://github.com/DimaSamodurov/browser_crawler'
|
20
|
+
|
21
|
+
# Prevent pushing this gem to RubyGems.org.
|
22
|
+
# To allow pushes either set the 'allowed_push_host'
|
23
|
+
# to allow pushing to a single host
|
24
|
+
# or delete this section to allow pushing to any host.
|
25
|
+
if spec.respond_to?(:metadata)
|
26
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
27
|
+
spec.metadata['source_code_uri'] = spec.homepage
|
28
|
+
else
|
29
|
+
raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.'
|
30
|
+
end
|
31
|
+
|
32
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
33
|
+
spec.bindir = 'bin'
|
34
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
35
|
+
spec.require_paths = ['lib']
|
36
|
+
|
37
|
+
spec.add_dependency 'activesupport', '~> 5.2', '>= 5.2.2'
|
38
|
+
spec.add_dependency 'capybara', '~> 3.24', '>= 3.24.0'
|
39
|
+
spec.add_dependency 'chromedriver-helper', '~> 2.1', '>= 2.1.0'
|
40
|
+
spec.add_dependency 'cuprite', '~> 0.6.0'
|
41
|
+
|
42
|
+
spec.add_development_dependency 'bundler', '~> 1.17.2', '>= 1.17.2'
|
43
|
+
spec.add_development_dependency 'pry-byebug', '~> 3.6', '>= 3.6'
|
44
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
45
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
46
|
+
spec.add_development_dependency 'rubocop', '~> 0.66'
|
47
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'browser_crawler/version'
|
2
|
+
require 'browser_crawler/options'
|
3
|
+
require 'browser_crawler/engine'
|
4
|
+
|
5
|
+
require 'browser_crawler/followups/screenshots_indexer'
|
6
|
+
require 'browser_crawler/followups/wraith_integrator'
|
7
|
+
|
8
|
+
# Crawls web site and extracts links available.
|
9
|
+
|
10
|
+
module BrowserCrawler
|
11
|
+
# Your code goes here...
|
12
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module BrowserCrawler
|
2
|
+
module DSL
|
3
|
+
module SignIn
|
4
|
+
def sign_in
|
5
|
+
visit '/'
|
6
|
+
pingfed_o365_login
|
7
|
+
end
|
8
|
+
|
9
|
+
def pingfed_login(force: true)
|
10
|
+
if force || page.has_content?('Enter your credentials')
|
11
|
+
fill_in 'input_username', with: ENV.fetch('username')
|
12
|
+
fill_in 'input_password', with: ENV.fetch('password')
|
13
|
+
click_on 'Login'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def o365_login(force: true)
|
18
|
+
if force || page.has_content?('Stay signed in?')
|
19
|
+
check 'DontShowAgain'
|
20
|
+
click_on 'Yes'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def o365_stay_signed_in(force: true)
|
25
|
+
if force || page.has_content?('Stay signed in?')
|
26
|
+
check 'DontShowAgain'
|
27
|
+
click_on 'Yes'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def pingfed_o365_login(force: true)
|
32
|
+
pingfed_login(force: force)
|
33
|
+
o365_stay_signed_in(force: force)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,156 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/dsl'
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
require_relative 'dsl/sign_in'
|
6
|
+
require_relative 'dsl/js_helpers'
|
7
|
+
require_relative 'report_factory'
|
8
|
+
require_relative 'reports/store'
|
9
|
+
require_relative 'support/capybara'
|
10
|
+
require_relative 'screenshot_operator'
|
11
|
+
require_relative 'url_tools'
|
12
|
+
require_relative 'engine_utilities/crawl_manager'
|
13
|
+
require_relative 'hooks_operator'
|
14
|
+
require_relative 'hooks_container'
|
15
|
+
|
16
|
+
module BrowserCrawler
|
17
|
+
class Engine
|
18
|
+
include Capybara::DSL
|
19
|
+
include HooksOperator
|
20
|
+
include DSL::SignIn
|
21
|
+
include DSL::JsHelpers
|
22
|
+
|
23
|
+
class UnavailableCallBackMethod < StandardError
|
24
|
+
end
|
25
|
+
|
26
|
+
REPORT_SAVE_FOLDER_PATH = 'tmp'.freeze
|
27
|
+
|
28
|
+
CUPRITE_OPTIONS = {
|
29
|
+
window_size: [1280, 1600]
|
30
|
+
}.freeze
|
31
|
+
|
32
|
+
SCREENSHOT_OPERATOR_OPTIONS = {
|
33
|
+
save_screenshots: false,
|
34
|
+
save_screenshots_to: nil,
|
35
|
+
format: 'png',
|
36
|
+
filename: nil
|
37
|
+
}.freeze
|
38
|
+
|
39
|
+
attr_reader :report_store,
|
40
|
+
:screenshot_operator,
|
41
|
+
:crawl_manager,
|
42
|
+
:logger
|
43
|
+
|
44
|
+
def initialize(browser_options: {},
|
45
|
+
screenshots_options: {},
|
46
|
+
max_pages: nil,
|
47
|
+
deep_visit: false,
|
48
|
+
logger: nil)
|
49
|
+
screenshots_operator_options = SCREENSHOT_OPERATOR_OPTIONS
|
50
|
+
.merge(screenshots_options)
|
51
|
+
@screenshot_operator = ScreenshotOperator.new(screenshots_operator_options)
|
52
|
+
|
53
|
+
cuprite_options = CUPRITE_OPTIONS.merge(browser_options)
|
54
|
+
|
55
|
+
@logger = logger || Logger.new(STDOUT)
|
56
|
+
|
57
|
+
register_chrome_driver(cuprite_options)
|
58
|
+
initialize_report_store(cuprite_options)
|
59
|
+
initialize_crawl_manager(max_pages, deep_visit)
|
60
|
+
end
|
61
|
+
|
62
|
+
def js_before_run(javascript: '')
|
63
|
+
return if javascript.empty?
|
64
|
+
|
65
|
+
@javascript_before_run = javascript
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_links(url:)
|
69
|
+
initialize_crawler(url)
|
70
|
+
|
71
|
+
begin
|
72
|
+
with_hooks_for(type: :all) do
|
73
|
+
crawl_manager.crawl(
|
74
|
+
target_url: url,
|
75
|
+
capybara_session: Capybara.current_session,
|
76
|
+
screenshot_operator: screenshot_operator
|
77
|
+
)
|
78
|
+
end
|
79
|
+
rescue StandardError => error
|
80
|
+
logger
|
81
|
+
.fatal("#{error.message} \n #{error.backtrace.join("\n")}")
|
82
|
+
ensure
|
83
|
+
@report_store.finish
|
84
|
+
end
|
85
|
+
self
|
86
|
+
end
|
87
|
+
|
88
|
+
def report_save(folder_path: '', type: :yaml)
|
89
|
+
save_folder_path = folder_path.empty? ? REPORT_SAVE_FOLDER_PATH : folder_path
|
90
|
+
ReportFactory.save(store: @report_store,
|
91
|
+
type: type.to_sym,
|
92
|
+
save_folder_path: save_folder_path)
|
93
|
+
end
|
94
|
+
|
95
|
+
def before(type: :all, &hook)
|
96
|
+
HooksContainer.instance.add_hook(method: :before, type: type, hook: hook)
|
97
|
+
end
|
98
|
+
|
99
|
+
def after(type: :all, &hook)
|
100
|
+
HooksContainer.instance.add_hook(method: :after, type: type, hook: hook)
|
101
|
+
end
|
102
|
+
|
103
|
+
def unvisited_links(&hook)
|
104
|
+
HooksContainer.instance.add_hook(type: :unvisited_links, hook: hook)
|
105
|
+
end
|
106
|
+
|
107
|
+
def change_page_scan_rules(&hook)
|
108
|
+
HooksContainer.instance.add_hook(type: :scan_rules, hook: hook)
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
def initialize_crawler(url)
|
114
|
+
Capybara.current_session.quit
|
115
|
+
|
116
|
+
uri = UrlTools.uri!(url: url)
|
117
|
+
Capybara.app_host = "#{uri.scheme}://#{uri.host}:#{uri.port}"
|
118
|
+
|
119
|
+
@report_store.start(url: url)
|
120
|
+
|
121
|
+
return if @javascript_before_run.nil?
|
122
|
+
|
123
|
+
Capybara.current_session
|
124
|
+
.driver
|
125
|
+
.browser
|
126
|
+
.page
|
127
|
+
.command('Page.addScriptToEvaluateOnNewDocument',
|
128
|
+
source: @javascript_before_run)
|
129
|
+
end
|
130
|
+
|
131
|
+
def initialize_report_store(cuprite_options)
|
132
|
+
@report_store = Reports::Store.new
|
133
|
+
@report_store.metadata[:screenshots_path] = screenshot_operator
|
134
|
+
.screenshots_folder
|
135
|
+
@report_store.metadata[:window_width] = cuprite_options[:window_size][0]
|
136
|
+
@report_store.metadata[:window_height] = cuprite_options[:window_size][1]
|
137
|
+
end
|
138
|
+
|
139
|
+
def register_chrome_driver(cuprite_options)
|
140
|
+
Capybara.register_chrome_driver(:cuprite_chrome, options: cuprite_options)
|
141
|
+
Capybara.run_server = false
|
142
|
+
Capybara.default_driver = :cuprite_chrome
|
143
|
+
# a workaround to extracting data from inactive tabs, dialogs, etc.
|
144
|
+
Capybara.ignore_hidden_elements = false
|
145
|
+
end
|
146
|
+
|
147
|
+
def initialize_crawl_manager(max_pages, deep_visit)
|
148
|
+
@crawl_manager = EngineUtilities::CrawlManager.new(
|
149
|
+
report_store: report_store,
|
150
|
+
max_pages: max_pages.to_i,
|
151
|
+
deep_visit: deep_visit,
|
152
|
+
logger: @logger
|
153
|
+
)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require_relative '../url_tools'
|
2
|
+
require_relative 'link_inspector'
|
3
|
+
require_relative 'page_inspector'
|
4
|
+
require_relative 'inspect_page_process'
|
5
|
+
|
6
|
+
module BrowserCrawler
|
7
|
+
module EngineUtilities
|
8
|
+
# This main operated class which controls queue of unvisisted links.
|
9
|
+
class CrawlManager
|
10
|
+
|
11
|
+
attr_reader :target_url,
|
12
|
+
:unvisited_links_queue,
|
13
|
+
:report_store,
|
14
|
+
:host_name,
|
15
|
+
:deep_visit,
|
16
|
+
:max_pages,
|
17
|
+
:logger,
|
18
|
+
:page_inspector
|
19
|
+
|
20
|
+
def initialize(report_store:,
|
21
|
+
max_pages: 0,
|
22
|
+
deep_visit: false,
|
23
|
+
logger: nil)
|
24
|
+
@report_store = report_store
|
25
|
+
@max_pages = max_pages
|
26
|
+
@deep_visit = deep_visit
|
27
|
+
@logger = logger || Logger.new(STDOUT)
|
28
|
+
end
|
29
|
+
|
30
|
+
def crawl(target_url:, capybara_session:, screenshot_operator: nil)
|
31
|
+
@host_name = UrlTools.uri!(url: target_url).host
|
32
|
+
@unvisited_links_queue = [target_url]
|
33
|
+
|
34
|
+
loop do
|
35
|
+
break if unvisited_links_queue.empty? || limit_reached?
|
36
|
+
|
37
|
+
unvisited_link = unvisited_links_queue.shift
|
38
|
+
|
39
|
+
link_inspector = LinkInspector.new(raw_link: unvisited_link,
|
40
|
+
host_name: host_name)
|
41
|
+
|
42
|
+
unless link_valid?(link_inspector)
|
43
|
+
@logger.info("Skipped visited #{unvisited_link}")
|
44
|
+
report_store.record_unrecognized_link(unvisited_link)
|
45
|
+
next
|
46
|
+
end
|
47
|
+
|
48
|
+
inspect_page(link_inspector: link_inspector,
|
49
|
+
capybara_session: capybara_session,
|
50
|
+
screenshot_operator: screenshot_operator)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def link_valid?(link_inspector)
|
55
|
+
link_inspector.link_valid? &&
|
56
|
+
internal_resource?(link_inspector) &&
|
57
|
+
page_unvisited?(link_inspector)
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def inspect_page(link_inspector:, capybara_session:, screenshot_operator:)
|
63
|
+
InspectPageProcess.new(link_inspector: link_inspector,
|
64
|
+
capybara_session: capybara_session,
|
65
|
+
screenshot_operator: screenshot_operator,
|
66
|
+
report_store: report_store,
|
67
|
+
logger: logger)
|
68
|
+
.call(unvisited_links_queue: unvisited_links_queue)
|
69
|
+
rescue StandardError => error
|
70
|
+
error_handler(link: link_inspector.raw_link, error: error)
|
71
|
+
end
|
72
|
+
|
73
|
+
def internal_resource?(link_inspector)
|
74
|
+
link_inspector.internal_url? || deep_visit
|
75
|
+
end
|
76
|
+
|
77
|
+
def page_unvisited?(link_inspector)
|
78
|
+
!visited_pages.include?(link_inspector.full_url)
|
79
|
+
end
|
80
|
+
|
81
|
+
def limit_reached?
|
82
|
+
return false if max_pages.zero?
|
83
|
+
|
84
|
+
visited_pages.count >= max_pages
|
85
|
+
end
|
86
|
+
|
87
|
+
def visited_pages
|
88
|
+
report_store.visited_pages
|
89
|
+
end
|
90
|
+
|
91
|
+
def error_handler(link:, error:)
|
92
|
+
error_link = "visiting link - #{link};\n"
|
93
|
+
error_message = "error message: #{error.message};\n"
|
94
|
+
error_backtrace = "error backtrace: #{error.backtrace.join("\n")};\n"
|
95
|
+
logger.error("Error: #{error_link} #{error_message} #{error_backtrace}")
|
96
|
+
report_store.record_crawler_error(link: link, error: error)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require_relative '../hooks_operator'
|
2
|
+
|
3
|
+
module BrowserCrawler
|
4
|
+
module EngineUtilities
|
5
|
+
# Inspect a passed link and update the loop queue if it is necessary.
|
6
|
+
class InspectPageProcess
|
7
|
+
include Capybara::DSL
|
8
|
+
include HooksOperator
|
9
|
+
|
10
|
+
attr_reader :page_inspector,
|
11
|
+
:screenshot_operator,
|
12
|
+
:link_inspector,
|
13
|
+
:logger
|
14
|
+
|
15
|
+
def initialize(link_inspector:,
|
16
|
+
capybara_session:,
|
17
|
+
report_store:,
|
18
|
+
screenshot_operator: nil,
|
19
|
+
logger:)
|
20
|
+
@page_inspector = PageInspector.new(
|
21
|
+
link_inspector: link_inspector,
|
22
|
+
capybara_session: capybara_session,
|
23
|
+
report_store: report_store
|
24
|
+
)
|
25
|
+
@link_inspector = link_inspector
|
26
|
+
@screenshot_operator = screenshot_operator
|
27
|
+
@logger = logger
|
28
|
+
end
|
29
|
+
|
30
|
+
def call(unvisited_links_queue:)
|
31
|
+
visit_page
|
32
|
+
update_queue(unvisited_links_queue: unvisited_links_queue)
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def add_to_queue?(links:)
|
38
|
+
links && !links.empty?
|
39
|
+
end
|
40
|
+
|
41
|
+
# returns array consists of unvisited_links
|
42
|
+
# if some hooks is existed to execute hooks instead of base behavior
|
43
|
+
def find_unvisited_links
|
44
|
+
exchange_on_hooks(type: :unvisited_links) do
|
45
|
+
@page_inspector.scan_result
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def visit_page
|
50
|
+
logger.info("Visiting #{link_inspector.raw_link}")
|
51
|
+
|
52
|
+
@page_inspector.visit_page
|
53
|
+
|
54
|
+
@page_inspector.save_to_report(screenshot_operator: screenshot_operator)
|
55
|
+
|
56
|
+
logger
|
57
|
+
.info("#{@page_inspector.scan_result.size} links found on the page.")
|
58
|
+
end
|
59
|
+
|
60
|
+
def update_queue(unvisited_links_queue:)
|
61
|
+
unvisited_links = find_unvisited_links
|
62
|
+
|
63
|
+
logger
|
64
|
+
.info("#{unvisited_links.size} will add to unvisited links queue.")
|
65
|
+
|
66
|
+
return unless add_to_queue?(links: unvisited_links)
|
67
|
+
|
68
|
+
unvisited_links_queue.push(*unvisited_links).uniq!
|
69
|
+
|
70
|
+
logger.info("#{unvisited_links_queue.size} - current state the queue.")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|