browser_crawler 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +10 -0
  5. data/.travis.yml +29 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +277 -0
  9. data/Rakefile +7 -0
  10. data/bin/console +10 -0
  11. data/bin/crawl +51 -0
  12. data/bin/setup +8 -0
  13. data/browser_crawler.gemspec +47 -0
  14. data/lib/browser_crawler.rb +12 -0
  15. data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
  16. data/lib/browser_crawler/dsl/sign_in.rb +37 -0
  17. data/lib/browser_crawler/engine.rb +156 -0
  18. data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
  19. data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
  20. data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
  21. data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
  22. data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
  23. data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
  24. data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
  25. data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
  26. data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
  27. data/lib/browser_crawler/hooks_container.rb +31 -0
  28. data/lib/browser_crawler/hooks_operator.rb +44 -0
  29. data/lib/browser_crawler/options.rb +86 -0
  30. data/lib/browser_crawler/report_factory.rb +22 -0
  31. data/lib/browser_crawler/reports/csv_report.rb +75 -0
  32. data/lib/browser_crawler/reports/store.rb +114 -0
  33. data/lib/browser_crawler/reports/yaml_report.rb +15 -0
  34. data/lib/browser_crawler/screenshot_operator.rb +47 -0
  35. data/lib/browser_crawler/support/capybara.rb +20 -0
  36. data/lib/browser_crawler/url_tools.rb +32 -0
  37. data/lib/browser_crawler/version.rb +3 -0
  38. metadata +244 -0
@@ -0,0 +1,22 @@
1
+ require 'fileutils'
2
+ require_relative 'reports/csv_report'
3
+ require_relative 'reports/yaml_report'
4
+
5
+ module BrowserCrawler
6
+ # It saves store data to yaml or csv report file.
7
+ module ReportFactory
8
+ module_function
9
+
10
+ REPORT_MATCHER = {
11
+ yaml: Reports::YamlReport,
12
+ csv: Reports::CsvReport
13
+ }.freeze
14
+
15
+ def save(store:, type:, save_folder_path:)
16
+ FileUtils.mkdir_p(save_folder_path)
17
+ REPORT_MATCHER[type]
18
+ .new(store: store)
19
+ .export(save_folder_path: save_folder_path)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,75 @@
1
+ require 'csv'
2
+
3
+ module BrowserCrawler
4
+ module Reports
5
+ # It involves methods which allow to save a store to a csv file
6
+ class CsvReport
7
+ def initialize(store:)
8
+ @store = store
9
+ end
10
+
11
+ def export(save_folder_path:)
12
+ CSV.open("#{save_folder_path}/crawler_report.csv", 'wb') do |csv|
13
+ csv << ['pages',
14
+ 'extracted links',
15
+ 'is external',
16
+ 'http status',
17
+ 'http code']
18
+
19
+ @store.pages.each do |page, crawler_result|
20
+ save_to_csv(csv, page, crawler_result)
21
+ end
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def filter_links(links)
28
+ return nil if links.nil?
29
+
30
+ links.select do |link|
31
+ link =~ /\A#{URI.regexp(%w[http https])}\z/
32
+ end
33
+ end
34
+
35
+ def save_to_row(page, crawler_result, link = nil)
36
+ [page,
37
+ link,
38
+ crawler_result[:external],
39
+ humanize_code(crawler_result[:code]),
40
+ crawler_result[:code]]
41
+ end
42
+
43
+ def save_to_csv(csv, page, crawler_result)
44
+ extracted_links = filter_links(crawler_result[:extracted_links])
45
+
46
+ if extracted_links.nil? || extracted_links.empty?
47
+ csv << save_to_row(page, crawler_result)
48
+ return
49
+ end
50
+
51
+ extracted_links.each do |link|
52
+ csv << save_to_row(page, crawler_result, link)
53
+ end
54
+ end
55
+
56
+ def csv_header
57
+ ['pages',
58
+ 'extracted links',
59
+ 'external?',
60
+ 'http status',
61
+ 'http code']
62
+ end
63
+
64
+ def humanize_code(code)
65
+ case code.to_i
66
+ when 200..225 then :active
67
+ when 401 then :unauthorized
68
+ when 301..308 then :redirect
69
+ else
70
+ :broken
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,114 @@
1
+ module BrowserCrawler
2
+ module Reports
3
+ # Value object that contains crawling results.
4
+ # Example:
5
+ # {
6
+ # pages: {
7
+ # '/':
8
+ # {
9
+ # screenshot: 'file1.png',
10
+ # error: nil,
11
+ # extracted_links: ['http://welcome[pdf]', 'http://support']
12
+ # },
13
+ # 'welcome':
14
+ # {
15
+ # screenshot: 'file2.png',
16
+ # error: 'Invalid URI',
17
+ # extracted_links: nil
18
+ # }
19
+ # },
20
+ # metadata: {
21
+ # custom_attribute: 'Sample report title'
22
+ # },
23
+ # unrecognized_links: ['mailto://', 'javascript://'],
24
+ # crawler_error: {
25
+ # 'http://welcome.page' => {
26
+ # message: 'Something has a wrong type',
27
+ # backtrace: ['/call:10', '/sum: 11']
28
+ # }
29
+ # },
30
+ # started_at: 12345,
31
+ # finished_at: 123456
32
+ # }
33
+
34
+ # It involves methods which allow to save data to a store structure
35
+ class Store
36
+ attr_reader :pages, :metadata, :unrecognized_links, :crawler_error
37
+ attr_accessor :error
38
+
39
+ def initialize(pages: {},
40
+ metadata: {},
41
+ started_at: nil,
42
+ finished_at: nil)
43
+ @pages = pages
44
+ @metadata = metadata
45
+ @started_at = started_at
46
+ @finished_at = finished_at
47
+ @crawler_error = {}
48
+ @unrecognized_links = []
49
+ end
50
+
51
+ def start(url:)
52
+ @pages.clear
53
+ @started_at = Time.now
54
+ @metadata[:url] = url
55
+ end
56
+
57
+ def finish
58
+ @finished_at = Time.now
59
+ end
60
+
61
+ def to_h
62
+ {}.merge(pages: @pages)
63
+ .merge(@metadata)
64
+ .merge(
65
+ unrecognized_links: @unrecognized_links,
66
+ crawler_error: @crawler_error,
67
+ started_at: @started_at,
68
+ finished_at: @finished_at,
69
+ links_count: count_all_links
70
+ )
71
+ end
72
+
73
+ def record_unrecognized_link(link)
74
+ return if @unrecognized_links.include?(link)
75
+
76
+ @unrecognized_links << link unless @unrecognized_links.include?(link)
77
+ end
78
+
79
+ def record_page_visit(page:,
80
+ extracted_links: nil,
81
+ screenshot_filename: nil,
82
+ error: nil,
83
+ external: false,
84
+ code: nil)
85
+ @pages[page] = {
86
+ screenshot: screenshot_filename,
87
+ error: error,
88
+ extracted_links: extracted_links,
89
+ code: code,
90
+ external: external
91
+ }
92
+ end
93
+
94
+ def record_crawler_error(link:, error:)
95
+ @crawler_error[link] = {
96
+ message: error.message,
97
+ backtrace: error.backtrace
98
+ }
99
+ end
100
+
101
+ def visited_pages
102
+ @pages.keys
103
+ end
104
+
105
+ private
106
+
107
+ def count_all_links
108
+ @pages.inject(0) do |sum, (_, data)|
109
+ sum + data[:extracted_links]&.size.to_i if data && data[:extracted_links]
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,15 @@
1
+ module BrowserCrawler
2
+ module Reports
3
+ # It involves methods which allow to save a store to an yaml file
4
+ class YamlReport
5
+ def initialize(store:)
6
+ @store = store
7
+ end
8
+
9
+ def export(save_folder_path:)
10
+ File.write("#{save_folder_path}/crawler_report.yaml",
11
+ @store.to_h.to_yaml)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,47 @@
1
+ module BrowserCrawler
2
+ # Control operations on screenshots
3
+ class ScreenshotOperator
4
+ attr_reader :format, :save_screenshots, :filename_base, :screenshots_folder
5
+
6
+ def initialize(save_screenshots: false,
7
+ save_screenshots_to: nil,
8
+ format: 'png',
9
+ filename: nil)
10
+ @screenshots_folder = save_screenshots_to
11
+ @format = format
12
+ @save_screenshots = save_screenshots
13
+ @filename_base = filename || 'screenshot'
14
+ end
15
+
16
+ def save_screenshots?
17
+ [screenshots_folder, save_screenshots].any?
18
+ end
19
+
20
+ def file_path(url: nil)
21
+ "#{save_path}/#{filename(url: url)}"
22
+ end
23
+
24
+ def filename(url: nil)
25
+ if !filename_base_default? || url.nil?
26
+ "#{filename_prefix}_#{filename_base}.#{format}"
27
+ else
28
+ path = UrlTools.uri(url: url)&.path&.gsub('/', '%')&.gsub('.', '')
29
+ "#{filename_prefix}_#{path}.#{format}"
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def filename_base_default?
36
+ filename_base == 'screenshot'
37
+ end
38
+
39
+ def save_path
40
+ screenshots_folder || File.join(Dir.pwd, 'tmp', 'screenshots')
41
+ end
42
+
43
+ def filename_prefix
44
+ Time.now.getutc.to_s.tr(' ', '_')
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,20 @@
1
+ require 'capybara'
2
+ require 'capybara/cuprite'
3
+
4
+ # Register new driver for capybara
5
+ module Capybara
6
+ module_function
7
+
8
+ def register_chrome_driver(name, options: {})
9
+ unless options[:browser_options]
10
+ options[:browser_options] = {
11
+ '--headless' => nil, '--disable-gpu' => nil,
12
+ '--disable-extensions' => nil, '--no-sandbox' => nil
13
+ }
14
+ end
15
+
16
+ Capybara.register_driver name do |app|
17
+ ::Capybara::Cuprite::Driver.new(app, options)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,32 @@
1
+ module BrowserCrawler
2
+ module UrlTools
3
+ def uri(url:)
4
+ uri!(url: url)
5
+ rescue URI::InvalidURIError
6
+ nil
7
+ end
8
+
9
+ def uri!(url:)
10
+ string_url = url.to_s
11
+ raise URI::InvalidURIError unless string_url =~ /\A#{URI.regexp(%w[http https])}\z/
12
+
13
+ URI(string_url)
14
+ end
15
+
16
+ def full_url(uri:)
17
+ path_query = get_path_query(uri: uri)
18
+ if uri.port == 80 || uri.port == 443
19
+ "#{uri.scheme}://#{uri.host}#{uri.path}#{path_query}"
20
+ else
21
+ "#{uri.scheme}://#{uri.host}:#{uri.port}#{uri.path}#{path_query}"
22
+ end.sub(%r{(/)+$}, '')
23
+ end
24
+
25
+ def get_path_query(uri:)
26
+ uri_fragment = uri.query
27
+ uri_fragment.nil? || (uri_fragment == '') ? nil : "?#{uri.query}"
28
+ end
29
+
30
+ module_function :uri, :uri!, :full_url, :get_path_query
31
+ end
32
+ end
@@ -0,0 +1,3 @@
1
+ module BrowserCrawler
2
+ VERSION = '0.4.0'.freeze
3
+ end
metadata ADDED
@@ -0,0 +1,244 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: browser_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
+ platform: ruby
6
+ authors:
7
+ - Dmytro Samodurov
8
+ - Artem Rumiantcev
9
+ - Denys Ivanchuk
10
+ - Sergiy Tyatin
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2019-08-23 00:00:00.000000000 Z
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: activesupport
18
+ requirement: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - "~>"
21
+ - !ruby/object:Gem::Version
22
+ version: '5.2'
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: 5.2.2
26
+ type: :runtime
27
+ prerelease: false
28
+ version_requirements: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '5.2'
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: 5.2.2
36
+ - !ruby/object:Gem::Dependency
37
+ name: capybara
38
+ requirement: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 3.24.0
43
+ - - "~>"
44
+ - !ruby/object:Gem::Version
45
+ version: '3.24'
46
+ type: :runtime
47
+ prerelease: false
48
+ version_requirements: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 3.24.0
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '3.24'
56
+ - !ruby/object:Gem::Dependency
57
+ name: chromedriver-helper
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 2.1.0
63
+ - - "~>"
64
+ - !ruby/object:Gem::Version
65
+ version: '2.1'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 2.1.0
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '2.1'
76
+ - !ruby/object:Gem::Dependency
77
+ name: cuprite
78
+ requirement: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.6.0
83
+ type: :runtime
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.6.0
90
+ - !ruby/object:Gem::Dependency
91
+ name: bundler
92
+ requirement: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: 1.17.2
97
+ - - "~>"
98
+ - !ruby/object:Gem::Version
99
+ version: 1.17.2
100
+ type: :development
101
+ prerelease: false
102
+ version_requirements: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: 1.17.2
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: 1.17.2
110
+ - !ruby/object:Gem::Dependency
111
+ name: pry-byebug
112
+ requirement: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '3.6'
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: '3.6'
120
+ type: :development
121
+ prerelease: false
122
+ version_requirements: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: '3.6'
127
+ - - "~>"
128
+ - !ruby/object:Gem::Version
129
+ version: '3.6'
130
+ - !ruby/object:Gem::Dependency
131
+ name: rake
132
+ requirement: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - "~>"
135
+ - !ruby/object:Gem::Version
136
+ version: '10.0'
137
+ type: :development
138
+ prerelease: false
139
+ version_requirements: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - "~>"
142
+ - !ruby/object:Gem::Version
143
+ version: '10.0'
144
+ - !ruby/object:Gem::Dependency
145
+ name: rspec
146
+ requirement: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - "~>"
149
+ - !ruby/object:Gem::Version
150
+ version: '3.0'
151
+ type: :development
152
+ prerelease: false
153
+ version_requirements: !ruby/object:Gem::Requirement
154
+ requirements:
155
+ - - "~>"
156
+ - !ruby/object:Gem::Version
157
+ version: '3.0'
158
+ - !ruby/object:Gem::Dependency
159
+ name: rubocop
160
+ requirement: !ruby/object:Gem::Requirement
161
+ requirements:
162
+ - - "~>"
163
+ - !ruby/object:Gem::Version
164
+ version: '0.66'
165
+ type: :development
166
+ prerelease: false
167
+ version_requirements: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - "~>"
170
+ - !ruby/object:Gem::Version
171
+ version: '0.66'
172
+ description: ''
173
+ email:
174
+ - dimasamodurov@gmail.com
175
+ - tema.place@gmail.com
176
+ executables:
177
+ - console
178
+ - crawl
179
+ - setup
180
+ extensions: []
181
+ extra_rdoc_files: []
182
+ files:
183
+ - ".gitignore"
184
+ - ".rspec"
185
+ - ".rubocop.yml"
186
+ - ".travis.yml"
187
+ - Gemfile
188
+ - LICENSE.txt
189
+ - README.md
190
+ - Rakefile
191
+ - bin/console
192
+ - bin/crawl
193
+ - bin/setup
194
+ - browser_crawler.gemspec
195
+ - lib/browser_crawler.rb
196
+ - lib/browser_crawler/dsl/js_helpers.rb
197
+ - lib/browser_crawler/dsl/sign_in.rb
198
+ - lib/browser_crawler/engine.rb
199
+ - lib/browser_crawler/engine_utilities/crawl_manager.rb
200
+ - lib/browser_crawler/engine_utilities/inspect_page_process.rb
201
+ - lib/browser_crawler/engine_utilities/link_inspector.rb
202
+ - lib/browser_crawler/engine_utilities/link_scanner.rb
203
+ - lib/browser_crawler/engine_utilities/page_inspector.rb
204
+ - lib/browser_crawler/errors/invalid_hooks_type.rb
205
+ - lib/browser_crawler/followups/screenshots_indexer.rb
206
+ - lib/browser_crawler/followups/templates/index.html.erb
207
+ - lib/browser_crawler/followups/wraith_integrator.rb
208
+ - lib/browser_crawler/hooks_container.rb
209
+ - lib/browser_crawler/hooks_operator.rb
210
+ - lib/browser_crawler/options.rb
211
+ - lib/browser_crawler/report_factory.rb
212
+ - lib/browser_crawler/reports/csv_report.rb
213
+ - lib/browser_crawler/reports/store.rb
214
+ - lib/browser_crawler/reports/yaml_report.rb
215
+ - lib/browser_crawler/screenshot_operator.rb
216
+ - lib/browser_crawler/support/capybara.rb
217
+ - lib/browser_crawler/url_tools.rb
218
+ - lib/browser_crawler/version.rb
219
+ homepage: https://github.com/DimaSamodurov/browser_crawler
220
+ licenses:
221
+ - MIT
222
+ metadata:
223
+ homepage_uri: https://github.com/DimaSamodurov/browser_crawler
224
+ source_code_uri: https://github.com/DimaSamodurov/browser_crawler
225
+ post_install_message:
226
+ rdoc_options: []
227
+ require_paths:
228
+ - lib
229
+ required_ruby_version: !ruby/object:Gem::Requirement
230
+ requirements:
231
+ - - ">="
232
+ - !ruby/object:Gem::Version
233
+ version: 2.5.0
234
+ required_rubygems_version: !ruby/object:Gem::Requirement
235
+ requirements:
236
+ - - ">="
237
+ - !ruby/object:Gem::Version
238
+ version: '0'
239
+ requirements: []
240
+ rubygems_version: 3.0.1
241
+ signing_key:
242
+ specification_version: 4
243
+ summary: Simple site crawler using Capybara
244
+ test_files: []