browser_crawler 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +10 -0
  5. data/.travis.yml +29 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +277 -0
  9. data/Rakefile +7 -0
  10. data/bin/console +10 -0
  11. data/bin/crawl +51 -0
  12. data/bin/setup +8 -0
  13. data/browser_crawler.gemspec +47 -0
  14. data/lib/browser_crawler.rb +12 -0
  15. data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
  16. data/lib/browser_crawler/dsl/sign_in.rb +37 -0
  17. data/lib/browser_crawler/engine.rb +156 -0
  18. data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
  19. data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
  20. data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
  21. data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
  22. data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
  23. data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
  24. data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
  25. data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
  26. data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
  27. data/lib/browser_crawler/hooks_container.rb +31 -0
  28. data/lib/browser_crawler/hooks_operator.rb +44 -0
  29. data/lib/browser_crawler/options.rb +86 -0
  30. data/lib/browser_crawler/report_factory.rb +22 -0
  31. data/lib/browser_crawler/reports/csv_report.rb +75 -0
  32. data/lib/browser_crawler/reports/store.rb +114 -0
  33. data/lib/browser_crawler/reports/yaml_report.rb +15 -0
  34. data/lib/browser_crawler/screenshot_operator.rb +47 -0
  35. data/lib/browser_crawler/support/capybara.rb +20 -0
  36. data/lib/browser_crawler/url_tools.rb +32 -0
  37. data/lib/browser_crawler/version.rb +3 -0
  38. metadata +244 -0
@@ -0,0 +1,22 @@
1
+ require 'fileutils'
2
+ require_relative 'reports/csv_report'
3
+ require_relative 'reports/yaml_report'
4
+
5
+ module BrowserCrawler
6
+ # It saves store data to yaml or csv report file.
7
+ module ReportFactory
8
+ module_function
9
+
10
+ REPORT_MATCHER = {
11
+ yaml: Reports::YamlReport,
12
+ csv: Reports::CsvReport
13
+ }.freeze
14
+
15
+ def save(store:, type:, save_folder_path:)
16
+ FileUtils.mkdir_p(save_folder_path)
17
+ REPORT_MATCHER[type]
18
+ .new(store: store)
19
+ .export(save_folder_path: save_folder_path)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,75 @@
1
+ require 'csv'
2
+
3
+ module BrowserCrawler
4
+ module Reports
5
+ # It involves methods which allow to save a store to a csv file
6
+ class CsvReport
7
+ def initialize(store:)
8
+ @store = store
9
+ end
10
+
11
+ def export(save_folder_path:)
12
+ CSV.open("#{save_folder_path}/crawler_report.csv", 'wb') do |csv|
13
+ csv << ['pages',
14
+ 'extracted links',
15
+ 'is external',
16
+ 'http status',
17
+ 'http code']
18
+
19
+ @store.pages.each do |page, crawler_result|
20
+ save_to_csv(csv, page, crawler_result)
21
+ end
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def filter_links(links)
28
+ return nil if links.nil?
29
+
30
+ links.select do |link|
31
+ link =~ /\A#{URI.regexp(%w[http https])}\z/
32
+ end
33
+ end
34
+
35
+ def save_to_row(page, crawler_result, link = nil)
36
+ [page,
37
+ link,
38
+ crawler_result[:external],
39
+ humanize_code(crawler_result[:code]),
40
+ crawler_result[:code]]
41
+ end
42
+
43
+ def save_to_csv(csv, page, crawler_result)
44
+ extracted_links = filter_links(crawler_result[:extracted_links])
45
+
46
+ if extracted_links.nil? || extracted_links.empty?
47
+ csv << save_to_row(page, crawler_result)
48
+ return
49
+ end
50
+
51
+ extracted_links.each do |link|
52
+ csv << save_to_row(page, crawler_result, link)
53
+ end
54
+ end
55
+
56
+ def csv_header
57
+ ['pages',
58
+ 'extracted links',
59
+ 'external?',
60
+ 'http status',
61
+ 'http code']
62
+ end
63
+
64
+ def humanize_code(code)
65
+ case code.to_i
66
+ when 200..225 then :active
67
+ when 401 then :unauthorized
68
+ when 301..308 then :redirect
69
+ else
70
+ :broken
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,114 @@
1
+ module BrowserCrawler
2
+ module Reports
3
+ # Value object that contains crawling results.
4
+ # Example:
5
+ # {
6
+ # pages: {
7
+ # '/':
8
+ # {
9
+ # screenshot: 'file1.png',
10
+ # error: nil,
11
+ # extracted_links: ['http://welcome[pdf]', 'http://support']
12
+ # },
13
+ # 'welcome':
14
+ # {
15
+ # screenshot: 'file2.png',
16
+ # error: 'Invalid URI',
17
+ # extracted_links: nil
18
+ # }
19
+ # },
20
+ # metadata: {
21
+ # custom_attribute: 'Sample report title'
22
+ # },
23
+ # unrecognized_links: ['mailto://', 'javascript://'],
24
+ # crawler_error: {
25
+ # 'http://welcome.page' => {
26
+ # message: 'Something has a wrong type',
27
+ # backtrace: ['/call:10', '/sum: 11']
28
+ # }
29
+ # },
30
+ # started_at: 12345,
31
+ # finished_at: 123456
32
+ # }
33
+
34
+ # It involves methods which allow to save data to a store structure
35
+ class Store
36
+ attr_reader :pages, :metadata, :unrecognized_links, :crawler_error
37
+ attr_accessor :error
38
+
39
+ def initialize(pages: {},
40
+ metadata: {},
41
+ started_at: nil,
42
+ finished_at: nil)
43
+ @pages = pages
44
+ @metadata = metadata
45
+ @started_at = started_at
46
+ @finished_at = finished_at
47
+ @crawler_error = {}
48
+ @unrecognized_links = []
49
+ end
50
+
51
+ def start(url:)
52
+ @pages.clear
53
+ @started_at = Time.now
54
+ @metadata[:url] = url
55
+ end
56
+
57
+ def finish
58
+ @finished_at = Time.now
59
+ end
60
+
61
+ def to_h
62
+ {}.merge(pages: @pages)
63
+ .merge(@metadata)
64
+ .merge(
65
+ unrecognized_links: @unrecognized_links,
66
+ crawler_error: @crawler_error,
67
+ started_at: @started_at,
68
+ finished_at: @finished_at,
69
+ links_count: count_all_links
70
+ )
71
+ end
72
+
73
+ def record_unrecognized_link(link)
74
+ return if @unrecognized_links.include?(link)
75
+
76
+ @unrecognized_links << link unless @unrecognized_links.include?(link)
77
+ end
78
+
79
+ def record_page_visit(page:,
80
+ extracted_links: nil,
81
+ screenshot_filename: nil,
82
+ error: nil,
83
+ external: false,
84
+ code: nil)
85
+ @pages[page] = {
86
+ screenshot: screenshot_filename,
87
+ error: error,
88
+ extracted_links: extracted_links,
89
+ code: code,
90
+ external: external
91
+ }
92
+ end
93
+
94
+ def record_crawler_error(link:, error:)
95
+ @crawler_error[link] = {
96
+ message: error.message,
97
+ backtrace: error.backtrace
98
+ }
99
+ end
100
+
101
+ def visited_pages
102
+ @pages.keys
103
+ end
104
+
105
+ private
106
+
107
+ def count_all_links
108
+ @pages.inject(0) do |sum, (_, data)|
109
+ sum + data[:extracted_links]&.size.to_i if data && data[:extracted_links]
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,15 @@
1
+ module BrowserCrawler
2
+ module Reports
3
+ # It involves methods which allow to save a store to an yaml file
4
+ class YamlReport
5
+ def initialize(store:)
6
+ @store = store
7
+ end
8
+
9
+ def export(save_folder_path:)
10
+ File.write("#{save_folder_path}/crawler_report.yaml",
11
+ @store.to_h.to_yaml)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,47 @@
1
+ module BrowserCrawler
2
+ # Control operations on screenshots
3
+ class ScreenshotOperator
4
+ attr_reader :format, :save_screenshots, :filename_base, :screenshots_folder
5
+
6
+ def initialize(save_screenshots: false,
7
+ save_screenshots_to: nil,
8
+ format: 'png',
9
+ filename: nil)
10
+ @screenshots_folder = save_screenshots_to
11
+ @format = format
12
+ @save_screenshots = save_screenshots
13
+ @filename_base = filename || 'screenshot'
14
+ end
15
+
16
+ def save_screenshots?
17
+ [screenshots_folder, save_screenshots].any?
18
+ end
19
+
20
+ def file_path(url: nil)
21
+ "#{save_path}/#{filename(url: url)}"
22
+ end
23
+
24
+ def filename(url: nil)
25
+ if !filename_base_default? || url.nil?
26
+ "#{filename_prefix}_#{filename_base}.#{format}"
27
+ else
28
+ path = UrlTools.uri(url: url)&.path&.gsub('/', '%')&.gsub('.', '')
29
+ "#{filename_prefix}_#{path}.#{format}"
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def filename_base_default?
36
+ filename_base == 'screenshot'
37
+ end
38
+
39
+ def save_path
40
+ screenshots_folder || File.join(Dir.pwd, 'tmp', 'screenshots')
41
+ end
42
+
43
+ def filename_prefix
44
+ Time.now.getutc.to_s.tr(' ', '_')
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,20 @@
1
+ require 'capybara'
2
+ require 'capybara/cuprite'
3
+
4
+ # Register new driver for capybara
5
+ module Capybara
6
+ module_function
7
+
8
+ def register_chrome_driver(name, options: {})
9
+ unless options[:browser_options]
10
+ options[:browser_options] = {
11
+ '--headless' => nil, '--disable-gpu' => nil,
12
+ '--disable-extensions' => nil, '--no-sandbox' => nil
13
+ }
14
+ end
15
+
16
+ Capybara.register_driver name do |app|
17
+ ::Capybara::Cuprite::Driver.new(app, options)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,32 @@
1
+ module BrowserCrawler
2
+ module UrlTools
3
+ def uri(url:)
4
+ uri!(url: url)
5
+ rescue URI::InvalidURIError
6
+ nil
7
+ end
8
+
9
+ def uri!(url:)
10
+ string_url = url.to_s
11
+ raise URI::InvalidURIError unless string_url =~ /\A#{URI.regexp(%w[http https])}\z/
12
+
13
+ URI(string_url)
14
+ end
15
+
16
+ def full_url(uri:)
17
+ path_query = get_path_query(uri: uri)
18
+ if uri.port == 80 || uri.port == 443
19
+ "#{uri.scheme}://#{uri.host}#{uri.path}#{path_query}"
20
+ else
21
+ "#{uri.scheme}://#{uri.host}:#{uri.port}#{uri.path}#{path_query}"
22
+ end.sub(%r{(/)+$}, '')
23
+ end
24
+
25
+ def get_path_query(uri:)
26
+ uri_fragment = uri.query
27
+ uri_fragment.nil? || (uri_fragment == '') ? nil : "?#{uri.query}"
28
+ end
29
+
30
+ module_function :uri, :uri!, :full_url, :get_path_query
31
+ end
32
+ end
@@ -0,0 +1,3 @@
1
+ module BrowserCrawler
2
+ VERSION = '0.4.0'.freeze
3
+ end
metadata ADDED
@@ -0,0 +1,244 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: browser_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
+ platform: ruby
6
+ authors:
7
+ - Dmytro Samodurov
8
+ - Artem Rumiantcev
9
+ - Denys Ivanchuk
10
+ - Sergiy Tyatin
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2019-08-23 00:00:00.000000000 Z
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: activesupport
18
+ requirement: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - "~>"
21
+ - !ruby/object:Gem::Version
22
+ version: '5.2'
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: 5.2.2
26
+ type: :runtime
27
+ prerelease: false
28
+ version_requirements: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '5.2'
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: 5.2.2
36
+ - !ruby/object:Gem::Dependency
37
+ name: capybara
38
+ requirement: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 3.24.0
43
+ - - "~>"
44
+ - !ruby/object:Gem::Version
45
+ version: '3.24'
46
+ type: :runtime
47
+ prerelease: false
48
+ version_requirements: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 3.24.0
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '3.24'
56
+ - !ruby/object:Gem::Dependency
57
+ name: chromedriver-helper
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 2.1.0
63
+ - - "~>"
64
+ - !ruby/object:Gem::Version
65
+ version: '2.1'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 2.1.0
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '2.1'
76
+ - !ruby/object:Gem::Dependency
77
+ name: cuprite
78
+ requirement: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.6.0
83
+ type: :runtime
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.6.0
90
+ - !ruby/object:Gem::Dependency
91
+ name: bundler
92
+ requirement: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: 1.17.2
97
+ - - "~>"
98
+ - !ruby/object:Gem::Version
99
+ version: 1.17.2
100
+ type: :development
101
+ prerelease: false
102
+ version_requirements: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: 1.17.2
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: 1.17.2
110
+ - !ruby/object:Gem::Dependency
111
+ name: pry-byebug
112
+ requirement: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '3.6'
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: '3.6'
120
+ type: :development
121
+ prerelease: false
122
+ version_requirements: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: '3.6'
127
+ - - "~>"
128
+ - !ruby/object:Gem::Version
129
+ version: '3.6'
130
+ - !ruby/object:Gem::Dependency
131
+ name: rake
132
+ requirement: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - "~>"
135
+ - !ruby/object:Gem::Version
136
+ version: '10.0'
137
+ type: :development
138
+ prerelease: false
139
+ version_requirements: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - "~>"
142
+ - !ruby/object:Gem::Version
143
+ version: '10.0'
144
+ - !ruby/object:Gem::Dependency
145
+ name: rspec
146
+ requirement: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - "~>"
149
+ - !ruby/object:Gem::Version
150
+ version: '3.0'
151
+ type: :development
152
+ prerelease: false
153
+ version_requirements: !ruby/object:Gem::Requirement
154
+ requirements:
155
+ - - "~>"
156
+ - !ruby/object:Gem::Version
157
+ version: '3.0'
158
+ - !ruby/object:Gem::Dependency
159
+ name: rubocop
160
+ requirement: !ruby/object:Gem::Requirement
161
+ requirements:
162
+ - - "~>"
163
+ - !ruby/object:Gem::Version
164
+ version: '0.66'
165
+ type: :development
166
+ prerelease: false
167
+ version_requirements: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - "~>"
170
+ - !ruby/object:Gem::Version
171
+ version: '0.66'
172
+ description: ''
173
+ email:
174
+ - dimasamodurov@gmail.com
175
+ - tema.place@gmail.com
176
+ executables:
177
+ - console
178
+ - crawl
179
+ - setup
180
+ extensions: []
181
+ extra_rdoc_files: []
182
+ files:
183
+ - ".gitignore"
184
+ - ".rspec"
185
+ - ".rubocop.yml"
186
+ - ".travis.yml"
187
+ - Gemfile
188
+ - LICENSE.txt
189
+ - README.md
190
+ - Rakefile
191
+ - bin/console
192
+ - bin/crawl
193
+ - bin/setup
194
+ - browser_crawler.gemspec
195
+ - lib/browser_crawler.rb
196
+ - lib/browser_crawler/dsl/js_helpers.rb
197
+ - lib/browser_crawler/dsl/sign_in.rb
198
+ - lib/browser_crawler/engine.rb
199
+ - lib/browser_crawler/engine_utilities/crawl_manager.rb
200
+ - lib/browser_crawler/engine_utilities/inspect_page_process.rb
201
+ - lib/browser_crawler/engine_utilities/link_inspector.rb
202
+ - lib/browser_crawler/engine_utilities/link_scanner.rb
203
+ - lib/browser_crawler/engine_utilities/page_inspector.rb
204
+ - lib/browser_crawler/errors/invalid_hooks_type.rb
205
+ - lib/browser_crawler/followups/screenshots_indexer.rb
206
+ - lib/browser_crawler/followups/templates/index.html.erb
207
+ - lib/browser_crawler/followups/wraith_integrator.rb
208
+ - lib/browser_crawler/hooks_container.rb
209
+ - lib/browser_crawler/hooks_operator.rb
210
+ - lib/browser_crawler/options.rb
211
+ - lib/browser_crawler/report_factory.rb
212
+ - lib/browser_crawler/reports/csv_report.rb
213
+ - lib/browser_crawler/reports/store.rb
214
+ - lib/browser_crawler/reports/yaml_report.rb
215
+ - lib/browser_crawler/screenshot_operator.rb
216
+ - lib/browser_crawler/support/capybara.rb
217
+ - lib/browser_crawler/url_tools.rb
218
+ - lib/browser_crawler/version.rb
219
+ homepage: https://github.com/DimaSamodurov/browser_crawler
220
+ licenses:
221
+ - MIT
222
+ metadata:
223
+ homepage_uri: https://github.com/DimaSamodurov/browser_crawler
224
+ source_code_uri: https://github.com/DimaSamodurov/browser_crawler
225
+ post_install_message:
226
+ rdoc_options: []
227
+ require_paths:
228
+ - lib
229
+ required_ruby_version: !ruby/object:Gem::Requirement
230
+ requirements:
231
+ - - ">="
232
+ - !ruby/object:Gem::Version
233
+ version: 2.5.0
234
+ required_rubygems_version: !ruby/object:Gem::Requirement
235
+ requirements:
236
+ - - ">="
237
+ - !ruby/object:Gem::Version
238
+ version: '0'
239
+ requirements: []
240
+ rubygems_version: 3.0.1
241
+ signing_key:
242
+ specification_version: 4
243
+ summary: Simple site crawler using Capybara
244
+ test_files: []