browser_crawler 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +10 -0
  5. data/.travis.yml +29 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +277 -0
  9. data/Rakefile +7 -0
  10. data/bin/console +10 -0
  11. data/bin/crawl +51 -0
  12. data/bin/setup +8 -0
  13. data/browser_crawler.gemspec +47 -0
  14. data/lib/browser_crawler.rb +12 -0
  15. data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
  16. data/lib/browser_crawler/dsl/sign_in.rb +37 -0
  17. data/lib/browser_crawler/engine.rb +156 -0
  18. data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
  19. data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
  20. data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
  21. data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
  22. data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
  23. data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
  24. data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
  25. data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
  26. data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
  27. data/lib/browser_crawler/hooks_container.rb +31 -0
  28. data/lib/browser_crawler/hooks_operator.rb +44 -0
  29. data/lib/browser_crawler/options.rb +86 -0
  30. data/lib/browser_crawler/report_factory.rb +22 -0
  31. data/lib/browser_crawler/reports/csv_report.rb +75 -0
  32. data/lib/browser_crawler/reports/store.rb +114 -0
  33. data/lib/browser_crawler/reports/yaml_report.rb +15 -0
  34. data/lib/browser_crawler/screenshot_operator.rb +47 -0
  35. data/lib/browser_crawler/support/capybara.rb +20 -0
  36. data/lib/browser_crawler/url_tools.rb +32 -0
  37. data/lib/browser_crawler/version.rb +3 -0
  38. metadata +244 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: aef40cd15ff0c5799f68906813b3b7161745d41d69ed96d18e67b963bc95595c
4
+ data.tar.gz: 8b4fef874f80bdeb74a960e76cbb77ff8a578fe99f8c65f6f32777dd767dd4dc
5
+ SHA512:
6
+ metadata.gz: e70ce11b3110c2967212ff504b1b0a541aaec7458d849048ec3df56e5806c9b0e5c9121f02314fcb3db86c981d5347979b8e22dee9aba3ea10745b1e4ccc7268
7
+ data.tar.gz: 8ee9f384dfb158009a334bf99e00c0239d64b942d41e5ba80532cd8cb80641bdb53676c04b3a909a2cf3e0547ec3faa8c9be9e4b1b0109f67d1694ae6ba50892
data/.gitignore ADDED
@@ -0,0 +1,13 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/*
10
+ .ruby-version
11
+ /.idea/*
12
+ *.gem
13
+ .byebug_history
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.rubocop.yml ADDED
@@ -0,0 +1,10 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.5
3
+ Style/FrozenStringLiteralComment:
4
+ Enabled: false
5
+ Metrics/BlockLength:
6
+ Exclude:
7
+ - 'browser_crawler.gemspec'
8
+ - 'spec/**/*'
9
+ Metrics/LineLength:
10
+ Max: 100
data/.travis.yml ADDED
@@ -0,0 +1,29 @@
1
+ language: ruby
2
+
3
+ dist: trusty
4
+
5
+ sudo: required
6
+
7
+ before_install:
8
+ - gem install bundler -v 1.17.2
9
+
10
+ addons:
11
+ apt:
12
+ packages:
13
+ - google-chrome-stable
14
+
15
+ before_script:
16
+ - whoami
17
+ - wget https://chromedriver.storage.googleapis.com/2.46/chromedriver_linux64.zip
18
+ - unzip chromedriver_linux64.zip
19
+ - sudo mv chromedriver /usr/bin/chromedriver
20
+ - sudo chown root:root /usr/bin/chromedriver
21
+ - sudo chmod +x /usr/bin/chromedriver
22
+ - sudo ln -s /usr/bin/chromedriver ~/bin/chromedriver
23
+ - chromedriver -v
24
+
25
+ cache: bundler
26
+
27
+ rvm:
28
+ - 2.5.3
29
+ - 2.6.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in browser_crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Dima Samodurov
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,277 @@
1
+ # Browser Crawler
2
+
3
+ [![Build Status](https://travis-ci.org/DimaSamodurov/browser_crawler.svg?branch=master)](https://travis-ci.org/DimaSamodurov/browser_crawler)
4
+
5
+ Browser Crawler is aimed to visit pages available on the site and extract useful information.
6
+
7
+ It can help maintaining e.g. lists of internal and external links,
8
+ creating sitemaps, visual testing using screenshots
9
+ or prepare the list of urls for the more sophisticated tool like [Wraith](https://github.com/BBC-News/wraith).
10
+
11
+ Browser based crawling is performed with the help of [Capybara](https://github.com/teamcapybara/capybara) and Chrome.
12
+ Javascript is executed before page is analyzed allowing to crawl dynamic content.
13
+ Browser based crawling is essentially an alternative to Wraith's spider mode,
14
+ which parses only server side rendered html.
15
+
16
+ By default crawler visits pages following the links extracted.
17
+ No button clicks performed other than during the optional authentication step.
18
+ Thus crawler does not perform any updates to the site and can be treated as noninvasive.
19
+
20
+ ## Table of contents
21
+ - [Installation](#installation)
22
+ - [Usage from command line](#usage-from-command-line)
23
+ - [Usage with scripting](#usage-with-scripting)
24
+ - [Callback methods](#callback-methods)
25
+ - [Callback methods Before/After crawling](#callback-methods-before-or-after-crawling)
26
+ - [Callback methods Before/After for each crawling page](#callback-methods-before-or-after-for-each-page)
27
+ - [Callback method is recorded unvisited links](#callback-method-unvisited-links)
28
+ - [Callback method is changed page scan rules](#callback-method-page-scan-rules)
29
+ - [Setup folder to save report file](#setup-folder-for-report)
30
+ - [Save report to yaml file](#save-report-to-yaml)
31
+ - [Save report to csv file](#save-report-to-csv)
32
+ - [Usage with Wraith](#usage-with-wraith)
33
+ - [Restrictions](#restrictions)
34
+ - [Ideas for enhancements](#ideas-for-enchancements)
35
+ - [Integration with test frameworks](#integration-with-test-frameworks)
36
+ - [Development](#development)
37
+ - [Contributing](#contributing)
38
+ - [License](#license)
39
+
40
+ ## <a name="installation"></a> Installation
41
+
42
+ Add this line to your application's Gemfile:
43
+
44
+ ```ruby
45
+ gem 'crawler', github: 'DimaSamodurov/crawler'
46
+ ```
47
+
48
+ And then execute:
49
+
50
+ $ bundle
51
+
52
+ Or install it yourself as:
53
+
54
+ $ gem install browser_crawler
55
+
56
+ ## <a name="usage-from-command-line"></a> Usage from command line
57
+
58
+ Without the authentication required:
59
+ ```
60
+ crawl http://localhost:3000
61
+ ```
62
+
63
+ With authentication, screenshots and limiting visited page number to 1:
64
+ ```
65
+ crawl https://your.site.com/welcome -u username -p password -n 1 -s tmp/screenshots
66
+ # or
67
+ export username=dima
68
+ export password=secret
69
+ #...
70
+ crawl https://your.site.com/welcome -n 1 -s tmp/screenshots
71
+ ```
72
+
73
+ Generate index from the captured screenshots. Index is saved to `tmp/screenshots/index.html`.
74
+ ```
75
+ bin/crawl -s tmp/screenshots
76
+ ```
77
+
78
+ see additional options with:
79
+
80
+ ```
81
+ bin/crawl -h
82
+ ```
83
+
84
+ When finished the crawling report will be saved to `tmp/crawl_report.yml` file by default.
85
+ You can specify the file path using command line options.
86
+
87
+ ## <a name="usage-with-scripting"></a> Usage with scripting
88
+
89
+ Below pointed an example script which configures the crawler and targets on the `github.com` site
90
+ and after that records the result report as yaml file.
91
+ ```
92
+ crawler = BrowserCrawler::Engine.new({
93
+ browser_options: {
94
+ headless: true,
95
+ window_size: [1200, 1600],
96
+ timeout: 60,
97
+ browser_options: { 'no-sandbox': nil }
98
+ },
99
+ max_pages: 10,
100
+ deep_visit: true
101
+ })
102
+
103
+ crawler.extract_links(url: 'https://github.com')
104
+ crawler.report_save
105
+ ```
106
+
107
+ This gem use external dependency a `cuprite`. The `cuprite` allows working with browser without intermediaries (chrome-driver).
108
+ `browser_options` responsible for configuration the chrome headless browser though the `cuprite`.
109
+
110
+ * `max_pages` - an additional option to allow to set amount of pages for crawling.
111
+ By default it equals `nil` and allows the crawler is browsing all pages within a certain domain.
112
+ * `deep_visit` - a mode of the crawler when the crawler checks external resources without collecting links from them.
113
+
114
+
115
+ ### <a name="callback-methods"></a> Callback methods
116
+
117
+ All of them you can use with Capybara DSL.
118
+
119
+ #### <a name="callback-methods-before-or-after-crawling"></a> Callback methods Before/After crawling
120
+ ```
121
+ crawler = BrowserCrawler::Engine.new()
122
+
123
+ # scroll down page before scan.
124
+ crawler.before do
125
+ page.execute_script 'window.scrollBy(0,10000)'
126
+ end
127
+
128
+ crawler.after do
129
+ page.body
130
+ end
131
+
132
+ crawler.extract_links(url: 'https://github.com')
133
+ ```
134
+
135
+ #### <a name="callback-methods-before-or-after-for-each-page"></a> Callback methods Before/After for each crawling page
136
+ ```
137
+ crawler = BrowserCrawler::Engine.new()
138
+
139
+ # scroll down page before scan.
140
+ crawler.before type: :each do
141
+ page.execute_script 'window.scrollBy(0,10000)'
142
+ end
143
+
144
+ crawler.after type: :each do
145
+ page.body
146
+ end
147
+
148
+ crawler.extract_links(url: 'https://github.com')
149
+ ```
150
+
151
+ #### <a name="callback-method-unvisited-links"></a> Callback method is recorded unvisited links
152
+ Default behavior: by default crawler is sent all links from page to an unvisited_links array
153
+ and after that browses each of them. This callback allows to change this behavior.
154
+ ```
155
+ crawler = BrowserCrawler::Engine.new()
156
+
157
+ # scan_result consists of array with links from scaned page.
158
+ crawler.unvisited_links do
159
+ @page_inspector.scan_result
160
+ end
161
+
162
+ crawler.extract_links(url: 'https://github.com')
163
+ ```
164
+
165
+ Changed behavior: change default behavior so that crawler browses only links which consist of `/best-links`.
166
+ ```
167
+ crawler = BrowserCrawler::Engine.new()
168
+
169
+ crawler.unvisited_links do
170
+ @page_inspector.scan_result.select { |link| link.include?('/best-links') }
171
+ end
172
+
173
+ crawler.extract_links(url: 'https://github.com')
174
+ ```
175
+
176
+ #### <a name="callback-method-page-scan-rules"></a> Callback method is changed page scan rules
177
+ Default behavior: by default crawler get all links from page and move to one to another.
178
+ ```
179
+ crawler = BrowserCrawler::Engine.new()
180
+
181
+ crawler.change_page_scan_rules do
182
+ page.all('a').map { |a| a['href'] }
183
+ end
184
+
185
+ crawler.extract_links(url: 'https://github.com')
186
+ ```
187
+
188
+ Changed behavior: change default behavior so that crawler get only links to have selector `paginations`.
189
+ ```
190
+ crawler = BrowserCrawler::Engine.new()
191
+
192
+ crawler.change_page_scan_rules do
193
+ if URI.parse(page.current_url).to_s.include?('/help/')
194
+ page.all('a.paginations') { |a| a['href'] }
195
+ else
196
+ []
197
+ end
198
+ end
199
+
200
+ crawler.extract_links(url: 'https://github.com')
201
+ ```
202
+
203
+ #### <a name="setup-folder-for-report"></a> Setup folder to save report file
204
+ ```
205
+ crawler = BrowserCrawler::Engine.new()
206
+ crawler.extract_links(url: 'https://github.com')
207
+
208
+ crawler.report_save(folder_path: './reports/')
209
+ ```
210
+ If the folder doesn't exist, `BrowserCrawler` create the folder for report.
211
+
212
+ #### <a name="save-report-to-yaml"></a> Save report to yaml file
213
+ ```
214
+ crawler = BrowserCrawler::Engine.new()
215
+ crawler.extract_links(url: 'https://github.com')
216
+
217
+ crawler.report_save(type: :yaml)
218
+ ```
219
+
220
+ #### <a name="save-report-to-csv"></a> Save report to csv file
221
+ ```
222
+ crawler = BrowserCrawler::Engine.new()
223
+ crawler.extract_links(url: 'https://github.com')
224
+
225
+ crawler.report_save(type: :csv)
226
+ ```
227
+
228
+ ### <a name="usage-with-wraith"></a> Usage with Wraith
229
+
230
+ Browser Crawler can be useful to update `paths:` section of the wraith's configs.
231
+
232
+ Provided wraith config is placed to `wraith/configs/capture.yaml` file, do:
233
+ ```
234
+ crawl https://your.site.com/welcome -c wraith/configs/capture.yaml
235
+ ```
236
+
237
+ Or if you have crawling report available, just use it without the URL to skip crawling:
238
+ ```
239
+ bin/crawl -c tmp/wraith_config.yml -r tmp/crawl_report.yml
240
+ ```
241
+
242
+ ## <a name="restrictions"></a> Restrictions
243
+
244
+ Current version has the authentication process hardcoded:
245
+ the path to login form and the field names used are specific to the project
246
+ the crawler is extracted from.
247
+ Configuration may be added in a future version.
248
+
249
+ ## <a name="ideas-for-enchancements"></a> Ideas for enhancements
250
+ It should be easy to crawl the site as part of the automated testing.
251
+ e.g. in order to verify the list of pages available on the site,
252
+ or in order to generate visual report (Wraith does it better).
253
+
254
+ ### <a name="integration-with-test-frameworks"></a> Integration with test frameworks
255
+
256
+ By integrating browser_crawler into the application test suite
257
+ it would be possible accessing pages and content not easily accessible on real site.
258
+ E.g. when performing data modifications.
259
+
260
+ By integrating into test suite it
261
+ would be possible to use all the tools/mocks/helpers/ created to simulate user behavior.
262
+ E.g. mock external request using e.g. VCR.
263
+
264
+
265
+ ## <a name="development"></a> Development
266
+
267
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
268
+
269
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
270
+
271
+ ## <a name="contributing"></a> Contributing
272
+
273
+ Bug reports and pull requests are welcome on GitHub at https://github.com/dimasamodurov/browser_crawler.
274
+
275
+ ## <a name="license"></a> License
276
+
277
+ MIT
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+ require 'rubocop/rake_task'
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
6
+
7
+ task default: %i[spec]
data/bin/console ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "browser_crawler"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ require "pry"
10
+ Pry.start
data/bin/crawl ADDED
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'browser_crawler'
5
+ require 'pry' if ENV['DEBUG'] == 'true'
6
+
7
+ options = BrowserCrawler::Options.parse_args
8
+ ENV.update(options.transform_keys(&:to_s).transform_values(&:to_s))
9
+
10
+ if options[:screenshots_path] && !File.directory?(options[:screenshots_path])
11
+ `mkdir -p #{options[:screenshots_path]}`
12
+ end
13
+
14
+ if options[:report_format] && !%w[yaml csv].include?(options[:report_format])
15
+ puts("Report format #{options[:report_format]} is not recognized." \
16
+ " Please choose the correct format from the list of available formats: 'csv', 'yaml'")
17
+ return
18
+ end
19
+
20
+ if options[:url]
21
+ engine = BrowserCrawler::Engine.new(
22
+ browser_options: {
23
+ windows_size: [options[:window_width].to_i,
24
+ options[:window_height].to_i]
25
+ },
26
+ max_pages: options[:max_pages],
27
+ screenshots_options: { save_screenshots_to: options[:screenshots_path] }
28
+ )
29
+
30
+ engine.extract_links(url: options[:url]) if options[:url]
31
+
32
+ engine.report_save(folder_path: options[:report_folder],
33
+ type: options[:report_format])
34
+
35
+ puts "Report is saved to #{options[:report_folder]} as #{options[:report_format]} file."
36
+ puts "Total pages visited: #{engine.report_store.visited_pages.count}."
37
+ end
38
+
39
+ if options[:screenshots_path]
40
+ template = File.read(options[:index_template]) if options[:index_template]
41
+ indexer = BrowserCrawler::Followups::ScreenshotsIndexer
42
+ .new(template: template)
43
+ file = indexer.index_directory(options[:screenshots_path])
44
+ puts "Screenshots index is saved to '#{file}'."
45
+ end
46
+
47
+ if options[:wraith_config]
48
+ followup = BrowserCrawler::Followups::WraithIntegrator
49
+ .new(report: File.read("#{options[:report_folder]}/crawler_report.yaml"))
50
+ followup.update_config(options[:wraith_config], path_suffix: '?wraith')
51
+ end