browser_crawler 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +10 -0
  5. data/.travis.yml +29 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +277 -0
  9. data/Rakefile +7 -0
  10. data/bin/console +10 -0
  11. data/bin/crawl +51 -0
  12. data/bin/setup +8 -0
  13. data/browser_crawler.gemspec +47 -0
  14. data/lib/browser_crawler.rb +12 -0
  15. data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
  16. data/lib/browser_crawler/dsl/sign_in.rb +37 -0
  17. data/lib/browser_crawler/engine.rb +156 -0
  18. data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
  19. data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
  20. data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
  21. data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
  22. data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
  23. data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
  24. data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
  25. data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
  26. data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
  27. data/lib/browser_crawler/hooks_container.rb +31 -0
  28. data/lib/browser_crawler/hooks_operator.rb +44 -0
  29. data/lib/browser_crawler/options.rb +86 -0
  30. data/lib/browser_crawler/report_factory.rb +22 -0
  31. data/lib/browser_crawler/reports/csv_report.rb +75 -0
  32. data/lib/browser_crawler/reports/store.rb +114 -0
  33. data/lib/browser_crawler/reports/yaml_report.rb +15 -0
  34. data/lib/browser_crawler/screenshot_operator.rb +47 -0
  35. data/lib/browser_crawler/support/capybara.rb +20 -0
  36. data/lib/browser_crawler/url_tools.rb +32 -0
  37. data/lib/browser_crawler/version.rb +3 -0
  38. metadata +244 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: aef40cd15ff0c5799f68906813b3b7161745d41d69ed96d18e67b963bc95595c
4
+ data.tar.gz: 8b4fef874f80bdeb74a960e76cbb77ff8a578fe99f8c65f6f32777dd767dd4dc
5
+ SHA512:
6
+ metadata.gz: e70ce11b3110c2967212ff504b1b0a541aaec7458d849048ec3df56e5806c9b0e5c9121f02314fcb3db86c981d5347979b8e22dee9aba3ea10745b1e4ccc7268
7
+ data.tar.gz: 8ee9f384dfb158009a334bf99e00c0239d64b942d41e5ba80532cd8cb80641bdb53676c04b3a909a2cf3e0547ec3faa8c9be9e4b1b0109f67d1694ae6ba50892
data/.gitignore ADDED
@@ -0,0 +1,13 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/*
10
+ .ruby-version
11
+ /.idea/*
12
+ *.gem
13
+ .byebug_history
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.rubocop.yml ADDED
@@ -0,0 +1,10 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.5
3
+ Style/FrozenStringLiteralComment:
4
+ Enabled: false
5
+ Metrics/BlockLength:
6
+ Exclude:
7
+ - 'browser_crawler.gemspec'
8
+ - 'spec/**/*'
9
+ Metrics/LineLength:
10
+ Max: 100
data/.travis.yml ADDED
@@ -0,0 +1,29 @@
1
+ language: ruby
2
+
3
+ dist: trusty
4
+
5
+ sudo: required
6
+
7
+ before_install:
8
+ - gem install bundler -v 1.17.2
9
+
10
+ addons:
11
+ apt:
12
+ packages:
13
+ - google-chrome-stable
14
+
15
+ before_script:
16
+ - whoami
17
+ - wget https://chromedriver.storage.googleapis.com/2.46/chromedriver_linux64.zip
18
+ - unzip chromedriver_linux64.zip
19
+ - sudo mv chromedriver /usr/bin/chromedriver
20
+ - sudo chown root:root /usr/bin/chromedriver
21
+ - sudo chmod +x /usr/bin/chromedriver
22
+ - sudo ln -s /usr/bin/chromedriver ~/bin/chromedriver
23
+ - chromedriver -v
24
+
25
+ cache: bundler
26
+
27
+ rvm:
28
+ - 2.5.3
29
+ - 2.6.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in browser_crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Dima Samodurov
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,277 @@
1
+ # Browser Crawler
2
+
3
+ [![Build Status](https://travis-ci.org/DimaSamodurov/browser_crawler.svg?branch=master)](https://travis-ci.org/DimaSamodurov/browser_crawler)
4
+
5
+ Browser Crawler is aimed to visit pages available on the site and extract useful information.
6
+
7
+ It can help maintaining e.g. lists of internal and external links,
8
+ creating sitemaps, visual testing using screenshots
9
+ or prepare the list of urls for the more sophisticated tool like [Wraith](https://github.com/BBC-News/wraith).
10
+
11
+ Browser based crawling is performed with the help of [Capybara](https://github.com/teamcapybara/capybara) and Chrome.
12
+ Javascript is executed before page is analyzed allowing to crawl dynamic content.
13
+ Browser based crawling is essentially an alternative to Wraith's spider mode,
14
+ which parses only server side rendered html.
15
+
16
+ By default crawler visits pages following the links extracted.
17
+ No button clicks performed other than during the optional authentication step.
18
+ Thus crawler does not perform any updates to the site and can be treated as noninvasive.
19
+
20
+ ## Table of contents
21
+ - [Installation](#installation)
22
+ - [Usage from command line](#usage-from-command-line)
23
+ - [Usage with scripting](#usage-with-scripting)
24
+ - [Callback methods](#callback-methods)
25
+ - [Callback methods Before/After crawling](#callback-methods-before-or-after-crawling)
26
+ - [Callback methods Before/After for each crawling page](#callback-methods-before-or-after-for-each-page)
27
+ - [Callback method is recorded unvisited links](#callback-method-unvisited-links)
28
+ - [Callback method is changed page scan rules](#callback-method-page-scan-rules)
29
+ - [Setup folder to save report file](#setup-folder-for-report)
30
+ - [Save report to yaml file](#save-report-to-yaml)
31
+ - [Save report to csv file](#save-report-to-csv)
32
+ - [Usage with Wraith](#usage-with-wraith)
33
+ - [Restrictions](#restrictions)
34
+ - [Ideas for enhancements](#ideas-for-enchancements)
35
+ - [Integration with test frameworks](#integration-with-test-frameworks)
36
+ - [Development](#development)
37
+ - [Contributing](#contributing)
38
+ - [License](#license)
39
+
40
+ ## <a name="installation"></a> Installation
41
+
42
+ Add this line to your application's Gemfile:
43
+
44
+ ```ruby
45
+ gem 'crawler', github: 'DimaSamodurov/crawler'
46
+ ```
47
+
48
+ And then execute:
49
+
50
+ $ bundle
51
+
52
+ Or install it yourself as:
53
+
54
+ $ gem install browser_crawler
55
+
56
+ ## <a name="usage-from-command-line"></a> Usage from command line
57
+
58
+ Without the authentication required:
59
+ ```
60
+ crawl http://localhost:3000
61
+ ```
62
+
63
+ With authentication, screenshots and limiting visited page number to 1:
64
+ ```
65
+ crawl https://your.site.com/welcome -u username -p password -n 1 -s tmp/screenshots
66
+ # or
67
+ export username=dima
68
+ export password=secret
69
+ #...
70
+ crawl https://your.site.com/welcome -n 1 -s tmp/screenshots
71
+ ```
72
+
73
+ Generate index from the captured screenshots. Index is saved to `tmp/screenshots/index.html`.
74
+ ```
75
+ bin/crawl -s tmp/screenshots
76
+ ```
77
+
78
+ see additional options with:
79
+
80
+ ```
81
+ bin/crawl -h
82
+ ```
83
+
84
+ When finished the crawling report will be saved to `tmp/crawl_report.yml` file by default.
85
+ You can specify the file path using command line options.
86
+
87
+ ## <a name="usage-with-scripting"></a> Usage with scripting
88
+
89
+ Below pointed an example script which configures the crawler and targets on the `github.com` site
90
+ and after that records the result report as yaml file.
91
+ ```
92
+ crawler = BrowserCrawler::Engine.new({
93
+ browser_options: {
94
+ headless: true,
95
+ window_size: [1200, 1600],
96
+ timeout: 60,
97
+ browser_options: { 'no-sandbox': nil }
98
+ },
99
+ max_pages: 10,
100
+ deep_visit: true
101
+ })
102
+
103
+ crawler.extract_links(url: 'https://github.com')
104
+ crawler.report_save
105
+ ```
106
+
107
+ This gem use external dependency a `cuprite`. The `cuprite` allows working with browser without intermediaries (chrome-driver).
108
+ `browser_options` responsible for configuration the chrome headless browser though the `cuprite`.
109
+
110
+ * `max_pages` - an additional option to allow to set amount of pages for crawling.
111
+ By default it equals `nil` and allows the crawler is browsing all pages within a certain domain.
112
+ * `deep_visit` - a mode of the crawler when the crawler checks external resources without collecting links from them.
113
+
114
+
115
+ ### <a name="callback-methods"></a> Callback methods
116
+
117
+ All of them you can use with Capybara DSL.
118
+
119
+ #### <a name="callback-methods-before-or-after-crawling"></a> Callback methods Before/After crawling
120
+ ```
121
+ crawler = BrowserCrawler::Engine.new()
122
+
123
+ # scroll down page before scan.
124
+ crawler.before do
125
+ page.execute_script 'window.scrollBy(0,10000)'
126
+ end
127
+
128
+ crawler.after do
129
+ page.body
130
+ end
131
+
132
+ crawler.extract_links(url: 'https://github.com')
133
+ ```
134
+
135
+ #### <a name="callback-methods-before-or-after-for-each-page"></a> Callback methods Before/After for each crawling page
136
+ ```
137
+ crawler = BrowserCrawler::Engine.new()
138
+
139
+ # scroll down page before scan.
140
+ crawler.before type: :each do
141
+ page.execute_script 'window.scrollBy(0,10000)'
142
+ end
143
+
144
+ crawler.after type: :each do
145
+ page.body
146
+ end
147
+
148
+ crawler.extract_links(url: 'https://github.com')
149
+ ```
150
+
151
+ #### <a name="callback-method-unvisited-links"></a> Callback method is recorded unvisited links
152
+ Default behavior: by default crawler is sent all links from page to an unvisited_links array
153
+ and after that browses each of them. This callback allows to change this behavior.
154
+ ```
155
+ crawler = BrowserCrawler::Engine.new()
156
+
157
+ # scan_result consists of array with links from scaned page.
158
+ crawler.unvisited_links do
159
+ @page_inspector.scan_result
160
+ end
161
+
162
+ crawler.extract_links(url: 'https://github.com')
163
+ ```
164
+
165
+ Changed behavior: change default behavior so that crawler browses only links which consist of `/best-links`.
166
+ ```
167
+ crawler = BrowserCrawler::Engine.new()
168
+
169
+ crawler.unvisited_links do
170
+ @page_inspector.scan_result.select { |link| link.include?('/best-links') }
171
+ end
172
+
173
+ crawler.extract_links(url: 'https://github.com')
174
+ ```
175
+
176
+ #### <a name="callback-method-page-scan-rules"></a> Callback method is changed page scan rules
177
+ Default behavior: by default crawler get all links from page and move to one to another.
178
+ ```
179
+ crawler = BrowserCrawler::Engine.new()
180
+
181
+ crawler.change_page_scan_rules do
182
+ page.all('a').map { |a| a['href'] }
183
+ end
184
+
185
+ crawler.extract_links(url: 'https://github.com')
186
+ ```
187
+
188
+ Changed behavior: change default behavior so that crawler get only links to have selector `paginations`.
189
+ ```
190
+ crawler = BrowserCrawler::Engine.new()
191
+
192
+ crawler.change_page_scan_rules do
193
+ if URI.parse(page.current_url).to_s.include?('/help/')
194
+ page.all('a.paginations') { |a| a['href'] }
195
+ else
196
+ []
197
+ end
198
+ end
199
+
200
+ crawler.extract_links(url: 'https://github.com')
201
+ ```
202
+
203
+ #### <a name="setup-folder-for-report"></a> Setup folder to save report file
204
+ ```
205
+ crawler = BrowserCrawler::Engine.new()
206
+ crawler.extract_links(url: 'https://github.com')
207
+
208
+ crawler.report_save(folder_path: './reports/')
209
+ ```
210
+ If the folder doesn't exist, `BrowserCrawler` create the folder for report.
211
+
212
+ #### <a name="save-report-to-yaml"></a> Save report to yaml file
213
+ ```
214
+ crawler = BrowserCrawler::Engine.new()
215
+ crawler.extract_links(url: 'https://github.com')
216
+
217
+ crawler.report_save(type: :yaml)
218
+ ```
219
+
220
+ #### <a name="save-report-to-csv"></a> Save report to csv file
221
+ ```
222
+ crawler = BrowserCrawler::Engine.new()
223
+ crawler.extract_links(url: 'https://github.com')
224
+
225
+ crawler.report_save(type: :csv)
226
+ ```
227
+
228
+ ### <a name="usage-with-wraith"></a> Usage with Wraith
229
+
230
+ Browser Crawler can be useful to update `paths:` section of the wraith's configs.
231
+
232
+ Provided wraith config is placed to `wraith/configs/capture.yaml` file, do:
233
+ ```
234
+ crawl https://your.site.com/welcome -c wraith/configs/capture.yaml
235
+ ```
236
+
237
+ Or if you have crawling report available, just use it without the URL to skip crawling:
238
+ ```
239
+ bin/crawl -c tmp/wraith_config.yml -r tmp/crawl_report.yml
240
+ ```
241
+
242
+ ## <a name="restrictions"></a> Restrictions
243
+
244
+ Current version has the authentication process hardcoded:
245
+ the path to login form and the field names used are specific to the project
246
+ the crawler is extracted from.
247
+ Configuration may be added in a future version.
248
+
249
+ ## <a name="ideas-for-enchancements"></a> Ideas for enhancements
250
+ It should be easy to crawl the site as part of the automated testing.
251
+ e.g. in order to verify the list of pages available on the site,
252
+ or in order to generate visual report (Wraith does it better).
253
+
254
+ ### <a name="integration-with-test-frameworks"></a> Integration with test frameworks
255
+
256
+ By integrating browser_crawler into the application test suite
257
+ it would be possible accessing pages and content not easily accessible on real site.
258
+ E.g. when performing data modifications.
259
+
260
+ By integrating into test suite it
261
+ would be possible to use all the tools/mocks/helpers/ created to simulate user behavior.
262
+ E.g. mock external request using e.g. VCR.
263
+
264
+
265
+ ## <a name="development"></a> Development
266
+
267
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
268
+
269
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
270
+
271
+ ## <a name="contributing"></a> Contributing
272
+
273
+ Bug reports and pull requests are welcome on GitHub at https://github.com/dimasamodurov/browser_crawler.
274
+
275
+ ## <a name="license"></a> License
276
+
277
+ MIT
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+ require 'rubocop/rake_task'
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
6
+
7
+ task default: %i[spec]
data/bin/console ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "browser_crawler"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ require "pry"
10
+ Pry.start
data/bin/crawl ADDED
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'browser_crawler'
5
+ require 'pry' if ENV['DEBUG'] == 'true'
6
+
7
+ options = BrowserCrawler::Options.parse_args
8
+ ENV.update(options.transform_keys(&:to_s).transform_values(&:to_s))
9
+
10
+ if options[:screenshots_path] && !File.directory?(options[:screenshots_path])
11
+ `mkdir -p #{options[:screenshots_path]}`
12
+ end
13
+
14
+ if options[:report_format] && !%w[yaml csv].include?(options[:report_format])
15
+ puts("Report format #{options[:report_format]} is not recognized." \
16
+ " Please choose the correct format from the list of available formats: 'csv', 'yaml'")
17
+ return
18
+ end
19
+
20
+ if options[:url]
21
+ engine = BrowserCrawler::Engine.new(
22
+ browser_options: {
23
+ windows_size: [options[:window_width].to_i,
24
+ options[:window_height].to_i]
25
+ },
26
+ max_pages: options[:max_pages],
27
+ screenshots_options: { save_screenshots_to: options[:screenshots_path] }
28
+ )
29
+
30
+ engine.extract_links(url: options[:url]) if options[:url]
31
+
32
+ engine.report_save(folder_path: options[:report_folder],
33
+ type: options[:report_format])
34
+
35
+ puts "Report is saved to #{options[:report_folder]} as #{options[:report_format]} file."
36
+ puts "Total pages visited: #{engine.report_store.visited_pages.count}."
37
+ end
38
+
39
+ if options[:screenshots_path]
40
+ template = File.read(options[:index_template]) if options[:index_template]
41
+ indexer = BrowserCrawler::Followups::ScreenshotsIndexer
42
+ .new(template: template)
43
+ file = indexer.index_directory(options[:screenshots_path])
44
+ puts "Screenshots index is saved to '#{file}'."
45
+ end
46
+
47
+ if options[:wraith_config]
48
+ followup = BrowserCrawler::Followups::WraithIntegrator
49
+ .new(report: File.read("#{options[:report_folder]}/crawler_report.yaml"))
50
+ followup.update_config(options[:wraith_config], path_suffix: '?wraith')
51
+ end