browser_crawler 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +2 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +277 -0
- data/Rakefile +7 -0
- data/bin/console +10 -0
- data/bin/crawl +51 -0
- data/bin/setup +8 -0
- data/browser_crawler.gemspec +47 -0
- data/lib/browser_crawler.rb +12 -0
- data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
- data/lib/browser_crawler/dsl/sign_in.rb +37 -0
- data/lib/browser_crawler/engine.rb +156 -0
- data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
- data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
- data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
- data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
- data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
- data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
- data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
- data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
- data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
- data/lib/browser_crawler/hooks_container.rb +31 -0
- data/lib/browser_crawler/hooks_operator.rb +44 -0
- data/lib/browser_crawler/options.rb +86 -0
- data/lib/browser_crawler/report_factory.rb +22 -0
- data/lib/browser_crawler/reports/csv_report.rb +75 -0
- data/lib/browser_crawler/reports/store.rb +114 -0
- data/lib/browser_crawler/reports/yaml_report.rb +15 -0
- data/lib/browser_crawler/screenshot_operator.rb +47 -0
- data/lib/browser_crawler/support/capybara.rb +20 -0
- data/lib/browser_crawler/url_tools.rb +32 -0
- data/lib/browser_crawler/version.rb +3 -0
- metadata +244 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: aef40cd15ff0c5799f68906813b3b7161745d41d69ed96d18e67b963bc95595c
|
4
|
+
data.tar.gz: 8b4fef874f80bdeb74a960e76cbb77ff8a578fe99f8c65f6f32777dd767dd4dc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e70ce11b3110c2967212ff504b1b0a541aaec7458d849048ec3df56e5806c9b0e5c9121f02314fcb3db86c981d5347979b8e22dee9aba3ea10745b1e4ccc7268
|
7
|
+
data.tar.gz: 8ee9f384dfb158009a334bf99e00c0239d64b942d41e5ba80532cd8cb80641bdb53676c04b3a909a2cf3e0547ec3faa8c9be9e4b1b0109f67d1694ae6ba50892
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
language: ruby
|
2
|
+
|
3
|
+
dist: trusty
|
4
|
+
|
5
|
+
sudo: required
|
6
|
+
|
7
|
+
before_install:
|
8
|
+
- gem install bundler -v 1.17.2
|
9
|
+
|
10
|
+
addons:
|
11
|
+
apt:
|
12
|
+
packages:
|
13
|
+
- google-chrome-stable
|
14
|
+
|
15
|
+
before_script:
|
16
|
+
- whoami
|
17
|
+
- wget https://chromedriver.storage.googleapis.com/2.46/chromedriver_linux64.zip
|
18
|
+
- unzip chromedriver_linux64.zip
|
19
|
+
- sudo mv chromedriver /usr/bin/chromedriver
|
20
|
+
- sudo chown root:root /usr/bin/chromedriver
|
21
|
+
- sudo chmod +x /usr/bin/chromedriver
|
22
|
+
- sudo ln -s /usr/bin/chromedriver ~/bin/chromedriver
|
23
|
+
- chromedriver -v
|
24
|
+
|
25
|
+
cache: bundler
|
26
|
+
|
27
|
+
rvm:
|
28
|
+
- 2.5.3
|
29
|
+
- 2.6.2
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Dima Samodurov
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,277 @@
|
|
1
|
+
# Browser Crawler
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/DimaSamodurov/browser_crawler.svg?branch=master)](https://travis-ci.org/DimaSamodurov/browser_crawler)
|
4
|
+
|
5
|
+
Browser Crawler is aimed to visit pages available on the site and extract useful information.
|
6
|
+
|
7
|
+
It can help maintaining e.g. lists of internal and external links,
|
8
|
+
creating sitemaps, visual testing using screenshots
|
9
|
+
or prepare the list of urls for the more sophisticated tool like [Wraith](https://github.com/BBC-News/wraith).
|
10
|
+
|
11
|
+
Browser based crawling is performed with the help of [Capybara](https://github.com/teamcapybara/capybara) and Chrome.
|
12
|
+
Javascript is executed before page is analyzed allowing to crawl dynamic content.
|
13
|
+
Browser based crawling is essentially an alternative to Wraith's spider mode,
|
14
|
+
which parses only server side rendered html.
|
15
|
+
|
16
|
+
By default crawler visits pages following the links extracted.
|
17
|
+
No button clicks performed other than during the optional authentication step.
|
18
|
+
Thus crawler does not perform any updates to the site and can be treated as noninvasive.
|
19
|
+
|
20
|
+
## Table of contents
|
21
|
+
- [Installation](#installation)
|
22
|
+
- [Usage from command line](#usage-from-command-line)
|
23
|
+
- [Usage with scripting](#usage-with-scripting)
|
24
|
+
- [Callback methods](#callback-methods)
|
25
|
+
- [Callback methods Before/After crawling](#callback-methods-before-or-after-crawling)
|
26
|
+
- [Callback methods Before/After for each crawling page](#callback-methods-before-or-after-for-each-page)
|
27
|
+
- [Callback method is recorded unvisited links](#callback-method-unvisited-links)
|
28
|
+
- [Callback method is changed page scan rules](#callback-method-page-scan-rules)
|
29
|
+
- [Setup folder to save report file](#setup-folder-for-report)
|
30
|
+
- [Save report to yaml file](#save-report-to-yaml)
|
31
|
+
- [Save report to csv file](#save-report-to-csv)
|
32
|
+
- [Usage with Wraith](#usage-with-wraith)
|
33
|
+
- [Restrictions](#restrictions)
|
34
|
+
- [Ideas for enhancements](#ideas-for-enchancements)
|
35
|
+
- [Integration with test frameworks](#integration-with-test-frameworks)
|
36
|
+
- [Development](#development)
|
37
|
+
- [Contributing](#contributing)
|
38
|
+
- [License](#license)
|
39
|
+
|
40
|
+
## <a name="installation"></a> Installation
|
41
|
+
|
42
|
+
Add this line to your application's Gemfile:
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
gem 'crawler', github: 'DimaSamodurov/crawler'
|
46
|
+
```
|
47
|
+
|
48
|
+
And then execute:
|
49
|
+
|
50
|
+
$ bundle
|
51
|
+
|
52
|
+
Or install it yourself as:
|
53
|
+
|
54
|
+
$ gem install browser_crawler
|
55
|
+
|
56
|
+
## <a name="usage-from-command-line"></a> Usage from command line
|
57
|
+
|
58
|
+
Without the authentication required:
|
59
|
+
```
|
60
|
+
crawl http://localhost:3000
|
61
|
+
```
|
62
|
+
|
63
|
+
With authentication, screenshots and limiting visited page number to 1:
|
64
|
+
```
|
65
|
+
crawl https://your.site.com/welcome -u username -p password -n 1 -s tmp/screenshots
|
66
|
+
# or
|
67
|
+
export username=dima
|
68
|
+
export password=secret
|
69
|
+
#...
|
70
|
+
crawl https://your.site.com/welcome -n 1 -s tmp/screenshots
|
71
|
+
```
|
72
|
+
|
73
|
+
Generate index from the captured screenshots. Index is saved to `tmp/screenshots/index.html`.
|
74
|
+
```
|
75
|
+
bin/crawl -s tmp/screenshots
|
76
|
+
```
|
77
|
+
|
78
|
+
see additional options with:
|
79
|
+
|
80
|
+
```
|
81
|
+
bin/crawl -h
|
82
|
+
```
|
83
|
+
|
84
|
+
When finished the crawling report will be saved to `tmp/crawl_report.yml` file by default.
|
85
|
+
You can specify the file path using command line options.
|
86
|
+
|
87
|
+
## <a name="usage-with-scripting"></a> Usage with scripting
|
88
|
+
|
89
|
+
Below pointed an example script which configures the crawler and targets on the `github.com` site
|
90
|
+
and after that records the result report as yaml file.
|
91
|
+
```
|
92
|
+
crawler = BrowserCrawler::Engine.new({
|
93
|
+
browser_options: {
|
94
|
+
headless: true,
|
95
|
+
window_size: [1200, 1600],
|
96
|
+
timeout: 60,
|
97
|
+
browser_options: { 'no-sandbox': nil }
|
98
|
+
},
|
99
|
+
max_pages: 10,
|
100
|
+
deep_visit: true
|
101
|
+
})
|
102
|
+
|
103
|
+
crawler.extract_links(url: 'https://github.com')
|
104
|
+
crawler.report_save
|
105
|
+
```
|
106
|
+
|
107
|
+
This gem use external dependency a `cuprite`. The `cuprite` allows working with browser without intermediaries (chrome-driver).
|
108
|
+
`browser_options` responsible for configuration the chrome headless browser though the `cuprite`.
|
109
|
+
|
110
|
+
* `max_pages` - an additional option to allow to set amount of pages for crawling.
|
111
|
+
By default it equals `nil` and allows the crawler is browsing all pages within a certain domain.
|
112
|
+
* `deep_visit` - a mode of the crawler when the crawler checks external resources without collecting links from them.
|
113
|
+
|
114
|
+
|
115
|
+
### <a name="callback-methods"></a> Callback methods
|
116
|
+
|
117
|
+
All of them you can use with Capybara DSL.
|
118
|
+
|
119
|
+
#### <a name="callback-methods-before-or-after-crawling"></a> Callback methods Before/After crawling
|
120
|
+
```
|
121
|
+
crawler = BrowserCrawler::Engine.new()
|
122
|
+
|
123
|
+
# scroll down page before scan.
|
124
|
+
crawler.before do
|
125
|
+
page.execute_script 'window.scrollBy(0,10000)'
|
126
|
+
end
|
127
|
+
|
128
|
+
crawler.after do
|
129
|
+
page.body
|
130
|
+
end
|
131
|
+
|
132
|
+
crawler.extract_links(url: 'https://github.com')
|
133
|
+
```
|
134
|
+
|
135
|
+
#### <a name="callback-methods-before-or-after-for-each-page"></a> Callback methods Before/After for each crawling page
|
136
|
+
```
|
137
|
+
crawler = BrowserCrawler::Engine.new()
|
138
|
+
|
139
|
+
# scroll down page before scan.
|
140
|
+
crawler.before type: :each do
|
141
|
+
page.execute_script 'window.scrollBy(0,10000)'
|
142
|
+
end
|
143
|
+
|
144
|
+
crawler.after type: :each do
|
145
|
+
page.body
|
146
|
+
end
|
147
|
+
|
148
|
+
crawler.extract_links(url: 'https://github.com')
|
149
|
+
```
|
150
|
+
|
151
|
+
#### <a name="callback-method-unvisited-links"></a> Callback method is recorded unvisited links
|
152
|
+
Default behavior: by default crawler is sent all links from page to an unvisited_links array
|
153
|
+
and after that browses each of them. This callback allows to change this behavior.
|
154
|
+
```
|
155
|
+
crawler = BrowserCrawler::Engine.new()
|
156
|
+
|
157
|
+
# scan_result consists of array with links from scaned page.
|
158
|
+
crawler.unvisited_links do
|
159
|
+
@page_inspector.scan_result
|
160
|
+
end
|
161
|
+
|
162
|
+
crawler.extract_links(url: 'https://github.com')
|
163
|
+
```
|
164
|
+
|
165
|
+
Changed behavior: change default behavior so that crawler browses only links which consist of `/best-links`.
|
166
|
+
```
|
167
|
+
crawler = BrowserCrawler::Engine.new()
|
168
|
+
|
169
|
+
crawler.unvisited_links do
|
170
|
+
@page_inspector.scan_result.select { |link| link.include?('/best-links') }
|
171
|
+
end
|
172
|
+
|
173
|
+
crawler.extract_links(url: 'https://github.com')
|
174
|
+
```
|
175
|
+
|
176
|
+
#### <a name="callback-method-page-scan-rules"></a> Callback method is changed page scan rules
|
177
|
+
Default behavior: by default crawler get all links from page and move to one to another.
|
178
|
+
```
|
179
|
+
crawler = BrowserCrawler::Engine.new()
|
180
|
+
|
181
|
+
crawler.change_page_scan_rules do
|
182
|
+
page.all('a').map { |a| a['href'] }
|
183
|
+
end
|
184
|
+
|
185
|
+
crawler.extract_links(url: 'https://github.com')
|
186
|
+
```
|
187
|
+
|
188
|
+
Changed behavior: change default behavior so that crawler get only links to have selector `paginations`.
|
189
|
+
```
|
190
|
+
crawler = BrowserCrawler::Engine.new()
|
191
|
+
|
192
|
+
crawler.change_page_scan_rules do
|
193
|
+
if URI.parse(page.current_url).to_s.include?('/help/')
|
194
|
+
page.all('a.paginations') { |a| a['href'] }
|
195
|
+
else
|
196
|
+
[]
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
crawler.extract_links(url: 'https://github.com')
|
201
|
+
```
|
202
|
+
|
203
|
+
#### <a name="setup-folder-for-report"></a> Setup folder to save report file
|
204
|
+
```
|
205
|
+
crawler = BrowserCrawler::Engine.new()
|
206
|
+
crawler.extract_links(url: 'https://github.com')
|
207
|
+
|
208
|
+
crawler.report_save(folder_path: './reports/')
|
209
|
+
```
|
210
|
+
If the folder doesn't exist, `BrowserCrawler` create the folder for report.
|
211
|
+
|
212
|
+
#### <a name="save-report-to-yaml"></a> Save report to yaml file
|
213
|
+
```
|
214
|
+
crawler = BrowserCrawler::Engine.new()
|
215
|
+
crawler.extract_links(url: 'https://github.com')
|
216
|
+
|
217
|
+
crawler.report_save(type: :yaml)
|
218
|
+
```
|
219
|
+
|
220
|
+
#### <a name="save-report-to-csv"></a> Save report to csv file
|
221
|
+
```
|
222
|
+
crawler = BrowserCrawler::Engine.new()
|
223
|
+
crawler.extract_links(url: 'https://github.com')
|
224
|
+
|
225
|
+
crawler.report_save(type: :csv)
|
226
|
+
```
|
227
|
+
|
228
|
+
### <a name="usage-with-wraith"></a> Usage with Wraith
|
229
|
+
|
230
|
+
Browser Crawler can be useful to update `paths:` section of the wraith's configs.
|
231
|
+
|
232
|
+
Provided wraith config is placed to `wraith/configs/capture.yaml` file, do:
|
233
|
+
```
|
234
|
+
crawl https://your.site.com/welcome -c wraith/configs/capture.yaml
|
235
|
+
```
|
236
|
+
|
237
|
+
Or if you have crawling report available, just use it without the URL to skip crawling:
|
238
|
+
```
|
239
|
+
bin/crawl -c tmp/wraith_config.yml -r tmp/crawl_report.yml
|
240
|
+
```
|
241
|
+
|
242
|
+
## <a name="restrictions"></a> Restrictions
|
243
|
+
|
244
|
+
Current version has the authentication process hardcoded:
|
245
|
+
the path to login form and the field names used are specific to the project
|
246
|
+
the crawler is extracted from.
|
247
|
+
Configuration may be added in a future version.
|
248
|
+
|
249
|
+
## <a name="ideas-for-enchancements"></a> Ideas for enhancements
|
250
|
+
It should be easy to crawl the site as part of the automated testing.
|
251
|
+
e.g. in order to verify the list of pages available on the site,
|
252
|
+
or in order to generate visual report (Wraith does it better).
|
253
|
+
|
254
|
+
### <a name="integration-with-test-frameworks"></a> Integration with test frameworks
|
255
|
+
|
256
|
+
By integrating browser_crawler into the application test suite
|
257
|
+
it would be possible accessing pages and content not easily accessible on real site.
|
258
|
+
E.g. when performing data modifications.
|
259
|
+
|
260
|
+
By integrating into test suite it
|
261
|
+
would be possible to use all the tools/mocks/helpers/ created to simulate user behavior.
|
262
|
+
E.g. mock external request using e.g. VCR.
|
263
|
+
|
264
|
+
|
265
|
+
## <a name="development"></a> Development
|
266
|
+
|
267
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
268
|
+
|
269
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
270
|
+
|
271
|
+
## <a name="contributing"></a> Contributing
|
272
|
+
|
273
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/dimasamodurov/browser_crawler.
|
274
|
+
|
275
|
+
## <a name="license"></a> License
|
276
|
+
|
277
|
+
MIT
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "browser_crawler"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
require "pry"
|
10
|
+
Pry.start
|
data/bin/crawl
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'browser_crawler'
|
5
|
+
require 'pry' if ENV['DEBUG'] == 'true'
|
6
|
+
|
7
|
+
options = BrowserCrawler::Options.parse_args
|
8
|
+
ENV.update(options.transform_keys(&:to_s).transform_values(&:to_s))
|
9
|
+
|
10
|
+
if options[:screenshots_path] && !File.directory?(options[:screenshots_path])
|
11
|
+
`mkdir -p #{options[:screenshots_path]}`
|
12
|
+
end
|
13
|
+
|
14
|
+
if options[:report_format] && !%w[yaml csv].include?(options[:report_format])
|
15
|
+
puts("Report format #{options[:report_format]} is not recognized." \
|
16
|
+
" Please choose the correct format from the list of available formats: 'csv', 'yaml'")
|
17
|
+
return
|
18
|
+
end
|
19
|
+
|
20
|
+
if options[:url]
|
21
|
+
engine = BrowserCrawler::Engine.new(
|
22
|
+
browser_options: {
|
23
|
+
windows_size: [options[:window_width].to_i,
|
24
|
+
options[:window_height].to_i]
|
25
|
+
},
|
26
|
+
max_pages: options[:max_pages],
|
27
|
+
screenshots_options: { save_screenshots_to: options[:screenshots_path] }
|
28
|
+
)
|
29
|
+
|
30
|
+
engine.extract_links(url: options[:url]) if options[:url]
|
31
|
+
|
32
|
+
engine.report_save(folder_path: options[:report_folder],
|
33
|
+
type: options[:report_format])
|
34
|
+
|
35
|
+
puts "Report is saved to #{options[:report_folder]} as #{options[:report_format]} file."
|
36
|
+
puts "Total pages visited: #{engine.report_store.visited_pages.count}."
|
37
|
+
end
|
38
|
+
|
39
|
+
if options[:screenshots_path]
|
40
|
+
template = File.read(options[:index_template]) if options[:index_template]
|
41
|
+
indexer = BrowserCrawler::Followups::ScreenshotsIndexer
|
42
|
+
.new(template: template)
|
43
|
+
file = indexer.index_directory(options[:screenshots_path])
|
44
|
+
puts "Screenshots index is saved to '#{file}'."
|
45
|
+
end
|
46
|
+
|
47
|
+
if options[:wraith_config]
|
48
|
+
followup = BrowserCrawler::Followups::WraithIntegrator
|
49
|
+
.new(report: File.read("#{options[:report_folder]}/crawler_report.yaml"))
|
50
|
+
followup.update_config(options[:wraith_config], path_suffix: '?wraith')
|
51
|
+
end
|