browser_crawler 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +2 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +277 -0
- data/Rakefile +7 -0
- data/bin/console +10 -0
- data/bin/crawl +51 -0
- data/bin/setup +8 -0
- data/browser_crawler.gemspec +47 -0
- data/lib/browser_crawler.rb +12 -0
- data/lib/browser_crawler/dsl/js_helpers.rb +13 -0
- data/lib/browser_crawler/dsl/sign_in.rb +37 -0
- data/lib/browser_crawler/engine.rb +156 -0
- data/lib/browser_crawler/engine_utilities/crawl_manager.rb +100 -0
- data/lib/browser_crawler/engine_utilities/inspect_page_process.rb +74 -0
- data/lib/browser_crawler/engine_utilities/link_inspector.rb +31 -0
- data/lib/browser_crawler/engine_utilities/link_scanner.rb +38 -0
- data/lib/browser_crawler/engine_utilities/page_inspector.rb +65 -0
- data/lib/browser_crawler/errors/invalid_hooks_type.rb +12 -0
- data/lib/browser_crawler/followups/screenshots_indexer.rb +40 -0
- data/lib/browser_crawler/followups/templates/index.html.erb +69 -0
- data/lib/browser_crawler/followups/wraith_integrator.rb +41 -0
- data/lib/browser_crawler/hooks_container.rb +31 -0
- data/lib/browser_crawler/hooks_operator.rb +44 -0
- data/lib/browser_crawler/options.rb +86 -0
- data/lib/browser_crawler/report_factory.rb +22 -0
- data/lib/browser_crawler/reports/csv_report.rb +75 -0
- data/lib/browser_crawler/reports/store.rb +114 -0
- data/lib/browser_crawler/reports/yaml_report.rb +15 -0
- data/lib/browser_crawler/screenshot_operator.rb +47 -0
- data/lib/browser_crawler/support/capybara.rb +20 -0
- data/lib/browser_crawler/url_tools.rb +32 -0
- data/lib/browser_crawler/version.rb +3 -0
- metadata +244 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: aef40cd15ff0c5799f68906813b3b7161745d41d69ed96d18e67b963bc95595c
|
4
|
+
data.tar.gz: 8b4fef874f80bdeb74a960e76cbb77ff8a578fe99f8c65f6f32777dd767dd4dc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e70ce11b3110c2967212ff504b1b0a541aaec7458d849048ec3df56e5806c9b0e5c9121f02314fcb3db86c981d5347979b8e22dee9aba3ea10745b1e4ccc7268
|
7
|
+
data.tar.gz: 8ee9f384dfb158009a334bf99e00c0239d64b942d41e5ba80532cd8cb80641bdb53676c04b3a909a2cf3e0547ec3faa8c9be9e4b1b0109f67d1694ae6ba50892
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
language: ruby
|
2
|
+
|
3
|
+
dist: trusty
|
4
|
+
|
5
|
+
sudo: required
|
6
|
+
|
7
|
+
before_install:
|
8
|
+
- gem install bundler -v 1.17.2
|
9
|
+
|
10
|
+
addons:
|
11
|
+
apt:
|
12
|
+
packages:
|
13
|
+
- google-chrome-stable
|
14
|
+
|
15
|
+
before_script:
|
16
|
+
- whoami
|
17
|
+
- wget https://chromedriver.storage.googleapis.com/2.46/chromedriver_linux64.zip
|
18
|
+
- unzip chromedriver_linux64.zip
|
19
|
+
- sudo mv chromedriver /usr/bin/chromedriver
|
20
|
+
- sudo chown root:root /usr/bin/chromedriver
|
21
|
+
- sudo chmod +x /usr/bin/chromedriver
|
22
|
+
- sudo ln -s /usr/bin/chromedriver ~/bin/chromedriver
|
23
|
+
- chromedriver -v
|
24
|
+
|
25
|
+
cache: bundler
|
26
|
+
|
27
|
+
rvm:
|
28
|
+
- 2.5.3
|
29
|
+
- 2.6.2
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Dima Samodurov
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,277 @@
|
|
1
|
+
# Browser Crawler
|
2
|
+
|
3
|
+
[](https://travis-ci.org/DimaSamodurov/browser_crawler)
|
4
|
+
|
5
|
+
Browser Crawler is aimed to visit pages available on the site and extract useful information.
|
6
|
+
|
7
|
+
It can help maintaining e.g. lists of internal and external links,
|
8
|
+
creating sitemaps, visual testing using screenshots
|
9
|
+
or prepare the list of urls for the more sophisticated tool like [Wraith](https://github.com/BBC-News/wraith).
|
10
|
+
|
11
|
+
Browser based crawling is performed with the help of [Capybara](https://github.com/teamcapybara/capybara) and Chrome.
|
12
|
+
Javascript is executed before page is analyzed allowing to crawl dynamic content.
|
13
|
+
Browser based crawling is essentially an alternative to Wraith's spider mode,
|
14
|
+
which parses only server side rendered html.
|
15
|
+
|
16
|
+
By default crawler visits pages following the links extracted.
|
17
|
+
No button clicks performed other than during the optional authentication step.
|
18
|
+
Thus crawler does not perform any updates to the site and can be treated as noninvasive.
|
19
|
+
|
20
|
+
## Table of contents
|
21
|
+
- [Installation](#installation)
|
22
|
+
- [Usage from command line](#usage-from-command-line)
|
23
|
+
- [Usage with scripting](#usage-with-scripting)
|
24
|
+
- [Callback methods](#callback-methods)
|
25
|
+
- [Callback methods Before/After crawling](#callback-methods-before-or-after-crawling)
|
26
|
+
- [Callback methods Before/After for each crawling page](#callback-methods-before-or-after-for-each-page)
|
27
|
+
- [Callback method is recorded unvisited links](#callback-method-unvisited-links)
|
28
|
+
- [Callback method is changed page scan rules](#callback-method-page-scan-rules)
|
29
|
+
- [Setup folder to save report file](#setup-folder-for-report)
|
30
|
+
- [Save report to yaml file](#save-report-to-yaml)
|
31
|
+
- [Save report to csv file](#save-report-to-csv)
|
32
|
+
- [Usage with Wraith](#usage-with-wraith)
|
33
|
+
- [Restrictions](#restrictions)
|
34
|
+
- [Ideas for enhancements](#ideas-for-enchancements)
|
35
|
+
- [Integration with test frameworks](#integration-with-test-frameworks)
|
36
|
+
- [Development](#development)
|
37
|
+
- [Contributing](#contributing)
|
38
|
+
- [License](#license)
|
39
|
+
|
40
|
+
## <a name="installation"></a> Installation
|
41
|
+
|
42
|
+
Add this line to your application's Gemfile:
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
gem 'crawler', github: 'DimaSamodurov/crawler'
|
46
|
+
```
|
47
|
+
|
48
|
+
And then execute:
|
49
|
+
|
50
|
+
$ bundle
|
51
|
+
|
52
|
+
Or install it yourself as:
|
53
|
+
|
54
|
+
$ gem install browser_crawler
|
55
|
+
|
56
|
+
## <a name="usage-from-command-line"></a> Usage from command line
|
57
|
+
|
58
|
+
Without the authentication required:
|
59
|
+
```
|
60
|
+
crawl http://localhost:3000
|
61
|
+
```
|
62
|
+
|
63
|
+
With authentication, screenshots and limiting visited page number to 1:
|
64
|
+
```
|
65
|
+
crawl https://your.site.com/welcome -u username -p password -n 1 -s tmp/screenshots
|
66
|
+
# or
|
67
|
+
export username=dima
|
68
|
+
export password=secret
|
69
|
+
#...
|
70
|
+
crawl https://your.site.com/welcome -n 1 -s tmp/screenshots
|
71
|
+
```
|
72
|
+
|
73
|
+
Generate index from the captured screenshots. Index is saved to `tmp/screenshots/index.html`.
|
74
|
+
```
|
75
|
+
bin/crawl -s tmp/screenshots
|
76
|
+
```
|
77
|
+
|
78
|
+
see additional options with:
|
79
|
+
|
80
|
+
```
|
81
|
+
bin/crawl -h
|
82
|
+
```
|
83
|
+
|
84
|
+
When finished the crawling report will be saved to `tmp/crawl_report.yml` file by default.
|
85
|
+
You can specify the file path using command line options.
|
86
|
+
|
87
|
+
## <a name="usage-with-scripting"></a> Usage with scripting
|
88
|
+
|
89
|
+
Below pointed an example script which configures the crawler and targets on the `github.com` site
|
90
|
+
and after that records the result report as yaml file.
|
91
|
+
```
|
92
|
+
crawler = BrowserCrawler::Engine.new({
|
93
|
+
browser_options: {
|
94
|
+
headless: true,
|
95
|
+
window_size: [1200, 1600],
|
96
|
+
timeout: 60,
|
97
|
+
browser_options: { 'no-sandbox': nil }
|
98
|
+
},
|
99
|
+
max_pages: 10,
|
100
|
+
deep_visit: true
|
101
|
+
})
|
102
|
+
|
103
|
+
crawler.extract_links(url: 'https://github.com')
|
104
|
+
crawler.report_save
|
105
|
+
```
|
106
|
+
|
107
|
+
This gem use external dependency a `cuprite`. The `cuprite` allows working with browser without intermediaries (chrome-driver).
|
108
|
+
`browser_options` responsible for configuration the chrome headless browser though the `cuprite`.
|
109
|
+
|
110
|
+
* `max_pages` - an additional option to allow to set amount of pages for crawling.
|
111
|
+
By default it equals `nil` and allows the crawler is browsing all pages within a certain domain.
|
112
|
+
* `deep_visit` - a mode of the crawler when the crawler checks external resources without collecting links from them.
|
113
|
+
|
114
|
+
|
115
|
+
### <a name="callback-methods"></a> Callback methods
|
116
|
+
|
117
|
+
All of them you can use with Capybara DSL.
|
118
|
+
|
119
|
+
#### <a name="callback-methods-before-or-after-crawling"></a> Callback methods Before/After crawling
|
120
|
+
```
|
121
|
+
crawler = BrowserCrawler::Engine.new()
|
122
|
+
|
123
|
+
# scroll down page before scan.
|
124
|
+
crawler.before do
|
125
|
+
page.execute_script 'window.scrollBy(0,10000)'
|
126
|
+
end
|
127
|
+
|
128
|
+
crawler.after do
|
129
|
+
page.body
|
130
|
+
end
|
131
|
+
|
132
|
+
crawler.extract_links(url: 'https://github.com')
|
133
|
+
```
|
134
|
+
|
135
|
+
#### <a name="callback-methods-before-or-after-for-each-page"></a> Callback methods Before/After for each crawling page
|
136
|
+
```
|
137
|
+
crawler = BrowserCrawler::Engine.new()
|
138
|
+
|
139
|
+
# scroll down page before scan.
|
140
|
+
crawler.before type: :each do
|
141
|
+
page.execute_script 'window.scrollBy(0,10000)'
|
142
|
+
end
|
143
|
+
|
144
|
+
crawler.after type: :each do
|
145
|
+
page.body
|
146
|
+
end
|
147
|
+
|
148
|
+
crawler.extract_links(url: 'https://github.com')
|
149
|
+
```
|
150
|
+
|
151
|
+
#### <a name="callback-method-unvisited-links"></a> Callback method is recorded unvisited links
|
152
|
+
Default behavior: by default crawler is sent all links from page to an unvisited_links array
|
153
|
+
and after that browses each of them. This callback allows to change this behavior.
|
154
|
+
```
|
155
|
+
crawler = BrowserCrawler::Engine.new()
|
156
|
+
|
157
|
+
# scan_result consists of array with links from scaned page.
|
158
|
+
crawler.unvisited_links do
|
159
|
+
@page_inspector.scan_result
|
160
|
+
end
|
161
|
+
|
162
|
+
crawler.extract_links(url: 'https://github.com')
|
163
|
+
```
|
164
|
+
|
165
|
+
Changed behavior: change default behavior so that crawler browses only links which consist of `/best-links`.
|
166
|
+
```
|
167
|
+
crawler = BrowserCrawler::Engine.new()
|
168
|
+
|
169
|
+
crawler.unvisited_links do
|
170
|
+
@page_inspector.scan_result.select { |link| link.include?('/best-links') }
|
171
|
+
end
|
172
|
+
|
173
|
+
crawler.extract_links(url: 'https://github.com')
|
174
|
+
```
|
175
|
+
|
176
|
+
#### <a name="callback-method-page-scan-rules"></a> Callback method is changed page scan rules
|
177
|
+
Default behavior: by default crawler get all links from page and move to one to another.
|
178
|
+
```
|
179
|
+
crawler = BrowserCrawler::Engine.new()
|
180
|
+
|
181
|
+
crawler.change_page_scan_rules do
|
182
|
+
page.all('a').map { |a| a['href'] }
|
183
|
+
end
|
184
|
+
|
185
|
+
crawler.extract_links(url: 'https://github.com')
|
186
|
+
```
|
187
|
+
|
188
|
+
Changed behavior: change default behavior so that crawler get only links to have selector `paginations`.
|
189
|
+
```
|
190
|
+
crawler = BrowserCrawler::Engine.new()
|
191
|
+
|
192
|
+
crawler.change_page_scan_rules do
|
193
|
+
if URI.parse(page.current_url).to_s.include?('/help/')
|
194
|
+
page.all('a.paginations') { |a| a['href'] }
|
195
|
+
else
|
196
|
+
[]
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
crawler.extract_links(url: 'https://github.com')
|
201
|
+
```
|
202
|
+
|
203
|
+
#### <a name="setup-folder-for-report"></a> Setup folder to save report file
|
204
|
+
```
|
205
|
+
crawler = BrowserCrawler::Engine.new()
|
206
|
+
crawler.extract_links(url: 'https://github.com')
|
207
|
+
|
208
|
+
crawler.report_save(folder_path: './reports/')
|
209
|
+
```
|
210
|
+
If the folder doesn't exist, `BrowserCrawler` create the folder for report.
|
211
|
+
|
212
|
+
#### <a name="save-report-to-yaml"></a> Save report to yaml file
|
213
|
+
```
|
214
|
+
crawler = BrowserCrawler::Engine.new()
|
215
|
+
crawler.extract_links(url: 'https://github.com')
|
216
|
+
|
217
|
+
crawler.report_save(type: :yaml)
|
218
|
+
```
|
219
|
+
|
220
|
+
#### <a name="save-report-to-csv"></a> Save report to csv file
|
221
|
+
```
|
222
|
+
crawler = BrowserCrawler::Engine.new()
|
223
|
+
crawler.extract_links(url: 'https://github.com')
|
224
|
+
|
225
|
+
crawler.report_save(type: :csv)
|
226
|
+
```
|
227
|
+
|
228
|
+
### <a name="usage-with-wraith"></a> Usage with Wraith
|
229
|
+
|
230
|
+
Browser Crawler can be useful to update `paths:` section of the wraith's configs.
|
231
|
+
|
232
|
+
Provided wraith config is placed to `wraith/configs/capture.yaml` file, do:
|
233
|
+
```
|
234
|
+
crawl https://your.site.com/welcome -c wraith/configs/capture.yaml
|
235
|
+
```
|
236
|
+
|
237
|
+
Or if you have crawling report available, just use it without the URL to skip crawling:
|
238
|
+
```
|
239
|
+
bin/crawl -c tmp/wraith_config.yml -r tmp/crawl_report.yml
|
240
|
+
```
|
241
|
+
|
242
|
+
## <a name="restrictions"></a> Restrictions
|
243
|
+
|
244
|
+
Current version has the authentication process hardcoded:
|
245
|
+
the path to login form and the field names used are specific to the project
|
246
|
+
the crawler is extracted from.
|
247
|
+
Configuration may be added in a future version.
|
248
|
+
|
249
|
+
## <a name="ideas-for-enchancements"></a> Ideas for enhancements
|
250
|
+
It should be easy to crawl the site as part of the automated testing.
|
251
|
+
e.g. in order to verify the list of pages available on the site,
|
252
|
+
or in order to generate visual report (Wraith does it better).
|
253
|
+
|
254
|
+
### <a name="integration-with-test-frameworks"></a> Integration with test frameworks
|
255
|
+
|
256
|
+
By integrating browser_crawler into the application test suite
|
257
|
+
it would be possible accessing pages and content not easily accessible on real site.
|
258
|
+
E.g. when performing data modifications.
|
259
|
+
|
260
|
+
By integrating into test suite it
|
261
|
+
would be possible to use all the tools/mocks/helpers/ created to simulate user behavior.
|
262
|
+
E.g. mock external request using e.g. VCR.
|
263
|
+
|
264
|
+
|
265
|
+
## <a name="development"></a> Development
|
266
|
+
|
267
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
268
|
+
|
269
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
270
|
+
|
271
|
+
## <a name="contributing"></a> Contributing
|
272
|
+
|
273
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/dimasamodurov/browser_crawler.
|
274
|
+
|
275
|
+
## <a name="license"></a> License
|
276
|
+
|
277
|
+
MIT
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "browser_crawler"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
require "pry"
|
10
|
+
Pry.start
|
data/bin/crawl
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'browser_crawler'
|
5
|
+
require 'pry' if ENV['DEBUG'] == 'true'
|
6
|
+
|
7
|
+
options = BrowserCrawler::Options.parse_args
|
8
|
+
ENV.update(options.transform_keys(&:to_s).transform_values(&:to_s))
|
9
|
+
|
10
|
+
if options[:screenshots_path] && !File.directory?(options[:screenshots_path])
|
11
|
+
`mkdir -p #{options[:screenshots_path]}`
|
12
|
+
end
|
13
|
+
|
14
|
+
if options[:report_format] && !%w[yaml csv].include?(options[:report_format])
|
15
|
+
puts("Report format #{options[:report_format]} is not recognized." \
|
16
|
+
" Please choose the correct format from the list of available formats: 'csv', 'yaml'")
|
17
|
+
return
|
18
|
+
end
|
19
|
+
|
20
|
+
if options[:url]
|
21
|
+
engine = BrowserCrawler::Engine.new(
|
22
|
+
browser_options: {
|
23
|
+
windows_size: [options[:window_width].to_i,
|
24
|
+
options[:window_height].to_i]
|
25
|
+
},
|
26
|
+
max_pages: options[:max_pages],
|
27
|
+
screenshots_options: { save_screenshots_to: options[:screenshots_path] }
|
28
|
+
)
|
29
|
+
|
30
|
+
engine.extract_links(url: options[:url]) if options[:url]
|
31
|
+
|
32
|
+
engine.report_save(folder_path: options[:report_folder],
|
33
|
+
type: options[:report_format])
|
34
|
+
|
35
|
+
puts "Report is saved to #{options[:report_folder]} as #{options[:report_format]} file."
|
36
|
+
puts "Total pages visited: #{engine.report_store.visited_pages.count}."
|
37
|
+
end
|
38
|
+
|
39
|
+
if options[:screenshots_path]
|
40
|
+
template = File.read(options[:index_template]) if options[:index_template]
|
41
|
+
indexer = BrowserCrawler::Followups::ScreenshotsIndexer
|
42
|
+
.new(template: template)
|
43
|
+
file = indexer.index_directory(options[:screenshots_path])
|
44
|
+
puts "Screenshots index is saved to '#{file}'."
|
45
|
+
end
|
46
|
+
|
47
|
+
if options[:wraith_config]
|
48
|
+
followup = BrowserCrawler::Followups::WraithIntegrator
|
49
|
+
.new(report: File.read("#{options[:report_folder]}/crawler_report.yaml"))
|
50
|
+
followup.update_config(options[:wraith_config], path_suffix: '?wraith')
|
51
|
+
end
|