powerdlz23 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/grell/.rspec +2 -0
- package/grell/.travis.yml +28 -0
- package/grell/CHANGELOG.md +111 -0
- package/grell/Gemfile +7 -0
- package/grell/LICENSE.txt +22 -0
- package/grell/README.md +213 -0
- package/grell/Rakefile +2 -0
- package/grell/grell.gemspec +36 -0
- package/grell/lib/grell/capybara_driver.rb +44 -0
- package/grell/lib/grell/crawler.rb +83 -0
- package/grell/lib/grell/crawler_manager.rb +84 -0
- package/grell/lib/grell/grell_logger.rb +10 -0
- package/grell/lib/grell/page.rb +275 -0
- package/grell/lib/grell/page_collection.rb +62 -0
- package/grell/lib/grell/rawpage.rb +62 -0
- package/grell/lib/grell/reader.rb +18 -0
- package/grell/lib/grell/version.rb +3 -0
- package/grell/lib/grell.rb +11 -0
- package/grell/spec/lib/capybara_driver_spec.rb +38 -0
- package/grell/spec/lib/crawler_manager_spec.rb +174 -0
- package/grell/spec/lib/crawler_spec.rb +361 -0
- package/grell/spec/lib/page_collection_spec.rb +159 -0
- package/grell/spec/lib/page_spec.rb +418 -0
- package/grell/spec/lib/reader_spec.rb +43 -0
- package/grell/spec/spec_helper.rb +66 -0
- package/heartmagic/config.py +1 -0
- package/heartmagic/heart.py +3 -0
- package/heartmagic/pytransform/__init__.py +483 -0
- package/heartmagic/pytransform/_pytransform.dll +0 -0
- package/heartmagic/pytransform/_pytransform.so +0 -0
- package/httpStatusCode/README.md +2 -0
- package/httpStatusCode/httpStatusCode.js +4 -0
- package/httpStatusCode/reasonPhrases.js +344 -0
- package/httpStatusCode/statusCodes.js +344 -0
- package/package.json +1 -1
- package/snapcrawl/.changelog.old.md +157 -0
- package/snapcrawl/.gitattributes +1 -0
- package/snapcrawl/.github/workflows/test.yml +41 -0
- package/snapcrawl/.rspec +3 -0
- package/snapcrawl/.rubocop.yml +23 -0
- package/snapcrawl/CHANGELOG.md +182 -0
- package/snapcrawl/Gemfile +15 -0
- package/snapcrawl/LICENSE +21 -0
- package/snapcrawl/README.md +135 -0
- package/snapcrawl/Runfile +35 -0
- package/snapcrawl/bin/snapcrawl +25 -0
- package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
- package/snapcrawl/lib/snapcrawl/config.rb +60 -0
- package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
- package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
- package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
- package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
- package/snapcrawl/lib/snapcrawl/page.rb +118 -0
- package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
- package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
- package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
- package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
- package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
- package/snapcrawl/lib/snapcrawl/version.rb +3 -0
- package/snapcrawl/lib/snapcrawl.rb +20 -0
- package/snapcrawl/snapcrawl.gemspec +27 -0
- package/snapcrawl/snapcrawl.yml +41 -0
- package/snapcrawl/spec/README.md +16 -0
- package/snapcrawl/spec/approvals/bin/help +26 -0
- package/snapcrawl/spec/approvals/bin/usage +4 -0
- package/snapcrawl/spec/approvals/cli/usage +4 -0
- package/snapcrawl/spec/approvals/config/defaults +15 -0
- package/snapcrawl/spec/approvals/config/minimal +15 -0
- package/snapcrawl/spec/approvals/integration/blacklist +14 -0
- package/snapcrawl/spec/approvals/integration/default-config +14 -0
- package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
- package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
- package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
- package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
- package/snapcrawl/spec/approvals/integration/whitelist +14 -0
- package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
- package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
- package/snapcrawl/spec/server/config.ru +97 -0
- package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
- package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
- package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
- package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
- package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
- package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
- package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
- package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
- package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
- package/snapcrawl/spec/spec_helper.rb +22 -0
- package/snapcrawl/spec/spec_mixin.rb +10 -0
package/grell/.rspec
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
language: ruby
|
|
2
|
+
cache: bundler
|
|
3
|
+
|
|
4
|
+
rvm:
|
|
5
|
+
- 2.2.4
|
|
6
|
+
- 2.3.0
|
|
7
|
+
- 2.4.2
|
|
8
|
+
|
|
9
|
+
before_install:
|
|
10
|
+
- mkdir travis-phantomjs
|
|
11
|
+
- wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true
|
|
12
|
+
-O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
|
|
13
|
+
- tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
|
|
14
|
+
- export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
|
|
15
|
+
|
|
16
|
+
install:
|
|
17
|
+
- bundle install --jobs=3 --retry=3
|
|
18
|
+
|
|
19
|
+
script:
|
|
20
|
+
- bundle exec rspec
|
|
21
|
+
|
|
22
|
+
deploy:
|
|
23
|
+
provider: rubygems
|
|
24
|
+
api_key:
|
|
25
|
+
secure: czStDI0W6MWL70sDwu53oNNCc8vKtT61pgvii+ZWIC9A41C2p7BzmbtosXsnLk2ApxmpWvFIgtQE0XIH7jkM5mY05cHinXDphtOTkNLFVjck3ZOMkx/cc+QRFW8K4FHkrzFsC+/Xx4t2/Psh35LpzhfJd0XzKKoCstXUVgJsfGcAK3DMpjXHSUbwLXGDZ4lzmsk52OLf0oL+in2447TJfVOvGXtYmfh1PjXRwDxKB0dan7w5mVgajS52b6wUhVPTaMe/JgCbMuV7BaQ1Goq8u7V4aaxU+liPAhzHWfMB6tF4TEW8yu2tvGLdOA0+1jmM8E9Q5saPWtwKiHvBxN8CzRpkiNDzyFAf8ljrWT5yKX3aRQCyPp3NNyhoumWap36b+O/zwZ3HxoAe22Yg0rjz8z8NxMR/ELPvjPYjCiF5zY7fO9PAzmIynMRUrxDnFj+/JGHdzx0ZMo3fEXgHHSaHPNxIzEffVVQk4XLVnFHDjBLY4mVp4sbHbja5qnui20RkdM/H9Yi/fQyl1ODhk+LUPoh45ZneDZq7GPrl+WKK06oEjXIXLU+1iEuqnSqybbmJMTUJlUV+7EJdtq2DgfDB4KXwLm2LLOR/IX63AzEav4NIxx3hIXifSKa9rp6D7nMTzdQwF0FFzIj/Y3qLrAe1WWt0gx3Vxq67pSwOJthk5Fc=
|
|
26
|
+
on:
|
|
27
|
+
tags: true
|
|
28
|
+
rvm: 2.4.2
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# 2.1.2
|
|
2
|
+
* Change white/black lists to allow/deny lists
|
|
3
|
+
|
|
4
|
+
# 2.1.1
|
|
5
|
+
* Update phantomjs_options to use 'TLSv1.2'
|
|
6
|
+
|
|
7
|
+
# 2.1.0
|
|
8
|
+
* Delete `driver_options` configuration key as it was never used.
|
|
9
|
+
* `cleanup_all_processes` is a self method as intended to.
|
|
10
|
+
|
|
11
|
+
# 2.0.0
|
|
12
|
+
* New configuration key `on_periodic_restart`.
|
|
13
|
+
* CrawlerManager.cleanup_all_processes method destroy all instances of phantomjs in this machine.
|
|
14
|
+
|
|
15
|
+
* Breaking changes
|
|
16
|
+
- Requires Ruby 2.1 or later.
|
|
17
|
+
- Crawler.start_crawling does not accept options anymore, all options are passed to Crawler.new.
|
|
18
|
+
- Crawler's methods `restart` and `quit` have been moved to CrawlerManager.
|
|
19
|
+
- Crawler gets whitelist and blacklist as configuration options instead of being set in specific methods.
|
|
20
|
+
|
|
21
|
+
# 1.6.11
|
|
22
|
+
* Ensure all links are loaded by waiting for Ajax requests to complete
|
|
23
|
+
* Add '@evaluate_in_each_page' option to evaluate before extracting links (e.g. $('.dropdown').addClass('open');)
|
|
24
|
+
|
|
25
|
+
# 1.6.10
|
|
26
|
+
* Avoid following JS href links, add missing dependencies to fix Travis build
|
|
27
|
+
|
|
28
|
+
# 1.6.9
|
|
29
|
+
* Avoid following links when disabled by CSS (1.6.8 worked only for Javascript)
|
|
30
|
+
|
|
31
|
+
# 1.6.8
|
|
32
|
+
* Avoid following disabled links
|
|
33
|
+
|
|
34
|
+
# 1.6.7
|
|
35
|
+
* Increment '@times_visited' first to avoid infinite retries when rescuing errors
|
|
36
|
+
|
|
37
|
+
# 1.6.6
|
|
38
|
+
* Updated phantomjs_logger not to open '/dev/null'
|
|
39
|
+
|
|
40
|
+
# 1.6.5
|
|
41
|
+
* Added #quit to Crawler
|
|
42
|
+
|
|
43
|
+
# 1.6.4
|
|
44
|
+
* Added #quit to Capybara driver
|
|
45
|
+
|
|
46
|
+
# 1.6.3
|
|
47
|
+
* Only follow visible links
|
|
48
|
+
|
|
49
|
+
# 1.6.2
|
|
50
|
+
* Reset Capybara driver to Puffing Billy (used to rewrite URL requests in specs)
|
|
51
|
+
* Use float timestamp for Poltergeist driver name to support fast test executions
|
|
52
|
+
|
|
53
|
+
# 1.6.1
|
|
54
|
+
* Use non-static name to support registering Poltergeist crawler multiple times
|
|
55
|
+
* More exception handling, store redirected URLs in addition to original URL
|
|
56
|
+
|
|
57
|
+
# 1.6
|
|
58
|
+
* Support custom URL comparison when adding new pages during crawling
|
|
59
|
+
* Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs
|
|
60
|
+
* Fail early if Capybara doesn't initialize properly
|
|
61
|
+
|
|
62
|
+
# 1.5.1
|
|
63
|
+
* Fixed deprecation warning (Thanks scott)
|
|
64
|
+
* Updated Poltergeist dependency
|
|
65
|
+
|
|
66
|
+
# 1.5.0
|
|
67
|
+
* Grell will follow redirects.
|
|
68
|
+
* Added #followed_redirects? #error? #current_url methods to the Page class
|
|
69
|
+
|
|
70
|
+
# 1.4.0
|
|
71
|
+
* Added crawler.restart to restart browser process
|
|
72
|
+
* The block of code can make grell retry any given page.
|
|
73
|
+
|
|
74
|
+
# 1.3.2
|
|
75
|
+
* Rescue Timeout error and return an empty page when that happens
|
|
76
|
+
|
|
77
|
+
# 1.3.1
|
|
78
|
+
* Added whitelisting and blacklisting
|
|
79
|
+
* Better info in gemspec
|
|
80
|
+
|
|
81
|
+
# 1.3
|
|
82
|
+
* The Crawler object allows you to provide an external logger object.
|
|
83
|
+
* Clearer semantics when an error happens, special headers are returned so the user can inspect the error
|
|
84
|
+
* Caveats:
|
|
85
|
+
- The 'debug' option in the crawler does not have any affect anymore. Provide an external logger with 'logger' instead
|
|
86
|
+
- The errors provided in the headers by grell has changed from 'grell_status' to 'grellStatus'.
|
|
87
|
+
- The 'visited' property in the page was never supposed to be accesible. Use 'visited?' instead.
|
|
88
|
+
|
|
89
|
+
# 1.2.1
|
|
90
|
+
* Solve bug: URLs are case insensitive
|
|
91
|
+
|
|
92
|
+
# 1.2
|
|
93
|
+
* Grell now will consider two links to point to the same page only when the whole URL is exactly the same.
|
|
94
|
+
Versions previously would only consider two links to be the same when they shared the path.
|
|
95
|
+
|
|
96
|
+
# 1.1.2
|
|
97
|
+
* Solve bug where we were adding links in heads as if there were normal links in the body
|
|
98
|
+
|
|
99
|
+
# 1.1.1
|
|
100
|
+
* Solve bug with the new data-href functionality
|
|
101
|
+
|
|
102
|
+
# 1.1
|
|
103
|
+
* Solve problem with randomly failing spec
|
|
104
|
+
* Search for elements with 'href' or 'data-href' to find links
|
|
105
|
+
|
|
106
|
+
# 1.0.1
|
|
107
|
+
* Rescueing Javascript errors
|
|
108
|
+
|
|
109
|
+
# 1.0
|
|
110
|
+
* Initial implementation
|
|
111
|
+
* Basic support to crawling pages.
|
package/grell/Gemfile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Copyright (c) 2015 Medidata Solutions Worldwide
|
|
2
|
+
|
|
3
|
+
MIT License
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
a copy of this software and associated documentation files (the
|
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
11
|
+
the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be
|
|
14
|
+
included in all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/grell/README.md
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# Grell
|
|
2
|
+
|
|
3
|
+
[](https://travis-ci.org/mdsol/grell)
|
|
4
|
+
|
|
5
|
+
Grell is a generic crawler for the web written in Ruby.
|
|
6
|
+
It can be used to gather data, test pages in a given domain, etc.
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
Add this line to your application's Gemfile:
|
|
11
|
+
|
|
12
|
+
```ruby
|
|
13
|
+
gem 'grell'
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
And then execute:
|
|
17
|
+
|
|
18
|
+
$ bundle
|
|
19
|
+
|
|
20
|
+
Or install it yourself as:
|
|
21
|
+
|
|
22
|
+
$ gem install grell
|
|
23
|
+
|
|
24
|
+
Grell uses PhantomJS as a browser, you will need to download and install it in your
|
|
25
|
+
system. Check for instructions in http://phantomjs.org/
|
|
26
|
+
Grell has been tested with PhantomJS v2.1.x
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
### Crawling an entire site
|
|
31
|
+
|
|
32
|
+
The main entry point of the library is Grell::Crawler#start_crawling.
|
|
33
|
+
Grell will yield to your code with each page it finds:
|
|
34
|
+
|
|
35
|
+
```ruby
|
|
36
|
+
require 'grell'
|
|
37
|
+
|
|
38
|
+
crawler = Grell::Crawler.new
|
|
39
|
+
crawler.start_crawling('http://www.google.com') do |page|
|
|
40
|
+
#Grell will keep iterating this block which each unique page it finds
|
|
41
|
+
puts "yes we crawled #{page.url}"
|
|
42
|
+
puts "status: #{page.status}"
|
|
43
|
+
puts "headers: #{page.headers}"
|
|
44
|
+
puts "body: #{page.body}"
|
|
45
|
+
puts "We crawled it at #{page.timestamp}"
|
|
46
|
+
puts "We found #{page.links.size} links"
|
|
47
|
+
puts "page id and parent_id #{page.id}, #{page.parent_id}"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Grell keeps a list of pages previously crawled and do not visit the same page twice.
|
|
53
|
+
This list is indexed by the complete url, including query parameters.
|
|
54
|
+
|
|
55
|
+
### Re-retrieving a page
|
|
56
|
+
If you want Grell to revisit a page and return the data to you again,
|
|
57
|
+
return the symbol :retry in your block for the start_crawling method.
|
|
58
|
+
For instance
|
|
59
|
+
```ruby
|
|
60
|
+
require 'grell'
|
|
61
|
+
crawler = Grell::Crawler.new
|
|
62
|
+
crawler.start_crawling('http://www.google.com') do |current_page|
|
|
63
|
+
if current_page.status == 500 && current_page.retries == 0
|
|
64
|
+
crawler.manager.restart
|
|
65
|
+
:retry
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Pages' id
|
|
71
|
+
|
|
72
|
+
Each page has an unique id, accessed by the property `id`. Also each page stores the id of the page from which we found this page, accessed by the property `parent_id`.
|
|
73
|
+
The page object generated by accessing the first URL passed to the start_crawling(the root) has a `parent_id` equal to `nil` and an `id` equal to 0.
|
|
74
|
+
Using this information it is possible to construct a directed graph.
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
### Restart and quit
|
|
78
|
+
|
|
79
|
+
Grell can be restarted. The current list of visited and yet-to-visit pages list are not modified when restarting
|
|
80
|
+
but the browser is destroyed and recreated, all cookies and local storage are lost. After restarting, crawling is resumed with a
|
|
81
|
+
new browser.
|
|
82
|
+
To destroy the crawler, call the `quit` method. This will free the memory taken in Ruby and destroys the PhantomJS process.
|
|
83
|
+
```ruby
|
|
84
|
+
require 'grell'
|
|
85
|
+
crawler = Grell::Crawler.new
|
|
86
|
+
crawler.manager.restart # restarts the browser
|
|
87
|
+
crawler.manager.quit # quits and destroys the crawler
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Options
|
|
91
|
+
|
|
92
|
+
The `Grell:Crawler` class can be passed options to customize its behavior:
|
|
93
|
+
- `logger`: Sets the logger object, for instance `Rails.logger`. Default: `Logger.new(STDOUT)`
|
|
94
|
+
- `on_periodic_restart`: Sets periodic restarts of the crawler each certain number of visits. Default: 100 pages.
|
|
95
|
+
- `allowlist`: Sets a allowlist filter for URLs to be visited. Default: all URLs are allowlisted.
|
|
96
|
+
- `denylist`: Sets a denylist filter for URLs to be avoided. Default: no URL is denylisted.
|
|
97
|
+
- `add_match_block`: Block evaluated to consider if a given page should be part of the pages to be visited. Default: add unique URLs.
|
|
98
|
+
- `evaluate_in_each_page`: Javascript block to be evaluated on each page visited. Default: Nothing evaluated.
|
|
99
|
+
|
|
100
|
+
Grell by default will follow all the links it finds in the site being crawled.
|
|
101
|
+
It will never follow links linking outside your site.
|
|
102
|
+
If you want to further limit the amount of links crawled, you can use
|
|
103
|
+
allowlisting, denylisting or manual filtering.
|
|
104
|
+
Below further details on these and other options.
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
#### Automatically restarting PhantomJS
|
|
108
|
+
If you are doing a long crawling it is possible that phantomJS gets into an inconsistent state or it starts leaking memory.
|
|
109
|
+
The crawler can be restarted manually by calling `crawler.manager.restart` or automatically by using the
|
|
110
|
+
`on_periodic_restart` configuration key as follows:
|
|
111
|
+
|
|
112
|
+
```ruby
|
|
113
|
+
require 'grell'
|
|
114
|
+
|
|
115
|
+
crawler = Grell::Crawler.new(on_periodic_restart: { do: my_restart_procedure, each: 200 })
|
|
116
|
+
|
|
117
|
+
crawler.start_crawling('http://www.google.com') do |current_page|
|
|
118
|
+
...
|
|
119
|
+
endd
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
This code will setup the crawler to be restarted every 200 pages being crawled and to call `my_restart_procedure`
|
|
123
|
+
between restarts. A restart will destroy the cookies so for instance this custom block can be used to relogin.
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
#### Allowlisting
|
|
127
|
+
|
|
128
|
+
```ruby
|
|
129
|
+
require 'grell'
|
|
130
|
+
|
|
131
|
+
crawler = Grell::Crawler.new(allowlist: [/games\/.*/, '/fun'])
|
|
132
|
+
crawler.start_crawling('http://www.google.com')
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Grell here will only follow links to games and '/fun' and ignore all
|
|
136
|
+
other links. You can provide a regexp, strings (if any part of the
|
|
137
|
+
string match is allowlisted) or an array with regexps and/or strings.
|
|
138
|
+
|
|
139
|
+
#### Denylisting
|
|
140
|
+
|
|
141
|
+
```ruby
|
|
142
|
+
require 'grell'
|
|
143
|
+
|
|
144
|
+
crawler = Grell::Crawler.new(denylist: /games\/.*/)
|
|
145
|
+
crawler.start_crawling('http://www.google.com')
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Similar to allowlisting. But now Grell will follow every other link in
|
|
149
|
+
this site which does not go to /games/...
|
|
150
|
+
|
|
151
|
+
If you call both allowlist and denylist then both will apply, a link
|
|
152
|
+
has to fullfill both conditions to survive. If you do not call any, then
|
|
153
|
+
all links on this site will be crawled. Think of these methods as
|
|
154
|
+
filters.
|
|
155
|
+
|
|
156
|
+
#### Manual link filtering
|
|
157
|
+
|
|
158
|
+
If you have a more complex use-case, you can modify the list of links
|
|
159
|
+
manually.
|
|
160
|
+
Grell yields the page to you before it adds the links to the list of
|
|
161
|
+
links to visit. So you can modify in your block of code "page.links" to
|
|
162
|
+
add and delete links to instruct Grell to add them to the list of links
|
|
163
|
+
to visit next.
|
|
164
|
+
|
|
165
|
+
#### Custom URL Comparison
|
|
166
|
+
By default, Grell will detect new URLs to visit by comparing the full URL
|
|
167
|
+
with the URLs of the discovered and visited links. This functionality can
|
|
168
|
+
be changed by passing a block of code to Grells `start_crawling` method.
|
|
169
|
+
In the below example, the path of the URLs (instead of the full URL) will
|
|
170
|
+
be compared.
|
|
171
|
+
|
|
172
|
+
```ruby
|
|
173
|
+
require 'grell'
|
|
174
|
+
|
|
175
|
+
add_match_block = Proc.new do |collection_page, page|
|
|
176
|
+
collection_page.path == page.path
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
crawler = Grell::Crawler.new(add_match_block: add_match_block)
|
|
180
|
+
|
|
181
|
+
crawler.start_crawling('http://www.google.com') do |current_page|
|
|
182
|
+
...
|
|
183
|
+
end
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
#### Evaluate script
|
|
187
|
+
|
|
188
|
+
You can evalute a JavaScript snippet in each page before extracting links by passing the snippet to the 'evaluate_in_each_page' option:
|
|
189
|
+
|
|
190
|
+
```ruby
|
|
191
|
+
require 'grell'
|
|
192
|
+
|
|
193
|
+
crawler = Grell::Crawler.new(evaluate_in_each_page: "typeof jQuery !== 'undefined' && $('.dropdown').addClass('open');")
|
|
194
|
+
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Errors
|
|
198
|
+
When there is an error in the page or an internal error in the crawler (Javascript crashed the browser, etc). Grell will return with status 404 and the headers will have the following keys:
|
|
199
|
+
- grellStatus: 'Error'
|
|
200
|
+
- errorClass: The class of the error which broke this page.
|
|
201
|
+
- errorMessage: A descriptive message with the information Grell could gather about the error.
|
|
202
|
+
|
|
203
|
+
## Tests
|
|
204
|
+
|
|
205
|
+
Run the tests with
|
|
206
|
+
```ruby
|
|
207
|
+
bundle exec rake ci
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Contributors
|
|
211
|
+
Grell is (c) Medidata Solutions Worldwide and owned by its major contributors:
|
|
212
|
+
* [Teruhide Hoshikawa](https://github.com/thoshikawa-mdsol)
|
|
213
|
+
* [Jordi Polo Carres](https://github.com/jcarres-mdsol)
|
package/grell/Rakefile
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
require 'grell/version'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |spec|
|
|
7
|
+
spec.name = "grell"
|
|
8
|
+
spec.version = Grell::VERSION
|
|
9
|
+
spec.platform = Gem::Platform::RUBY
|
|
10
|
+
spec.authors = ["Jordi Polo Carres"]
|
|
11
|
+
spec.email = ["jcarres@mdsol.com"]
|
|
12
|
+
spec.summary = %q{Ruby web crawler}
|
|
13
|
+
spec.description = %q{Ruby web crawler using PhantomJS}
|
|
14
|
+
spec.homepage = "https://github.com/mdsol/grell"
|
|
15
|
+
spec.license = 'MIT'
|
|
16
|
+
|
|
17
|
+
spec.files = `git ls-files -z`.split("\x0")
|
|
18
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
19
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
20
|
+
spec.require_paths = ["lib"]
|
|
21
|
+
|
|
22
|
+
spec.required_ruby_version = '>= 2.1.8'
|
|
23
|
+
|
|
24
|
+
spec.add_dependency 'capybara', '~> 2.10'
|
|
25
|
+
spec.add_dependency 'poltergeist', '~> 1.11'
|
|
26
|
+
|
|
27
|
+
# spec.add_development_dependency 'bundler', '~> 1.6'
|
|
28
|
+
spec.add_development_dependency 'byebug', '~> 4.0'
|
|
29
|
+
spec.add_development_dependency 'kender', '~> 0.2'
|
|
30
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
|
31
|
+
spec.add_development_dependency 'webmock', '~> 1.18'
|
|
32
|
+
spec.add_development_dependency 'rspec', '~> 3.5'
|
|
33
|
+
spec.add_development_dependency 'puffing-billy', '~> 0.9'
|
|
34
|
+
spec.add_development_dependency 'timecop', '~> 0.8'
|
|
35
|
+
spec.add_development_dependency 'selenium-webdriver', '~> 2.53.4'
|
|
36
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
module Grell
|
|
2
|
+
# This class setups the driver for capybara. Used internally by the CrawlerManager
|
|
3
|
+
# It uses Portelgeist to control PhantomJS
|
|
4
|
+
class CapybaraDriver
|
|
5
|
+
USER_AGENT = "Mozilla/5.0 (Grell Crawler)".freeze
|
|
6
|
+
|
|
7
|
+
# Returns a poltergeist driver
|
|
8
|
+
def setup_capybara
|
|
9
|
+
@poltergeist_driver = nil
|
|
10
|
+
|
|
11
|
+
# Capybara will not re-run the block if the driver name already exists, so the driver name
|
|
12
|
+
# will have a time integer appended to ensure uniqueness.
|
|
13
|
+
driver_name = "poltergeist_crawler_#{Time.now.to_f}".to_sym
|
|
14
|
+
Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'"
|
|
15
|
+
|
|
16
|
+
Capybara.register_driver driver_name do |app|
|
|
17
|
+
@poltergeist_driver = Capybara::Poltergeist::Driver.new(app,
|
|
18
|
+
js_errors: false,
|
|
19
|
+
inspector: false,
|
|
20
|
+
phantomjs_logger: FakePoltergeistLogger,
|
|
21
|
+
phantomjs_options: ['--debug=no', '--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=TLSv1.2'])
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
Capybara.default_max_wait_time = 3
|
|
25
|
+
Capybara.run_server = false
|
|
26
|
+
Capybara.default_driver = driver_name
|
|
27
|
+
Capybara.current_session.driver.headers = { # The driver gets initialized when modified here
|
|
28
|
+
"DNT" => 1,
|
|
29
|
+
"User-Agent" => USER_AGENT
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
raise 'Poltergeist Driver could not be properly initialized' unless @poltergeist_driver
|
|
33
|
+
|
|
34
|
+
@poltergeist_driver
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Poltergeist driver needs a class with this signature. The javascript console.log is sent here.
|
|
38
|
+
# We just discard that information.
|
|
39
|
+
module FakePoltergeistLogger
|
|
40
|
+
def self.puts(*)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
module Grell
|
|
2
|
+
# This is the class that starts and controls the crawling
|
|
3
|
+
class Crawler
|
|
4
|
+
attr_reader :collection, :manager
|
|
5
|
+
|
|
6
|
+
# Creates a crawler
|
|
7
|
+
# evaluate_in_each_page: javascript block to evaluate in each page we crawl
|
|
8
|
+
# add_match_block: block to evaluate to consider if a page is part of the collection
|
|
9
|
+
# manager_options: options passed to the manager class
|
|
10
|
+
# allowlist: Sets an allowlist filter, allows a regexp, string or array of either to be matched.
|
|
11
|
+
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
|
12
|
+
def initialize(evaluate_in_each_page: nil, add_match_block: nil, allowlist: /.*/, denylist: /a^/, **manager_options)
|
|
13
|
+
@collection = nil
|
|
14
|
+
@manager = CrawlerManager.new(manager_options)
|
|
15
|
+
@evaluate_in_each_page = evaluate_in_each_page
|
|
16
|
+
@add_match_block = add_match_block
|
|
17
|
+
@allowlist_regexp = Regexp.union(allowlist)
|
|
18
|
+
@denylist_regexp = Regexp.union(denylist)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Main method, it starts crawling on the given URL and calls a block for each of the pages found.
|
|
22
|
+
def start_crawling(url, &block)
|
|
23
|
+
Grell.logger.info "GRELL Started crawling"
|
|
24
|
+
@collection = PageCollection.new(@add_match_block)
|
|
25
|
+
@collection.create_page(url, nil)
|
|
26
|
+
|
|
27
|
+
while !@collection.discovered_pages.empty?
|
|
28
|
+
crawl(@collection.next_page, block)
|
|
29
|
+
@manager.check_periodic_restart(@collection)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
Grell.logger.info "GRELL finished crawling"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def crawl(site, block)
|
|
36
|
+
Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
|
|
37
|
+
crawl_site(site)
|
|
38
|
+
|
|
39
|
+
if block # The user of this block can send us a :retry to retry accessing the page
|
|
40
|
+
while crawl_block(block, site) == :retry
|
|
41
|
+
Grell.logger.info "Retrying our visit to #{site.url}"
|
|
42
|
+
crawl_site(site)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
site.links.each do |url|
|
|
47
|
+
@collection.create_page(url, site.id)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def crawl_site(site)
|
|
54
|
+
site.navigate
|
|
55
|
+
site.rawpage.page.evaluate_script(@evaluate_in_each_page) if @evaluate_in_each_page
|
|
56
|
+
filter!(site.links)
|
|
57
|
+
add_redirect_url(site)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Treat any exceptions from the block as an unavailable page
|
|
61
|
+
def crawl_block(block, site)
|
|
62
|
+
block.call(site)
|
|
63
|
+
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
|
|
64
|
+
Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
|
|
65
|
+
Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
|
|
66
|
+
site.unavailable_page(404, e)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def filter!(links)
|
|
70
|
+
links.select! { |link| link =~ @allowlist_regexp } if @allowlist_regexp
|
|
71
|
+
links.delete_if { |link| link =~ @denylist_regexp } if @denylist_regexp
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Store the resulting redirected URL along with the original URL
|
|
75
|
+
def add_redirect_url(site)
|
|
76
|
+
if site.url != site.current_url
|
|
77
|
+
@collection.create_page(site.current_url, site.id)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
module Grell
|
|
2
|
+
# Manages the state of the process crawling, does not care about individual pages but about logging,
|
|
3
|
+
# restarting and quiting the crawler correctly.
|
|
4
|
+
class CrawlerManager
|
|
5
|
+
# logger: logger to use for Grell's messages
|
|
6
|
+
# on_periodic_restart: if set, the driver will restart every :each visits (100 default) and execute the :do block
|
|
7
|
+
# driver_options: Any extra options for the Capybara driver
|
|
8
|
+
def initialize(logger: nil, on_periodic_restart: {}, driver: nil)
|
|
9
|
+
Grell.logger = logger ? logger : Logger.new(STDOUT)
|
|
10
|
+
@periodic_restart_block = on_periodic_restart[:do]
|
|
11
|
+
@periodic_restart_period = on_periodic_restart[:each] || PAGES_TO_RESTART
|
|
12
|
+
@driver = driver || CapybaraDriver.new.setup_capybara
|
|
13
|
+
if @periodic_restart_period <= 0
|
|
14
|
+
Grell.logger.warn "GRELL. Restart option misconfigured with a negative period. Ignoring option."
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Restarts the PhantomJS process without modifying the state of visited and discovered pages.
|
|
19
|
+
def restart
|
|
20
|
+
Grell.logger.info "GRELL. Driver restarting"
|
|
21
|
+
@driver.restart
|
|
22
|
+
Grell.logger.info "GRELL. Driver restarted"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Quits the poltergeist driver.
|
|
26
|
+
def quit
|
|
27
|
+
Grell.logger.info "GRELL. Driver quitting"
|
|
28
|
+
@driver.quit
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# PhantomJS seems to consume memory increasingly as it crawls, periodic restart allows to restart
|
|
32
|
+
# the driver, potentially calling a block.
|
|
33
|
+
def check_periodic_restart(collection)
|
|
34
|
+
return unless @periodic_restart_block
|
|
35
|
+
return unless @periodic_restart_period > 0
|
|
36
|
+
return unless (collection.visited_pages.size % @periodic_restart_period).zero?
|
|
37
|
+
restart
|
|
38
|
+
@periodic_restart_block.call
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def self.cleanup_all_processes
|
|
42
|
+
PhantomJSManager.new.cleanup_all_processes
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
PAGES_TO_RESTART = 100 # Default number of pages before we restart the driver.
|
|
48
|
+
KILL_TIMEOUT = 2 # Number of seconds we wait till we kill the process.
|
|
49
|
+
|
|
50
|
+
# Manages the PhantomJS process
|
|
51
|
+
class PhantomJSManager
|
|
52
|
+
def cleanup_all_processes
|
|
53
|
+
pids = running_phantomjs_pids
|
|
54
|
+
return if pids.empty?
|
|
55
|
+
Grell.logger.warn "GRELL. Killing PhantomJS processes: #{pids.inspect}"
|
|
56
|
+
pids.each do |pid|
|
|
57
|
+
Grell.logger.warn "GRELL. Sending KILL to PhantomJS process #{pid}"
|
|
58
|
+
kill_process(pid.to_i)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def running_phantomjs_pids
|
|
63
|
+
list_phantomjs_processes_cmd = "ps -ef | grep -E 'bin/phantomjs' | grep -v grep"
|
|
64
|
+
`#{list_phantomjs_processes_cmd} | awk '{print $2;}'`.split("\n")
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def kill_process(pid)
|
|
68
|
+
Process.kill('TERM', pid)
|
|
69
|
+
force_kill(pid)
|
|
70
|
+
rescue Errno::ESRCH, Errno::ECHILD
|
|
71
|
+
# successfully terminated
|
|
72
|
+
rescue => e
|
|
73
|
+
Grell.logger.error ["GRELL. PhantomJS process could not be killed", e.message, *e.backtrace].join($/)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def force_kill(pid)
|
|
77
|
+
Timeout.timeout(KILL_TIMEOUT) { Process.wait(pid) }
|
|
78
|
+
rescue Timeout::Error
|
|
79
|
+
Process.kill('KILL', pid)
|
|
80
|
+
Process.wait(pid)
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|