grell 2.1.1 → 2.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +18 -10
- data/CHANGELOG.md +3 -0
- data/README.md +10 -10
- data/grell.gemspec +1 -1
- data/lib/grell/crawler.rb +7 -7
- data/lib/grell/crawler_manager.rb +1 -1
- data/lib/grell/version.rb +1 -1
- data/spec/lib/crawler_spec.rb +28 -28
- metadata +3 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c17856255ff1e871cc5e12cc2a9f0f4870156923ab924ea11db16b053a6742fb
|
4
|
+
data.tar.gz: d619076b40cbb4b057015a8bbcb8a07f555c282aa0ec971aa36b4e867fbfbd86
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28860f331fc02f6976bcfd8717bf8c33ca89984ae5d2ce9eede6abb31b5f06b44e2135468c6d75374dd649378cc3d719474979c2f27e67a5a7e5301fc561113f
|
7
|
+
data.tar.gz: 77f68dbdb006803c517de4e0b72a11ac9eba265781703f1b03f98af52b147cab7ba02429371038d426fed1073e1a5f3dcdc0a6838cbf93c64c0c4307f605eea6
|
data/.travis.yml
CHANGED
@@ -1,20 +1,28 @@
|
|
1
1
|
language: ruby
|
2
2
|
cache: bundler
|
3
|
-
|
3
|
+
|
4
4
|
rvm:
|
5
|
-
- 2.2.4
|
6
|
-
- 2.3.0
|
7
|
-
- 2.4.2
|
8
|
-
|
5
|
+
- 2.2.4
|
6
|
+
- 2.3.0
|
7
|
+
- 2.4.2
|
8
|
+
|
9
9
|
before_install:
|
10
|
-
- mkdir travis-phantomjs
|
11
|
-
- wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true
|
12
|
-
|
13
|
-
- tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
|
14
|
-
- export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
|
10
|
+
- mkdir travis-phantomjs
|
11
|
+
- wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true
|
12
|
+
-O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
|
13
|
+
- tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
|
14
|
+
- export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
|
15
|
+
|
16
|
+
install:
|
17
|
+
- bundle install --jobs=3 --retry=3
|
18
|
+
|
19
|
+
script:
|
20
|
+
- bundle exec rspec
|
21
|
+
|
15
22
|
deploy:
|
16
23
|
provider: rubygems
|
17
24
|
api_key:
|
18
25
|
secure: czStDI0W6MWL70sDwu53oNNCc8vKtT61pgvii+ZWIC9A41C2p7BzmbtosXsnLk2ApxmpWvFIgtQE0XIH7jkM5mY05cHinXDphtOTkNLFVjck3ZOMkx/cc+QRFW8K4FHkrzFsC+/Xx4t2/Psh35LpzhfJd0XzKKoCstXUVgJsfGcAK3DMpjXHSUbwLXGDZ4lzmsk52OLf0oL+in2447TJfVOvGXtYmfh1PjXRwDxKB0dan7w5mVgajS52b6wUhVPTaMe/JgCbMuV7BaQ1Goq8u7V4aaxU+liPAhzHWfMB6tF4TEW8yu2tvGLdOA0+1jmM8E9Q5saPWtwKiHvBxN8CzRpkiNDzyFAf8ljrWT5yKX3aRQCyPp3NNyhoumWap36b+O/zwZ3HxoAe22Yg0rjz8z8NxMR/ELPvjPYjCiF5zY7fO9PAzmIynMRUrxDnFj+/JGHdzx0ZMo3fEXgHHSaHPNxIzEffVVQk4XLVnFHDjBLY4mVp4sbHbja5qnui20RkdM/H9Yi/fQyl1ODhk+LUPoh45ZneDZq7GPrl+WKK06oEjXIXLU+1iEuqnSqybbmJMTUJlUV+7EJdtq2DgfDB4KXwLm2LLOR/IX63AzEav4NIxx3hIXifSKa9rp6D7nMTzdQwF0FFzIj/Y3qLrAe1WWt0gx3Vxq67pSwOJthk5Fc=
|
19
26
|
on:
|
20
27
|
tags: true
|
28
|
+
rvm: 2.4.2
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -92,15 +92,15 @@ crawler.manager.quit # quits and destroys the crawler
|
|
92
92
|
The `Grell:Crawler` class can be passed options to customize its behavior:
|
93
93
|
- `logger`: Sets the logger object, for instance `Rails.logger`. Default: `Logger.new(STDOUT)`
|
94
94
|
- `on_periodic_restart`: Sets periodic restarts of the crawler each certain number of visits. Default: 100 pages.
|
95
|
-
- `
|
96
|
-
- `
|
95
|
+
- `allowlist`: Sets a allowlist filter for URLs to be visited. Default: all URLs are allowlisted.
|
96
|
+
- `denylist`: Sets a denylist filter for URLs to be avoided. Default: no URL is denylisted.
|
97
97
|
- `add_match_block`: Block evaluated to consider if a given page should be part of the pages to be visited. Default: add unique URLs.
|
98
98
|
- `evaluate_in_each_page`: Javascript block to be evaluated on each page visited. Default: Nothing evaluated.
|
99
99
|
|
100
100
|
Grell by default will follow all the links it finds in the site being crawled.
|
101
101
|
It will never follow links linking outside your site.
|
102
102
|
If you want to further limit the amount of links crawled, you can use
|
103
|
-
|
103
|
+
allowlisting, denylisting or manual filtering.
|
104
104
|
Below further details on these and other options.
|
105
105
|
|
106
106
|
|
@@ -123,32 +123,32 @@ The crawler can be restarted manually by calling `crawler.manager.restart` or au
|
|
123
123
|
between restarts. A restart will destroy the cookies so for instance this custom block can be used to relogin.
|
124
124
|
|
125
125
|
|
126
|
-
####
|
126
|
+
#### Allowlisting
|
127
127
|
|
128
128
|
```ruby
|
129
129
|
require 'grell'
|
130
130
|
|
131
|
-
crawler = Grell::Crawler.new(
|
131
|
+
crawler = Grell::Crawler.new(allowlist: [/games\/.*/, '/fun'])
|
132
132
|
crawler.start_crawling('http://www.google.com')
|
133
133
|
```
|
134
134
|
|
135
135
|
Grell here will only follow links to games and '/fun' and ignore all
|
136
136
|
other links. You can provide a regexp, strings (if any part of the
|
137
|
-
string match is
|
137
|
+
string match is allowlisted) or an array with regexps and/or strings.
|
138
138
|
|
139
|
-
####
|
139
|
+
#### Denylisting
|
140
140
|
|
141
141
|
```ruby
|
142
142
|
require 'grell'
|
143
143
|
|
144
|
-
crawler = Grell::Crawler.new(
|
144
|
+
crawler = Grell::Crawler.new(denylist: /games\/.*/)
|
145
145
|
crawler.start_crawling('http://www.google.com')
|
146
146
|
```
|
147
147
|
|
148
|
-
Similar to
|
148
|
+
Similar to allowlisting. But now Grell will follow every other link in
|
149
149
|
this site which does not go to /games/...
|
150
150
|
|
151
|
-
If you call both
|
151
|
+
If you call both allowlist and denylist then both will apply, a link
|
152
152
|
has to fullfill both conditions to survive. If you do not call any, then
|
153
153
|
all links on this site will be crawled. Think of these methods as
|
154
154
|
filters.
|
data/grell.gemspec
CHANGED
@@ -24,7 +24,7 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_dependency 'capybara', '~> 2.10'
|
25
25
|
spec.add_dependency 'poltergeist', '~> 1.11'
|
26
26
|
|
27
|
-
spec.add_development_dependency 'bundler', '~> 1.6'
|
27
|
+
# spec.add_development_dependency 'bundler', '~> 1.6'
|
28
28
|
spec.add_development_dependency 'byebug', '~> 4.0'
|
29
29
|
spec.add_development_dependency 'kender', '~> 0.2'
|
30
30
|
spec.add_development_dependency 'rake', '~> 10.0'
|
data/lib/grell/crawler.rb
CHANGED
@@ -7,15 +7,15 @@ module Grell
|
|
7
7
|
# evaluate_in_each_page: javascript block to evaluate in each page we crawl
|
8
8
|
# add_match_block: block to evaluate to consider if a page is part of the collection
|
9
9
|
# manager_options: options passed to the manager class
|
10
|
-
#
|
11
|
-
#
|
12
|
-
def initialize(evaluate_in_each_page: nil, add_match_block: nil,
|
10
|
+
# allowlist: Sets an allowlist filter, allows a regexp, string or array of either to be matched.
|
11
|
+
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
12
|
+
def initialize(evaluate_in_each_page: nil, add_match_block: nil, allowlist: /.*/, denylist: /a^/, **manager_options)
|
13
13
|
@collection = nil
|
14
14
|
@manager = CrawlerManager.new(manager_options)
|
15
15
|
@evaluate_in_each_page = evaluate_in_each_page
|
16
16
|
@add_match_block = add_match_block
|
17
|
-
@
|
18
|
-
@
|
17
|
+
@allowlist_regexp = Regexp.union(allowlist)
|
18
|
+
@denylist_regexp = Regexp.union(denylist)
|
19
19
|
end
|
20
20
|
|
21
21
|
# Main method, it starts crawling on the given URL and calls a block for each of the pages found.
|
@@ -67,8 +67,8 @@ module Grell
|
|
67
67
|
end
|
68
68
|
|
69
69
|
def filter!(links)
|
70
|
-
links.select! { |link| link =~ @
|
71
|
-
links.delete_if { |link| link =~ @
|
70
|
+
links.select! { |link| link =~ @allowlist_regexp } if @allowlist_regexp
|
71
|
+
links.delete_if { |link| link =~ @denylist_regexp } if @denylist_regexp
|
72
72
|
end
|
73
73
|
|
74
74
|
# Store the resulting redirected URL along with the original URL
|
@@ -70,7 +70,7 @@ module Grell
|
|
70
70
|
rescue Errno::ESRCH, Errno::ECHILD
|
71
71
|
# successfully terminated
|
72
72
|
rescue => e
|
73
|
-
Grell.logger.
|
73
|
+
Grell.logger.error ["GRELL. PhantomJS process could not be killed", e.message, *e.backtrace].join($/)
|
74
74
|
end
|
75
75
|
|
76
76
|
def force_kill(pid)
|
data/lib/grell/version.rb
CHANGED
data/spec/lib/crawler_spec.rb
CHANGED
@@ -6,16 +6,16 @@ RSpec.describe Grell::Crawler do
|
|
6
6
|
let(:host) { 'http://www.example.com' }
|
7
7
|
let(:url) { 'http://www.example.com/test' }
|
8
8
|
let(:add_match_block) { nil }
|
9
|
-
let(:
|
10
|
-
let(:
|
9
|
+
let(:denylist) { /a^/ }
|
10
|
+
let(:allowlist) { /.*/ }
|
11
11
|
let(:crawler) do
|
12
12
|
Grell::Crawler.new(
|
13
13
|
logger: Logger.new(nil),
|
14
14
|
driver: double(nil),
|
15
15
|
evaluate_in_each_page: script,
|
16
16
|
add_match_block: add_match_block,
|
17
|
-
|
18
|
-
|
17
|
+
denylist: denylist,
|
18
|
+
allowlist: allowlist)
|
19
19
|
end
|
20
20
|
let(:script) { nil }
|
21
21
|
let(:body) { 'body' }
|
@@ -128,7 +128,7 @@ RSpec.describe Grell::Crawler do
|
|
128
128
|
expect(crawler.collection.discovered_pages.size).to eq(0)
|
129
129
|
end
|
130
130
|
|
131
|
-
it 'contains the
|
131
|
+
it 'contains the allowlisted page and the base page only' do
|
132
132
|
crawler.start_crawling(url)
|
133
133
|
expect(crawler.collection.visited_pages.map(&:url)).
|
134
134
|
to eq(visited_pages)
|
@@ -168,7 +168,7 @@ RSpec.describe Grell::Crawler do
|
|
168
168
|
it_behaves_like 'visits all available pages'
|
169
169
|
end
|
170
170
|
|
171
|
-
describe '#
|
171
|
+
describe '#allowlist' do
|
172
172
|
let(:body) do
|
173
173
|
"<html><head></head><body>
|
174
174
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -183,7 +183,7 @@ RSpec.describe Grell::Crawler do
|
|
183
183
|
end
|
184
184
|
|
185
185
|
context 'using a single string' do
|
186
|
-
let(:
|
186
|
+
let(:allowlist) { '/trusmis.html' }
|
187
187
|
let(:visited_pages_count) { 2 } # my own page + trusmis
|
188
188
|
let(:visited_pages) do
|
189
189
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -193,7 +193,7 @@ RSpec.describe Grell::Crawler do
|
|
193
193
|
end
|
194
194
|
|
195
195
|
context 'using an array of strings' do
|
196
|
-
let(:
|
196
|
+
let(:allowlist) { ['/trusmis.html', '/nothere', 'another.html'] }
|
197
197
|
let(:visited_pages_count) { 2 }
|
198
198
|
let(:visited_pages) do
|
199
199
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -203,7 +203,7 @@ RSpec.describe Grell::Crawler do
|
|
203
203
|
end
|
204
204
|
|
205
205
|
context 'using a regexp' do
|
206
|
-
let(:
|
206
|
+
let(:allowlist) { /\/trusmis\.html/ }
|
207
207
|
let(:visited_pages_count) { 2 }
|
208
208
|
let(:visited_pages) do
|
209
209
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -213,7 +213,7 @@ RSpec.describe Grell::Crawler do
|
|
213
213
|
end
|
214
214
|
|
215
215
|
context 'using an array of regexps' do
|
216
|
-
let(:
|
216
|
+
let(:allowlist) { [/\/trusmis\.html/] }
|
217
217
|
let(:visited_pages_count) { 2 }
|
218
218
|
let(:visited_pages) do
|
219
219
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -223,7 +223,7 @@ RSpec.describe Grell::Crawler do
|
|
223
223
|
end
|
224
224
|
|
225
225
|
context 'using an empty array' do
|
226
|
-
let(:
|
226
|
+
let(:allowlist) { [] }
|
227
227
|
let(:visited_pages_count) { 1 } # my own page only
|
228
228
|
let(:visited_pages) do
|
229
229
|
['http://www.example.com/test']
|
@@ -232,8 +232,8 @@ RSpec.describe Grell::Crawler do
|
|
232
232
|
it_behaves_like 'visits all available pages'
|
233
233
|
end
|
234
234
|
|
235
|
-
context 'adding all links to the
|
236
|
-
let(:
|
235
|
+
context 'adding all links to the allowlist' do
|
236
|
+
let(:allowlist) { ['/trusmis', '/help'] }
|
237
237
|
let(:visited_pages_count) { 3 } # all links
|
238
238
|
let(:visited_pages) do
|
239
239
|
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
@@ -244,7 +244,7 @@ RSpec.describe Grell::Crawler do
|
|
244
244
|
end
|
245
245
|
|
246
246
|
|
247
|
-
describe '#
|
247
|
+
describe '#denylist' do
|
248
248
|
let(:body) do
|
249
249
|
"<html><head></head><body>
|
250
250
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -259,7 +259,7 @@ RSpec.describe Grell::Crawler do
|
|
259
259
|
end
|
260
260
|
|
261
261
|
context 'using a single string' do
|
262
|
-
let(:
|
262
|
+
let(:denylist) { '/trusmis.html' }
|
263
263
|
let(:visited_pages_count) {2}
|
264
264
|
let(:visited_pages) do
|
265
265
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -269,7 +269,7 @@ RSpec.describe Grell::Crawler do
|
|
269
269
|
end
|
270
270
|
|
271
271
|
context 'using an array of strings' do
|
272
|
-
let(:
|
272
|
+
let(:denylist) { ['/trusmis.html', '/nothere', 'another.html'] }
|
273
273
|
let(:visited_pages_count) {2}
|
274
274
|
let(:visited_pages) do
|
275
275
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -279,7 +279,7 @@ RSpec.describe Grell::Crawler do
|
|
279
279
|
end
|
280
280
|
|
281
281
|
context 'using a regexp' do
|
282
|
-
let(:
|
282
|
+
let(:denylist) { /\/trusmis\.html/ }
|
283
283
|
let(:visited_pages_count) {2}
|
284
284
|
let(:visited_pages) do
|
285
285
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -289,7 +289,7 @@ RSpec.describe Grell::Crawler do
|
|
289
289
|
end
|
290
290
|
|
291
291
|
context 'using an array of regexps' do
|
292
|
-
let(:
|
292
|
+
let(:denylist) { [/\/trusmis\.html/] }
|
293
293
|
let(:visited_pages_count) {2}
|
294
294
|
let(:visited_pages) do
|
295
295
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -299,7 +299,7 @@ RSpec.describe Grell::Crawler do
|
|
299
299
|
end
|
300
300
|
|
301
301
|
context 'using an empty array' do
|
302
|
-
let(:
|
302
|
+
let(:denylist) { [] }
|
303
303
|
let(:visited_pages_count) { 3 } # all links
|
304
304
|
let(:visited_pages) do
|
305
305
|
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
@@ -308,8 +308,8 @@ RSpec.describe Grell::Crawler do
|
|
308
308
|
it_behaves_like 'visits all available pages'
|
309
309
|
end
|
310
310
|
|
311
|
-
context 'adding all links to the
|
312
|
-
let(:
|
311
|
+
context 'adding all links to the denylist' do
|
312
|
+
let(:denylist) { ['/trusmis', '/help'] }
|
313
313
|
let(:visited_pages_count) { 1 }
|
314
314
|
let(:visited_pages) do
|
315
315
|
['http://www.example.com/test']
|
@@ -320,7 +320,7 @@ RSpec.describe Grell::Crawler do
|
|
320
320
|
end
|
321
321
|
|
322
322
|
|
323
|
-
describe '
|
323
|
+
describe 'allowlisting and denylisting' do
|
324
324
|
let(:body) do
|
325
325
|
"<html><head></head><body>
|
326
326
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -334,9 +334,9 @@ RSpec.describe Grell::Crawler do
|
|
334
334
|
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
335
335
|
end
|
336
336
|
|
337
|
-
context 'we
|
338
|
-
let(:
|
339
|
-
let(:
|
337
|
+
context 'we denylist the only allowlisted page' do
|
338
|
+
let(:allowlist) { '/trusmis.html' }
|
339
|
+
let(:denylist) { '/trusmis.html' }
|
340
340
|
let(:visited_pages_count) { 1 }
|
341
341
|
let(:visited_pages) do
|
342
342
|
['http://www.example.com/test']
|
@@ -345,9 +345,9 @@ RSpec.describe Grell::Crawler do
|
|
345
345
|
it_behaves_like 'visits all available pages'
|
346
346
|
end
|
347
347
|
|
348
|
-
context 'we
|
349
|
-
let(:
|
350
|
-
let(:
|
348
|
+
context 'we denylist none of the allowlisted pages' do
|
349
|
+
let(:allowlist) { '/trusmis.html' }
|
350
|
+
let(:denylist) { '/raistlin.html' }
|
351
351
|
let(:visited_pages_count) { 2 }
|
352
352
|
let(:visited_pages) do
|
353
353
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '1.11'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: bundler
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '1.6'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '1.6'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: byebug
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -215,8 +201,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
215
201
|
- !ruby/object:Gem::Version
|
216
202
|
version: '0'
|
217
203
|
requirements: []
|
218
|
-
|
219
|
-
rubygems_version: 2.7.1
|
204
|
+
rubygems_version: 3.0.8
|
220
205
|
signing_key:
|
221
206
|
specification_version: 4
|
222
207
|
summary: Ruby web crawler
|