grell 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +18 -10
- data/CHANGELOG.md +3 -0
- data/README.md +10 -10
- data/grell.gemspec +1 -1
- data/lib/grell/crawler.rb +7 -7
- data/lib/grell/crawler_manager.rb +1 -1
- data/lib/grell/version.rb +1 -1
- data/spec/lib/crawler_spec.rb +28 -28
- metadata +3 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c17856255ff1e871cc5e12cc2a9f0f4870156923ab924ea11db16b053a6742fb
|
4
|
+
data.tar.gz: d619076b40cbb4b057015a8bbcb8a07f555c282aa0ec971aa36b4e867fbfbd86
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28860f331fc02f6976bcfd8717bf8c33ca89984ae5d2ce9eede6abb31b5f06b44e2135468c6d75374dd649378cc3d719474979c2f27e67a5a7e5301fc561113f
|
7
|
+
data.tar.gz: 77f68dbdb006803c517de4e0b72a11ac9eba265781703f1b03f98af52b147cab7ba02429371038d426fed1073e1a5f3dcdc0a6838cbf93c64c0c4307f605eea6
|
data/.travis.yml
CHANGED
@@ -1,20 +1,28 @@
|
|
1
1
|
language: ruby
|
2
2
|
cache: bundler
|
3
|
-
|
3
|
+
|
4
4
|
rvm:
|
5
|
-
- 2.2.4
|
6
|
-
- 2.3.0
|
7
|
-
- 2.4.2
|
8
|
-
|
5
|
+
- 2.2.4
|
6
|
+
- 2.3.0
|
7
|
+
- 2.4.2
|
8
|
+
|
9
9
|
before_install:
|
10
|
-
- mkdir travis-phantomjs
|
11
|
-
- wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true
|
12
|
-
|
13
|
-
- tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
|
14
|
-
- export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
|
10
|
+
- mkdir travis-phantomjs
|
11
|
+
- wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true
|
12
|
+
-O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
|
13
|
+
- tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
|
14
|
+
- export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
|
15
|
+
|
16
|
+
install:
|
17
|
+
- bundle install --jobs=3 --retry=3
|
18
|
+
|
19
|
+
script:
|
20
|
+
- bundle exec rspec
|
21
|
+
|
15
22
|
deploy:
|
16
23
|
provider: rubygems
|
17
24
|
api_key:
|
18
25
|
secure: czStDI0W6MWL70sDwu53oNNCc8vKtT61pgvii+ZWIC9A41C2p7BzmbtosXsnLk2ApxmpWvFIgtQE0XIH7jkM5mY05cHinXDphtOTkNLFVjck3ZOMkx/cc+QRFW8K4FHkrzFsC+/Xx4t2/Psh35LpzhfJd0XzKKoCstXUVgJsfGcAK3DMpjXHSUbwLXGDZ4lzmsk52OLf0oL+in2447TJfVOvGXtYmfh1PjXRwDxKB0dan7w5mVgajS52b6wUhVPTaMe/JgCbMuV7BaQ1Goq8u7V4aaxU+liPAhzHWfMB6tF4TEW8yu2tvGLdOA0+1jmM8E9Q5saPWtwKiHvBxN8CzRpkiNDzyFAf8ljrWT5yKX3aRQCyPp3NNyhoumWap36b+O/zwZ3HxoAe22Yg0rjz8z8NxMR/ELPvjPYjCiF5zY7fO9PAzmIynMRUrxDnFj+/JGHdzx0ZMo3fEXgHHSaHPNxIzEffVVQk4XLVnFHDjBLY4mVp4sbHbja5qnui20RkdM/H9Yi/fQyl1ODhk+LUPoh45ZneDZq7GPrl+WKK06oEjXIXLU+1iEuqnSqybbmJMTUJlUV+7EJdtq2DgfDB4KXwLm2LLOR/IX63AzEav4NIxx3hIXifSKa9rp6D7nMTzdQwF0FFzIj/Y3qLrAe1WWt0gx3Vxq67pSwOJthk5Fc=
|
19
26
|
on:
|
20
27
|
tags: true
|
28
|
+
rvm: 2.4.2
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -92,15 +92,15 @@ crawler.manager.quit # quits and destroys the crawler
|
|
92
92
|
The `Grell:Crawler` class can be passed options to customize its behavior:
|
93
93
|
- `logger`: Sets the logger object, for instance `Rails.logger`. Default: `Logger.new(STDOUT)`
|
94
94
|
- `on_periodic_restart`: Sets periodic restarts of the crawler each certain number of visits. Default: 100 pages.
|
95
|
-
- `
|
96
|
-
- `
|
95
|
+
- `allowlist`: Sets a allowlist filter for URLs to be visited. Default: all URLs are allowlisted.
|
96
|
+
- `denylist`: Sets a denylist filter for URLs to be avoided. Default: no URL is denylisted.
|
97
97
|
- `add_match_block`: Block evaluated to consider if a given page should be part of the pages to be visited. Default: add unique URLs.
|
98
98
|
- `evaluate_in_each_page`: Javascript block to be evaluated on each page visited. Default: Nothing evaluated.
|
99
99
|
|
100
100
|
Grell by default will follow all the links it finds in the site being crawled.
|
101
101
|
It will never follow links linking outside your site.
|
102
102
|
If you want to further limit the amount of links crawled, you can use
|
103
|
-
|
103
|
+
allowlisting, denylisting or manual filtering.
|
104
104
|
Below further details on these and other options.
|
105
105
|
|
106
106
|
|
@@ -123,32 +123,32 @@ The crawler can be restarted manually by calling `crawler.manager.restart` or au
|
|
123
123
|
between restarts. A restart will destroy the cookies so for instance this custom block can be used to relogin.
|
124
124
|
|
125
125
|
|
126
|
-
####
|
126
|
+
#### Allowlisting
|
127
127
|
|
128
128
|
```ruby
|
129
129
|
require 'grell'
|
130
130
|
|
131
|
-
crawler = Grell::Crawler.new(
|
131
|
+
crawler = Grell::Crawler.new(allowlist: [/games\/.*/, '/fun'])
|
132
132
|
crawler.start_crawling('http://www.google.com')
|
133
133
|
```
|
134
134
|
|
135
135
|
Grell here will only follow links to games and '/fun' and ignore all
|
136
136
|
other links. You can provide a regexp, strings (if any part of the
|
137
|
-
string match is
|
137
|
+
string match is allowlisted) or an array with regexps and/or strings.
|
138
138
|
|
139
|
-
####
|
139
|
+
#### Denylisting
|
140
140
|
|
141
141
|
```ruby
|
142
142
|
require 'grell'
|
143
143
|
|
144
|
-
crawler = Grell::Crawler.new(
|
144
|
+
crawler = Grell::Crawler.new(denylist: /games\/.*/)
|
145
145
|
crawler.start_crawling('http://www.google.com')
|
146
146
|
```
|
147
147
|
|
148
|
-
Similar to
|
148
|
+
Similar to allowlisting. But now Grell will follow every other link in
|
149
149
|
this site which does not go to /games/...
|
150
150
|
|
151
|
-
If you call both
|
151
|
+
If you call both allowlist and denylist then both will apply, a link
|
152
152
|
has to fullfill both conditions to survive. If you do not call any, then
|
153
153
|
all links on this site will be crawled. Think of these methods as
|
154
154
|
filters.
|
data/grell.gemspec
CHANGED
@@ -24,7 +24,7 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_dependency 'capybara', '~> 2.10'
|
25
25
|
spec.add_dependency 'poltergeist', '~> 1.11'
|
26
26
|
|
27
|
-
spec.add_development_dependency 'bundler', '~> 1.6'
|
27
|
+
# spec.add_development_dependency 'bundler', '~> 1.6'
|
28
28
|
spec.add_development_dependency 'byebug', '~> 4.0'
|
29
29
|
spec.add_development_dependency 'kender', '~> 0.2'
|
30
30
|
spec.add_development_dependency 'rake', '~> 10.0'
|
data/lib/grell/crawler.rb
CHANGED
@@ -7,15 +7,15 @@ module Grell
|
|
7
7
|
# evaluate_in_each_page: javascript block to evaluate in each page we crawl
|
8
8
|
# add_match_block: block to evaluate to consider if a page is part of the collection
|
9
9
|
# manager_options: options passed to the manager class
|
10
|
-
#
|
11
|
-
#
|
12
|
-
def initialize(evaluate_in_each_page: nil, add_match_block: nil,
|
10
|
+
# allowlist: Sets an allowlist filter, allows a regexp, string or array of either to be matched.
|
11
|
+
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
12
|
+
def initialize(evaluate_in_each_page: nil, add_match_block: nil, allowlist: /.*/, denylist: /a^/, **manager_options)
|
13
13
|
@collection = nil
|
14
14
|
@manager = CrawlerManager.new(manager_options)
|
15
15
|
@evaluate_in_each_page = evaluate_in_each_page
|
16
16
|
@add_match_block = add_match_block
|
17
|
-
@
|
18
|
-
@
|
17
|
+
@allowlist_regexp = Regexp.union(allowlist)
|
18
|
+
@denylist_regexp = Regexp.union(denylist)
|
19
19
|
end
|
20
20
|
|
21
21
|
# Main method, it starts crawling on the given URL and calls a block for each of the pages found.
|
@@ -67,8 +67,8 @@ module Grell
|
|
67
67
|
end
|
68
68
|
|
69
69
|
def filter!(links)
|
70
|
-
links.select! { |link| link =~ @
|
71
|
-
links.delete_if { |link| link =~ @
|
70
|
+
links.select! { |link| link =~ @allowlist_regexp } if @allowlist_regexp
|
71
|
+
links.delete_if { |link| link =~ @denylist_regexp } if @denylist_regexp
|
72
72
|
end
|
73
73
|
|
74
74
|
# Store the resulting redirected URL along with the original URL
|
@@ -70,7 +70,7 @@ module Grell
|
|
70
70
|
rescue Errno::ESRCH, Errno::ECHILD
|
71
71
|
# successfully terminated
|
72
72
|
rescue => e
|
73
|
-
Grell.logger.
|
73
|
+
Grell.logger.error ["GRELL. PhantomJS process could not be killed", e.message, *e.backtrace].join($/)
|
74
74
|
end
|
75
75
|
|
76
76
|
def force_kill(pid)
|
data/lib/grell/version.rb
CHANGED
data/spec/lib/crawler_spec.rb
CHANGED
@@ -6,16 +6,16 @@ RSpec.describe Grell::Crawler do
|
|
6
6
|
let(:host) { 'http://www.example.com' }
|
7
7
|
let(:url) { 'http://www.example.com/test' }
|
8
8
|
let(:add_match_block) { nil }
|
9
|
-
let(:
|
10
|
-
let(:
|
9
|
+
let(:denylist) { /a^/ }
|
10
|
+
let(:allowlist) { /.*/ }
|
11
11
|
let(:crawler) do
|
12
12
|
Grell::Crawler.new(
|
13
13
|
logger: Logger.new(nil),
|
14
14
|
driver: double(nil),
|
15
15
|
evaluate_in_each_page: script,
|
16
16
|
add_match_block: add_match_block,
|
17
|
-
|
18
|
-
|
17
|
+
denylist: denylist,
|
18
|
+
allowlist: allowlist)
|
19
19
|
end
|
20
20
|
let(:script) { nil }
|
21
21
|
let(:body) { 'body' }
|
@@ -128,7 +128,7 @@ RSpec.describe Grell::Crawler do
|
|
128
128
|
expect(crawler.collection.discovered_pages.size).to eq(0)
|
129
129
|
end
|
130
130
|
|
131
|
-
it 'contains the
|
131
|
+
it 'contains the allowlisted page and the base page only' do
|
132
132
|
crawler.start_crawling(url)
|
133
133
|
expect(crawler.collection.visited_pages.map(&:url)).
|
134
134
|
to eq(visited_pages)
|
@@ -168,7 +168,7 @@ RSpec.describe Grell::Crawler do
|
|
168
168
|
it_behaves_like 'visits all available pages'
|
169
169
|
end
|
170
170
|
|
171
|
-
describe '#
|
171
|
+
describe '#allowlist' do
|
172
172
|
let(:body) do
|
173
173
|
"<html><head></head><body>
|
174
174
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -183,7 +183,7 @@ RSpec.describe Grell::Crawler do
|
|
183
183
|
end
|
184
184
|
|
185
185
|
context 'using a single string' do
|
186
|
-
let(:
|
186
|
+
let(:allowlist) { '/trusmis.html' }
|
187
187
|
let(:visited_pages_count) { 2 } # my own page + trusmis
|
188
188
|
let(:visited_pages) do
|
189
189
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -193,7 +193,7 @@ RSpec.describe Grell::Crawler do
|
|
193
193
|
end
|
194
194
|
|
195
195
|
context 'using an array of strings' do
|
196
|
-
let(:
|
196
|
+
let(:allowlist) { ['/trusmis.html', '/nothere', 'another.html'] }
|
197
197
|
let(:visited_pages_count) { 2 }
|
198
198
|
let(:visited_pages) do
|
199
199
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -203,7 +203,7 @@ RSpec.describe Grell::Crawler do
|
|
203
203
|
end
|
204
204
|
|
205
205
|
context 'using a regexp' do
|
206
|
-
let(:
|
206
|
+
let(:allowlist) { /\/trusmis\.html/ }
|
207
207
|
let(:visited_pages_count) { 2 }
|
208
208
|
let(:visited_pages) do
|
209
209
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -213,7 +213,7 @@ RSpec.describe Grell::Crawler do
|
|
213
213
|
end
|
214
214
|
|
215
215
|
context 'using an array of regexps' do
|
216
|
-
let(:
|
216
|
+
let(:allowlist) { [/\/trusmis\.html/] }
|
217
217
|
let(:visited_pages_count) { 2 }
|
218
218
|
let(:visited_pages) do
|
219
219
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -223,7 +223,7 @@ RSpec.describe Grell::Crawler do
|
|
223
223
|
end
|
224
224
|
|
225
225
|
context 'using an empty array' do
|
226
|
-
let(:
|
226
|
+
let(:allowlist) { [] }
|
227
227
|
let(:visited_pages_count) { 1 } # my own page only
|
228
228
|
let(:visited_pages) do
|
229
229
|
['http://www.example.com/test']
|
@@ -232,8 +232,8 @@ RSpec.describe Grell::Crawler do
|
|
232
232
|
it_behaves_like 'visits all available pages'
|
233
233
|
end
|
234
234
|
|
235
|
-
context 'adding all links to the
|
236
|
-
let(:
|
235
|
+
context 'adding all links to the allowlist' do
|
236
|
+
let(:allowlist) { ['/trusmis', '/help'] }
|
237
237
|
let(:visited_pages_count) { 3 } # all links
|
238
238
|
let(:visited_pages) do
|
239
239
|
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
@@ -244,7 +244,7 @@ RSpec.describe Grell::Crawler do
|
|
244
244
|
end
|
245
245
|
|
246
246
|
|
247
|
-
describe '#
|
247
|
+
describe '#denylist' do
|
248
248
|
let(:body) do
|
249
249
|
"<html><head></head><body>
|
250
250
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -259,7 +259,7 @@ RSpec.describe Grell::Crawler do
|
|
259
259
|
end
|
260
260
|
|
261
261
|
context 'using a single string' do
|
262
|
-
let(:
|
262
|
+
let(:denylist) { '/trusmis.html' }
|
263
263
|
let(:visited_pages_count) {2}
|
264
264
|
let(:visited_pages) do
|
265
265
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -269,7 +269,7 @@ RSpec.describe Grell::Crawler do
|
|
269
269
|
end
|
270
270
|
|
271
271
|
context 'using an array of strings' do
|
272
|
-
let(:
|
272
|
+
let(:denylist) { ['/trusmis.html', '/nothere', 'another.html'] }
|
273
273
|
let(:visited_pages_count) {2}
|
274
274
|
let(:visited_pages) do
|
275
275
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -279,7 +279,7 @@ RSpec.describe Grell::Crawler do
|
|
279
279
|
end
|
280
280
|
|
281
281
|
context 'using a regexp' do
|
282
|
-
let(:
|
282
|
+
let(:denylist) { /\/trusmis\.html/ }
|
283
283
|
let(:visited_pages_count) {2}
|
284
284
|
let(:visited_pages) do
|
285
285
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -289,7 +289,7 @@ RSpec.describe Grell::Crawler do
|
|
289
289
|
end
|
290
290
|
|
291
291
|
context 'using an array of regexps' do
|
292
|
-
let(:
|
292
|
+
let(:denylist) { [/\/trusmis\.html/] }
|
293
293
|
let(:visited_pages_count) {2}
|
294
294
|
let(:visited_pages) do
|
295
295
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -299,7 +299,7 @@ RSpec.describe Grell::Crawler do
|
|
299
299
|
end
|
300
300
|
|
301
301
|
context 'using an empty array' do
|
302
|
-
let(:
|
302
|
+
let(:denylist) { [] }
|
303
303
|
let(:visited_pages_count) { 3 } # all links
|
304
304
|
let(:visited_pages) do
|
305
305
|
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
@@ -308,8 +308,8 @@ RSpec.describe Grell::Crawler do
|
|
308
308
|
it_behaves_like 'visits all available pages'
|
309
309
|
end
|
310
310
|
|
311
|
-
context 'adding all links to the
|
312
|
-
let(:
|
311
|
+
context 'adding all links to the denylist' do
|
312
|
+
let(:denylist) { ['/trusmis', '/help'] }
|
313
313
|
let(:visited_pages_count) { 1 }
|
314
314
|
let(:visited_pages) do
|
315
315
|
['http://www.example.com/test']
|
@@ -320,7 +320,7 @@ RSpec.describe Grell::Crawler do
|
|
320
320
|
end
|
321
321
|
|
322
322
|
|
323
|
-
describe '
|
323
|
+
describe 'allowlisting and denylisting' do
|
324
324
|
let(:body) do
|
325
325
|
"<html><head></head><body>
|
326
326
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -334,9 +334,9 @@ RSpec.describe Grell::Crawler do
|
|
334
334
|
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
335
335
|
end
|
336
336
|
|
337
|
-
context 'we
|
338
|
-
let(:
|
339
|
-
let(:
|
337
|
+
context 'we denylist the only allowlisted page' do
|
338
|
+
let(:allowlist) { '/trusmis.html' }
|
339
|
+
let(:denylist) { '/trusmis.html' }
|
340
340
|
let(:visited_pages_count) { 1 }
|
341
341
|
let(:visited_pages) do
|
342
342
|
['http://www.example.com/test']
|
@@ -345,9 +345,9 @@ RSpec.describe Grell::Crawler do
|
|
345
345
|
it_behaves_like 'visits all available pages'
|
346
346
|
end
|
347
347
|
|
348
|
-
context 'we
|
349
|
-
let(:
|
350
|
-
let(:
|
348
|
+
context 'we denylist none of the allowlisted pages' do
|
349
|
+
let(:allowlist) { '/trusmis.html' }
|
350
|
+
let(:denylist) { '/raistlin.html' }
|
351
351
|
let(:visited_pages_count) { 2 }
|
352
352
|
let(:visited_pages) do
|
353
353
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '1.11'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: bundler
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '1.6'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '1.6'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: byebug
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -215,8 +201,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
215
201
|
- !ruby/object:Gem::Version
|
216
202
|
version: '0'
|
217
203
|
requirements: []
|
218
|
-
|
219
|
-
rubygems_version: 2.7.1
|
204
|
+
rubygems_version: 3.0.8
|
220
205
|
signing_key:
|
221
206
|
specification_version: 4
|
222
207
|
summary: Ruby web crawler
|