grell 1.5.1 → 1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +42 -38
- data/README.md +20 -0
- data/lib/grell/capybara_driver.rb +3 -0
- data/lib/grell/crawler.rb +16 -7
- data/lib/grell/page.rb +7 -2
- data/lib/grell/page_collection.rb +6 -2
- data/lib/grell/version.rb +1 -1
- data/spec/lib/crawler_spec.rb +59 -28
- data/spec/lib/page_collection_spec.rb +28 -18
- data/spec/lib/page_spec.rb +92 -61
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2be8992c96b83e9b1a98474ada3b49ea7e5adb69
|
4
|
+
data.tar.gz: 3eed1bea205812e8e9ab7dc8678da57efea1fea1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: baa6e37b2ce80491b05688618b6ad0576149236c2367b2f6c52a84dfeae25edb6d340abfdcae4e3b6f7363072db0dc0c8c052cd83410e1f28e1725305db99993
|
7
|
+
data.tar.gz: 7c246e8b2a02494d5e44dc6fc4b0029ab254e63764b46791e9135ed9ec1657627d4b6f7e5cd921a951c062cfe815ac1fd7b4e7d87ffb11f786e0989d44c3083a
|
data/CHANGELOG.md
CHANGED
@@ -1,51 +1,55 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# 1.6
|
2
|
+
* Support custom URL comparison when adding new pages during crawling
|
3
|
+
* Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs
|
4
|
+
* Fail early if Capybara doesn't initialize properly
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
6
|
+
# 1.5.1
|
7
|
+
* Fixed deprecation warning (Thanks scott)
|
8
|
+
* Updated Poltergeist dependency
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
# 1.5.0
|
11
|
+
* Grell will follow redirects.
|
12
|
+
* Added #followed_redirects? #error? #current_url methods to the Page class
|
12
13
|
|
13
|
-
|
14
|
-
|
14
|
+
# 1.4.0
|
15
|
+
* Added crawler.restart to restart browser process
|
16
|
+
* The block of code can make grell retry any given page.
|
15
17
|
|
16
|
-
|
17
|
-
|
18
|
-
Better info in gemspec
|
18
|
+
# 1.3.2
|
19
|
+
* Rescue Timeout error and return an empty page when that happens
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
21
|
+
# 1.3.1
|
22
|
+
* Added whitelisting and blacklisting
|
23
|
+
* Better info in gemspec
|
23
24
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
# 1.3
|
26
|
+
* The Crawler object allows you to provide an external logger object.
|
27
|
+
* Clearer semantics when an error happens, special headers are returned so the user can inspect the error
|
28
|
+
* Caveats:
|
29
|
+
- The 'debug' option in the crawler does not have any affect anymore. Provide an external logger with 'logger' instead
|
30
|
+
- The errors provided in the headers by grell has changed from 'grell_status' to 'grellStatus'.
|
31
|
+
- The 'visited' property in the page was never supposed to be accesible. Use 'visited?' instead.
|
28
32
|
|
29
|
-
|
30
|
-
Solve bug: URLs are case insensitive
|
33
|
+
# 1.2.1
|
34
|
+
* Solve bug: URLs are case insensitive
|
31
35
|
|
32
|
-
|
33
|
-
Grell now will consider two links to point to the same page only when the whole URL is exactly the same.
|
34
|
-
|
36
|
+
# 1.2
|
37
|
+
* Grell now will consider two links to point to the same page only when the whole URL is exactly the same.
|
38
|
+
Versions previously would only consider two links to be the same when they shared the path.
|
35
39
|
|
36
|
-
|
37
|
-
Solve bug where we were adding links in heads as if there were normal links in the body
|
40
|
+
# 1.1.2
|
41
|
+
* Solve bug where we were adding links in heads as if there were normal links in the body
|
38
42
|
|
39
|
-
|
40
|
-
Solve bug with the new data-href functionality
|
43
|
+
# 1.1.1
|
44
|
+
* Solve bug with the new data-href functionality
|
41
45
|
|
42
|
-
|
43
|
-
Solve problem with randomly failing spec
|
44
|
-
Search for elements with 'href' or 'data-href' to find links
|
46
|
+
# 1.1
|
47
|
+
* Solve problem with randomly failing spec
|
48
|
+
* Search for elements with 'href' or 'data-href' to find links
|
45
49
|
|
46
|
-
|
47
|
-
Rescueing Javascript errors
|
50
|
+
# 1.0.1
|
51
|
+
* Rescueing Javascript errors
|
48
52
|
|
49
|
-
|
50
|
-
Initial implementation
|
51
|
-
Basic support to crawling pages.
|
53
|
+
# 1.0
|
54
|
+
* Initial implementation
|
55
|
+
* Basic support to crawling pages.
|
data/README.md
CHANGED
@@ -80,6 +80,26 @@ your are crawling. It will never follow links linking outside your site.
|
|
80
80
|
If you want to further limit the amount of links crawled, you can use
|
81
81
|
whitelisting, blacklisting or manual filtering.
|
82
82
|
|
83
|
+
#### Custom URL Comparison
|
84
|
+
By default, Grell will detect new URLs to visit by comparing the full URL
|
85
|
+
with the URLs of the discovered and visited links. This functionality can
|
86
|
+
be changed by passing a block of code to Grells `start_crawling` method.
|
87
|
+
In the below example, the path of the URLs (instead of the full URL) will
|
88
|
+
be compared.
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
require 'grell'
|
92
|
+
|
93
|
+
crawler = Grell::Crawler.new
|
94
|
+
|
95
|
+
add_match_block = Proc.new do |collection_page, page|
|
96
|
+
collection_page.path == page.path
|
97
|
+
end
|
98
|
+
|
99
|
+
crawler.start_crawling('http://www.google.com', add_match_block: add_match_block) do |current_page|
|
100
|
+
...
|
101
|
+
end
|
102
|
+
```
|
83
103
|
|
84
104
|
#### Whitelisting
|
85
105
|
|
data/lib/grell/crawler.rb
CHANGED
@@ -15,8 +15,6 @@ module Grell
|
|
15
15
|
else
|
16
16
|
Grell.logger = Logger.new(STDOUT)
|
17
17
|
end
|
18
|
-
|
19
|
-
@collection = PageCollection.new
|
20
18
|
end
|
21
19
|
|
22
20
|
# Restarts the PhantomJS process without modifying the state of visited and discovered pages.
|
@@ -37,13 +35,15 @@ module Grell
|
|
37
35
|
end
|
38
36
|
|
39
37
|
# Main method, it starts crawling on the given URL and calls a block for each of the pages found.
|
40
|
-
def start_crawling(url, &block)
|
38
|
+
def start_crawling(url, options = {}, &block)
|
41
39
|
Grell.logger.info "GRELL Started crawling"
|
42
|
-
@collection = PageCollection.new
|
40
|
+
@collection = PageCollection.new(options[:add_match_block] || default_add_match)
|
43
41
|
@collection.create_page(url, nil)
|
42
|
+
|
44
43
|
while !@collection.discovered_pages.empty?
|
45
44
|
crawl(@collection.next_page, block)
|
46
45
|
end
|
46
|
+
|
47
47
|
Grell.logger.info "GRELL finished crawling"
|
48
48
|
end
|
49
49
|
|
@@ -53,7 +53,7 @@ module Grell
|
|
53
53
|
filter!(site.links)
|
54
54
|
|
55
55
|
if block #The user of this block can send us a :retry to retry accessing the page
|
56
|
-
while
|
56
|
+
while block.call(site) == :retry
|
57
57
|
Grell.logger.info "Retrying our visit to #{site.url}"
|
58
58
|
site.navigate
|
59
59
|
filter!(site.links)
|
@@ -66,9 +66,18 @@ module Grell
|
|
66
66
|
end
|
67
67
|
|
68
68
|
private
|
69
|
+
|
69
70
|
def filter!(links)
|
70
|
-
links.select!{ |link| link =~ @whitelist_regexp } if @whitelist_regexp
|
71
|
-
links.delete_if{ |link| link =~ @blacklist_regexp } if @blacklist_regexp
|
71
|
+
links.select! { |link| link =~ @whitelist_regexp } if @whitelist_regexp
|
72
|
+
links.delete_if { |link| link =~ @blacklist_regexp } if @blacklist_regexp
|
73
|
+
end
|
74
|
+
|
75
|
+
# If options[:add_match_block] is not provided, url matching to determine if a
|
76
|
+
# new page should be added the page collection will default to this proc
|
77
|
+
def default_add_match
|
78
|
+
Proc.new do |collection_page, page|
|
79
|
+
collection_page.url.downcase == page.url.downcase
|
80
|
+
end
|
72
81
|
end
|
73
82
|
|
74
83
|
end
|
data/lib/grell/page.rb
CHANGED
@@ -44,8 +44,6 @@ module Grell
|
|
44
44
|
unavailable_page(404, e)
|
45
45
|
rescue Capybara::Poltergeist::StatusFailError => e
|
46
46
|
unavailable_page(404, e)
|
47
|
-
rescue Timeout::Error => e #This error inherits from Interruption, do not inherit from StandardError
|
48
|
-
unavailable_page(404, e)
|
49
47
|
end
|
50
48
|
|
51
49
|
# Number of times we have retried the current page
|
@@ -68,6 +66,13 @@ module Grell
|
|
68
66
|
!!(status.to_s =~ /[4|5]\d\d/)
|
69
67
|
end
|
70
68
|
|
69
|
+
# Extracts the path (e.g. /actions/test_action) from the URL
|
70
|
+
def path
|
71
|
+
URI.parse(@url).path
|
72
|
+
rescue URI::InvalidURIError # Invalid URLs will be added and caught when we try to navigate to them
|
73
|
+
@url
|
74
|
+
end
|
75
|
+
|
71
76
|
private
|
72
77
|
def unavailable_page(status, exception)
|
73
78
|
Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
|
@@ -6,8 +6,11 @@ module Grell
|
|
6
6
|
class PageCollection
|
7
7
|
attr_reader :collection
|
8
8
|
|
9
|
-
|
9
|
+
# A block containing the logic that determines if a new URL should be added
|
10
|
+
# to the collection or if it is already present will be passed to the initializer.
|
11
|
+
def initialize(add_match_block)
|
10
12
|
@collection = []
|
13
|
+
@add_match_block = add_match_block
|
11
14
|
end
|
12
15
|
|
13
16
|
def create_page(url, parent_id)
|
@@ -39,8 +42,9 @@ module Grell
|
|
39
42
|
# Although finding unique pages based on URL will add pages with different query parameters,
|
40
43
|
# in some cases we do link to different pages depending on the query parameters like when using proxies
|
41
44
|
new_url = @collection.none? do |collection_page|
|
42
|
-
collection_page
|
45
|
+
@add_match_block.call(collection_page, page)
|
43
46
|
end
|
47
|
+
|
44
48
|
if new_url
|
45
49
|
@collection.push page
|
46
50
|
end
|
data/lib/grell/version.rb
CHANGED
data/spec/lib/crawler_spec.rb
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
|
2
2
|
RSpec.describe Grell::Crawler do
|
3
|
-
let(:page_id) { rand(10).floor + 10}
|
4
|
-
let(:parent_page_id) {rand(10).floor}
|
5
|
-
let(:page) {Grell::Page.new(url, page_id, parent_page_id)}
|
6
|
-
let(:host) {
|
7
|
-
let(:url) {
|
8
|
-
let(:crawler) { Grell::Crawler.new(logger: Logger.new(nil), external_driver: true)}
|
9
|
-
let(:body) {'body'}
|
3
|
+
let(:page_id) { rand(10).floor + 10 }
|
4
|
+
let(:parent_page_id) { rand(10).floor }
|
5
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
6
|
+
let(:host) { 'http://www.example.com' }
|
7
|
+
let(:url) { 'http://www.example.com/test' }
|
8
|
+
let(:crawler) { Grell::Crawler.new(logger: Logger.new(nil), external_driver: true) }
|
9
|
+
let(:body) { 'body' }
|
10
|
+
let(:custom_add_match) do
|
11
|
+
Proc.new do |collection_page, page|
|
12
|
+
collection_page.path == page.path
|
13
|
+
end
|
14
|
+
end
|
10
15
|
|
11
16
|
before do
|
12
17
|
proxy.stub(url).and_return(body: body, code: 200)
|
@@ -17,6 +22,7 @@ RSpec.describe Grell::Crawler do
|
|
17
22
|
Grell::Crawler.new(external_driver: true, logger: 33)
|
18
23
|
expect(Grell.logger).to eq(33)
|
19
24
|
end
|
25
|
+
|
20
26
|
it 'provides a stdout logger if nothing provided' do
|
21
27
|
crawler
|
22
28
|
expect(Grell.logger).to be_instance_of(Logger)
|
@@ -24,6 +30,10 @@ RSpec.describe Grell::Crawler do
|
|
24
30
|
end
|
25
31
|
|
26
32
|
describe '#crawl' do
|
33
|
+
before do
|
34
|
+
crawler.instance_variable_set('@collection', Grell::PageCollection.new(custom_add_match))
|
35
|
+
end
|
36
|
+
|
27
37
|
it 'yields the result if a block is given' do
|
28
38
|
result = []
|
29
39
|
block = Proc.new {|n| result.push(n) }
|
@@ -62,7 +72,8 @@ RSpec.describe Grell::Crawler do
|
|
62
72
|
</body></html>
|
63
73
|
EOS
|
64
74
|
end
|
65
|
-
let(:url_visited) {"http://www.example.com/musmis.html"}
|
75
|
+
let(:url_visited) { "http://www.example.com/musmis.html" }
|
76
|
+
|
66
77
|
before do
|
67
78
|
proxy.stub(url_visited).and_return(body: 'body', code: 200)
|
68
79
|
end
|
@@ -75,6 +86,16 @@ RSpec.describe Grell::Crawler do
|
|
75
86
|
expect(result[0].url).to eq(url)
|
76
87
|
expect(result[1].url).to eq(url_visited)
|
77
88
|
end
|
89
|
+
|
90
|
+
it 'can use a custom url add matcher block' do
|
91
|
+
expect(crawler).to_not receive(:default_add_match)
|
92
|
+
crawler.start_crawling(url, add_match_block: custom_add_match)
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'uses a default url add matched if not provided' do
|
96
|
+
expect(crawler).to receive(:default_add_match).and_return(custom_add_match)
|
97
|
+
crawler.start_crawling(url)
|
98
|
+
end
|
78
99
|
end
|
79
100
|
|
80
101
|
shared_examples_for 'visits all available pages' do
|
@@ -82,6 +103,7 @@ RSpec.describe Grell::Crawler do
|
|
82
103
|
crawler.start_crawling(url)
|
83
104
|
expect(crawler.collection.visited_pages.size).to eq(visited_pages_count)
|
84
105
|
end
|
106
|
+
|
85
107
|
it 'has no more pages to discover' do
|
86
108
|
crawler.start_crawling(url)
|
87
109
|
expect(crawler.collection.discovered_pages.size).to eq(0)
|
@@ -100,13 +122,17 @@ RSpec.describe Grell::Crawler do
|
|
100
122
|
Hello world!
|
101
123
|
</body></html>"
|
102
124
|
end
|
103
|
-
let(:visited_pages_count) {1}
|
104
|
-
let(:visited_pages) {['http://www.example.com/test']}
|
125
|
+
let(:visited_pages_count) { 1 }
|
126
|
+
let(:visited_pages) { ['http://www.example.com/test'] }
|
105
127
|
|
106
128
|
it_behaves_like 'visits all available pages'
|
107
129
|
end
|
108
130
|
|
109
131
|
context 'the url has several links' do
|
132
|
+
let(:visited_pages_count) { 3 }
|
133
|
+
let(:visited_pages) do
|
134
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
135
|
+
end
|
110
136
|
let(:body) do
|
111
137
|
"<html><head></head><body>
|
112
138
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -114,14 +140,11 @@ RSpec.describe Grell::Crawler do
|
|
114
140
|
Hello world!
|
115
141
|
</body></html>"
|
116
142
|
end
|
143
|
+
|
117
144
|
before do
|
118
145
|
proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
|
119
146
|
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
120
147
|
end
|
121
|
-
let(:visited_pages_count) {3}
|
122
|
-
let(:visited_pages) do
|
123
|
-
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
124
|
-
end
|
125
148
|
|
126
149
|
it_behaves_like 'visits all available pages'
|
127
150
|
end
|
@@ -144,9 +167,10 @@ RSpec.describe Grell::Crawler do
|
|
144
167
|
before do
|
145
168
|
crawler.whitelist('/trusmis.html')
|
146
169
|
end
|
147
|
-
|
170
|
+
|
171
|
+
let(:visited_pages_count) { 2 } # my own page + trusmis
|
148
172
|
let(:visited_pages) do
|
149
|
-
['http://www.example.com/test','http://www.example.com/trusmis.html']
|
173
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
150
174
|
end
|
151
175
|
|
152
176
|
it_behaves_like 'visits all available pages'
|
@@ -156,9 +180,10 @@ RSpec.describe Grell::Crawler do
|
|
156
180
|
before do
|
157
181
|
crawler.whitelist(['/trusmis.html', '/nothere', 'another.html'])
|
158
182
|
end
|
159
|
-
|
183
|
+
|
184
|
+
let(:visited_pages_count) { 2 }
|
160
185
|
let(:visited_pages) do
|
161
|
-
['http://www.example.com/test','http://www.example.com/trusmis.html']
|
186
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
162
187
|
end
|
163
188
|
|
164
189
|
it_behaves_like 'visits all available pages'
|
@@ -168,9 +193,10 @@ RSpec.describe Grell::Crawler do
|
|
168
193
|
before do
|
169
194
|
crawler.whitelist(/\/trusmis\.html/)
|
170
195
|
end
|
171
|
-
|
196
|
+
|
197
|
+
let(:visited_pages_count) { 2 }
|
172
198
|
let(:visited_pages) do
|
173
|
-
['http://www.example.com/test','http://www.example.com/trusmis.html']
|
199
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
174
200
|
end
|
175
201
|
|
176
202
|
it_behaves_like 'visits all available pages'
|
@@ -180,9 +206,10 @@ RSpec.describe Grell::Crawler do
|
|
180
206
|
before do
|
181
207
|
crawler.whitelist([/\/trusmis\.html/])
|
182
208
|
end
|
183
|
-
|
209
|
+
|
210
|
+
let(:visited_pages_count) { 2 }
|
184
211
|
let(:visited_pages) do
|
185
|
-
['http://www.example.com/test','http://www.example.com/trusmis.html']
|
212
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
186
213
|
end
|
187
214
|
|
188
215
|
it_behaves_like 'visits all available pages'
|
@@ -192,7 +219,8 @@ RSpec.describe Grell::Crawler do
|
|
192
219
|
before do
|
193
220
|
crawler.whitelist([])
|
194
221
|
end
|
195
|
-
|
222
|
+
|
223
|
+
let(:visited_pages_count) { 1 } # my own page only
|
196
224
|
let(:visited_pages) do
|
197
225
|
['http://www.example.com/test']
|
198
226
|
end
|
@@ -204,7 +232,8 @@ RSpec.describe Grell::Crawler do
|
|
204
232
|
before do
|
205
233
|
crawler.whitelist(['/trusmis', '/help'])
|
206
234
|
end
|
207
|
-
|
235
|
+
|
236
|
+
let(:visited_pages_count) { 3 } # all links
|
208
237
|
let(:visited_pages) do
|
209
238
|
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
210
239
|
end
|
@@ -280,7 +309,7 @@ RSpec.describe Grell::Crawler do
|
|
280
309
|
before do
|
281
310
|
crawler.blacklist([])
|
282
311
|
end
|
283
|
-
let(:visited_pages_count) {3} #all links
|
312
|
+
let(:visited_pages_count) { 3 } # all links
|
284
313
|
let(:visited_pages) do
|
285
314
|
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
286
315
|
end
|
@@ -292,7 +321,7 @@ RSpec.describe Grell::Crawler do
|
|
292
321
|
before do
|
293
322
|
crawler.blacklist(['/trusmis', '/help'])
|
294
323
|
end
|
295
|
-
let(:visited_pages_count) {1}
|
324
|
+
let(:visited_pages_count) { 1 }
|
296
325
|
let(:visited_pages) do
|
297
326
|
['http://www.example.com/test']
|
298
327
|
end
|
@@ -321,7 +350,8 @@ RSpec.describe Grell::Crawler do
|
|
321
350
|
crawler.whitelist('/trusmis.html')
|
322
351
|
crawler.blacklist('/trusmis.html')
|
323
352
|
end
|
324
|
-
|
353
|
+
|
354
|
+
let(:visited_pages_count) { 1 }
|
325
355
|
let(:visited_pages) do
|
326
356
|
['http://www.example.com/test']
|
327
357
|
end
|
@@ -334,7 +364,8 @@ RSpec.describe Grell::Crawler do
|
|
334
364
|
crawler.whitelist('/trusmis.html')
|
335
365
|
crawler.blacklist('/raistlin.html')
|
336
366
|
end
|
337
|
-
|
367
|
+
|
368
|
+
let(:visited_pages_count) { 2 }
|
338
369
|
let(:visited_pages) do
|
339
370
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
340
371
|
end
|
@@ -1,8 +1,14 @@
|
|
1
1
|
|
2
2
|
RSpec.describe Grell::PageCollection do
|
3
|
-
let(:
|
4
|
-
|
5
|
-
|
3
|
+
let(:add_match_block) do
|
4
|
+
Proc.new do |collection_page, page|
|
5
|
+
collection_page.url.downcase == page.url.downcase
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
let(:collection) { Grell::PageCollection.new(add_match_block) }
|
10
|
+
let(:url) { 'http://www.github.com/SomeUser/dragonlance?search=false' }
|
11
|
+
let(:url2) { 'http://www.github.com/OtherUser/forgotten?search=false' }
|
6
12
|
|
7
13
|
context 'empty collection' do
|
8
14
|
|
@@ -20,7 +26,8 @@ RSpec.describe Grell::PageCollection do
|
|
20
26
|
end
|
21
27
|
|
22
28
|
context 'one unvisited page' do
|
23
|
-
let(:page) {collection.create_page(url, 0)}
|
29
|
+
let(:page) { collection.create_page(url, 0) }
|
30
|
+
|
24
31
|
before do
|
25
32
|
allow(page).to receive(:visited?).and_return(false)
|
26
33
|
end
|
@@ -40,7 +47,8 @@ RSpec.describe Grell::PageCollection do
|
|
40
47
|
end
|
41
48
|
|
42
49
|
context 'one visited page' do
|
43
|
-
let(:page) {collection.create_page(url, 0)}
|
50
|
+
let(:page) { collection.create_page(url, 0) }
|
51
|
+
|
44
52
|
before do
|
45
53
|
allow(page).to receive(:visited?).and_return(true)
|
46
54
|
end
|
@@ -59,8 +67,9 @@ RSpec.describe Grell::PageCollection do
|
|
59
67
|
end
|
60
68
|
|
61
69
|
context 'one visited and one unvisited page with the same url' do
|
62
|
-
let(:page) {collection.create_page(url, 0)}
|
63
|
-
let(:unvisited)
|
70
|
+
let(:page) { collection.create_page(url, 0) }
|
71
|
+
let(:unvisited) { collection.create_page(url.upcase, 0) }
|
72
|
+
|
64
73
|
before do
|
65
74
|
allow(page).to receive(:visited?).and_return(true)
|
66
75
|
allow(unvisited).to receive(:visited?).and_return(false)
|
@@ -88,8 +97,9 @@ RSpec.describe Grell::PageCollection do
|
|
88
97
|
end
|
89
98
|
|
90
99
|
context 'one visited and one unvisited page with different URLs' do
|
91
|
-
let(:page) {collection.create_page(url, 0)}
|
92
|
-
let(:unvisited)
|
100
|
+
let(:page) { collection.create_page(url, 0) }
|
101
|
+
let(:unvisited) { collection.create_page(url2, 0) }
|
102
|
+
|
93
103
|
before do
|
94
104
|
allow(page).to receive(:visited?).and_return(true)
|
95
105
|
allow(unvisited).to receive(:visited?).and_return(false)
|
@@ -109,9 +119,10 @@ RSpec.describe Grell::PageCollection do
|
|
109
119
|
end
|
110
120
|
|
111
121
|
context 'one visited and one unvisited page with different URLs only different by the query' do
|
112
|
-
let(:page) {collection.create_page(url, 0)}
|
113
|
-
let(:url3) {'http://www.github.com/SomeUser/dragonlance?search=true'}
|
114
|
-
let(:unvisited)
|
122
|
+
let(:page) { collection.create_page(url, 0) }
|
123
|
+
let(:url3) { 'http://www.github.com/SomeUser/dragonlance?search=true' }
|
124
|
+
let(:unvisited) { collection.create_page(url3, 0) }
|
125
|
+
|
115
126
|
before do
|
116
127
|
allow(page).to receive(:visited?).and_return(true)
|
117
128
|
allow(unvisited).to receive(:visited?).and_return(false)
|
@@ -131,19 +142,18 @@ RSpec.describe Grell::PageCollection do
|
|
131
142
|
end
|
132
143
|
|
133
144
|
context 'several unvisited pages' do
|
134
|
-
let(:page) {collection.create_page(url, 2)}
|
135
|
-
let(:page2) {collection.create_page(url2, 0)}
|
145
|
+
let(:page) { collection.create_page(url, 2) }
|
146
|
+
let(:page2) { collection.create_page(url2, 0) }
|
147
|
+
|
136
148
|
before do
|
137
149
|
allow(page).to receive(:visited?).and_return(true)
|
138
150
|
allow(page2).to receive(:visited?).and_return(false)
|
139
151
|
end
|
140
152
|
|
141
|
-
it
|
153
|
+
it 'returns the page which has an earlier parent' do
|
142
154
|
expect(collection.next_page).to eq(page2)
|
143
155
|
end
|
144
156
|
|
145
157
|
end
|
146
158
|
|
147
|
-
|
148
|
-
|
149
|
-
end
|
159
|
+
end
|
data/spec/lib/page_spec.rb
CHANGED
@@ -1,26 +1,31 @@
|
|
1
1
|
RSpec.describe Grell::Page do
|
2
2
|
|
3
|
-
let(:page_id) { rand(10).floor + 10}
|
4
|
-
let(:parent_page_id) {rand(10).floor}
|
5
|
-
let(:page) {Grell::Page.new(url, page_id, parent_page_id)}
|
6
|
-
let(:host) {
|
7
|
-
let(:url) {
|
3
|
+
let(:page_id) { rand(10).floor + 10 }
|
4
|
+
let(:parent_page_id) { rand(10).floor }
|
5
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
6
|
+
let(:host) { 'http://www.example.com' }
|
7
|
+
let(:url) { 'http://www.example.com/test' }
|
8
8
|
let(:returned_headers) { { 'Other-Header' => 'yes', 'Content-Type' => 'text/html' }}
|
9
|
-
let(:now) {Time.now}
|
9
|
+
let(:now) { Time.now }
|
10
|
+
|
10
11
|
before do
|
11
12
|
allow(Time).to receive(:now).and_return(now)
|
12
|
-
Grell.logger = Logger.new(nil) #avoids noise in rspec output
|
13
|
+
Grell.logger = Logger.new(nil) # avoids noise in rspec output
|
13
14
|
end
|
14
15
|
|
15
|
-
it
|
16
|
+
it 'gives access to the url' do
|
16
17
|
expect(page.url).to eq(url)
|
17
18
|
end
|
18
19
|
|
19
|
-
it
|
20
|
+
it 'gives access to the path' do
|
21
|
+
expect(page.path).to eq('/test')
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'gives access to the page id' do
|
20
25
|
expect(page.id).to eq(page_id)
|
21
26
|
end
|
22
27
|
|
23
|
-
it
|
28
|
+
it 'gives access to the parent page id' do
|
24
29
|
expect(page.parent_id).to eq(parent_page_id)
|
25
30
|
end
|
26
31
|
|
@@ -68,6 +73,7 @@ RSpec.describe Grell::Page do
|
|
68
73
|
proxy.stub(url).and_return(body: '', code: 200, headers: {})
|
69
74
|
page.navigate
|
70
75
|
end
|
76
|
+
|
71
77
|
it '#retries return 0' do
|
72
78
|
expect(page.retries).to eq(0)
|
73
79
|
end
|
@@ -79,6 +85,7 @@ RSpec.describe Grell::Page do
|
|
79
85
|
page.navigate
|
80
86
|
page.navigate
|
81
87
|
end
|
88
|
+
|
82
89
|
it '#retries return 1' do
|
83
90
|
expect(page.retries).to eq(1)
|
84
91
|
end
|
@@ -98,8 +105,8 @@ RSpec.describe Grell::Page do
|
|
98
105
|
end
|
99
106
|
end
|
100
107
|
|
101
|
-
[Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
|
102
|
-
|
108
|
+
[ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
|
109
|
+
Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError ].each do |error_type|
|
103
110
|
|
104
111
|
context "#{error_type}" do
|
105
112
|
let(:headers) do
|
@@ -109,25 +116,27 @@ RSpec.describe Grell::Page do
|
|
109
116
|
errorMessage: error_message
|
110
117
|
}
|
111
118
|
end
|
112
|
-
let(:error_message) {'Trusmis broke it again'}
|
113
|
-
let(:now) {Time.now}
|
119
|
+
let(:error_message) { 'Trusmis broke it again' }
|
120
|
+
let(:now) { Time.now }
|
121
|
+
|
114
122
|
before do
|
115
123
|
allow_any_instance_of(Grell::RawPage).to receive(:navigate).and_raise(error_type, 'error')
|
116
124
|
allow_any_instance_of(error_type).to receive(:message).and_return(error_message)
|
117
125
|
page.navigate
|
118
126
|
end
|
127
|
+
|
119
128
|
it_behaves_like 'an errored grell page'
|
120
129
|
end
|
121
130
|
end
|
122
131
|
|
123
132
|
|
124
133
|
context 'we have not yet navigated to the page' do
|
125
|
-
let(:visited) {false}
|
126
|
-
let(:status) {nil}
|
127
|
-
let(:body) {''}
|
128
|
-
let(:links) {[]}
|
129
|
-
let(:expected_headers) {{}}
|
130
|
-
let(:now) {nil}
|
134
|
+
let(:visited) { false }
|
135
|
+
let(:status) { nil }
|
136
|
+
let(:body) { '' }
|
137
|
+
let(:links) { [] }
|
138
|
+
let(:expected_headers) { {} }
|
139
|
+
let(:now) { nil }
|
131
140
|
|
132
141
|
before do
|
133
142
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -138,11 +147,11 @@ RSpec.describe Grell::Page do
|
|
138
147
|
end
|
139
148
|
|
140
149
|
context 'navigating to the URL we get a 404' do
|
141
|
-
let(:visited) {true}
|
142
|
-
let(:status) { 404}
|
143
|
-
let(:body) {'<html><head></head><body>nothing cool</body></html>'}
|
144
|
-
let(:links) {[]}
|
145
|
-
let(:expected_headers) {returned_headers}
|
150
|
+
let(:visited) { true }
|
151
|
+
let(:status) { 404 }
|
152
|
+
let(:body) { '<html><head></head><body>nothing cool</body></html>' }
|
153
|
+
let(:links) { [] }
|
154
|
+
let(:expected_headers) { returned_headers }
|
146
155
|
|
147
156
|
before do
|
148
157
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -154,17 +163,19 @@ RSpec.describe Grell::Page do
|
|
154
163
|
end
|
155
164
|
|
156
165
|
context 'navigating to an URL with redirects, follows them transparently' do
|
157
|
-
let(:visited) {true}
|
158
|
-
let(:status) { 200}
|
159
|
-
let(:body) {'<html><head></head><body>nothing cool</body></html>'}
|
160
|
-
let(:links) {[]}
|
161
|
-
let(:expected_headers) {returned_headers}
|
162
|
-
let(:real_url) {'http://example.com/other'}
|
166
|
+
let(:visited) { true }
|
167
|
+
let(:status) { 200 }
|
168
|
+
let(:body) { '<html><head></head><body>nothing cool</body></html>' }
|
169
|
+
let(:links) { [] }
|
170
|
+
let(:expected_headers) { returned_headers }
|
171
|
+
let(:real_url) { 'http://example.com/other' }
|
172
|
+
|
163
173
|
before do
|
164
174
|
proxy.stub(url).and_return(:redirect_to => real_url)
|
165
175
|
proxy.stub(real_url).and_return(body: body, code: status, headers: returned_headers.dup)
|
166
176
|
page.navigate
|
167
177
|
end
|
178
|
+
|
168
179
|
it_behaves_like 'a grell page'
|
169
180
|
|
170
181
|
it 'followed_redirects? is true' do
|
@@ -178,11 +189,11 @@ RSpec.describe Grell::Page do
|
|
178
189
|
|
179
190
|
#Here also add examples that may happen for almost all pages (no errors, no redirects)
|
180
191
|
context 'navigating to the URL we get page with no links' do
|
181
|
-
let(:visited) {true}
|
182
|
-
let(:status) { 200}
|
183
|
-
let(:body) {'<html><head></head><body>nothing cool</body></html>'}
|
184
|
-
let(:links) {[]}
|
185
|
-
let(:expected_headers) {returned_headers}
|
192
|
+
let(:visited) { true }
|
193
|
+
let(:status) { 200 }
|
194
|
+
let(:body) { '<html><head></head><body>nothing cool</body></html>' }
|
195
|
+
let(:links) { [] }
|
196
|
+
let(:expected_headers) { returned_headers }
|
186
197
|
|
187
198
|
before do
|
188
199
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -205,8 +216,8 @@ RSpec.describe Grell::Page do
|
|
205
216
|
end
|
206
217
|
|
207
218
|
context 'navigating to the URL we get page with links using a elements' do
|
208
|
-
let(:visited) {true}
|
209
|
-
let(:status) { 200}
|
219
|
+
let(:visited) { true }
|
220
|
+
let(:status) { 200 }
|
210
221
|
let(:body) do
|
211
222
|
"<html><head></head><body>
|
212
223
|
Hello world!
|
@@ -215,8 +226,8 @@ RSpec.describe Grell::Page do
|
|
215
226
|
<a href=\"http://www.outsidewebsite.com/help.html\">help</a>
|
216
227
|
</body></html>"
|
217
228
|
end
|
218
|
-
let(:links) {[
|
219
|
-
let(:expected_headers) {returned_headers}
|
229
|
+
let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] }
|
230
|
+
let(:expected_headers) { returned_headers }
|
220
231
|
|
221
232
|
before do
|
222
233
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -231,8 +242,8 @@ RSpec.describe Grell::Page do
|
|
231
242
|
end
|
232
243
|
|
233
244
|
context 'navigating to the URL we get page with links with absolute links' do
|
234
|
-
let(:visited) {true}
|
235
|
-
let(:status) { 200}
|
245
|
+
let(:visited) { true }
|
246
|
+
let(:status) { 200 }
|
236
247
|
let(:body) do
|
237
248
|
"<html><head></head><body>
|
238
249
|
Hello world!
|
@@ -241,8 +252,8 @@ RSpec.describe Grell::Page do
|
|
241
252
|
<a href=\"http://www.outsidewebsite.com/help.html\">help</a>
|
242
253
|
</body></html>"
|
243
254
|
end
|
244
|
-
let(:links) {[
|
245
|
-
let(:expected_headers) {returned_headers}
|
255
|
+
let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] }
|
256
|
+
let(:expected_headers) { returned_headers }
|
246
257
|
|
247
258
|
before do
|
248
259
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -257,8 +268,8 @@ RSpec.describe Grell::Page do
|
|
257
268
|
end
|
258
269
|
|
259
270
|
context 'navigating to the URL we get page with links using a mix of elements' do
|
260
|
-
let(:visited) {true}
|
261
|
-
let(:status) { 200}
|
271
|
+
let(:visited) { true }
|
272
|
+
let(:status) { 200 }
|
262
273
|
let(:body) do
|
263
274
|
"<html><head></head><body>
|
264
275
|
Hello world!
|
@@ -274,11 +285,10 @@ RSpec.describe Grell::Page do
|
|
274
285
|
</body></html>"
|
275
286
|
end
|
276
287
|
let(:links) do
|
277
|
-
[
|
278
|
-
|
279
|
-
]
|
288
|
+
[ 'http://www.example.com/trusmis.html', 'http://www.example.com/help.html',
|
289
|
+
'http://www.example.com/more_help.html', 'http://www.example.com/help_me.html' ]
|
280
290
|
end
|
281
|
-
let(:expected_headers) {returned_headers}
|
291
|
+
let(:expected_headers) { returned_headers }
|
282
292
|
|
283
293
|
before do
|
284
294
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -287,16 +297,36 @@ RSpec.describe Grell::Page do
|
|
287
297
|
|
288
298
|
it_behaves_like 'a grell page'
|
289
299
|
|
300
|
+
describe '#path' do
|
301
|
+
context 'proper url' do
|
302
|
+
let(:url) { 'http://www.anyurl.com/path' }
|
303
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
304
|
+
|
305
|
+
it 'returns the path' do
|
306
|
+
expect(page.path).to eq('/path')
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
context 'broken url' do
|
311
|
+
let(:url) { 'www.an.asda.fasfasf.yurl.com/path' }
|
312
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
313
|
+
|
314
|
+
it 'returns the path' do
|
315
|
+
expect(page.path).to eq(url)
|
316
|
+
end
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
290
320
|
it 'do not return links to external websites' do
|
291
321
|
expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
|
292
322
|
end
|
293
323
|
end
|
294
324
|
|
295
325
|
context 'navigating to the URL we get page with links inside the header section of the code' do
|
296
|
-
let(:visited) {true}
|
297
|
-
let(:status) { 200}
|
298
|
-
let(:css) {'/application.css'}
|
299
|
-
let(:favicon) {'/favicon.ico'}
|
326
|
+
let(:visited) { true }
|
327
|
+
let(:status) { 200 }
|
328
|
+
let(:css) { '/application.css' }
|
329
|
+
let(:favicon) { '/favicon.ico' }
|
300
330
|
let(:body) do
|
301
331
|
"<html><head>
|
302
332
|
<title>mimi</title>
|
@@ -309,9 +339,9 @@ RSpec.describe Grell::Page do
|
|
309
339
|
</body></html>"
|
310
340
|
end
|
311
341
|
let(:links) do
|
312
|
-
[
|
342
|
+
['http://www.example.com/trusmis.html']
|
313
343
|
end
|
314
|
-
let(:expected_headers) {returned_headers}
|
344
|
+
let(:expected_headers) { returned_headers }
|
315
345
|
|
316
346
|
before do
|
317
347
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -338,11 +368,12 @@ RSpec.describe Grell::Page do
|
|
338
368
|
proxy.stub(url).and_return(body: body, code: nil, headers: {})
|
339
369
|
page.navigate
|
340
370
|
end
|
341
|
-
|
342
|
-
let(:
|
343
|
-
let(:
|
344
|
-
let(:
|
345
|
-
let(:
|
371
|
+
|
372
|
+
let(:visited) { true }
|
373
|
+
let(:status) { nil }
|
374
|
+
let(:body) { '' }
|
375
|
+
let(:links) { [] }
|
376
|
+
let(:expected_headers) { {} }
|
346
377
|
|
347
378
|
it_behaves_like 'a grell page'
|
348
379
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: '1.6'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|