grell 1.5.1 → 1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +42 -38
- data/README.md +20 -0
- data/lib/grell/capybara_driver.rb +3 -0
- data/lib/grell/crawler.rb +16 -7
- data/lib/grell/page.rb +7 -2
- data/lib/grell/page_collection.rb +6 -2
- data/lib/grell/version.rb +1 -1
- data/spec/lib/crawler_spec.rb +59 -28
- data/spec/lib/page_collection_spec.rb +28 -18
- data/spec/lib/page_spec.rb +92 -61
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2be8992c96b83e9b1a98474ada3b49ea7e5adb69
|
4
|
+
data.tar.gz: 3eed1bea205812e8e9ab7dc8678da57efea1fea1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: baa6e37b2ce80491b05688618b6ad0576149236c2367b2f6c52a84dfeae25edb6d340abfdcae4e3b6f7363072db0dc0c8c052cd83410e1f28e1725305db99993
|
7
|
+
data.tar.gz: 7c246e8b2a02494d5e44dc6fc4b0029ab254e63764b46791e9135ed9ec1657627d4b6f7e5cd921a951c062cfe815ac1fd7b4e7d87ffb11f786e0989d44c3083a
|
data/CHANGELOG.md
CHANGED
@@ -1,51 +1,55 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# 1.6
|
2
|
+
* Support custom URL comparison when adding new pages during crawling
|
3
|
+
* Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs
|
4
|
+
* Fail early if Capybara doesn't initialize properly
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
6
|
+
# 1.5.1
|
7
|
+
* Fixed deprecation warning (Thanks scott)
|
8
|
+
* Updated Poltergeist dependency
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
# 1.5.0
|
11
|
+
* Grell will follow redirects.
|
12
|
+
* Added #followed_redirects? #error? #current_url methods to the Page class
|
12
13
|
|
13
|
-
|
14
|
-
|
14
|
+
# 1.4.0
|
15
|
+
* Added crawler.restart to restart browser process
|
16
|
+
* The block of code can make grell retry any given page.
|
15
17
|
|
16
|
-
|
17
|
-
|
18
|
-
Better info in gemspec
|
18
|
+
# 1.3.2
|
19
|
+
* Rescue Timeout error and return an empty page when that happens
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
21
|
+
# 1.3.1
|
22
|
+
* Added whitelisting and blacklisting
|
23
|
+
* Better info in gemspec
|
23
24
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
# 1.3
|
26
|
+
* The Crawler object allows you to provide an external logger object.
|
27
|
+
* Clearer semantics when an error happens, special headers are returned so the user can inspect the error
|
28
|
+
* Caveats:
|
29
|
+
- The 'debug' option in the crawler does not have any affect anymore. Provide an external logger with 'logger' instead
|
30
|
+
- The errors provided in the headers by grell has changed from 'grell_status' to 'grellStatus'.
|
31
|
+
- The 'visited' property in the page was never supposed to be accesible. Use 'visited?' instead.
|
28
32
|
|
29
|
-
|
30
|
-
Solve bug: URLs are case insensitive
|
33
|
+
# 1.2.1
|
34
|
+
* Solve bug: URLs are case insensitive
|
31
35
|
|
32
|
-
|
33
|
-
Grell now will consider two links to point to the same page only when the whole URL is exactly the same.
|
34
|
-
|
36
|
+
# 1.2
|
37
|
+
* Grell now will consider two links to point to the same page only when the whole URL is exactly the same.
|
38
|
+
Versions previously would only consider two links to be the same when they shared the path.
|
35
39
|
|
36
|
-
|
37
|
-
Solve bug where we were adding links in heads as if there were normal links in the body
|
40
|
+
# 1.1.2
|
41
|
+
* Solve bug where we were adding links in heads as if there were normal links in the body
|
38
42
|
|
39
|
-
|
40
|
-
Solve bug with the new data-href functionality
|
43
|
+
# 1.1.1
|
44
|
+
* Solve bug with the new data-href functionality
|
41
45
|
|
42
|
-
|
43
|
-
Solve problem with randomly failing spec
|
44
|
-
Search for elements with 'href' or 'data-href' to find links
|
46
|
+
# 1.1
|
47
|
+
* Solve problem with randomly failing spec
|
48
|
+
* Search for elements with 'href' or 'data-href' to find links
|
45
49
|
|
46
|
-
|
47
|
-
Rescueing Javascript errors
|
50
|
+
# 1.0.1
|
51
|
+
* Rescueing Javascript errors
|
48
52
|
|
49
|
-
|
50
|
-
Initial implementation
|
51
|
-
Basic support to crawling pages.
|
53
|
+
# 1.0
|
54
|
+
* Initial implementation
|
55
|
+
* Basic support to crawling pages.
|
data/README.md
CHANGED
@@ -80,6 +80,26 @@ your are crawling. It will never follow links linking outside your site.
|
|
80
80
|
If you want to further limit the amount of links crawled, you can use
|
81
81
|
whitelisting, blacklisting or manual filtering.
|
82
82
|
|
83
|
+
#### Custom URL Comparison
|
84
|
+
By default, Grell will detect new URLs to visit by comparing the full URL
|
85
|
+
with the URLs of the discovered and visited links. This functionality can
|
86
|
+
be changed by passing a block of code to Grells `start_crawling` method.
|
87
|
+
In the below example, the path of the URLs (instead of the full URL) will
|
88
|
+
be compared.
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
require 'grell'
|
92
|
+
|
93
|
+
crawler = Grell::Crawler.new
|
94
|
+
|
95
|
+
add_match_block = Proc.new do |collection_page, page|
|
96
|
+
collection_page.path == page.path
|
97
|
+
end
|
98
|
+
|
99
|
+
crawler.start_crawling('http://www.google.com', add_match_block: add_match_block) do |current_page|
|
100
|
+
...
|
101
|
+
end
|
102
|
+
```
|
83
103
|
|
84
104
|
#### Whitelisting
|
85
105
|
|
data/lib/grell/crawler.rb
CHANGED
@@ -15,8 +15,6 @@ module Grell
|
|
15
15
|
else
|
16
16
|
Grell.logger = Logger.new(STDOUT)
|
17
17
|
end
|
18
|
-
|
19
|
-
@collection = PageCollection.new
|
20
18
|
end
|
21
19
|
|
22
20
|
# Restarts the PhantomJS process without modifying the state of visited and discovered pages.
|
@@ -37,13 +35,15 @@ module Grell
|
|
37
35
|
end
|
38
36
|
|
39
37
|
# Main method, it starts crawling on the given URL and calls a block for each of the pages found.
|
40
|
-
def start_crawling(url, &block)
|
38
|
+
def start_crawling(url, options = {}, &block)
|
41
39
|
Grell.logger.info "GRELL Started crawling"
|
42
|
-
@collection = PageCollection.new
|
40
|
+
@collection = PageCollection.new(options[:add_match_block] || default_add_match)
|
43
41
|
@collection.create_page(url, nil)
|
42
|
+
|
44
43
|
while !@collection.discovered_pages.empty?
|
45
44
|
crawl(@collection.next_page, block)
|
46
45
|
end
|
46
|
+
|
47
47
|
Grell.logger.info "GRELL finished crawling"
|
48
48
|
end
|
49
49
|
|
@@ -53,7 +53,7 @@ module Grell
|
|
53
53
|
filter!(site.links)
|
54
54
|
|
55
55
|
if block #The user of this block can send us a :retry to retry accessing the page
|
56
|
-
while
|
56
|
+
while block.call(site) == :retry
|
57
57
|
Grell.logger.info "Retrying our visit to #{site.url}"
|
58
58
|
site.navigate
|
59
59
|
filter!(site.links)
|
@@ -66,9 +66,18 @@ module Grell
|
|
66
66
|
end
|
67
67
|
|
68
68
|
private
|
69
|
+
|
69
70
|
def filter!(links)
|
70
|
-
links.select!{ |link| link =~ @whitelist_regexp } if @whitelist_regexp
|
71
|
-
links.delete_if{ |link| link =~ @blacklist_regexp } if @blacklist_regexp
|
71
|
+
links.select! { |link| link =~ @whitelist_regexp } if @whitelist_regexp
|
72
|
+
links.delete_if { |link| link =~ @blacklist_regexp } if @blacklist_regexp
|
73
|
+
end
|
74
|
+
|
75
|
+
# If options[:add_match_block] is not provided, url matching to determine if a
|
76
|
+
# new page should be added the page collection will default to this proc
|
77
|
+
def default_add_match
|
78
|
+
Proc.new do |collection_page, page|
|
79
|
+
collection_page.url.downcase == page.url.downcase
|
80
|
+
end
|
72
81
|
end
|
73
82
|
|
74
83
|
end
|
data/lib/grell/page.rb
CHANGED
@@ -44,8 +44,6 @@ module Grell
|
|
44
44
|
unavailable_page(404, e)
|
45
45
|
rescue Capybara::Poltergeist::StatusFailError => e
|
46
46
|
unavailable_page(404, e)
|
47
|
-
rescue Timeout::Error => e #This error inherits from Interruption, do not inherit from StandardError
|
48
|
-
unavailable_page(404, e)
|
49
47
|
end
|
50
48
|
|
51
49
|
# Number of times we have retried the current page
|
@@ -68,6 +66,13 @@ module Grell
|
|
68
66
|
!!(status.to_s =~ /[4|5]\d\d/)
|
69
67
|
end
|
70
68
|
|
69
|
+
# Extracts the path (e.g. /actions/test_action) from the URL
|
70
|
+
def path
|
71
|
+
URI.parse(@url).path
|
72
|
+
rescue URI::InvalidURIError # Invalid URLs will be added and caught when we try to navigate to them
|
73
|
+
@url
|
74
|
+
end
|
75
|
+
|
71
76
|
private
|
72
77
|
def unavailable_page(status, exception)
|
73
78
|
Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
|
@@ -6,8 +6,11 @@ module Grell
|
|
6
6
|
class PageCollection
|
7
7
|
attr_reader :collection
|
8
8
|
|
9
|
-
|
9
|
+
# A block containing the logic that determines if a new URL should be added
|
10
|
+
# to the collection or if it is already present will be passed to the initializer.
|
11
|
+
def initialize(add_match_block)
|
10
12
|
@collection = []
|
13
|
+
@add_match_block = add_match_block
|
11
14
|
end
|
12
15
|
|
13
16
|
def create_page(url, parent_id)
|
@@ -39,8 +42,9 @@ module Grell
|
|
39
42
|
# Although finding unique pages based on URL will add pages with different query parameters,
|
40
43
|
# in some cases we do link to different pages depending on the query parameters like when using proxies
|
41
44
|
new_url = @collection.none? do |collection_page|
|
42
|
-
collection_page
|
45
|
+
@add_match_block.call(collection_page, page)
|
43
46
|
end
|
47
|
+
|
44
48
|
if new_url
|
45
49
|
@collection.push page
|
46
50
|
end
|
data/lib/grell/version.rb
CHANGED
data/spec/lib/crawler_spec.rb
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
|
2
2
|
RSpec.describe Grell::Crawler do
|
3
|
-
let(:page_id) { rand(10).floor + 10}
|
4
|
-
let(:parent_page_id) {rand(10).floor}
|
5
|
-
let(:page) {Grell::Page.new(url, page_id, parent_page_id)}
|
6
|
-
let(:host) {
|
7
|
-
let(:url) {
|
8
|
-
let(:crawler) { Grell::Crawler.new(logger: Logger.new(nil), external_driver: true)}
|
9
|
-
let(:body) {'body'}
|
3
|
+
let(:page_id) { rand(10).floor + 10 }
|
4
|
+
let(:parent_page_id) { rand(10).floor }
|
5
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
6
|
+
let(:host) { 'http://www.example.com' }
|
7
|
+
let(:url) { 'http://www.example.com/test' }
|
8
|
+
let(:crawler) { Grell::Crawler.new(logger: Logger.new(nil), external_driver: true) }
|
9
|
+
let(:body) { 'body' }
|
10
|
+
let(:custom_add_match) do
|
11
|
+
Proc.new do |collection_page, page|
|
12
|
+
collection_page.path == page.path
|
13
|
+
end
|
14
|
+
end
|
10
15
|
|
11
16
|
before do
|
12
17
|
proxy.stub(url).and_return(body: body, code: 200)
|
@@ -17,6 +22,7 @@ RSpec.describe Grell::Crawler do
|
|
17
22
|
Grell::Crawler.new(external_driver: true, logger: 33)
|
18
23
|
expect(Grell.logger).to eq(33)
|
19
24
|
end
|
25
|
+
|
20
26
|
it 'provides a stdout logger if nothing provided' do
|
21
27
|
crawler
|
22
28
|
expect(Grell.logger).to be_instance_of(Logger)
|
@@ -24,6 +30,10 @@ RSpec.describe Grell::Crawler do
|
|
24
30
|
end
|
25
31
|
|
26
32
|
describe '#crawl' do
|
33
|
+
before do
|
34
|
+
crawler.instance_variable_set('@collection', Grell::PageCollection.new(custom_add_match))
|
35
|
+
end
|
36
|
+
|
27
37
|
it 'yields the result if a block is given' do
|
28
38
|
result = []
|
29
39
|
block = Proc.new {|n| result.push(n) }
|
@@ -62,7 +72,8 @@ RSpec.describe Grell::Crawler do
|
|
62
72
|
</body></html>
|
63
73
|
EOS
|
64
74
|
end
|
65
|
-
let(:url_visited) {"http://www.example.com/musmis.html"}
|
75
|
+
let(:url_visited) { "http://www.example.com/musmis.html" }
|
76
|
+
|
66
77
|
before do
|
67
78
|
proxy.stub(url_visited).and_return(body: 'body', code: 200)
|
68
79
|
end
|
@@ -75,6 +86,16 @@ RSpec.describe Grell::Crawler do
|
|
75
86
|
expect(result[0].url).to eq(url)
|
76
87
|
expect(result[1].url).to eq(url_visited)
|
77
88
|
end
|
89
|
+
|
90
|
+
it 'can use a custom url add matcher block' do
|
91
|
+
expect(crawler).to_not receive(:default_add_match)
|
92
|
+
crawler.start_crawling(url, add_match_block: custom_add_match)
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'uses a default url add matched if not provided' do
|
96
|
+
expect(crawler).to receive(:default_add_match).and_return(custom_add_match)
|
97
|
+
crawler.start_crawling(url)
|
98
|
+
end
|
78
99
|
end
|
79
100
|
|
80
101
|
shared_examples_for 'visits all available pages' do
|
@@ -82,6 +103,7 @@ RSpec.describe Grell::Crawler do
|
|
82
103
|
crawler.start_crawling(url)
|
83
104
|
expect(crawler.collection.visited_pages.size).to eq(visited_pages_count)
|
84
105
|
end
|
106
|
+
|
85
107
|
it 'has no more pages to discover' do
|
86
108
|
crawler.start_crawling(url)
|
87
109
|
expect(crawler.collection.discovered_pages.size).to eq(0)
|
@@ -100,13 +122,17 @@ RSpec.describe Grell::Crawler do
|
|
100
122
|
Hello world!
|
101
123
|
</body></html>"
|
102
124
|
end
|
103
|
-
let(:visited_pages_count) {1}
|
104
|
-
let(:visited_pages) {['http://www.example.com/test']}
|
125
|
+
let(:visited_pages_count) { 1 }
|
126
|
+
let(:visited_pages) { ['http://www.example.com/test'] }
|
105
127
|
|
106
128
|
it_behaves_like 'visits all available pages'
|
107
129
|
end
|
108
130
|
|
109
131
|
context 'the url has several links' do
|
132
|
+
let(:visited_pages_count) { 3 }
|
133
|
+
let(:visited_pages) do
|
134
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
135
|
+
end
|
110
136
|
let(:body) do
|
111
137
|
"<html><head></head><body>
|
112
138
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -114,14 +140,11 @@ RSpec.describe Grell::Crawler do
|
|
114
140
|
Hello world!
|
115
141
|
</body></html>"
|
116
142
|
end
|
143
|
+
|
117
144
|
before do
|
118
145
|
proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
|
119
146
|
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
120
147
|
end
|
121
|
-
let(:visited_pages_count) {3}
|
122
|
-
let(:visited_pages) do
|
123
|
-
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
124
|
-
end
|
125
148
|
|
126
149
|
it_behaves_like 'visits all available pages'
|
127
150
|
end
|
@@ -144,9 +167,10 @@ RSpec.describe Grell::Crawler do
|
|
144
167
|
before do
|
145
168
|
crawler.whitelist('/trusmis.html')
|
146
169
|
end
|
147
|
-
|
170
|
+
|
171
|
+
let(:visited_pages_count) { 2 } # my own page + trusmis
|
148
172
|
let(:visited_pages) do
|
149
|
-
['http://www.example.com/test','http://www.example.com/trusmis.html']
|
173
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
150
174
|
end
|
151
175
|
|
152
176
|
it_behaves_like 'visits all available pages'
|
@@ -156,9 +180,10 @@ RSpec.describe Grell::Crawler do
|
|
156
180
|
before do
|
157
181
|
crawler.whitelist(['/trusmis.html', '/nothere', 'another.html'])
|
158
182
|
end
|
159
|
-
|
183
|
+
|
184
|
+
let(:visited_pages_count) { 2 }
|
160
185
|
let(:visited_pages) do
|
161
|
-
['http://www.example.com/test','http://www.example.com/trusmis.html']
|
186
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
162
187
|
end
|
163
188
|
|
164
189
|
it_behaves_like 'visits all available pages'
|
@@ -168,9 +193,10 @@ RSpec.describe Grell::Crawler do
|
|
168
193
|
before do
|
169
194
|
crawler.whitelist(/\/trusmis\.html/)
|
170
195
|
end
|
171
|
-
|
196
|
+
|
197
|
+
let(:visited_pages_count) { 2 }
|
172
198
|
let(:visited_pages) do
|
173
|
-
['http://www.example.com/test','http://www.example.com/trusmis.html']
|
199
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
174
200
|
end
|
175
201
|
|
176
202
|
it_behaves_like 'visits all available pages'
|
@@ -180,9 +206,10 @@ RSpec.describe Grell::Crawler do
|
|
180
206
|
before do
|
181
207
|
crawler.whitelist([/\/trusmis\.html/])
|
182
208
|
end
|
183
|
-
|
209
|
+
|
210
|
+
let(:visited_pages_count) { 2 }
|
184
211
|
let(:visited_pages) do
|
185
|
-
['http://www.example.com/test','http://www.example.com/trusmis.html']
|
212
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
186
213
|
end
|
187
214
|
|
188
215
|
it_behaves_like 'visits all available pages'
|
@@ -192,7 +219,8 @@ RSpec.describe Grell::Crawler do
|
|
192
219
|
before do
|
193
220
|
crawler.whitelist([])
|
194
221
|
end
|
195
|
-
|
222
|
+
|
223
|
+
let(:visited_pages_count) { 1 } # my own page only
|
196
224
|
let(:visited_pages) do
|
197
225
|
['http://www.example.com/test']
|
198
226
|
end
|
@@ -204,7 +232,8 @@ RSpec.describe Grell::Crawler do
|
|
204
232
|
before do
|
205
233
|
crawler.whitelist(['/trusmis', '/help'])
|
206
234
|
end
|
207
|
-
|
235
|
+
|
236
|
+
let(:visited_pages_count) { 3 } # all links
|
208
237
|
let(:visited_pages) do
|
209
238
|
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
210
239
|
end
|
@@ -280,7 +309,7 @@ RSpec.describe Grell::Crawler do
|
|
280
309
|
before do
|
281
310
|
crawler.blacklist([])
|
282
311
|
end
|
283
|
-
let(:visited_pages_count) {3} #all links
|
312
|
+
let(:visited_pages_count) { 3 } # all links
|
284
313
|
let(:visited_pages) do
|
285
314
|
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
286
315
|
end
|
@@ -292,7 +321,7 @@ RSpec.describe Grell::Crawler do
|
|
292
321
|
before do
|
293
322
|
crawler.blacklist(['/trusmis', '/help'])
|
294
323
|
end
|
295
|
-
let(:visited_pages_count) {1}
|
324
|
+
let(:visited_pages_count) { 1 }
|
296
325
|
let(:visited_pages) do
|
297
326
|
['http://www.example.com/test']
|
298
327
|
end
|
@@ -321,7 +350,8 @@ RSpec.describe Grell::Crawler do
|
|
321
350
|
crawler.whitelist('/trusmis.html')
|
322
351
|
crawler.blacklist('/trusmis.html')
|
323
352
|
end
|
324
|
-
|
353
|
+
|
354
|
+
let(:visited_pages_count) { 1 }
|
325
355
|
let(:visited_pages) do
|
326
356
|
['http://www.example.com/test']
|
327
357
|
end
|
@@ -334,7 +364,8 @@ RSpec.describe Grell::Crawler do
|
|
334
364
|
crawler.whitelist('/trusmis.html')
|
335
365
|
crawler.blacklist('/raistlin.html')
|
336
366
|
end
|
337
|
-
|
367
|
+
|
368
|
+
let(:visited_pages_count) { 2 }
|
338
369
|
let(:visited_pages) do
|
339
370
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
340
371
|
end
|
@@ -1,8 +1,14 @@
|
|
1
1
|
|
2
2
|
RSpec.describe Grell::PageCollection do
|
3
|
-
let(:
|
4
|
-
|
5
|
-
|
3
|
+
let(:add_match_block) do
|
4
|
+
Proc.new do |collection_page, page|
|
5
|
+
collection_page.url.downcase == page.url.downcase
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
let(:collection) { Grell::PageCollection.new(add_match_block) }
|
10
|
+
let(:url) { 'http://www.github.com/SomeUser/dragonlance?search=false' }
|
11
|
+
let(:url2) { 'http://www.github.com/OtherUser/forgotten?search=false' }
|
6
12
|
|
7
13
|
context 'empty collection' do
|
8
14
|
|
@@ -20,7 +26,8 @@ RSpec.describe Grell::PageCollection do
|
|
20
26
|
end
|
21
27
|
|
22
28
|
context 'one unvisited page' do
|
23
|
-
let(:page) {collection.create_page(url, 0)}
|
29
|
+
let(:page) { collection.create_page(url, 0) }
|
30
|
+
|
24
31
|
before do
|
25
32
|
allow(page).to receive(:visited?).and_return(false)
|
26
33
|
end
|
@@ -40,7 +47,8 @@ RSpec.describe Grell::PageCollection do
|
|
40
47
|
end
|
41
48
|
|
42
49
|
context 'one visited page' do
|
43
|
-
let(:page) {collection.create_page(url, 0)}
|
50
|
+
let(:page) { collection.create_page(url, 0) }
|
51
|
+
|
44
52
|
before do
|
45
53
|
allow(page).to receive(:visited?).and_return(true)
|
46
54
|
end
|
@@ -59,8 +67,9 @@ RSpec.describe Grell::PageCollection do
|
|
59
67
|
end
|
60
68
|
|
61
69
|
context 'one visited and one unvisited page with the same url' do
|
62
|
-
let(:page) {collection.create_page(url, 0)}
|
63
|
-
let(:unvisited)
|
70
|
+
let(:page) { collection.create_page(url, 0) }
|
71
|
+
let(:unvisited) { collection.create_page(url.upcase, 0) }
|
72
|
+
|
64
73
|
before do
|
65
74
|
allow(page).to receive(:visited?).and_return(true)
|
66
75
|
allow(unvisited).to receive(:visited?).and_return(false)
|
@@ -88,8 +97,9 @@ RSpec.describe Grell::PageCollection do
|
|
88
97
|
end
|
89
98
|
|
90
99
|
context 'one visited and one unvisited page with different URLs' do
|
91
|
-
let(:page) {collection.create_page(url, 0)}
|
92
|
-
let(:unvisited)
|
100
|
+
let(:page) { collection.create_page(url, 0) }
|
101
|
+
let(:unvisited) { collection.create_page(url2, 0) }
|
102
|
+
|
93
103
|
before do
|
94
104
|
allow(page).to receive(:visited?).and_return(true)
|
95
105
|
allow(unvisited).to receive(:visited?).and_return(false)
|
@@ -109,9 +119,10 @@ RSpec.describe Grell::PageCollection do
|
|
109
119
|
end
|
110
120
|
|
111
121
|
context 'one visited and one unvisited page with different URLs only different by the query' do
|
112
|
-
let(:page) {collection.create_page(url, 0)}
|
113
|
-
let(:url3) {'http://www.github.com/SomeUser/dragonlance?search=true'}
|
114
|
-
let(:unvisited)
|
122
|
+
let(:page) { collection.create_page(url, 0) }
|
123
|
+
let(:url3) { 'http://www.github.com/SomeUser/dragonlance?search=true' }
|
124
|
+
let(:unvisited) { collection.create_page(url3, 0) }
|
125
|
+
|
115
126
|
before do
|
116
127
|
allow(page).to receive(:visited?).and_return(true)
|
117
128
|
allow(unvisited).to receive(:visited?).and_return(false)
|
@@ -131,19 +142,18 @@ RSpec.describe Grell::PageCollection do
|
|
131
142
|
end
|
132
143
|
|
133
144
|
context 'several unvisited pages' do
|
134
|
-
let(:page) {collection.create_page(url, 2)}
|
135
|
-
let(:page2) {collection.create_page(url2, 0)}
|
145
|
+
let(:page) { collection.create_page(url, 2) }
|
146
|
+
let(:page2) { collection.create_page(url2, 0) }
|
147
|
+
|
136
148
|
before do
|
137
149
|
allow(page).to receive(:visited?).and_return(true)
|
138
150
|
allow(page2).to receive(:visited?).and_return(false)
|
139
151
|
end
|
140
152
|
|
141
|
-
it
|
153
|
+
it 'returns the page which has an earlier parent' do
|
142
154
|
expect(collection.next_page).to eq(page2)
|
143
155
|
end
|
144
156
|
|
145
157
|
end
|
146
158
|
|
147
|
-
|
148
|
-
|
149
|
-
end
|
159
|
+
end
|
data/spec/lib/page_spec.rb
CHANGED
@@ -1,26 +1,31 @@
|
|
1
1
|
RSpec.describe Grell::Page do
|
2
2
|
|
3
|
-
let(:page_id) { rand(10).floor + 10}
|
4
|
-
let(:parent_page_id) {rand(10).floor}
|
5
|
-
let(:page) {Grell::Page.new(url, page_id, parent_page_id)}
|
6
|
-
let(:host) {
|
7
|
-
let(:url) {
|
3
|
+
let(:page_id) { rand(10).floor + 10 }
|
4
|
+
let(:parent_page_id) { rand(10).floor }
|
5
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
6
|
+
let(:host) { 'http://www.example.com' }
|
7
|
+
let(:url) { 'http://www.example.com/test' }
|
8
8
|
let(:returned_headers) { { 'Other-Header' => 'yes', 'Content-Type' => 'text/html' }}
|
9
|
-
let(:now) {Time.now}
|
9
|
+
let(:now) { Time.now }
|
10
|
+
|
10
11
|
before do
|
11
12
|
allow(Time).to receive(:now).and_return(now)
|
12
|
-
Grell.logger = Logger.new(nil) #avoids noise in rspec output
|
13
|
+
Grell.logger = Logger.new(nil) # avoids noise in rspec output
|
13
14
|
end
|
14
15
|
|
15
|
-
it
|
16
|
+
it 'gives access to the url' do
|
16
17
|
expect(page.url).to eq(url)
|
17
18
|
end
|
18
19
|
|
19
|
-
it
|
20
|
+
it 'gives access to the path' do
|
21
|
+
expect(page.path).to eq('/test')
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'gives access to the page id' do
|
20
25
|
expect(page.id).to eq(page_id)
|
21
26
|
end
|
22
27
|
|
23
|
-
it
|
28
|
+
it 'gives access to the parent page id' do
|
24
29
|
expect(page.parent_id).to eq(parent_page_id)
|
25
30
|
end
|
26
31
|
|
@@ -68,6 +73,7 @@ RSpec.describe Grell::Page do
|
|
68
73
|
proxy.stub(url).and_return(body: '', code: 200, headers: {})
|
69
74
|
page.navigate
|
70
75
|
end
|
76
|
+
|
71
77
|
it '#retries return 0' do
|
72
78
|
expect(page.retries).to eq(0)
|
73
79
|
end
|
@@ -79,6 +85,7 @@ RSpec.describe Grell::Page do
|
|
79
85
|
page.navigate
|
80
86
|
page.navigate
|
81
87
|
end
|
88
|
+
|
82
89
|
it '#retries return 1' do
|
83
90
|
expect(page.retries).to eq(1)
|
84
91
|
end
|
@@ -98,8 +105,8 @@ RSpec.describe Grell::Page do
|
|
98
105
|
end
|
99
106
|
end
|
100
107
|
|
101
|
-
[Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
|
102
|
-
|
108
|
+
[ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
|
109
|
+
Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError ].each do |error_type|
|
103
110
|
|
104
111
|
context "#{error_type}" do
|
105
112
|
let(:headers) do
|
@@ -109,25 +116,27 @@ RSpec.describe Grell::Page do
|
|
109
116
|
errorMessage: error_message
|
110
117
|
}
|
111
118
|
end
|
112
|
-
let(:error_message) {'Trusmis broke it again'}
|
113
|
-
let(:now) {Time.now}
|
119
|
+
let(:error_message) { 'Trusmis broke it again' }
|
120
|
+
let(:now) { Time.now }
|
121
|
+
|
114
122
|
before do
|
115
123
|
allow_any_instance_of(Grell::RawPage).to receive(:navigate).and_raise(error_type, 'error')
|
116
124
|
allow_any_instance_of(error_type).to receive(:message).and_return(error_message)
|
117
125
|
page.navigate
|
118
126
|
end
|
127
|
+
|
119
128
|
it_behaves_like 'an errored grell page'
|
120
129
|
end
|
121
130
|
end
|
122
131
|
|
123
132
|
|
124
133
|
context 'we have not yet navigated to the page' do
|
125
|
-
let(:visited) {false}
|
126
|
-
let(:status) {nil}
|
127
|
-
let(:body) {''}
|
128
|
-
let(:links) {[]}
|
129
|
-
let(:expected_headers) {{}}
|
130
|
-
let(:now) {nil}
|
134
|
+
let(:visited) { false }
|
135
|
+
let(:status) { nil }
|
136
|
+
let(:body) { '' }
|
137
|
+
let(:links) { [] }
|
138
|
+
let(:expected_headers) { {} }
|
139
|
+
let(:now) { nil }
|
131
140
|
|
132
141
|
before do
|
133
142
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -138,11 +147,11 @@ RSpec.describe Grell::Page do
|
|
138
147
|
end
|
139
148
|
|
140
149
|
context 'navigating to the URL we get a 404' do
|
141
|
-
let(:visited) {true}
|
142
|
-
let(:status) { 404}
|
143
|
-
let(:body) {'<html><head></head><body>nothing cool</body></html>'}
|
144
|
-
let(:links) {[]}
|
145
|
-
let(:expected_headers) {returned_headers}
|
150
|
+
let(:visited) { true }
|
151
|
+
let(:status) { 404 }
|
152
|
+
let(:body) { '<html><head></head><body>nothing cool</body></html>' }
|
153
|
+
let(:links) { [] }
|
154
|
+
let(:expected_headers) { returned_headers }
|
146
155
|
|
147
156
|
before do
|
148
157
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -154,17 +163,19 @@ RSpec.describe Grell::Page do
|
|
154
163
|
end
|
155
164
|
|
156
165
|
context 'navigating to an URL with redirects, follows them transparently' do
|
157
|
-
let(:visited) {true}
|
158
|
-
let(:status) { 200}
|
159
|
-
let(:body) {'<html><head></head><body>nothing cool</body></html>'}
|
160
|
-
let(:links) {[]}
|
161
|
-
let(:expected_headers) {returned_headers}
|
162
|
-
let(:real_url) {'http://example.com/other'}
|
166
|
+
let(:visited) { true }
|
167
|
+
let(:status) { 200 }
|
168
|
+
let(:body) { '<html><head></head><body>nothing cool</body></html>' }
|
169
|
+
let(:links) { [] }
|
170
|
+
let(:expected_headers) { returned_headers }
|
171
|
+
let(:real_url) { 'http://example.com/other' }
|
172
|
+
|
163
173
|
before do
|
164
174
|
proxy.stub(url).and_return(:redirect_to => real_url)
|
165
175
|
proxy.stub(real_url).and_return(body: body, code: status, headers: returned_headers.dup)
|
166
176
|
page.navigate
|
167
177
|
end
|
178
|
+
|
168
179
|
it_behaves_like 'a grell page'
|
169
180
|
|
170
181
|
it 'followed_redirects? is true' do
|
@@ -178,11 +189,11 @@ RSpec.describe Grell::Page do
|
|
178
189
|
|
179
190
|
#Here also add examples that may happen for almost all pages (no errors, no redirects)
|
180
191
|
context 'navigating to the URL we get page with no links' do
|
181
|
-
let(:visited) {true}
|
182
|
-
let(:status) { 200}
|
183
|
-
let(:body) {'<html><head></head><body>nothing cool</body></html>'}
|
184
|
-
let(:links) {[]}
|
185
|
-
let(:expected_headers) {returned_headers}
|
192
|
+
let(:visited) { true }
|
193
|
+
let(:status) { 200 }
|
194
|
+
let(:body) { '<html><head></head><body>nothing cool</body></html>' }
|
195
|
+
let(:links) { [] }
|
196
|
+
let(:expected_headers) { returned_headers }
|
186
197
|
|
187
198
|
before do
|
188
199
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -205,8 +216,8 @@ RSpec.describe Grell::Page do
|
|
205
216
|
end
|
206
217
|
|
207
218
|
context 'navigating to the URL we get page with links using a elements' do
|
208
|
-
let(:visited) {true}
|
209
|
-
let(:status) { 200}
|
219
|
+
let(:visited) { true }
|
220
|
+
let(:status) { 200 }
|
210
221
|
let(:body) do
|
211
222
|
"<html><head></head><body>
|
212
223
|
Hello world!
|
@@ -215,8 +226,8 @@ RSpec.describe Grell::Page do
|
|
215
226
|
<a href=\"http://www.outsidewebsite.com/help.html\">help</a>
|
216
227
|
</body></html>"
|
217
228
|
end
|
218
|
-
let(:links) {[
|
219
|
-
let(:expected_headers) {returned_headers}
|
229
|
+
let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] }
|
230
|
+
let(:expected_headers) { returned_headers }
|
220
231
|
|
221
232
|
before do
|
222
233
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -231,8 +242,8 @@ RSpec.describe Grell::Page do
|
|
231
242
|
end
|
232
243
|
|
233
244
|
context 'navigating to the URL we get page with links with absolute links' do
|
234
|
-
let(:visited) {true}
|
235
|
-
let(:status) { 200}
|
245
|
+
let(:visited) { true }
|
246
|
+
let(:status) { 200 }
|
236
247
|
let(:body) do
|
237
248
|
"<html><head></head><body>
|
238
249
|
Hello world!
|
@@ -241,8 +252,8 @@ RSpec.describe Grell::Page do
|
|
241
252
|
<a href=\"http://www.outsidewebsite.com/help.html\">help</a>
|
242
253
|
</body></html>"
|
243
254
|
end
|
244
|
-
let(:links) {[
|
245
|
-
let(:expected_headers) {returned_headers}
|
255
|
+
let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] }
|
256
|
+
let(:expected_headers) { returned_headers }
|
246
257
|
|
247
258
|
before do
|
248
259
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -257,8 +268,8 @@ RSpec.describe Grell::Page do
|
|
257
268
|
end
|
258
269
|
|
259
270
|
context 'navigating to the URL we get page with links using a mix of elements' do
|
260
|
-
let(:visited) {true}
|
261
|
-
let(:status) { 200}
|
271
|
+
let(:visited) { true }
|
272
|
+
let(:status) { 200 }
|
262
273
|
let(:body) do
|
263
274
|
"<html><head></head><body>
|
264
275
|
Hello world!
|
@@ -274,11 +285,10 @@ RSpec.describe Grell::Page do
|
|
274
285
|
</body></html>"
|
275
286
|
end
|
276
287
|
let(:links) do
|
277
|
-
[
|
278
|
-
|
279
|
-
]
|
288
|
+
[ 'http://www.example.com/trusmis.html', 'http://www.example.com/help.html',
|
289
|
+
'http://www.example.com/more_help.html', 'http://www.example.com/help_me.html' ]
|
280
290
|
end
|
281
|
-
let(:expected_headers) {returned_headers}
|
291
|
+
let(:expected_headers) { returned_headers }
|
282
292
|
|
283
293
|
before do
|
284
294
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -287,16 +297,36 @@ RSpec.describe Grell::Page do
|
|
287
297
|
|
288
298
|
it_behaves_like 'a grell page'
|
289
299
|
|
300
|
+
describe '#path' do
|
301
|
+
context 'proper url' do
|
302
|
+
let(:url) { 'http://www.anyurl.com/path' }
|
303
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
304
|
+
|
305
|
+
it 'returns the path' do
|
306
|
+
expect(page.path).to eq('/path')
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
context 'broken url' do
|
311
|
+
let(:url) { 'www.an.asda.fasfasf.yurl.com/path' }
|
312
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
313
|
+
|
314
|
+
it 'returns the path' do
|
315
|
+
expect(page.path).to eq(url)
|
316
|
+
end
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
290
320
|
it 'do not return links to external websites' do
|
291
321
|
expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
|
292
322
|
end
|
293
323
|
end
|
294
324
|
|
295
325
|
context 'navigating to the URL we get page with links inside the header section of the code' do
|
296
|
-
let(:visited) {true}
|
297
|
-
let(:status) { 200}
|
298
|
-
let(:css) {'/application.css'}
|
299
|
-
let(:favicon) {'/favicon.ico'}
|
326
|
+
let(:visited) { true }
|
327
|
+
let(:status) { 200 }
|
328
|
+
let(:css) { '/application.css' }
|
329
|
+
let(:favicon) { '/favicon.ico' }
|
300
330
|
let(:body) do
|
301
331
|
"<html><head>
|
302
332
|
<title>mimi</title>
|
@@ -309,9 +339,9 @@ RSpec.describe Grell::Page do
|
|
309
339
|
</body></html>"
|
310
340
|
end
|
311
341
|
let(:links) do
|
312
|
-
[
|
342
|
+
['http://www.example.com/trusmis.html']
|
313
343
|
end
|
314
|
-
let(:expected_headers) {returned_headers}
|
344
|
+
let(:expected_headers) { returned_headers }
|
315
345
|
|
316
346
|
before do
|
317
347
|
proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
|
@@ -338,11 +368,12 @@ RSpec.describe Grell::Page do
|
|
338
368
|
proxy.stub(url).and_return(body: body, code: nil, headers: {})
|
339
369
|
page.navigate
|
340
370
|
end
|
341
|
-
|
342
|
-
let(:
|
343
|
-
let(:
|
344
|
-
let(:
|
345
|
-
let(:
|
371
|
+
|
372
|
+
let(:visited) { true }
|
373
|
+
let(:status) { nil }
|
374
|
+
let(:body) { '' }
|
375
|
+
let(:links) { [] }
|
376
|
+
let(:expected_headers) { {} }
|
346
377
|
|
347
378
|
it_behaves_like 'a grell page'
|
348
379
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: '1.6'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|