grell 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ module Grell
2
+ class PageCollection
3
+ attr_reader :collection
4
+
5
+ def initialize
6
+ @collection = []
7
+ end
8
+
9
+ def create_page(url, parent_id)
10
+ page_id = next_id
11
+ page = Page.new(url, page_id, parent_id)
12
+ add(page)
13
+ page
14
+ end
15
+
16
+ def visited_pages
17
+ @collection.select {|page| page.visited?}
18
+ end
19
+
20
+ def discovered_pages
21
+ @collection - visited_pages
22
+ end
23
+
24
+ def next_page
25
+ discovered_pages.sort_by{|page| page.parent_id}.first
26
+ end
27
+
28
+ private
29
+
30
+ def next_id
31
+ @collection.size
32
+ end
33
+
34
+ def add(page)
35
+ # Although finding unique pages based on URL will add pages with different query parameters,
36
+ # in some cases we do link to different pages depending on the query parameters like when using proxies
37
+ new_url = @collection.none? do |collection_page|
38
+ collection_page.url.downcase == page.url.downcase
39
+ end
40
+ if new_url
41
+ @collection.push page
42
+ end
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,37 @@
1
+ module Grell
2
+ # This class depends heavily on Capybara but contains no logic.
3
+ class RawPage
4
+ include Capybara::DSL
5
+
6
+ def navigate(url)
7
+ visit(url)
8
+ end
9
+
10
+ def headers
11
+ page.response_headers
12
+ end
13
+
14
+ def status
15
+ page.status_code
16
+ end
17
+
18
+ def body
19
+ page.body
20
+ end
21
+
22
+ def all_anchors
23
+ # Some elements may not be "a" elements but still provide a link. This usually is done for Javascript
24
+ # to convert other elements which are not links to be able to be clicked naturally.
25
+ all('[href]', visible: false).to_a + all('[data-href]', visible: false).to_a
26
+ end
27
+
28
+
29
+ def host
30
+ page.current_host
31
+ end
32
+
33
+ def has_selector?(selector)
34
+ page.has_selector?(selector)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,15 @@
1
+ module Grell
2
+ class Reader
3
+ def self.wait_for(action, max_waiting, sleeping_time)
4
+ time_start = Time.now
5
+ action.call()
6
+ return if yield
7
+ while (Time.now < time_start + max_waiting)
8
+ action.call()
9
+ break if yield
10
+ sleep(sleeping_time)
11
+ end
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,3 @@
1
+ module Grell
2
+ VERSION = "1.3.0"
3
+ end
@@ -0,0 +1,108 @@
1
+
2
+ RSpec.describe Grell::Crawler do
3
+ let(:page_id) { rand(10).floor + 10}
4
+ let(:parent_page_id) {rand(10).floor}
5
+ let(:page) {Grell::Page.new(url, page_id, parent_page_id)}
6
+ let(:host) {"http://www.example.com"}
7
+ let(:url) {"http://www.example.com/test"}
8
+ let(:crawler) { Grell::Crawler.new(external_driver: true)}
9
+ let(:body) {'body'}
10
+
11
+ before do
12
+ proxy.stub(url).and_return(body: body, code: 200)
13
+ end
14
+
15
+ describe 'initialize' do
16
+ it 'can provide your own logger' do
17
+ Grell::Crawler.new(external_driver: true, logger: 33)
18
+ expect(Grell.logger).to eq(33)
19
+ end
20
+ it 'provides a stdout logger if nothing provided' do
21
+ crawler
22
+ expect(Grell.logger).to be_instance_of(Logger)
23
+ end
24
+ end
25
+
26
+ context '#crawl' do
27
+ it 'yields the result if a block is given' do
28
+ result = []
29
+ block = Proc.new {|n| result.push(n) }
30
+ crawler.crawl(page, block)
31
+ expect(result.size).to eq(1)
32
+ expect(result.first.url).to eq(url)
33
+ expect(result.first.visited?).to eq(true)
34
+ end
35
+
36
+ it 'logs interesting information' do
37
+ crawler
38
+ expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
39
+ crawler.crawl(page, nil)
40
+ end
41
+ end
42
+
43
+ context '#start_crawling' do
44
+ let(:body) do
45
+ <<-EOS
46
+ <html><head></head><body>
47
+ <a href="/musmis.html">trusmis</a>
48
+ Hello world!
49
+ </body></html>
50
+ EOS
51
+ end
52
+ let(:url_visited) {"http://www.example.com/musmis.html"}
53
+ before do
54
+ proxy.stub(url_visited).and_return(body: 'body', code: 200)
55
+ end
56
+
57
+ it 'calls the block we used to start_crawling' do
58
+ result = []
59
+ block = Proc.new {|n| result.push(n) }
60
+ crawler.start_crawling(url, &block)
61
+ expect(result.size).to eq(2)
62
+ expect(result[0].url).to eq(url)
63
+ expect(result[1].url).to eq(url_visited)
64
+ end
65
+ end
66
+
67
+ context 'the url has no links' do
68
+ let(:body) do
69
+ "<html><head></head><body>
70
+ Hello world!
71
+ </body></html>"
72
+ end
73
+ before do
74
+ crawler.start_crawling(url)
75
+ end
76
+ it 'visits all the pages' do
77
+ expect(crawler.collection.visited_pages.size).to eq(1)
78
+ end
79
+ it 'has no more pages to discover' do
80
+ expect(crawler.collection.discovered_pages.size).to eq(0)
81
+ end
82
+ end
83
+
84
+ context 'the url has several links' do
85
+ let(:body) do
86
+ "<html><head></head><body>
87
+ <a href=\"/trusmis.html\">trusmis</a>
88
+ <a href=\"/help.html\">help</a>
89
+ Hello world!
90
+ </body></html>"
91
+ end
92
+ before do
93
+ proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
94
+ proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
95
+ end
96
+
97
+ it 'visits all the pages' do
98
+ crawler.start_crawling(url)
99
+ expect(crawler.collection.visited_pages.size).to eq(3)
100
+ end
101
+ it 'has no more pages to discover' do
102
+ crawler.start_crawling(url)
103
+ expect(crawler.collection.discovered_pages.size).to eq(0)
104
+ end
105
+ end
106
+
107
+
108
+ end
@@ -0,0 +1,149 @@
1
+
2
+ RSpec.describe Grell::PageCollection do
3
+ let(:collection) {Grell::PageCollection.new}
4
+ let(:url) {'http://www.github.com/SomeUser/dragonlance?search=false'}
5
+ let(:url2) {'http://www.github.com/OtherUser/forgotten?search=false'}
6
+
7
+ context 'empty collection' do
8
+
9
+ it 'has no visited pages' do
10
+ expect(collection.visited_pages).to be_empty
11
+ end
12
+
13
+ it 'has no discovered pages' do
14
+ expect(collection.discovered_pages).to be_empty
15
+ end
16
+
17
+ it 'next page is nil' do
18
+ expect(collection.next_page).to be_nil
19
+ end
20
+ end
21
+
22
+ context 'one unvisited page' do
23
+ let(:page) {collection.create_page(url, 0)}
24
+ before do
25
+ allow(page).to receive(:visited?).and_return(false)
26
+ end
27
+
28
+ it 'has no visited pages' do
29
+ expect(collection.visited_pages).to be_empty
30
+ end
31
+
32
+ it 'has one discovered page' do
33
+ expect(collection.discovered_pages).to eq([page])
34
+
35
+ end
36
+
37
+ it 'next page is the unvisited page' do
38
+ expect(collection.next_page).to eq(page)
39
+ end
40
+ end
41
+
42
+ context 'one visited page' do
43
+ let(:page) {collection.create_page(url, 0)}
44
+ before do
45
+ allow(page).to receive(:visited?).and_return(true)
46
+ end
47
+
48
+ it 'has one visited page' do
49
+ expect(collection.visited_pages).to eq([page])
50
+ end
51
+
52
+ it 'has no discovered pages' do
53
+ expect(collection.discovered_pages).to be_empty
54
+ end
55
+
56
+ it 'next page is nil' do
57
+ expect(collection.next_page).to be_nil
58
+ end
59
+ end
60
+
61
+ context 'one visited and one unvisited page with the same url' do
62
+ let(:page) {collection.create_page(url, 0)}
63
+ let(:unvisited) {collection.create_page(url.upcase, 0)}
64
+ before do
65
+ allow(page).to receive(:visited?).and_return(true)
66
+ allow(unvisited).to receive(:visited?).and_return(false)
67
+ end
68
+
69
+ it 'first page has id 0' do
70
+ expect(page.id).to eq(0)
71
+ end
72
+
73
+ it 'second page has id 1' do
74
+ expect(unvisited.id).to eq(1)
75
+ end
76
+
77
+ it 'has one visited page' do
78
+ expect(collection.visited_pages).to eq([page])
79
+ end
80
+
81
+ it 'has no discovered pages' do
82
+ expect(collection.discovered_pages).to be_empty
83
+ end
84
+
85
+ it 'next page is nil' do
86
+ expect(collection.next_page).to be_nil
87
+ end
88
+ end
89
+
90
+ context 'one visited and one unvisited page with different URLs' do
91
+ let(:page) {collection.create_page(url, 0)}
92
+ let(:unvisited) {collection.create_page(url2, 0)}
93
+ before do
94
+ allow(page).to receive(:visited?).and_return(true)
95
+ allow(unvisited).to receive(:visited?).and_return(false)
96
+ end
97
+
98
+ it 'has one visited page' do
99
+ expect(collection.visited_pages).to eq([page])
100
+ end
101
+
102
+ it 'has one discovered page' do
103
+ expect(collection.discovered_pages).to eq([unvisited])
104
+ end
105
+
106
+ it 'next page is the unvisited page' do
107
+ expect(collection.next_page).to eq(unvisited)
108
+ end
109
+ end
110
+
111
+ context 'one visited and one unvisited page with different URLs only different by the query' do
112
+ let(:page) {collection.create_page(url, 0)}
113
+ let(:url3) {'http://www.github.com/SomeUser/dragonlance?search=true'}
114
+ let(:unvisited) {collection.create_page(url3, 0)}
115
+ before do
116
+ allow(page).to receive(:visited?).and_return(true)
117
+ allow(unvisited).to receive(:visited?).and_return(false)
118
+ end
119
+
120
+ it 'has one visited page' do
121
+ expect(collection.visited_pages).to eq([page])
122
+ end
123
+
124
+ it 'has one discovered page' do
125
+ expect(collection.discovered_pages).to eq([unvisited])
126
+ end
127
+
128
+ it 'next page is the unvisited page' do
129
+ expect(collection.next_page).to eq(unvisited)
130
+ end
131
+ end
132
+
133
+ context 'several unvisited pages' do
134
+ let(:page) {collection.create_page(url, 2)}
135
+ let(:page2) {collection.create_page(url2, 0)}
136
+ before do
137
+ allow(page).to receive(:visited?).and_return(true)
138
+ allow(page2).to receive(:visited?).and_return(false)
139
+ end
140
+
141
+ it "returns the page which has an earlier parent" do
142
+ expect(collection.next_page).to eq(page2)
143
+ end
144
+
145
+ end
146
+
147
+
148
+
149
+ end
@@ -0,0 +1,284 @@
1
+ RSpec.describe Grell::Page do
2
+
3
+ let(:page_id) { rand(10).floor + 10}
4
+ let(:parent_page_id) {rand(10).floor}
5
+ let(:page) {Grell::Page.new(url, page_id, parent_page_id)}
6
+ let(:host) {"http://www.example.com"}
7
+ let(:url) {"http://www.example.com/test"}
8
+ let(:returned_headers) { { 'Other-Header' => 'yes', 'Content-Type' => 'text/html' }}
9
+ let(:now) {Time.now}
10
+ before do
11
+ allow(Time).to receive(:now).and_return(now)
12
+ end
13
+
14
+ it "gives access to the url" do
15
+ expect(page.url).to eq(url)
16
+ end
17
+
18
+ it "gives access to the page id" do
19
+ expect(page.id).to eq(page_id)
20
+ end
21
+
22
+ it "gives access to the parent page id" do
23
+ expect(page.parent_id).to eq(parent_page_id)
24
+ end
25
+
26
+ it 'newly created page does not have status yet' do
27
+ expect(page.status).to eq(nil)
28
+ end
29
+
30
+ shared_examples_for 'a grell page' do
31
+
32
+ it 'returns the correct status' do
33
+ expect(page.status).to eq(status)
34
+ end
35
+
36
+ it 'has the correct body' do
37
+ expect(page.body).to eq(body)
38
+ end
39
+
40
+ it 'has correct headers' do
41
+ expect(page.headers).to include(expected_headers)
42
+ end
43
+
44
+ it 'has the correct links' do
45
+ expect(page.links.sort).to eq(links.sort)
46
+ end
47
+
48
+ it '#visited? returns the correct value' do
49
+ expect(page.visited?).to eq(visited)
50
+ end
51
+
52
+ it 'has correct timestamp' do
53
+ expect(page.timestamp).to eq(now)
54
+ end
55
+
56
+ end
57
+
58
+ shared_examples_for 'an errored grell page' do
59
+ it 'returns empty status 404 page after navigating' do
60
+ expect(page.status).to eq(404)
61
+ expect(page.links).to eq([])
62
+ expect(page.headers).to eq(headers)
63
+ expect(page.body).to eq('')
64
+ expect(page.has_selector?('html')).to eq(false)
65
+ expect(page).to be_visited
66
+ expect(page.timestamp).to eq(now)
67
+ #expect_any_instance_of(Logger).to receive(:warn) #.with(/The page with the URL #{url} was not available"/)
68
+ end
69
+ end
70
+
71
+ [Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
72
+ Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError ].each do |error_type|
73
+
74
+ context "#{error_type}" do
75
+ let(:headers) do
76
+ {
77
+ grellStatus: 'Error',
78
+ errorClass: "#{error_type}",
79
+ errorMessage: error_message
80
+ }
81
+ end
82
+ let(:error_message) {'Trusmis broke it again'}
83
+ let(:now) {Time.now}
84
+ before do
85
+ allow_any_instance_of(Grell::RawPage).to receive(:navigate).and_raise(error_type, 'error')
86
+ allow_any_instance_of(error_type).to receive(:message).and_return(error_message)
87
+ page.navigate
88
+ end
89
+ it_behaves_like 'an errored grell page'
90
+ end
91
+ end
92
+
93
+
94
+ context 'we have not yet navigated to the page' do
95
+ let(:visited) {false}
96
+ let(:status) {nil}
97
+ let(:body) {''}
98
+ let(:links) {[]}
99
+ let(:expected_headers) {{}}
100
+ let(:now) {nil}
101
+
102
+ before do
103
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
104
+ end
105
+
106
+ it_behaves_like 'a grell page'
107
+
108
+ end
109
+
110
+ context 'navigating to the URL we get a 404' do
111
+ let(:visited) {true}
112
+ let(:status) { 404}
113
+ let(:body) {'<html><head></head><body>nothing cool</body></html>'}
114
+ let(:links) {[]}
115
+ let(:expected_headers) {returned_headers}
116
+
117
+ before do
118
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
119
+ page.navigate
120
+ end
121
+
122
+ it_behaves_like 'a grell page'
123
+
124
+ end
125
+
126
+ context 'navigating to the URL we get page with no links' do
127
+ let(:visited) {true}
128
+ let(:status) { 200}
129
+ let(:body) {'<html><head></head><body>nothing cool</body></html>'}
130
+ let(:links) {[]}
131
+ let(:expected_headers) {returned_headers}
132
+
133
+ before do
134
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
135
+ page.navigate
136
+ end
137
+
138
+ it_behaves_like 'a grell page'
139
+ end
140
+
141
+ context 'navigating to the URL we get page with links using a elements' do
142
+ let(:visited) {true}
143
+ let(:status) { 200}
144
+ let(:body) do
145
+ "<html><head></head><body>
146
+ Hello world!
147
+ <a href=\"/trusmis.html\">trusmis</a>
148
+ <a href=\"/help.html\">help</a>
149
+ <a href=\"http://www.outsidewebsite.com/help.html\">help</a>
150
+ </body></html>"
151
+ end
152
+ let(:links) {["http://www.example.com/trusmis.html", "http://www.example.com/help.html"]}
153
+ let(:expected_headers) {returned_headers}
154
+
155
+ before do
156
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
157
+ page.navigate
158
+ end
159
+
160
+ it_behaves_like 'a grell page'
161
+
162
+ it 'do not return links to external websites' do
163
+ expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
164
+ end
165
+ end
166
+
167
+ context 'navigating to the URL we get page with links with absolute links' do
168
+ let(:visited) {true}
169
+ let(:status) { 200}
170
+ let(:body) do
171
+ "<html><head></head><body>
172
+ Hello world!
173
+ <a href=\"/trusmis.html\">trusmis</a>
174
+ <a href=\"http://www.example.com/help.html\">help</a>
175
+ <a href=\"http://www.outsidewebsite.com/help.html\">help</a>
176
+ </body></html>"
177
+ end
178
+ let(:links) {["http://www.example.com/trusmis.html", "http://www.example.com/help.html"]}
179
+ let(:expected_headers) {returned_headers}
180
+
181
+ before do
182
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
183
+ page.navigate
184
+ end
185
+
186
+ it_behaves_like 'a grell page'
187
+
188
+ it 'do not return links to external websites' do
189
+ expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
190
+ end
191
+ end
192
+
193
+ context 'navigating to the URL we get page with links using a mix of elements' do
194
+ let(:visited) {true}
195
+ let(:status) { 200}
196
+ let(:body) do
197
+ "<html><head></head><body>
198
+ Hello world!
199
+ <a href=\"/trusmis.html\">trusmis</a>
200
+ <table>
201
+ <tbody>
202
+ <tr href=\"/help_me.html\"><td>help</td></tr>
203
+ <tr data-href=\"/help.html\"><td>help</td></tr>
204
+ </tbody>
205
+ </table>
206
+ <div data-href=\"http://www.example.com/more_help.html\">help</div>
207
+ <div data-href=\"http://www.outsidewebsite.com/help.html\">help</div>
208
+ </body></html>"
209
+ end
210
+ let(:links) do
211
+ ["http://www.example.com/trusmis.html", "http://www.example.com/help.html",
212
+ 'http://www.example.com/more_help.html', 'http://www.example.com/help_me.html'
213
+ ]
214
+ end
215
+ let(:expected_headers) {returned_headers}
216
+
217
+ before do
218
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
219
+ page.navigate
220
+ end
221
+
222
+ it_behaves_like 'a grell page'
223
+
224
+ it 'do not return links to external websites' do
225
+ expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
226
+ end
227
+ end
228
+
229
+ context 'navigating to the URL we get page with links inside the header section of the code' do
230
+ let(:visited) {true}
231
+ let(:status) { 200}
232
+ let(:css) {'/application.css'}
233
+ let(:favicon) {'/favicon.ico'}
234
+ let(:body) do
235
+ "<html><head>
236
+ <title>mimi</title>
237
+ <link href=\"#{css}\" rel=\"stylesheet\">
238
+ <link href=\"#{favicon}\" rel=\"shortcut icon\" type=\"image/vnd.microsoft.icon\">
239
+ </head>
240
+ <body>
241
+ Hello world!
242
+ <a href=\"/trusmis.html\">trusmis</a>
243
+ </body></html>"
244
+ end
245
+ let(:links) do
246
+ ["http://www.example.com/trusmis.html"]
247
+ end
248
+ let(:expected_headers) {returned_headers}
249
+
250
+ before do
251
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
252
+ #We need to stub this or Phantomjs will get stuck trying to retrieve the resources
253
+ proxy.stub(host + css).and_return(body: '', code: status)
254
+ proxy.stub(host + favicon).and_return(body: '', code: status)
255
+ page.navigate
256
+ end
257
+
258
+ it_behaves_like 'a grell page'
259
+
260
+ it 'do not return links to resources in the header' do
261
+ expect(page.links).to_not include('http://www.example.com/application.css')
262
+ end
263
+
264
+ end
265
+
266
+ context 'status is never set' do #this may happen when there is nothing comming from the site
267
+ before do
268
+ stub_const('Grell::Page::WAIT_TIME', 0)
269
+ allow_any_instance_of(Grell::RawPage).to receive(:status).and_return(nil)
270
+ allow_any_instance_of(Grell::RawPage).to receive(:headers).and_return({})
271
+ allow_any_instance_of(Grell::RawPage).to receive(:body).and_return('')
272
+ proxy.stub(url).and_return(body: body, code: nil, headers: {})
273
+ page.navigate
274
+ end
275
+ let(:visited) {true}
276
+ let(:status) { nil}
277
+ let(:body) {''}
278
+ let(:links) {[]}
279
+ let(:expected_headers) {{}}
280
+
281
+ it_behaves_like 'a grell page'
282
+ end
283
+
284
+ end