grell 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,46 @@
1
+ module Grell
2
+ class PageCollection
3
+ attr_reader :collection
4
+
5
+ def initialize
6
+ @collection = []
7
+ end
8
+
9
+ def create_page(url, parent_id)
10
+ page_id = next_id
11
+ page = Page.new(url, page_id, parent_id)
12
+ add(page)
13
+ page
14
+ end
15
+
16
+ def visited_pages
17
+ @collection.select {|page| page.visited?}
18
+ end
19
+
20
+ def discovered_pages
21
+ @collection - visited_pages
22
+ end
23
+
24
+ def next_page
25
+ discovered_pages.sort_by{|page| page.parent_id}.first
26
+ end
27
+
28
+ private
29
+
30
+ def next_id
31
+ @collection.size
32
+ end
33
+
34
+ def add(page)
35
+ # Although finding unique pages based on URL will add pages with different query parameters,
36
+ # in some cases we do link to different pages depending on the query parameters like when using proxies
37
+ new_url = @collection.none? do |collection_page|
38
+ collection_page.url.downcase == page.url.downcase
39
+ end
40
+ if new_url
41
+ @collection.push page
42
+ end
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,37 @@
1
+ module Grell
2
+ # This class depends heavily on Capybara but contains no logic.
3
+ class RawPage
4
+ include Capybara::DSL
5
+
6
+ def navigate(url)
7
+ visit(url)
8
+ end
9
+
10
+ def headers
11
+ page.response_headers
12
+ end
13
+
14
+ def status
15
+ page.status_code
16
+ end
17
+
18
+ def body
19
+ page.body
20
+ end
21
+
22
+ def all_anchors
23
+ # Some elements may not be "a" elements but still provide a link. This usually is done for Javascript
24
+ # to convert other elements which are not links to be able to be clicked naturally.
25
+ all('[href]', visible: false).to_a + all('[data-href]', visible: false).to_a
26
+ end
27
+
28
+
29
+ def host
30
+ page.current_host
31
+ end
32
+
33
+ def has_selector?(selector)
34
+ page.has_selector?(selector)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,15 @@
1
+ module Grell
2
+ class Reader
3
+ def self.wait_for(action, max_waiting, sleeping_time)
4
+ time_start = Time.now
5
+ action.call()
6
+ return if yield
7
+ while (Time.now < time_start + max_waiting)
8
+ action.call()
9
+ break if yield
10
+ sleep(sleeping_time)
11
+ end
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,3 @@
1
+ module Grell
2
+ VERSION = "1.3.0"
3
+ end
@@ -0,0 +1,108 @@
1
+
2
+ RSpec.describe Grell::Crawler do
3
+ let(:page_id) { rand(10).floor + 10}
4
+ let(:parent_page_id) {rand(10).floor}
5
+ let(:page) {Grell::Page.new(url, page_id, parent_page_id)}
6
+ let(:host) {"http://www.example.com"}
7
+ let(:url) {"http://www.example.com/test"}
8
+ let(:crawler) { Grell::Crawler.new(external_driver: true)}
9
+ let(:body) {'body'}
10
+
11
+ before do
12
+ proxy.stub(url).and_return(body: body, code: 200)
13
+ end
14
+
15
+ describe 'initialize' do
16
+ it 'can provide your own logger' do
17
+ Grell::Crawler.new(external_driver: true, logger: 33)
18
+ expect(Grell.logger).to eq(33)
19
+ end
20
+ it 'provides a stdout logger if nothing provided' do
21
+ crawler
22
+ expect(Grell.logger).to be_instance_of(Logger)
23
+ end
24
+ end
25
+
26
+ context '#crawl' do
27
+ it 'yields the result if a block is given' do
28
+ result = []
29
+ block = Proc.new {|n| result.push(n) }
30
+ crawler.crawl(page, block)
31
+ expect(result.size).to eq(1)
32
+ expect(result.first.url).to eq(url)
33
+ expect(result.first.visited?).to eq(true)
34
+ end
35
+
36
+ it 'logs interesting information' do
37
+ crawler
38
+ expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
39
+ crawler.crawl(page, nil)
40
+ end
41
+ end
42
+
43
+ context '#start_crawling' do
44
+ let(:body) do
45
+ <<-EOS
46
+ <html><head></head><body>
47
+ <a href="/musmis.html">trusmis</a>
48
+ Hello world!
49
+ </body></html>
50
+ EOS
51
+ end
52
+ let(:url_visited) {"http://www.example.com/musmis.html"}
53
+ before do
54
+ proxy.stub(url_visited).and_return(body: 'body', code: 200)
55
+ end
56
+
57
+ it 'calls the block we used to start_crawling' do
58
+ result = []
59
+ block = Proc.new {|n| result.push(n) }
60
+ crawler.start_crawling(url, &block)
61
+ expect(result.size).to eq(2)
62
+ expect(result[0].url).to eq(url)
63
+ expect(result[1].url).to eq(url_visited)
64
+ end
65
+ end
66
+
67
+ context 'the url has no links' do
68
+ let(:body) do
69
+ "<html><head></head><body>
70
+ Hello world!
71
+ </body></html>"
72
+ end
73
+ before do
74
+ crawler.start_crawling(url)
75
+ end
76
+ it 'visits all the pages' do
77
+ expect(crawler.collection.visited_pages.size).to eq(1)
78
+ end
79
+ it 'has no more pages to discover' do
80
+ expect(crawler.collection.discovered_pages.size).to eq(0)
81
+ end
82
+ end
83
+
84
+ context 'the url has several links' do
85
+ let(:body) do
86
+ "<html><head></head><body>
87
+ <a href=\"/trusmis.html\">trusmis</a>
88
+ <a href=\"/help.html\">help</a>
89
+ Hello world!
90
+ </body></html>"
91
+ end
92
+ before do
93
+ proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
94
+ proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
95
+ end
96
+
97
+ it 'visits all the pages' do
98
+ crawler.start_crawling(url)
99
+ expect(crawler.collection.visited_pages.size).to eq(3)
100
+ end
101
+ it 'has no more pages to discover' do
102
+ crawler.start_crawling(url)
103
+ expect(crawler.collection.discovered_pages.size).to eq(0)
104
+ end
105
+ end
106
+
107
+
108
+ end
@@ -0,0 +1,149 @@
1
+
2
+ RSpec.describe Grell::PageCollection do
3
+ let(:collection) {Grell::PageCollection.new}
4
+ let(:url) {'http://www.github.com/SomeUser/dragonlance?search=false'}
5
+ let(:url2) {'http://www.github.com/OtherUser/forgotten?search=false'}
6
+
7
+ context 'empty collection' do
8
+
9
+ it 'has no visited pages' do
10
+ expect(collection.visited_pages).to be_empty
11
+ end
12
+
13
+ it 'has no discovered pages' do
14
+ expect(collection.discovered_pages).to be_empty
15
+ end
16
+
17
+ it 'next page is nil' do
18
+ expect(collection.next_page).to be_nil
19
+ end
20
+ end
21
+
22
+ context 'one unvisited page' do
23
+ let(:page) {collection.create_page(url, 0)}
24
+ before do
25
+ allow(page).to receive(:visited?).and_return(false)
26
+ end
27
+
28
+ it 'has no visited pages' do
29
+ expect(collection.visited_pages).to be_empty
30
+ end
31
+
32
+ it 'has one discovered page' do
33
+ expect(collection.discovered_pages).to eq([page])
34
+
35
+ end
36
+
37
+ it 'next page is the unvisited page' do
38
+ expect(collection.next_page).to eq(page)
39
+ end
40
+ end
41
+
42
+ context 'one visited page' do
43
+ let(:page) {collection.create_page(url, 0)}
44
+ before do
45
+ allow(page).to receive(:visited?).and_return(true)
46
+ end
47
+
48
+ it 'has one visited page' do
49
+ expect(collection.visited_pages).to eq([page])
50
+ end
51
+
52
+ it 'has no discovered pages' do
53
+ expect(collection.discovered_pages).to be_empty
54
+ end
55
+
56
+ it 'next page is nil' do
57
+ expect(collection.next_page).to be_nil
58
+ end
59
+ end
60
+
61
+ context 'one visited and one unvisited page with the same url' do
62
+ let(:page) {collection.create_page(url, 0)}
63
+ let(:unvisited) {collection.create_page(url.upcase, 0)}
64
+ before do
65
+ allow(page).to receive(:visited?).and_return(true)
66
+ allow(unvisited).to receive(:visited?).and_return(false)
67
+ end
68
+
69
+ it 'first page has id 0' do
70
+ expect(page.id).to eq(0)
71
+ end
72
+
73
+ it 'second page has id 1' do
74
+ expect(unvisited.id).to eq(1)
75
+ end
76
+
77
+ it 'has one visited page' do
78
+ expect(collection.visited_pages).to eq([page])
79
+ end
80
+
81
+ it 'has no discovered pages' do
82
+ expect(collection.discovered_pages).to be_empty
83
+ end
84
+
85
+ it 'next page is nil' do
86
+ expect(collection.next_page).to be_nil
87
+ end
88
+ end
89
+
90
+ context 'one visited and one unvisited page with different URLs' do
91
+ let(:page) {collection.create_page(url, 0)}
92
+ let(:unvisited) {collection.create_page(url2, 0)}
93
+ before do
94
+ allow(page).to receive(:visited?).and_return(true)
95
+ allow(unvisited).to receive(:visited?).and_return(false)
96
+ end
97
+
98
+ it 'has one visited page' do
99
+ expect(collection.visited_pages).to eq([page])
100
+ end
101
+
102
+ it 'has one discovered page' do
103
+ expect(collection.discovered_pages).to eq([unvisited])
104
+ end
105
+
106
+ it 'next page is the unvisited page' do
107
+ expect(collection.next_page).to eq(unvisited)
108
+ end
109
+ end
110
+
111
+ context 'one visited and one unvisited page with different URLs only different by the query' do
112
+ let(:page) {collection.create_page(url, 0)}
113
+ let(:url3) {'http://www.github.com/SomeUser/dragonlance?search=true'}
114
+ let(:unvisited) {collection.create_page(url3, 0)}
115
+ before do
116
+ allow(page).to receive(:visited?).and_return(true)
117
+ allow(unvisited).to receive(:visited?).and_return(false)
118
+ end
119
+
120
+ it 'has one visited page' do
121
+ expect(collection.visited_pages).to eq([page])
122
+ end
123
+
124
+ it 'has one discovered page' do
125
+ expect(collection.discovered_pages).to eq([unvisited])
126
+ end
127
+
128
+ it 'next page is the unvisited page' do
129
+ expect(collection.next_page).to eq(unvisited)
130
+ end
131
+ end
132
+
133
+ context 'several unvisited pages' do
134
+ let(:page) {collection.create_page(url, 2)}
135
+ let(:page2) {collection.create_page(url2, 0)}
136
+ before do
137
+ allow(page).to receive(:visited?).and_return(true)
138
+ allow(page2).to receive(:visited?).and_return(false)
139
+ end
140
+
141
+ it "returns the page which has an earlier parent" do
142
+ expect(collection.next_page).to eq(page2)
143
+ end
144
+
145
+ end
146
+
147
+
148
+
149
+ end
@@ -0,0 +1,284 @@
1
+ RSpec.describe Grell::Page do
2
+
3
+ let(:page_id) { rand(10).floor + 10}
4
+ let(:parent_page_id) {rand(10).floor}
5
+ let(:page) {Grell::Page.new(url, page_id, parent_page_id)}
6
+ let(:host) {"http://www.example.com"}
7
+ let(:url) {"http://www.example.com/test"}
8
+ let(:returned_headers) { { 'Other-Header' => 'yes', 'Content-Type' => 'text/html' }}
9
+ let(:now) {Time.now}
10
+ before do
11
+ allow(Time).to receive(:now).and_return(now)
12
+ end
13
+
14
+ it "gives access to the url" do
15
+ expect(page.url).to eq(url)
16
+ end
17
+
18
+ it "gives access to the page id" do
19
+ expect(page.id).to eq(page_id)
20
+ end
21
+
22
+ it "gives access to the parent page id" do
23
+ expect(page.parent_id).to eq(parent_page_id)
24
+ end
25
+
26
+ it 'newly created page does not have status yet' do
27
+ expect(page.status).to eq(nil)
28
+ end
29
+
30
+ shared_examples_for 'a grell page' do
31
+
32
+ it 'returns the correct status' do
33
+ expect(page.status).to eq(status)
34
+ end
35
+
36
+ it 'has the correct body' do
37
+ expect(page.body).to eq(body)
38
+ end
39
+
40
+ it 'has correct headers' do
41
+ expect(page.headers).to include(expected_headers)
42
+ end
43
+
44
+ it 'has the correct links' do
45
+ expect(page.links.sort).to eq(links.sort)
46
+ end
47
+
48
+ it '#visited? returns the correct value' do
49
+ expect(page.visited?).to eq(visited)
50
+ end
51
+
52
+ it 'has correct timestamp' do
53
+ expect(page.timestamp).to eq(now)
54
+ end
55
+
56
+ end
57
+
58
+ shared_examples_for 'an errored grell page' do
59
+ it 'returns empty status 404 page after navigating' do
60
+ expect(page.status).to eq(404)
61
+ expect(page.links).to eq([])
62
+ expect(page.headers).to eq(headers)
63
+ expect(page.body).to eq('')
64
+ expect(page.has_selector?('html')).to eq(false)
65
+ expect(page).to be_visited
66
+ expect(page.timestamp).to eq(now)
67
+ #expect_any_instance_of(Logger).to receive(:warn) #.with(/The page with the URL #{url} was not available"/)
68
+ end
69
+ end
70
+
71
+ [Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
72
+ Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError ].each do |error_type|
73
+
74
+ context "#{error_type}" do
75
+ let(:headers) do
76
+ {
77
+ grellStatus: 'Error',
78
+ errorClass: "#{error_type}",
79
+ errorMessage: error_message
80
+ }
81
+ end
82
+ let(:error_message) {'Trusmis broke it again'}
83
+ let(:now) {Time.now}
84
+ before do
85
+ allow_any_instance_of(Grell::RawPage).to receive(:navigate).and_raise(error_type, 'error')
86
+ allow_any_instance_of(error_type).to receive(:message).and_return(error_message)
87
+ page.navigate
88
+ end
89
+ it_behaves_like 'an errored grell page'
90
+ end
91
+ end
92
+
93
+
94
+ context 'we have not yet navigated to the page' do
95
+ let(:visited) {false}
96
+ let(:status) {nil}
97
+ let(:body) {''}
98
+ let(:links) {[]}
99
+ let(:expected_headers) {{}}
100
+ let(:now) {nil}
101
+
102
+ before do
103
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
104
+ end
105
+
106
+ it_behaves_like 'a grell page'
107
+
108
+ end
109
+
110
+ context 'navigating to the URL we get a 404' do
111
+ let(:visited) {true}
112
+ let(:status) { 404}
113
+ let(:body) {'<html><head></head><body>nothing cool</body></html>'}
114
+ let(:links) {[]}
115
+ let(:expected_headers) {returned_headers}
116
+
117
+ before do
118
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
119
+ page.navigate
120
+ end
121
+
122
+ it_behaves_like 'a grell page'
123
+
124
+ end
125
+
126
+ context 'navigating to the URL we get page with no links' do
127
+ let(:visited) {true}
128
+ let(:status) { 200}
129
+ let(:body) {'<html><head></head><body>nothing cool</body></html>'}
130
+ let(:links) {[]}
131
+ let(:expected_headers) {returned_headers}
132
+
133
+ before do
134
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
135
+ page.navigate
136
+ end
137
+
138
+ it_behaves_like 'a grell page'
139
+ end
140
+
141
+ context 'navigating to the URL we get page with links using a elements' do
142
+ let(:visited) {true}
143
+ let(:status) { 200}
144
+ let(:body) do
145
+ "<html><head></head><body>
146
+ Hello world!
147
+ <a href=\"/trusmis.html\">trusmis</a>
148
+ <a href=\"/help.html\">help</a>
149
+ <a href=\"http://www.outsidewebsite.com/help.html\">help</a>
150
+ </body></html>"
151
+ end
152
+ let(:links) {["http://www.example.com/trusmis.html", "http://www.example.com/help.html"]}
153
+ let(:expected_headers) {returned_headers}
154
+
155
+ before do
156
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
157
+ page.navigate
158
+ end
159
+
160
+ it_behaves_like 'a grell page'
161
+
162
+ it 'do not return links to external websites' do
163
+ expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
164
+ end
165
+ end
166
+
167
+ context 'navigating to the URL we get page with links with absolute links' do
168
+ let(:visited) {true}
169
+ let(:status) { 200}
170
+ let(:body) do
171
+ "<html><head></head><body>
172
+ Hello world!
173
+ <a href=\"/trusmis.html\">trusmis</a>
174
+ <a href=\"http://www.example.com/help.html\">help</a>
175
+ <a href=\"http://www.outsidewebsite.com/help.html\">help</a>
176
+ </body></html>"
177
+ end
178
+ let(:links) {["http://www.example.com/trusmis.html", "http://www.example.com/help.html"]}
179
+ let(:expected_headers) {returned_headers}
180
+
181
+ before do
182
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
183
+ page.navigate
184
+ end
185
+
186
+ it_behaves_like 'a grell page'
187
+
188
+ it 'do not return links to external websites' do
189
+ expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
190
+ end
191
+ end
192
+
193
+ context 'navigating to the URL we get page with links using a mix of elements' do
194
+ let(:visited) {true}
195
+ let(:status) { 200}
196
+ let(:body) do
197
+ "<html><head></head><body>
198
+ Hello world!
199
+ <a href=\"/trusmis.html\">trusmis</a>
200
+ <table>
201
+ <tbody>
202
+ <tr href=\"/help_me.html\"><td>help</td></tr>
203
+ <tr data-href=\"/help.html\"><td>help</td></tr>
204
+ </tbody>
205
+ </table>
206
+ <div data-href=\"http://www.example.com/more_help.html\">help</div>
207
+ <div data-href=\"http://www.outsidewebsite.com/help.html\">help</div>
208
+ </body></html>"
209
+ end
210
+ let(:links) do
211
+ ["http://www.example.com/trusmis.html", "http://www.example.com/help.html",
212
+ 'http://www.example.com/more_help.html', 'http://www.example.com/help_me.html'
213
+ ]
214
+ end
215
+ let(:expected_headers) {returned_headers}
216
+
217
+ before do
218
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
219
+ page.navigate
220
+ end
221
+
222
+ it_behaves_like 'a grell page'
223
+
224
+ it 'do not return links to external websites' do
225
+ expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
226
+ end
227
+ end
228
+
229
+ context 'navigating to the URL we get page with links inside the header section of the code' do
230
+ let(:visited) {true}
231
+ let(:status) { 200}
232
+ let(:css) {'/application.css'}
233
+ let(:favicon) {'/favicon.ico'}
234
+ let(:body) do
235
+ "<html><head>
236
+ <title>mimi</title>
237
+ <link href=\"#{css}\" rel=\"stylesheet\">
238
+ <link href=\"#{favicon}\" rel=\"shortcut icon\" type=\"image/vnd.microsoft.icon\">
239
+ </head>
240
+ <body>
241
+ Hello world!
242
+ <a href=\"/trusmis.html\">trusmis</a>
243
+ </body></html>"
244
+ end
245
+ let(:links) do
246
+ ["http://www.example.com/trusmis.html"]
247
+ end
248
+ let(:expected_headers) {returned_headers}
249
+
250
+ before do
251
+ proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
252
+ #We need to stub this or Phantomjs will get stuck trying to retrieve the resources
253
+ proxy.stub(host + css).and_return(body: '', code: status)
254
+ proxy.stub(host + favicon).and_return(body: '', code: status)
255
+ page.navigate
256
+ end
257
+
258
+ it_behaves_like 'a grell page'
259
+
260
+ it 'do not return links to resources in the header' do
261
+ expect(page.links).to_not include('http://www.example.com/application.css')
262
+ end
263
+
264
+ end
265
+
266
+ context 'status is never set' do #this may happen when there is nothing comming from the site
267
+ before do
268
+ stub_const('Grell::Page::WAIT_TIME', 0)
269
+ allow_any_instance_of(Grell::RawPage).to receive(:status).and_return(nil)
270
+ allow_any_instance_of(Grell::RawPage).to receive(:headers).and_return({})
271
+ allow_any_instance_of(Grell::RawPage).to receive(:body).and_return('')
272
+ proxy.stub(url).and_return(body: body, code: nil, headers: {})
273
+ page.navigate
274
+ end
275
+ let(:visited) {true}
276
+ let(:status) { nil}
277
+ let(:body) {''}
278
+ let(:links) {[]}
279
+ let(:expected_headers) {{}}
280
+
281
+ it_behaves_like 'a grell page'
282
+ end
283
+
284
+ end