grell 1.6.10 → 1.6.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -1
- data/CHANGELOG.md +4 -0
- data/README.md +11 -0
- data/lib/grell/crawler.rb +10 -6
- data/lib/grell/page.rb +2 -1
- data/lib/grell/rawpage.rb +11 -1
- data/lib/grell/version.rb +1 -1
- data/spec/lib/crawler_spec.rb +17 -1
- data/spec/lib/page_spec.rb +11 -0
- data/spec/spec_helper.rb +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a3668b60b187d3b4dab183bf6b386738813a1ec
|
4
|
+
data.tar.gz: 32ddba97d8dca3abca1cffb4d44760d0679eea0b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ba545ee45bbdc4a43b85045d6e09c0a1cfd7f0d7605dd5abee455434426b84cf11fe706a46a0128cc2b35ab04ff0f0d2106482a7548989a1aaa2bdb4928cf90
|
7
|
+
data.tar.gz: 0d1e2eb5dbcd688d00a39e7ff39781b577647eb6db47ac2155709d81af7e004d435521a38067258026960982f97f60cbf991993366dc61b89ae8f2d08345c4d1
|
data/.travis.yml
CHANGED
@@ -9,6 +9,6 @@ script: bundle exec rspec
|
|
9
9
|
|
10
10
|
before_install:
|
11
11
|
- mkdir travis-phantomjs
|
12
|
-
- wget https://
|
12
|
+
- wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true -O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
|
13
13
|
- tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
|
14
14
|
- export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# 1.6.11
|
2
|
+
* Ensure all links are loaded by waiting for Ajax requests to complete
|
3
|
+
* Add '@evaluate_in_each_page' option to evaluate before extracting links (e.g. $('.dropdown').addClass('open');)
|
4
|
+
|
1
5
|
# 1.6.10
|
2
6
|
* Avoid following JS href links, add missing dependencies to fix Travis build
|
3
7
|
|
data/README.md
CHANGED
@@ -151,6 +151,17 @@ The page object generated by accessing the first URL passed to the start_crawlin
|
|
151
151
|
Using this information it is possible to construct a directed graph.
|
152
152
|
|
153
153
|
|
154
|
+
### Evaluate script
|
155
|
+
|
156
|
+
You can evalute a JavaScript snippet in each page before extracting links by passing the snippet to the 'evaluate_in_each_page' option:
|
157
|
+
|
158
|
+
```ruby
|
159
|
+
require 'grell'
|
160
|
+
|
161
|
+
crawler = Grell::Crawler.new(evaluate_in_each_page: "typeof jQuery !== 'undefined' && $('.dropdown').addClass('open');")
|
162
|
+
|
163
|
+
```
|
164
|
+
|
154
165
|
### Errors
|
155
166
|
When there is an error in the page or an internal error in the crawler (Javascript crashed the browser, etc). Grell will return with status 404 and the headers will have the following keys:
|
156
167
|
- grellStatus: 'Error'
|
data/lib/grell/crawler.rb
CHANGED
@@ -15,6 +15,7 @@ module Grell
|
|
15
15
|
end
|
16
16
|
|
17
17
|
@driver = CapybaraDriver.setup(options)
|
18
|
+
@evaluate_in_each_page = options[:evaluate_in_each_page]
|
18
19
|
end
|
19
20
|
|
20
21
|
# Restarts the PhantomJS process without modifying the state of visited and discovered pages.
|
@@ -55,16 +56,12 @@ module Grell
|
|
55
56
|
|
56
57
|
def crawl(site, block)
|
57
58
|
Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
|
58
|
-
site
|
59
|
-
filter!(site.links)
|
60
|
-
add_redirect_url(site)
|
59
|
+
crawl_site(site)
|
61
60
|
|
62
61
|
if block # The user of this block can send us a :retry to retry accessing the page
|
63
62
|
while crawl_block(block, site) == :retry
|
64
63
|
Grell.logger.info "Retrying our visit to #{site.url}"
|
65
|
-
site
|
66
|
-
filter!(site.links)
|
67
|
-
add_redirect_url(site)
|
64
|
+
crawl_site(site)
|
68
65
|
end
|
69
66
|
end
|
70
67
|
|
@@ -75,6 +72,13 @@ module Grell
|
|
75
72
|
|
76
73
|
private
|
77
74
|
|
75
|
+
def crawl_site(site)
|
76
|
+
site.navigate
|
77
|
+
site.rawpage.page.evaluate_script(@evaluate_in_each_page) if @evaluate_in_each_page
|
78
|
+
filter!(site.links)
|
79
|
+
add_redirect_url(site)
|
80
|
+
end
|
81
|
+
|
78
82
|
# Treat any exceptions from the block as an unavailable page
|
79
83
|
def crawl_block(block, site)
|
80
84
|
block.call(site)
|
data/lib/grell/page.rb
CHANGED
@@ -26,11 +26,12 @@ module Grell
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def navigate
|
29
|
-
# We wait a maximum of WAIT_TIME seconds to get an HTML page. We try
|
29
|
+
# We wait a maximum of WAIT_TIME seconds to get an HTML page. We try our best to workaround inconsistencies on poltergeist
|
30
30
|
Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do
|
31
31
|
@rawpage.status && !@rawpage.headers.empty? &&
|
32
32
|
@rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true)
|
33
33
|
end
|
34
|
+
@rawpage.wait_for_all_ajax_requests(WAIT_TIME, WAIT_INTERVAL)
|
34
35
|
@result_page = VisitedPage.new(@rawpage)
|
35
36
|
@timestamp = Time.now
|
36
37
|
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
|
data/lib/grell/rawpage.rb
CHANGED
@@ -27,7 +27,6 @@ module Grell
|
|
27
27
|
all('[href]', visible: true).to_a + all('[data-href]', visible: true).to_a
|
28
28
|
end
|
29
29
|
|
30
|
-
|
31
30
|
def host
|
32
31
|
page.current_host
|
33
32
|
end
|
@@ -36,6 +35,17 @@ module Grell
|
|
36
35
|
page.has_selector?(selector)
|
37
36
|
end
|
38
37
|
|
38
|
+
def wait_for_all_ajax_requests(timeout, interval)
|
39
|
+
Timeout::timeout(timeout) do
|
40
|
+
(timeout / interval).ceil.times do
|
41
|
+
jquery_active = page.evaluate_script("typeof jQuery !== 'undefined' && jQuery.active;")
|
42
|
+
break if (!jquery_active || jquery_active.zero?)
|
43
|
+
sleep(interval)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
true
|
47
|
+
end
|
48
|
+
|
39
49
|
private
|
40
50
|
|
41
51
|
def follow_redirects!
|
data/lib/grell/version.rb
CHANGED
data/spec/lib/crawler_spec.rb
CHANGED
@@ -5,7 +5,8 @@ RSpec.describe Grell::Crawler do
|
|
5
5
|
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
6
6
|
let(:host) { 'http://www.example.com' }
|
7
7
|
let(:url) { 'http://www.example.com/test' }
|
8
|
-
let(:crawler) { Grell::Crawler.new(logger: Logger.new(nil), external_driver: true) }
|
8
|
+
let(:crawler) { Grell::Crawler.new(logger: Logger.new(nil), external_driver: true, evaluate_in_each_page: script) }
|
9
|
+
let(:script) { nil }
|
9
10
|
let(:body) { 'body' }
|
10
11
|
let(:custom_add_match) do
|
11
12
|
Proc.new do |collection_page, page|
|
@@ -85,6 +86,21 @@ RSpec.describe Grell::Crawler do
|
|
85
86
|
expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
|
86
87
|
crawler.crawl(page, nil)
|
87
88
|
end
|
89
|
+
|
90
|
+
context 'without script' do
|
91
|
+
it 'does not evaluate a script' do
|
92
|
+
expect_any_instance_of(Capybara::Session).not_to receive(:evaluate_script)
|
93
|
+
crawler.crawl(page, nil)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
context 'with script' do
|
98
|
+
let(:script) { "(typeof(jQuery)!='undefined') && $('.dropdown').addClass('open');" }
|
99
|
+
it 'evaluates a script' do
|
100
|
+
expect_any_instance_of(Capybara::Session).to receive(:evaluate_script).with(script)
|
101
|
+
crawler.crawl(page, nil)
|
102
|
+
end
|
103
|
+
end
|
88
104
|
end
|
89
105
|
|
90
106
|
context '#start_crawling' do
|
data/spec/lib/page_spec.rb
CHANGED
@@ -92,6 +92,17 @@ RSpec.describe Grell::Page do
|
|
92
92
|
end
|
93
93
|
end
|
94
94
|
|
95
|
+
describe '#navigate' do
|
96
|
+
before do
|
97
|
+
proxy.stub(url).and_return(body: '', code: 200, headers: {})
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'waits for all ajax requests' do
|
101
|
+
expect_any_instance_of(Grell::RawPage).to receive(:wait_for_all_ajax_requests).with(0, 0.5)
|
102
|
+
page.navigate
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
95
106
|
shared_examples_for 'an errored grell page' do
|
96
107
|
it 'returns empty status 404 page after navigating' do
|
97
108
|
expect(page.status).to eq(404)
|
data/spec/spec_helper.rb
CHANGED
@@ -23,6 +23,7 @@ RSpec.configure do |config|
|
|
23
23
|
# We do not need to wait for pages to return all the data
|
24
24
|
config.before do
|
25
25
|
stub_const("Grell::Page::WAIT_TIME", 0)
|
26
|
+
allow_any_instance_of(Capybara::Session).to receive(:evaluate_script).and_return(nil)
|
26
27
|
end
|
27
28
|
|
28
29
|
config.expect_with :rspec do |expectations|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.6.
|
4
|
+
version: 1.6.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-09-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|