grell 1.6.10 → 1.6.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 866f1b7117624455b79791bacba549710eb7dc2b
4
- data.tar.gz: 4a3646c053bb7b4884fa8b82ebb24809d2d37f97
3
+ metadata.gz: 9a3668b60b187d3b4dab183bf6b386738813a1ec
4
+ data.tar.gz: 32ddba97d8dca3abca1cffb4d44760d0679eea0b
5
5
  SHA512:
6
- metadata.gz: dfb07e7c0a6a7fb2fe53a40a3da248a24ba683972c696ab7064481448cb4067403dc7350d34cbba21ff4f240fb09b3571adbb36b38bfecd28aeee1ea1551638e
7
- data.tar.gz: 7f91a206fd4cb264d73b05468dce66d858637c6d675b92088b800a85b67599201f957f879d618a0f4fc7a538de79c3211f82ffc3d73b063fec760da09209578a
6
+ metadata.gz: 6ba545ee45bbdc4a43b85045d6e09c0a1cfd7f0d7605dd5abee455434426b84cf11fe706a46a0128cc2b35ab04ff0f0d2106482a7548989a1aaa2bdb4928cf90
7
+ data.tar.gz: 0d1e2eb5dbcd688d00a39e7ff39781b577647eb6db47ac2155709d81af7e004d435521a38067258026960982f97f60cbf991993366dc61b89ae8f2d08345c4d1
@@ -9,6 +9,6 @@ script: bundle exec rspec
9
9
 
10
10
  before_install:
11
11
  - mkdir travis-phantomjs
12
- - wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 -O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
12
+ - wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true -O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
13
13
  - tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
14
14
  - export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
@@ -1,3 +1,7 @@
1
+ # 1.6.11
2
+ * Ensure all links are loaded by waiting for Ajax requests to complete
3
+ * Add '@evaluate_in_each_page' option to evaluate before extracting links (e.g. $('.dropdown').addClass('open');)
4
+
1
5
  # 1.6.10
2
6
  * Avoid following JS href links, add missing dependencies to fix Travis build
3
7
 
data/README.md CHANGED
@@ -151,6 +151,17 @@ The page object generated by accessing the first URL passed to the start_crawlin
151
151
  Using this information it is possible to construct a directed graph.
152
152
 
153
153
 
154
+ ### Evaluate script
155
+
156
+ You can evalute a JavaScript snippet in each page before extracting links by passing the snippet to the 'evaluate_in_each_page' option:
157
+
158
+ ```ruby
159
+ require 'grell'
160
+
161
+ crawler = Grell::Crawler.new(evaluate_in_each_page: "typeof jQuery !== 'undefined' && $('.dropdown').addClass('open');")
162
+
163
+ ```
164
+
154
165
  ### Errors
155
166
  When there is an error in the page or an internal error in the crawler (Javascript crashed the browser, etc). Grell will return with status 404 and the headers will have the following keys:
156
167
  - grellStatus: 'Error'
@@ -15,6 +15,7 @@ module Grell
15
15
  end
16
16
 
17
17
  @driver = CapybaraDriver.setup(options)
18
+ @evaluate_in_each_page = options[:evaluate_in_each_page]
18
19
  end
19
20
 
20
21
  # Restarts the PhantomJS process without modifying the state of visited and discovered pages.
@@ -55,16 +56,12 @@ module Grell
55
56
 
56
57
  def crawl(site, block)
57
58
  Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
58
- site.navigate
59
- filter!(site.links)
60
- add_redirect_url(site)
59
+ crawl_site(site)
61
60
 
62
61
  if block # The user of this block can send us a :retry to retry accessing the page
63
62
  while crawl_block(block, site) == :retry
64
63
  Grell.logger.info "Retrying our visit to #{site.url}"
65
- site.navigate
66
- filter!(site.links)
67
- add_redirect_url(site)
64
+ crawl_site(site)
68
65
  end
69
66
  end
70
67
 
@@ -75,6 +72,13 @@ module Grell
75
72
 
76
73
  private
77
74
 
75
+ def crawl_site(site)
76
+ site.navigate
77
+ site.rawpage.page.evaluate_script(@evaluate_in_each_page) if @evaluate_in_each_page
78
+ filter!(site.links)
79
+ add_redirect_url(site)
80
+ end
81
+
78
82
  # Treat any exceptions from the block as an unavailable page
79
83
  def crawl_block(block, site)
80
84
  block.call(site)
@@ -26,11 +26,12 @@ module Grell
26
26
  end
27
27
 
28
28
  def navigate
29
- # We wait a maximum of WAIT_TIME seconds to get an HTML page. We try or best to workaround inconsistencies on poltergeist
29
+ # We wait a maximum of WAIT_TIME seconds to get an HTML page. We try our best to workaround inconsistencies on poltergeist
30
30
  Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do
31
31
  @rawpage.status && !@rawpage.headers.empty? &&
32
32
  @rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true)
33
33
  end
34
+ @rawpage.wait_for_all_ajax_requests(WAIT_TIME, WAIT_INTERVAL)
34
35
  @result_page = VisitedPage.new(@rawpage)
35
36
  @timestamp = Time.now
36
37
  rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
@@ -27,7 +27,6 @@ module Grell
27
27
  all('[href]', visible: true).to_a + all('[data-href]', visible: true).to_a
28
28
  end
29
29
 
30
-
31
30
  def host
32
31
  page.current_host
33
32
  end
@@ -36,6 +35,17 @@ module Grell
36
35
  page.has_selector?(selector)
37
36
  end
38
37
 
38
+ def wait_for_all_ajax_requests(timeout, interval)
39
+ Timeout::timeout(timeout) do
40
+ (timeout / interval).ceil.times do
41
+ jquery_active = page.evaluate_script("typeof jQuery !== 'undefined' && jQuery.active;")
42
+ break if (!jquery_active || jquery_active.zero?)
43
+ sleep(interval)
44
+ end
45
+ end
46
+ true
47
+ end
48
+
39
49
  private
40
50
 
41
51
  def follow_redirects!
@@ -1,3 +1,3 @@
1
1
  module Grell
2
- VERSION = "1.6.10".freeze
2
+ VERSION = "1.6.11".freeze
3
3
  end
@@ -5,7 +5,8 @@ RSpec.describe Grell::Crawler do
5
5
  let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
6
6
  let(:host) { 'http://www.example.com' }
7
7
  let(:url) { 'http://www.example.com/test' }
8
- let(:crawler) { Grell::Crawler.new(logger: Logger.new(nil), external_driver: true) }
8
+ let(:crawler) { Grell::Crawler.new(logger: Logger.new(nil), external_driver: true, evaluate_in_each_page: script) }
9
+ let(:script) { nil }
9
10
  let(:body) { 'body' }
10
11
  let(:custom_add_match) do
11
12
  Proc.new do |collection_page, page|
@@ -85,6 +86,21 @@ RSpec.describe Grell::Crawler do
85
86
  expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
86
87
  crawler.crawl(page, nil)
87
88
  end
89
+
90
+ context 'without script' do
91
+ it 'does not evaluate a script' do
92
+ expect_any_instance_of(Capybara::Session).not_to receive(:evaluate_script)
93
+ crawler.crawl(page, nil)
94
+ end
95
+ end
96
+
97
+ context 'with script' do
98
+ let(:script) { "(typeof(jQuery)!='undefined') && $('.dropdown').addClass('open');" }
99
+ it 'evaluates a script' do
100
+ expect_any_instance_of(Capybara::Session).to receive(:evaluate_script).with(script)
101
+ crawler.crawl(page, nil)
102
+ end
103
+ end
88
104
  end
89
105
 
90
106
  context '#start_crawling' do
@@ -92,6 +92,17 @@ RSpec.describe Grell::Page do
92
92
  end
93
93
  end
94
94
 
95
+ describe '#navigate' do
96
+ before do
97
+ proxy.stub(url).and_return(body: '', code: 200, headers: {})
98
+ end
99
+
100
+ it 'waits for all ajax requests' do
101
+ expect_any_instance_of(Grell::RawPage).to receive(:wait_for_all_ajax_requests).with(0, 0.5)
102
+ page.navigate
103
+ end
104
+ end
105
+
95
106
  shared_examples_for 'an errored grell page' do
96
107
  it 'returns empty status 404 page after navigating' do
97
108
  expect(page.status).to eq(404)
@@ -23,6 +23,7 @@ RSpec.configure do |config|
23
23
  # We do not need to wait for pages to return all the data
24
24
  config.before do
25
25
  stub_const("Grell::Page::WAIT_TIME", 0)
26
+ allow_any_instance_of(Capybara::Session).to receive(:evaluate_script).and_return(nil)
26
27
  end
27
28
 
28
29
  config.expect_with :rspec do |expectations|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.10
4
+ version: 1.6.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jordi Polo Carres
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-27 00:00:00.000000000 Z
11
+ date: 2016-09-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara