google_ajax_crawler 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- Nzk1MTI4M2Y1ZTgyZmEyYzIyNWVlM2ExYjlhNmQxMTYwYzI3MGU0OQ==
4
+ YTU0N2ZkNGQ0YzJiMDlkYmYyMTI4YWNkMDY0ZmQ2MmM1MzBkZGNkNQ==
5
5
  data.tar.gz: !binary |-
6
- N2Y0ZDhjZmFiOGJkYTQwNDA4Y2JlYzQyNGY3OGU1OGYwNzM3YThkZA==
6
+ ODdkODI5OGI3NGIzYTdhOWNlOTEwMGE5NzRkZjgxYzUzOTY0NzZjZg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- YjYwY2Q0Y2I4NzQwNmYzYjBkYTc0NDljNDJjY2E2MDJlYjRkZTFjNzg5NjBk
10
- ZmFjZjQxMGYzNTgyYjBmNDIzMTQ5ODlkM2U4NDhlMTk1YWQ2NzZjODAyYWQx
11
- M2VmNTRjZGI2ODhmYjY4MDNiZTdlNDY1MDQyN2FkMTA0MTJlZWM=
9
+ ZTBlNjdlOWI3NDU2ZDI3OTBkYmU5MDE4MTJlZWIxZDZmODMzYTg1NmUyMjE1
10
+ OTExMDNjY2UyOWZiZjljOGUwYzhhZTJiZjFmODY2OTE3NmFkYzEyNWZhY2E3
11
+ M2E1MzlmZTljYjVmMTBmOWQ0ZDIzMzQyMWY1NTA4NWY3MjgwYzE=
12
12
  data.tar.gz: !binary |-
13
- MDg0MWM3ZmRiODk1NjM1MTk3MWRmYjFhNDhhOThlOGJlN2FjMTJjZjY0OTNi
14
- ZjQwYjAzNWI5MTI3N2Q2MTJkNGZhMjcyN2YwYTU2ZGQ4ZjM0MmVjMmVkZmJi
15
- Nzg5MDY0ZDJlYTlhNTRlZmQzODUxYmFiZmIwNWQ2Y2NlNzY3NzM=
13
+ ODc3YzBjYzg0MzYyYTA5NzVkM2M2OWRkYWFjNjNlMDQ2MzI2OTI4NjE5MDc2
14
+ YTUzNmJkNzFlMTE4ZjJiNmRlYzcyMDNhZTUxN2YwOTQ4Y2U0M2MyZDRjNzU5
15
+ MGNjZDhlZTllY2NjYzUzODljOWJjYjliOGUzNWM0NThkZTE1NjY=
@@ -10,7 +10,6 @@ env:
10
10
 
11
11
  before_install:
12
12
  - sh -e /etc/init.d/xvfb start
13
- - echo "Started xvfb..."
14
13
 
15
14
  script:
16
15
  - DISPLAY=:99.0 bundle exec rake
data/README.md CHANGED
@@ -9,7 +9,7 @@ Details of the scheme can be found at: https://developers.google.com/webmasters/
9
9
 
10
10
  ## Using
11
11
 
12
- install
12
+ ### install
13
13
 
14
14
  ``` ruby
15
15
  gem install google_ajax_crawler
@@ -21,57 +21,114 @@ In your config.ru
21
21
  require 'google_ajax_crawler'
22
22
 
23
23
  use GoogleAjaxCrawler::Crawler do |config|
24
- config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
24
+ config.page_loaded_js = "MyApp.isPageLoaded()"
25
25
  end
26
26
 
27
- app = lambda {|env| [200, {'Content-Type' => 'text/plain'}, "b" ] }
27
+ app = -> env { [200, {'Content-Type' => 'text/plain'}, "b" ] }
28
28
  run app
29
29
 
30
30
  ```
31
31
 
32
+ ### rails usage
33
+
34
+ create in the initializer folder :
35
+ ``` ruby
36
+ google_ajax_crawler_middleware.rb
37
+ ```
38
+
39
+ with
40
+ ``` ruby
41
+ if defined?(Rails.configuration) && Rails.configuration.respond_to?(:middleware)
42
+ require 'google_ajax_crawler'
43
+ Rails.configuration.middleware.use GoogleAjaxCrawler::Crawler do |config|
44
+ config.page_loaded_test = -> driver { driver.page.evaluate_script('document.getElementById("loading") == null') }
45
+ end
46
+ end
47
+ ```
48
+
49
+ #### Important
50
+
51
+ Concurrent requests must be enabled to allow your site to snapshot itself. If concurrent requests are not allowed, the site will simple hang on a crawler request.
52
+
53
+ In config/application.rb :
54
+
55
+ ``` ruby
56
+ config.threadsafe!
57
+ ```
58
+
32
59
  ## Examples
33
60
 
34
- In the examples folder, each driver has a rackup file, which can be launched:
61
+ In the examples folder, each driver has a rackup file (at the moment only one driver, capybara-webkit, exists), which can be launched:
62
+
63
+ `rackup examples/capybara_webkit.ru`
35
64
 
36
- `rackup examples/[driver_name].ru`
65
+ Examples for how to use the crawler with Backbone.JS, Angular.JS and plain ol javascript are accesible via:
66
+ - http://localhost:9292/backbone
67
+ - http://localhost:9292/angular
68
+ - http://localhost:9292/
37
69
 
38
- then open a browser to http://localhost:9292/#!test and view source.... This is how a search engine will see your page. *NOTE:* don't look at the markup through a web inspector as it will most likely display dom elements rendered on the fly by js.
70
+ Curl, or open a browser to http://localhost:9292/[framework]#!test and view source.... This is how a search engine will see your page before snapshotting. *NOTE:* don't look at the markup through a web inspector as it will most likely display dom elements rendered on the fly by js.
39
71
 
40
- Change the url to http://localhost:9292/?_escaped_fragment_=test , and then again view source to see how the DOM state has been captured
72
+ Change the url to http://localhost:9292/[framework]?_escaped_fragment_=test , and then again curl or view source to see how the DOM state has been captured
41
73
 
42
74
  ## Configuration Options
43
75
 
44
- ### page_loaded_test
76
+ ### Page Loaded Tests
45
77
 
46
- Tell the crawler when your page has finished loading / rendering. As determining when a page has completed rendering can depend on a number of qualitative factors (i.e. all ajax requests have responses, certain content has been displayed, or even when there are no loaders / spinners visible on the page), the page loaded test allows you to specify when the crawler should decide that your page has finished loading / rendering and to return a snapshot of the rendered dom at that time.
78
+ As determining when a page has completed rendering can depend on a number of qualitative factors (i.e. all ajax requests have responses, certain content has been displayed, or even when there are no loaders / spinners visible on the page), you can specify one of two ways to tell the crawler that your page has finished loading / rendering and to return a snapshot of the rendered dom at that time.
47
79
 
48
- The current crawler driver is passed to the lambda to allow querying of the current page's dom state.
80
+ #### page_loaded_js (client side test)
49
81
 
50
- A good pattern is to test your page state in a js function returning a boolean, accessible from the window context.. i.e.
82
+ Tell the crawler the client side javascript function (returning true/false) you have created, that determines when your page has finished loading / rendering.
51
83
 
52
84
  ```ruby
53
85
 
54
86
  use GoogleAjaxCrawler::Crawler do |config|
55
- config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('myApp.isPageLoaded()') }
87
+ config.page_loaded_js = "MyApp.isPageLoaded()"
88
+ end
89
+
90
+ ```
91
+
92
+ #### page_loaded_test (server side test)
93
+
94
+ A server side test determining when your page has finished loading / rendering.
95
+ The configured crawler driver is passed to the lambda to allow querying of the current page's dom state from the server side.
96
+
97
+ ```ruby
98
+
99
+ use GoogleAjaxCrawler::Crawler do |config|
100
+ config.page_loaded_test = -> driver { driver.page.has_css?('.loading') == false }
56
101
  end
57
102
 
58
103
  ```
59
104
 
60
105
  ### timeout
61
106
 
62
- The max time the crawler should wait before returning a response
107
+ The max time (in seconds) the crawler should wait before returning a response. After the timeout has been reached,
108
+ a snapshot of the DOM in its current state is returned. Defaults to 30 seconds.
63
109
 
64
110
  ### driver
65
111
 
66
- The configured google ajax crawler driver used to query the current page state. Presently there is only one driver (now taking pull requests!); CapybaraWebkit
112
+ The configured google ajax crawler driver used to query the current page state. Defaults to capybara_webkit.
67
113
 
68
114
  ### poll_interval
69
115
 
70
- How often (in seconds) to test the page state with the configured page_loaded_test
116
+ How often (in seconds) to test the page state with the configured page_loaded_test. Defaults to 0.5 seconds.
71
117
 
72
118
  ### response_headers
73
119
 
74
- What response headers shoudl be returned with the dom snapshot. Default headers specify the content-type text/html
120
+ What response headers shoudl be returned with the dom snapshot. Default headers specify the content-type text/html.
121
+
122
+ ### requested_route_key
123
+
124
+ The parameter name used by a search bot to idenitfy which client side route to snapshot. Defaults to _escaped_fragment_.
125
+
126
+
127
+
128
+ ### Identifing Search Engine Requests
129
+
130
+ Snapshot requests are passed an additional query string param (?search_engine=true), allowing you to optionally execute client side code.
131
+ This is particularly handy should you have stats tracking code (i.e. Google Analytics), which you don't want executed / included when search engines are trawling your site.
75
132
 
76
133
  ## License
77
134
 
@@ -2,6 +2,8 @@
2
2
  # to run:
3
3
  # $ rackup examples/capybara_webkit.ru -p 3000
4
4
  # open browser to http://localhost:3000/#!test
5
+ # or http://localhost:3000/backbone/#!test
6
+ # or http://localhost:3000/backbone/#!test
5
7
  #
6
8
  require 'bundler/setup'
7
9
  require './lib/google_ajax_crawler'
@@ -12,19 +14,25 @@ use GoogleAjaxCrawler::Crawler do |config|
12
14
  config.timeout = 5
13
15
 
14
16
  #
15
- # for the demo - the page is considered loaded when the loading mask has been removed from the DOM
16
- # this could evaluate something like $.active == 0 to ensure no jquery ajax calls are pending
17
+ # for the demo - in each example (simple, backbone and angular) there is a page loaded function signaling
18
+ # when the page has completed loading. If neither page_loaded_js or page_loaded_test has been configured, the crawler will default to
19
+ # executing $.active == 0 to ensure no jquery ajax calls are pending
17
20
  #
18
- config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('app.pageHasLoaded()') }
21
+ config.page_loaded_js = 'app.pageHasLoaded()'
22
+
23
+ # alertnative server side test for the simple_javascript example:
24
+ # config.page_loaded_test = -> driver { !driver.page.has_css?('#loading') }
19
25
  end
20
26
 
21
27
  #
22
28
  # a sample page using #! url fragments to seed page state
23
29
  #
24
- app = lambda do |env|
30
+ app = -> env do
25
31
  page_content = case env['PATH_INFO']
26
32
  when /\/backbone(\/)?/
27
33
  File.read('./spec/fixtures/backbone.html')
34
+ when /\/angular(\/)?/
35
+ File.read('./spec/fixtures/angular.html')
28
36
  else
29
37
  File.read('./spec/fixtures/simple_javascript.html')
30
38
  end
@@ -5,7 +5,7 @@ module GoogleAjaxCrawler
5
5
  end
6
6
 
7
7
  def version
8
- "0.1.3"
8
+ "0.2.0"
9
9
  end
10
10
  end
11
11
  end
@@ -12,6 +12,10 @@ module GoogleAjaxCrawler
12
12
  raise "Driver Not Specified"
13
13
  end
14
14
 
15
+ def evaluate_script(javascript)
16
+ raise "Driver Not Specified"
17
+ end
18
+
15
19
  def default_page_loaded_test
16
20
  raise "Driver Not Specified"
17
21
  end
@@ -35,19 +39,15 @@ module GoogleAjaxCrawler
35
39
  end
36
40
 
37
41
  def is_page_loaded?
38
- if options.page_loaded_test.nil?
39
- default_page_loaded_test
40
- else
41
- options.page_loaded_test.call self
42
- end
42
+ return evaluate_script(options.page_loaded_js) unless options.page_loaded_js.nil?
43
+ return options.page_loaded_test.call(self) unless options.page_loaded_test.nil?
44
+ default_page_loaded_test
43
45
  end
44
46
 
45
47
  def wait_until_page_is_fully_loaded
46
48
  Timeout::timeout(options.timeout) do
47
49
  begin
48
- while !is_page_loaded?
49
- sleep options.poll_interval
50
- end
50
+ sleep(options.poll_interval) while !is_page_loaded?
51
51
  rescue
52
52
  #...squelch
53
53
  puts "Exception: #{$!}"
@@ -1,6 +1,6 @@
1
1
  module GoogleAjaxCrawler
2
2
  class Options
3
- attr_accessor :driver, :timeout, :requested_route_key, :page_loaded_test, :poll_interval, :response_headers
3
+ attr_accessor :driver, :timeout, :requested_route_key, :page_loaded_test, :page_loaded_js, :poll_interval, :response_headers
4
4
 
5
5
  def initialize(app, &block)
6
6
  @driver = Drivers::CapybaraWebkit.new(self)
@@ -0,0 +1,32 @@
1
+ <!doctype html>
2
+ <html ng-app>
3
+ <head>
4
+ <script src="https://ajax.googleapis.com/ajax/libs/angularjs/1.0.6/angular.min.js"></script>
5
+ </head>
6
+ <body>
7
+
8
+ <div ng-controller="renderTSCtrl">
9
+ <h1 id='title'>Angular tested route: {{route()}}</h1>
10
+ <p id='ts'>{{renderTime()}}</p>
11
+ </div>
12
+
13
+ <script type='text/javascript'>
14
+ function renderTSCtrl ($scope) {
15
+ $scope.route = function() {
16
+ return document.location.hash;
17
+ };
18
+
19
+ $scope.renderTime = function() {
20
+ return "This rendered at " + new Date().toString() +"!";
21
+ };
22
+ };
23
+
24
+ window.app = {
25
+ pageHasLoaded: function () {
26
+ return !!document.getElementById('title').innerText && !!document.getElementById('ts').innerText;
27
+ }
28
+ };
29
+
30
+ </script>
31
+ </body>
32
+ </html>
@@ -5,31 +5,55 @@ describe 'CapybaraWebkit driver' do
5
5
  let(:browser_route) { "#{host}#!test" }
6
6
  let(:snapshot_route) { "#{host}?_escaped_fragment_=test" }
7
7
 
8
- before(:all) do
9
- RackApp.configure_crawler do |config|
10
- config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
11
- config.poll_interval = 0.25
12
- config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('app.pageHasLoaded()') }
8
+ shared_examples 'google ajax crawler' do
9
+ describe 'when a browser requests a client side route (i.e.: /#my_route)' do
10
+ it 'should not serve a snapshot of the dom' do
11
+ response = Faraday.get browser_route
12
+ response.body.should_not =~ /Javascript rendering complete for client-side route #!test/
13
+ end
13
14
  end
14
15
 
15
- RackApp.start
16
+ describe 'when an ajax crawler requests a snapshot of a client side route' do
17
+ it 'should serve a snapshot of the dom that includes js rendered components' do
18
+ response = Faraday.get snapshot_route
19
+ response.body.should =~ /Javascript rendering complete for client-side route #!test/
20
+ end
21
+ end
16
22
  end
17
23
 
18
- after(:all) do
19
- RackApp.stop
20
- end
24
+ describe 'with page_loaded_test' do
25
+ before(:all) do
26
+ RackApp.configure_crawler do |config|
27
+ config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
28
+ config.poll_interval = 0.25
29
+ config.page_loaded_test = -> driver { driver.page.evaluate_script('app.pageHasLoaded()') }
30
+ end
21
31
 
22
- describe 'when a browser requests a client side route (i.e.: /#my_route)' do
23
- it 'should not serve a snapshot of the dom' do
24
- response = Faraday.get browser_route
25
- response.body.should_not =~ /Javascript rendering complete for client-side route #!test/
32
+ RackApp.start
26
33
  end
34
+
35
+ after(:all) do
36
+ RackApp.stop
37
+ end
38
+
39
+ it_should_behave_like 'google ajax crawler'
27
40
  end
28
41
 
29
- describe 'when an ajax crawler requests a snapshot of a client side route' do
30
- it 'should serve a snapshot of the dom that includes js rendered components' do
31
- response = Faraday.get snapshot_route
32
- response.body.should =~ /Javascript rendering complete for client-side route #!test/
42
+ describe 'with page_loaded_js' do
43
+ before(:all) do
44
+ RackApp.configure_crawler do |config|
45
+ config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
46
+ config.poll_interval = 0.25
47
+ config.page_loaded_js = 'app.pageHasLoaded()'
48
+ end
49
+
50
+ RackApp.start
51
+ end
52
+
53
+ after(:all) do
54
+ RackApp.stop
33
55
  end
56
+
57
+ it_should_behave_like 'google ajax crawler'
34
58
  end
35
59
  end
@@ -1,12 +1,6 @@
1
1
  require './spec/spec_helper'
2
2
 
3
3
  describe GoogleAjaxCrawler::Crawler do
4
- before(:each) do
5
- GoogleAjaxCrawler::Crawler.configure do |config|
6
- config.page_loaded_test = lambda{ page.find('.loading', count: 0) }
7
- end
8
- end
9
-
10
4
  shared_examples 'a crawler configurer' do |method, *args|
11
5
  it 'and facilitate the setting of crawler options' do
12
6
  GoogleAjaxCrawler::Crawler.send(method, *args) do |config|
@@ -1,17 +1,59 @@
1
1
  require './spec/spec_helper'
2
2
 
3
3
  describe GoogleAjaxCrawler::Drivers::Driver do
4
- let(:options) do
5
- GoogleAjaxCrawler::Options.new(nil) do |o|
6
- o.timeout = 0.05
7
- o.page_loaded_test = lambda {|d| false }
4
+ let(:options) { GoogleAjaxCrawler::Options.new(nil) { |o| o.timeout = 0.01 } }
5
+ let(:driver) { GoogleAjaxCrawler::Drivers::Driver.new(options) }
6
+
7
+ describe '#mandatory overrides' do
8
+ shared_examples 'an enforced override method' do |method, *args|
9
+ it 'should throw an exception if not overridden' do
10
+ expect { driver.send(method, *args) }.to raise_error(RuntimeError, "Driver Not Specified")
11
+ end
12
+ end
13
+
14
+ it_should_behave_like 'an enforced override method', :visit, 'http://test.com'
15
+ it_should_behave_like 'an enforced override method', :evaluate_script, 'myApp.isPageLoaded()'
16
+ it_should_behave_like 'an enforced override method', :default_page_loaded_test
17
+ it_should_behave_like 'an enforced override method', :html
18
+ end
19
+
20
+ describe '#is_page_loaded?' do
21
+
22
+ describe 'when page_loaded_test optioned' do
23
+ it 'should be called' do
24
+ driver.options.page_loaded_test = double
25
+ driver.options.page_loaded_test.should_receive(:call).with(driver)
26
+ driver.is_page_loaded?
27
+ end
28
+ end
29
+
30
+ describe 'when page_loaded_js optioned' do
31
+ it 'should call evaluate_script with the page_loaded_js' do
32
+ driver.options.page_loaded_js = 'MyApp.isPageLoaded()'
33
+ driver.stub :evaluate_script
34
+ driver.should_receive(:evaluate_script).with('MyApp.isPageLoaded()').once
35
+ driver.is_page_loaded?
36
+ end
37
+ end
38
+
39
+ describe 'when no loaded tests optioned' do
40
+ it 'should execute the default_page_loaded_test' do
41
+ driver.options.page_loaded_test = driver.options.page_loaded_js = nil
42
+ driver.stub :default_page_loaded_test
43
+ driver.should_receive(:default_page_loaded_test).once
44
+ driver.is_page_loaded?
45
+ end
8
46
  end
9
47
  end
10
48
 
11
49
  describe '#wait_until_page_is_fully_loaded' do
50
+ before do
51
+ driver.options.page_loaded_test = double
52
+ driver.options.page_loaded_test.should_receive(:call).with(driver)
53
+ end
54
+
12
55
  it 'should raise a Timeout Exception when timeout limit reached' do
13
56
  expect do
14
- driver = GoogleAjaxCrawler::Drivers::Driver.new(options)
15
57
  driver.wait_until_page_is_fully_loaded
16
58
  end.to raise_error(Timeout::Error)
17
59
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: google_ajax_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Kitzelman
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-05-16 00:00:00.000000000 Z
11
+ date: 2013-07-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara-webkit
@@ -66,6 +66,8 @@ files:
66
66
  - releases/google_ajax_crawler-0.1.0.gem
67
67
  - releases/google_ajax_crawler-0.1.1.gem
68
68
  - releases/google_ajax_crawler-0.1.2.gem
69
+ - releases/google_ajax_crawler-0.1.3.gem
70
+ - spec/fixtures/angular.html
69
71
  - spec/fixtures/backbone.html
70
72
  - spec/fixtures/simple_javascript.html
71
73
  - spec/integration/capybara_webkit_spec.rb
@@ -94,7 +96,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
94
96
  version: '0'
95
97
  requirements: []
96
98
  rubyforge_project:
97
- rubygems_version: 2.0.3
99
+ rubygems_version: 2.0.5
98
100
  signing_key:
99
101
  specification_version: 4
100
102
  summary: Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your