google_ajax_crawler 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NzMwMTEyOWEwMzg3OWNiMTE1MzU1NzdjNDdkZGE5ZDE1YzNiMDk5NQ==
4
+ Nzk1MTI4M2Y1ZTgyZmEyYzIyNWVlM2ExYjlhNmQxMTYwYzI3MGU0OQ==
5
5
  data.tar.gz: !binary |-
6
- MGNkNzYyMDg4ZWZjMmRjMDJhM2JiMmE2NTg0MTkyOWY2MDRmMWYyMg==
6
+ N2Y0ZDhjZmFiOGJkYTQwNDA4Y2JlYzQyNGY3OGU1OGYwNzM3YThkZA==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- OGEyYTY5MWQwYzhkY2NiNjlmZWFmM2JiMjYwY2ZkNDQ0M2VmMjM2ZTMyNzhl
10
- NjFjNmRkYWE2M2JkMjhhYTZjZmIyY2RkMzI2MzQ5NGNjOGQ5NWUyOTdjMDg3
11
- OTAzNDE3NDgwYTAyMWM1ZDgzMGM0YWZlYTIwMzBiNTk5M2VjMGE=
9
+ YjYwY2Q0Y2I4NzQwNmYzYjBkYTc0NDljNDJjY2E2MDJlYjRkZTFjNzg5NjBk
10
+ ZmFjZjQxMGYzNTgyYjBmNDIzMTQ5ODlkM2U4NDhlMTk1YWQ2NzZjODAyYWQx
11
+ M2VmNTRjZGI2ODhmYjY4MDNiZTdlNDY1MDQyN2FkMTA0MTJlZWM=
12
12
  data.tar.gz: !binary |-
13
- MDc1ODY4MTI2YzRhNTNiNmQ2YzcwNjliMjA2NTFmMzczM2ZmYmI0NzQxNzI1
14
- NGVlODA0NjQ4YmQ4ZmIwYTNkZGZlYzQzNmMwNDNhYjhmOTgyM2NkMjQ5N2Ew
15
- YzRmNTU2ZDRhYTE2MWRkNjk0MTg1OTU2OTM2MzY5ODA5ODIwMDM=
13
+ MDg0MWM3ZmRiODk1NjM1MTk3MWRmYjFhNDhhOThlOGJlN2FjMTJjZjY0OTNi
14
+ ZjQwYjAzNWI5MTI3N2Q2MTJkNGZhMjcyN2YwYTU2ZGQ4ZjM0MmVjMmVkZmJi
15
+ Nzg5MDY0ZDJlYTlhNTRlZmQzODUxYmFiZmIwNWQ2Y2NlNzY3NzM=
data/.travis.yml ADDED
@@ -0,0 +1,16 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - "1.9.2"
5
+ - "1.9.3"
6
+ - "2.0.0"
7
+
8
+ env:
9
+ - RACK_ENV=ci
10
+
11
+ before_install:
12
+ - sh -e /etc/init.d/xvfb start
13
+ - echo "Started xvfb..."
14
+
15
+ script:
16
+ - DISPLAY=:99.0 bundle exec rake
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Google Ajax Crawler
2
2
 
3
+ [![Build Status](https://travis-ci.org/benkitzelman/google-ajax-crawler.png)](https://travis-ci.org/benkitzelman/google-ajax-crawler)
4
+ [![Gem Version](https://badge.fury.io/rb/google_ajax_crawler.png)](http://badge.fury.io/rb/google_ajax_crawler)
5
+
3
6
  Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.
4
7
 
5
8
  Details of the scheme can be found at: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
@@ -44,6 +47,16 @@ Tell the crawler when your page has finished loading / rendering. As determining
44
47
 
45
48
  The current crawler driver is passed to the lambda to allow querying of the current page's dom state.
46
49
 
50
+ A good pattern is to test your page state in a js function returning a boolean, accessible from the window context.. i.e.
51
+
52
+ ```ruby
53
+
54
+ use GoogleAjaxCrawler::Crawler do |config|
55
+ config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('myApp.isPageLoaded()') }
56
+ end
57
+
58
+ ```
59
+
47
60
  ### timeout
48
61
 
49
62
  The max time the crawler should wait before returning a response
data/Rakefile CHANGED
@@ -1,12 +1,15 @@
1
- # encoding: utf-8
2
-
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts "Run `bundle install` to install missing gems"
10
- exit e.status_code
1
+ $:.unshift File.join(File.dirname(__FILE__), 'lib')
2
+
3
+ require 'bundler/setup'
4
+ require 'rspec/core/rake_task'
5
+
6
+ desc "Run specifications"
7
+ RSpec::Core::RakeTask.new(:spec)
8
+
9
+ task :ci do
10
+ puts 'running tests on CI Server....'
11
+ system("export DISPLAY=:99.0 && bundle exec rake spec")
12
+ raise "rake spec failed!" unless $?.exitstatus == 0
11
13
  end
12
- require 'rake'
14
+
15
+ task :default => :ci
@@ -1,11 +1,34 @@
1
+ #
2
+ # to run:
3
+ # $ rackup examples/capybara_webkit.ru -p 3000
4
+ # open browser to http://localhost:3000/#!test
5
+ #
1
6
  require 'bundler/setup'
2
7
  require './lib/google_ajax_crawler'
3
8
 
4
9
  use GoogleAjaxCrawler::Crawler do |config|
5
- config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
6
- config.poll_interval = 0.25
7
- config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
10
+ config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
11
+ config.poll_interval = 0.25
12
+ config.timeout = 5
13
+
14
+ #
15
+ # for the demo - the page is considered loaded when the loading mask has been removed from the DOM
16
+ # this could evaluate something like $.active == 0 to ensure no jquery ajax calls are pending
17
+ #
18
+ config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('app.pageHasLoaded()') }
8
19
  end
9
20
 
10
- page_content = File.read('./spec/support/page.html')
11
- run lambda {|env| [200, { 'Content-Type' => 'text/html' }, [page_content]] }
21
+ #
22
+ # a sample page using #! url fragments to seed page state
23
+ #
24
+ app = lambda do |env|
25
+ page_content = case env['PATH_INFO']
26
+ when /\/backbone(\/)?/
27
+ File.read('./spec/fixtures/backbone.html')
28
+ else
29
+ File.read('./spec/fixtures/simple_javascript.html')
30
+ end
31
+
32
+ [200, { 'Content-Type' => 'text/html' }, [page_content]]
33
+ end
34
+ run app
@@ -1,5 +1,4 @@
1
- $LOAD_PATH << './lib'
2
- require 'google_ajax_crawler'
1
+ require './lib/google_ajax_crawler'
3
2
  Gem::Specification.new do |s|
4
3
  s.name = 'google_ajax_crawler'
5
4
  s.version = GoogleAjaxCrawler.version
@@ -13,4 +12,4 @@ Gem::Specification.new do |s|
13
12
 
14
13
  s.add_dependency 'capybara-webkit', '>= 0.10.0'
15
14
  s.add_dependency 'rack'
16
- end
15
+ end
@@ -1,14 +1,19 @@
1
1
  module GoogleAjaxCrawler
2
2
  class << self
3
+ def env
4
+ (ENV['RACK_ENV'] || 'development').to_sym
5
+ end
6
+
3
7
  def version
4
- "0.1.2"
8
+ "0.1.3"
5
9
  end
6
10
  end
7
11
  end
8
12
 
13
+ here = File.dirname(__FILE__)
9
14
  require 'uri'
10
- require 'google_ajax_crawler/crawler'
11
- require 'google_ajax_crawler/options'
12
- require 'google_ajax_crawler/page'
13
- require 'google_ajax_crawler/drivers/driver'
14
- require 'google_ajax_crawler/drivers/capybara_webkit'
15
+ require "#{here}/google_ajax_crawler/crawler"
16
+ require "#{here}/google_ajax_crawler/options"
17
+ require "#{here}/google_ajax_crawler/page"
18
+ require "#{here}/google_ajax_crawler/drivers/driver"
19
+ require "#{here}/google_ajax_crawler/drivers/capybara_webkit"
@@ -24,11 +24,9 @@ module GoogleAjaxCrawler
24
24
 
25
25
  def call(env)
26
26
  request = Rack::Request.new(env)
27
- if is_search_engine?(request)
28
- serve_crawlable_content_for request
29
- else
30
- @app.call(env)
31
- end
27
+ return serve_crawlable_content_for(request) if is_search_engine?(request)
28
+
29
+ @app.call(env)
32
30
  end
33
31
 
34
32
  protected
@@ -1,3 +1,4 @@
1
+ require 'timeout'
1
2
  module GoogleAjaxCrawler
2
3
  module Drivers
3
4
  class Driver
@@ -20,10 +21,16 @@ module GoogleAjaxCrawler
20
21
  end
21
22
 
22
23
  def get_content(uri)
23
- puts "requesting: #{uri}"
24
- visit uri.to_s
25
-
26
- wait_until_page_is_fully_loaded
24
+ begin
25
+ puts "::requesting: #{uri}"
26
+ visit uri.to_s
27
+ wait_until_page_is_fully_loaded
28
+ rescue Timeout::Error
29
+ puts "-- Page Rendering Timed out: --\n"\
30
+ "Either your page_loaded_test didn't successfully detect when your page had loaded, \n"\
31
+ "or your page took longer than #{options.timeout} seconds to load \n"\
32
+ "-- Returning page snapshot in its present state --"
33
+ end
27
34
  html
28
35
  end
29
36
 
@@ -31,18 +38,20 @@ module GoogleAjaxCrawler
31
38
  if options.page_loaded_test.nil?
32
39
  default_page_loaded_test
33
40
  else
34
- options.page_loaded_test.call(self)
41
+ options.page_loaded_test.call self
35
42
  end
36
43
  end
37
44
 
38
45
  def wait_until_page_is_fully_loaded
39
- begin
40
- while !is_page_loaded?
41
- sleep options.poll_interval
46
+ Timeout::timeout(options.timeout) do
47
+ begin
48
+ while !is_page_loaded?
49
+ sleep options.poll_interval
50
+ end
51
+ rescue
52
+ #...squelch
53
+ puts "Exception: #{$!}"
42
54
  end
43
- rescue
44
- #...squelch
45
- puts "Timeout: #{$!}"
46
55
  end
47
56
  end
48
57
  end
@@ -0,0 +1,61 @@
1
+ <html>
2
+ <head>
3
+ <script src='http://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.4.4/underscore-min.js'></script>
4
+ <script src='http://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min.js'></script>
5
+ <script src='http://cdnjs.cloudflare.com/ajax/libs/backbone.js/1.0.0/backbone-min.js'></script>
6
+ </head>
7
+ <body>
8
+
9
+ <div id='app'>
10
+ <div id='loading'>Loading...</div>
11
+ </div>
12
+
13
+ <script type='text/javascript'>
14
+
15
+ var View = Backbone.View.extend({
16
+ className: 'backbone-view',
17
+ template: _.template(
18
+ '<h1>Backbone Example</h1><p>This template was rendered by backbone js</p>'
19
+ ),
20
+
21
+ render: function(){
22
+ var app = document.getElementById('app');
23
+ var _this = this;
24
+ this.el.innerHTML = _this.template();
25
+ return this;
26
+ }
27
+ });
28
+
29
+ var Router = Backbone.Router.extend({
30
+ routes: {
31
+ "*anything": "home"
32
+ },
33
+
34
+ home: function() {
35
+ el = document.getElementById('app');
36
+ var view = new View;
37
+
38
+ // simulate fetching data / async operation
39
+ _.delay(function() {
40
+ el.replaceChild(view.render().el, el.children[0]);
41
+ }, 1500);
42
+ }
43
+ });
44
+
45
+ function App() {
46
+ new Router();
47
+ Backbone.history.start();
48
+ };
49
+
50
+ App.prototype.pageHasLoaded = function() {
51
+ return !!document.getElementsByClassName('backbone-view').length;
52
+ };
53
+
54
+
55
+ $(function() {
56
+ window.app = new App();
57
+ });
58
+
59
+ </script>
60
+ </body>
61
+ </html>
@@ -7,7 +7,8 @@
7
7
 
8
8
  <script type='text/javascript'>
9
9
 
10
- var init = function() {
10
+
11
+ function App() {
11
12
  var writeHash = function() {
12
13
  document.getElementById('page_state').innerHTML = "Javascript rendering complete for client-side route " + document.location.hash;
13
14
  var loadingMask = document.getElementById('loading');
@@ -19,10 +20,14 @@
19
20
  setTimeout(writeHash, 500);
20
21
  };
21
22
 
23
+ App.prototype.pageHasLoaded = function() {
24
+ return !document.getElementById('loading');
25
+ };
26
+
22
27
  //
23
28
  // Only execute js if loading the page using an unescaped url
24
29
  //
25
- if(/#.*$/.test(document.location.href)) init();
30
+ if(/#.*$/.test(document.location.href)) window.app = new App();
26
31
 
27
32
  </script>
28
33
  </body>
@@ -9,7 +9,7 @@ describe 'CapybaraWebkit driver' do
9
9
  RackApp.configure_crawler do |config|
10
10
  config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
11
11
  config.poll_interval = 0.25
12
- config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
12
+ config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('app.pageHasLoaded()') }
13
13
  end
14
14
 
15
15
  RackApp.start
data/spec/spec_helper.rb CHANGED
@@ -1,3 +1,5 @@
1
+ ENV["RACK_ENV"] ||= "test"
2
+
1
3
  require 'rubygems'
2
4
  require 'bundler/setup'
3
5
  require './lib/google_ajax_crawler'
@@ -3,7 +3,7 @@ require 'rack'
3
3
  class RackApp
4
4
 
5
5
  def app
6
- page_content = File.read('./spec/support/page.html')
6
+ page_content = File.read('./spec/fixtures/simple_javascript.html')
7
7
  Rack::Builder.new do
8
8
 
9
9
  use GoogleAjaxCrawler::Crawler do |c|
@@ -0,0 +1,19 @@
1
+ require './spec/spec_helper'
2
+
3
+ describe GoogleAjaxCrawler::Drivers::Driver do
4
+ let(:options) do
5
+ GoogleAjaxCrawler::Options.new(nil) do |o|
6
+ o.timeout = 0.05
7
+ o.page_loaded_test = lambda {|d| false }
8
+ end
9
+ end
10
+
11
+ describe '#wait_until_page_is_fully_loaded' do
12
+ it 'should raise a Timeout Exception when timeout limit reached' do
13
+ expect do
14
+ driver = GoogleAjaxCrawler::Drivers::Driver.new(options)
15
+ driver.wait_until_page_is_fully_loaded
16
+ end.to raise_error(Timeout::Error)
17
+ end
18
+ end
19
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: google_ajax_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Kitzelman
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-03-16 00:00:00.000000000 Z
11
+ date: 2013-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara-webkit
@@ -49,6 +49,7 @@ extra_rdoc_files: []
49
49
  files:
50
50
  - .gitignore
51
51
  - .rspec
52
+ - .travis.yml
52
53
  - Gemfile
53
54
  - Gemfile.lock
54
55
  - LICENSE.txt
@@ -64,11 +65,14 @@ files:
64
65
  - lib/google_ajax_crawler/page.rb
65
66
  - releases/google_ajax_crawler-0.1.0.gem
66
67
  - releases/google_ajax_crawler-0.1.1.gem
68
+ - releases/google_ajax_crawler-0.1.2.gem
69
+ - spec/fixtures/backbone.html
70
+ - spec/fixtures/simple_javascript.html
67
71
  - spec/integration/capybara_webkit_spec.rb
68
72
  - spec/spec_helper.rb
69
- - spec/support/page.html
70
73
  - spec/support/rack_app.rb
71
74
  - spec/unit/crawler_spec.rb
75
+ - spec/unit/drivers/driver_spec.rb
72
76
  - spec/unit/options_spec.rb
73
77
  - spec/unit/page_spec.rb
74
78
  homepage: http://github.com/benkitzelman/google-ajax-crawler