google_ajax_crawler 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NzMwMTEyOWEwMzg3OWNiMTE1MzU1NzdjNDdkZGE5ZDE1YzNiMDk5NQ==
4
+ Nzk1MTI4M2Y1ZTgyZmEyYzIyNWVlM2ExYjlhNmQxMTYwYzI3MGU0OQ==
5
5
  data.tar.gz: !binary |-
6
- MGNkNzYyMDg4ZWZjMmRjMDJhM2JiMmE2NTg0MTkyOWY2MDRmMWYyMg==
6
+ N2Y0ZDhjZmFiOGJkYTQwNDA4Y2JlYzQyNGY3OGU1OGYwNzM3YThkZA==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- OGEyYTY5MWQwYzhkY2NiNjlmZWFmM2JiMjYwY2ZkNDQ0M2VmMjM2ZTMyNzhl
10
- NjFjNmRkYWE2M2JkMjhhYTZjZmIyY2RkMzI2MzQ5NGNjOGQ5NWUyOTdjMDg3
11
- OTAzNDE3NDgwYTAyMWM1ZDgzMGM0YWZlYTIwMzBiNTk5M2VjMGE=
9
+ YjYwY2Q0Y2I4NzQwNmYzYjBkYTc0NDljNDJjY2E2MDJlYjRkZTFjNzg5NjBk
10
+ ZmFjZjQxMGYzNTgyYjBmNDIzMTQ5ODlkM2U4NDhlMTk1YWQ2NzZjODAyYWQx
11
+ M2VmNTRjZGI2ODhmYjY4MDNiZTdlNDY1MDQyN2FkMTA0MTJlZWM=
12
12
  data.tar.gz: !binary |-
13
- MDc1ODY4MTI2YzRhNTNiNmQ2YzcwNjliMjA2NTFmMzczM2ZmYmI0NzQxNzI1
14
- NGVlODA0NjQ4YmQ4ZmIwYTNkZGZlYzQzNmMwNDNhYjhmOTgyM2NkMjQ5N2Ew
15
- YzRmNTU2ZDRhYTE2MWRkNjk0MTg1OTU2OTM2MzY5ODA5ODIwMDM=
13
+ MDg0MWM3ZmRiODk1NjM1MTk3MWRmYjFhNDhhOThlOGJlN2FjMTJjZjY0OTNi
14
+ ZjQwYjAzNWI5MTI3N2Q2MTJkNGZhMjcyN2YwYTU2ZGQ4ZjM0MmVjMmVkZmJi
15
+ Nzg5MDY0ZDJlYTlhNTRlZmQzODUxYmFiZmIwNWQ2Y2NlNzY3NzM=
data/.travis.yml ADDED
@@ -0,0 +1,16 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - "1.9.2"
5
+ - "1.9.3"
6
+ - "2.0.0"
7
+
8
+ env:
9
+ - RACK_ENV=ci
10
+
11
+ before_install:
12
+ - sh -e /etc/init.d/xvfb start
13
+ - echo "Started xvfb..."
14
+
15
+ script:
16
+ - DISPLAY=:99.0 bundle exec rake
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Google Ajax Crawler
2
2
 
3
+ [![Build Status](https://travis-ci.org/benkitzelman/google-ajax-crawler.png)](https://travis-ci.org/benkitzelman/google-ajax-crawler)
4
+ [![Gem Version](https://badge.fury.io/rb/google_ajax_crawler.png)](http://badge.fury.io/rb/google_ajax_crawler)
5
+
3
6
  Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.
4
7
 
5
8
  Details of the scheme can be found at: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
@@ -44,6 +47,16 @@ Tell the crawler when your page has finished loading / rendering. As determining
44
47
 
45
48
  The current crawler driver is passed to the lambda to allow querying of the current page's dom state.
46
49
 
50
+ A good pattern is to test your page state in a js function returning a boolean, accessible from the window context.. i.e.
51
+
52
+ ```ruby
53
+
54
+ use GoogleAjaxCrawler::Crawler do |config|
55
+ config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('myApp.isPageLoaded()') }
56
+ end
57
+
58
+ ```
59
+
47
60
  ### timeout
48
61
 
49
62
  The max time the crawler should wait before returning a response
data/Rakefile CHANGED
@@ -1,12 +1,15 @@
1
- # encoding: utf-8
2
-
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts "Run `bundle install` to install missing gems"
10
- exit e.status_code
1
+ $:.unshift File.join(File.dirname(__FILE__), 'lib')
2
+
3
+ require 'bundler/setup'
4
+ require 'rspec/core/rake_task'
5
+
6
+ desc "Run specifications"
7
+ RSpec::Core::RakeTask.new(:spec)
8
+
9
+ task :ci do
10
+ puts 'running tests on CI Server....'
11
+ system("export DISPLAY=:99.0 && bundle exec rake spec")
12
+ raise "rake spec failed!" unless $?.exitstatus == 0
11
13
  end
12
- require 'rake'
14
+
15
+ task :default => :ci
@@ -1,11 +1,34 @@
1
+ #
2
+ # to run:
3
+ # $ rackup examples/capybara_webkit.ru -p 3000
4
+ # open browser to http://localhost:3000/#!test
5
+ #
1
6
  require 'bundler/setup'
2
7
  require './lib/google_ajax_crawler'
3
8
 
4
9
  use GoogleAjaxCrawler::Crawler do |config|
5
- config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
6
- config.poll_interval = 0.25
7
- config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
10
+ config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
11
+ config.poll_interval = 0.25
12
+ config.timeout = 5
13
+
14
+ #
15
+ # for the demo - the page is considered loaded when the loading mask has been removed from the DOM
16
+ # this could evaluate something like $.active == 0 to ensure no jquery ajax calls are pending
17
+ #
18
+ config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('app.pageHasLoaded()') }
8
19
  end
9
20
 
10
- page_content = File.read('./spec/support/page.html')
11
- run lambda {|env| [200, { 'Content-Type' => 'text/html' }, [page_content]] }
21
+ #
22
+ # a sample page using #! url fragments to seed page state
23
+ #
24
+ app = lambda do |env|
25
+ page_content = case env['PATH_INFO']
26
+ when /\/backbone(\/)?/
27
+ File.read('./spec/fixtures/backbone.html')
28
+ else
29
+ File.read('./spec/fixtures/simple_javascript.html')
30
+ end
31
+
32
+ [200, { 'Content-Type' => 'text/html' }, [page_content]]
33
+ end
34
+ run app
@@ -1,5 +1,4 @@
1
- $LOAD_PATH << './lib'
2
- require 'google_ajax_crawler'
1
+ require './lib/google_ajax_crawler'
3
2
  Gem::Specification.new do |s|
4
3
  s.name = 'google_ajax_crawler'
5
4
  s.version = GoogleAjaxCrawler.version
@@ -13,4 +12,4 @@ Gem::Specification.new do |s|
13
12
 
14
13
  s.add_dependency 'capybara-webkit', '>= 0.10.0'
15
14
  s.add_dependency 'rack'
16
- end
15
+ end
@@ -1,14 +1,19 @@
1
1
  module GoogleAjaxCrawler
2
2
  class << self
3
+ def env
4
+ (ENV['RACK_ENV'] || 'development').to_sym
5
+ end
6
+
3
7
  def version
4
- "0.1.2"
8
+ "0.1.3"
5
9
  end
6
10
  end
7
11
  end
8
12
 
13
+ here = File.dirname(__FILE__)
9
14
  require 'uri'
10
- require 'google_ajax_crawler/crawler'
11
- require 'google_ajax_crawler/options'
12
- require 'google_ajax_crawler/page'
13
- require 'google_ajax_crawler/drivers/driver'
14
- require 'google_ajax_crawler/drivers/capybara_webkit'
15
+ require "#{here}/google_ajax_crawler/crawler"
16
+ require "#{here}/google_ajax_crawler/options"
17
+ require "#{here}/google_ajax_crawler/page"
18
+ require "#{here}/google_ajax_crawler/drivers/driver"
19
+ require "#{here}/google_ajax_crawler/drivers/capybara_webkit"
@@ -24,11 +24,9 @@ module GoogleAjaxCrawler
24
24
 
25
25
  def call(env)
26
26
  request = Rack::Request.new(env)
27
- if is_search_engine?(request)
28
- serve_crawlable_content_for request
29
- else
30
- @app.call(env)
31
- end
27
+ return serve_crawlable_content_for(request) if is_search_engine?(request)
28
+
29
+ @app.call(env)
32
30
  end
33
31
 
34
32
  protected
@@ -1,3 +1,4 @@
1
+ require 'timeout'
1
2
  module GoogleAjaxCrawler
2
3
  module Drivers
3
4
  class Driver
@@ -20,10 +21,16 @@ module GoogleAjaxCrawler
20
21
  end
21
22
 
22
23
  def get_content(uri)
23
- puts "requesting: #{uri}"
24
- visit uri.to_s
25
-
26
- wait_until_page_is_fully_loaded
24
+ begin
25
+ puts "::requesting: #{uri}"
26
+ visit uri.to_s
27
+ wait_until_page_is_fully_loaded
28
+ rescue Timeout::Error
29
+ puts "-- Page Rendering Timed out: --\n"\
30
+ "Either your page_loaded_test didn't successfully detect when your page had loaded, \n"\
31
+ "or your page took longer than #{options.timeout} seconds to load \n"\
32
+ "-- Returning page snapshot in its present state --"
33
+ end
27
34
  html
28
35
  end
29
36
 
@@ -31,18 +38,20 @@ module GoogleAjaxCrawler
31
38
  if options.page_loaded_test.nil?
32
39
  default_page_loaded_test
33
40
  else
34
- options.page_loaded_test.call(self)
41
+ options.page_loaded_test.call self
35
42
  end
36
43
  end
37
44
 
38
45
  def wait_until_page_is_fully_loaded
39
- begin
40
- while !is_page_loaded?
41
- sleep options.poll_interval
46
+ Timeout::timeout(options.timeout) do
47
+ begin
48
+ while !is_page_loaded?
49
+ sleep options.poll_interval
50
+ end
51
+ rescue
52
+ #...squelch
53
+ puts "Exception: #{$!}"
42
54
  end
43
- rescue
44
- #...squelch
45
- puts "Timeout: #{$!}"
46
55
  end
47
56
  end
48
57
  end
@@ -0,0 +1,61 @@
1
+ <html>
2
+ <head>
3
+ <script src='http://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.4.4/underscore-min.js'></script>
4
+ <script src='http://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min.js'></script>
5
+ <script src='http://cdnjs.cloudflare.com/ajax/libs/backbone.js/1.0.0/backbone-min.js'></script>
6
+ </head>
7
+ <body>
8
+
9
+ <div id='app'>
10
+ <div id='loading'>Loading...</div>
11
+ </div>
12
+
13
+ <script type='text/javascript'>
14
+
15
+ var View = Backbone.View.extend({
16
+ className: 'backbone-view',
17
+ template: _.template(
18
+ '<h1>Backbone Example</h1><p>This template was rendered by backbone js</p>'
19
+ ),
20
+
21
+ render: function(){
22
+ var app = document.getElementById('app');
23
+ var _this = this;
24
+ this.el.innerHTML = _this.template();
25
+ return this;
26
+ }
27
+ });
28
+
29
+ var Router = Backbone.Router.extend({
30
+ routes: {
31
+ "*anything": "home"
32
+ },
33
+
34
+ home: function() {
35
+ el = document.getElementById('app');
36
+ var view = new View;
37
+
38
+ // simulate fetching data / async operation
39
+ _.delay(function() {
40
+ el.replaceChild(view.render().el, el.children[0]);
41
+ }, 1500);
42
+ }
43
+ });
44
+
45
+ function App() {
46
+ new Router();
47
+ Backbone.history.start();
48
+ };
49
+
50
+ App.prototype.pageHasLoaded = function() {
51
+ return !!document.getElementsByClassName('backbone-view').length;
52
+ };
53
+
54
+
55
+ $(function() {
56
+ window.app = new App();
57
+ });
58
+
59
+ </script>
60
+ </body>
61
+ </html>
@@ -7,7 +7,8 @@
7
7
 
8
8
  <script type='text/javascript'>
9
9
 
10
- var init = function() {
10
+
11
+ function App() {
11
12
  var writeHash = function() {
12
13
  document.getElementById('page_state').innerHTML = "Javascript rendering complete for client-side route " + document.location.hash;
13
14
  var loadingMask = document.getElementById('loading');
@@ -19,10 +20,14 @@
19
20
  setTimeout(writeHash, 500);
20
21
  };
21
22
 
23
+ App.prototype.pageHasLoaded = function() {
24
+ return !document.getElementById('loading');
25
+ };
26
+
22
27
  //
23
28
  // Only execute js if loading the page using an unescaped url
24
29
  //
25
- if(/#.*$/.test(document.location.href)) init();
30
+ if(/#.*$/.test(document.location.href)) window.app = new App();
26
31
 
27
32
  </script>
28
33
  </body>
@@ -9,7 +9,7 @@ describe 'CapybaraWebkit driver' do
9
9
  RackApp.configure_crawler do |config|
10
10
  config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
11
11
  config.poll_interval = 0.25
12
- config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
12
+ config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('app.pageHasLoaded()') }
13
13
  end
14
14
 
15
15
  RackApp.start
data/spec/spec_helper.rb CHANGED
@@ -1,3 +1,5 @@
1
+ ENV["RACK_ENV"] ||= "test"
2
+
1
3
  require 'rubygems'
2
4
  require 'bundler/setup'
3
5
  require './lib/google_ajax_crawler'
@@ -3,7 +3,7 @@ require 'rack'
3
3
  class RackApp
4
4
 
5
5
  def app
6
- page_content = File.read('./spec/support/page.html')
6
+ page_content = File.read('./spec/fixtures/simple_javascript.html')
7
7
  Rack::Builder.new do
8
8
 
9
9
  use GoogleAjaxCrawler::Crawler do |c|
@@ -0,0 +1,19 @@
1
+ require './spec/spec_helper'
2
+
3
+ describe GoogleAjaxCrawler::Drivers::Driver do
4
+ let(:options) do
5
+ GoogleAjaxCrawler::Options.new(nil) do |o|
6
+ o.timeout = 0.05
7
+ o.page_loaded_test = lambda {|d| false }
8
+ end
9
+ end
10
+
11
+ describe '#wait_until_page_is_fully_loaded' do
12
+ it 'should raise a Timeout Exception when timeout limit reached' do
13
+ expect do
14
+ driver = GoogleAjaxCrawler::Drivers::Driver.new(options)
15
+ driver.wait_until_page_is_fully_loaded
16
+ end.to raise_error(Timeout::Error)
17
+ end
18
+ end
19
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: google_ajax_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Kitzelman
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-03-16 00:00:00.000000000 Z
11
+ date: 2013-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara-webkit
@@ -49,6 +49,7 @@ extra_rdoc_files: []
49
49
  files:
50
50
  - .gitignore
51
51
  - .rspec
52
+ - .travis.yml
52
53
  - Gemfile
53
54
  - Gemfile.lock
54
55
  - LICENSE.txt
@@ -64,11 +65,14 @@ files:
64
65
  - lib/google_ajax_crawler/page.rb
65
66
  - releases/google_ajax_crawler-0.1.0.gem
66
67
  - releases/google_ajax_crawler-0.1.1.gem
68
+ - releases/google_ajax_crawler-0.1.2.gem
69
+ - spec/fixtures/backbone.html
70
+ - spec/fixtures/simple_javascript.html
67
71
  - spec/integration/capybara_webkit_spec.rb
68
72
  - spec/spec_helper.rb
69
- - spec/support/page.html
70
73
  - spec/support/rack_app.rb
71
74
  - spec/unit/crawler_spec.rb
75
+ - spec/unit/drivers/driver_spec.rb
72
76
  - spec/unit/options_spec.rb
73
77
  - spec/unit/page_spec.rb
74
78
  homepage: http://github.com/benkitzelman/google-ajax-crawler