google_ajax_crawler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ M2QxYzU1Yjc3YmEzYzRmYjg5MTdlMWViN2JiYWUyNjdiYWRmZmQ4Yg==
5
+ data.tar.gz: !binary |-
6
+ MDBmOTllZDkzYzg2NjE1ZmVjY2RiYzE4NzA1YjVhNzA4ZTNiMjE4ZA==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ OTg5Y2E4MGZjZTRlOWU0ZmRkNDY3OGRmMTMwYzkzZTc3MzIyNDViMTdiMGQy
10
+ ZmQ5Yjc1ZGM2ZDUwYTk4YjliYzYwYzg1MmY0MzY0ODliNGI5MWZlMTlhYjhm
11
+ Njg1MTI2MzAzOTY1Mzg2MGYyMmExNDM5YmI4Y2ZkYzBkYmU5MjI=
12
+ data.tar.gz: !binary |-
13
+ ZGYzZDVlMmZkOTY2YTU5MWZhNDRlNWQ5MzllMmFmMjVlMjBhYjk2MjM0NDZi
14
+ YmFkMzE4MDBlMTliYTUzOWE4MWZkNjRmNjA3NTZmOTg5NWE2M2RjNzRmMDA4
15
+ ZGYyOTc5ZjI2N2NjMTFhZTA4NTdjMDVjZGU2MzIxNTRlNDlkM2M=
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.gitignore ADDED
@@ -0,0 +1,38 @@
1
+ .rvmrc
2
+ tmp
3
+ # rcov generated
4
+ coverage
5
+ coverage.data
6
+
7
+ # rdoc generated
8
+ rdoc
9
+
10
+ # yard generated
11
+ doc
12
+ .yardoc
13
+
14
+ # bundler
15
+ .bundle
16
+
17
+ # jeweler generated
18
+ pkg
19
+
20
+ # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
21
+ #
22
+ # * Create a file at ~/.gitignore
23
+ # * Include files you want ignored
24
+ # * Run: git config --global core.excludesfile ~/.gitignore
25
+ #
26
+ # After doing this, these files will be ignored in all your git projects,
27
+ # saving you from having to 'pollute' every project you touch with them
28
+ #
29
+ # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
30
+ #
31
+ # For MacOS:
32
+ #
33
+ .DS_Store
34
+
35
+ # For TextMate
36
+ *.tmproj
37
+ tmtags
38
+
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem 'capybara-webkit', '>= 0.10.0'
4
+ gem 'rack'
5
+
6
+ group :development, :test do
7
+ gem "simplecov"
8
+ gem 'rake'
9
+ gem 'rspec'
10
+ gem 'faraday'
11
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,60 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ capybara (2.0.2)
5
+ mime-types (>= 1.16)
6
+ nokogiri (>= 1.3.3)
7
+ rack (>= 1.0.0)
8
+ rack-test (>= 0.5.4)
9
+ selenium-webdriver (~> 2.0)
10
+ xpath (~> 1.0.0)
11
+ capybara-webkit (0.14.2)
12
+ capybara (~> 2.0, >= 2.0.2)
13
+ json
14
+ childprocess (0.3.9)
15
+ ffi (~> 1.0, >= 1.0.11)
16
+ diff-lcs (1.2.1)
17
+ faraday (0.8.6)
18
+ multipart-post (~> 1.1)
19
+ ffi (1.4.0)
20
+ json (1.7.7)
21
+ mime-types (1.21)
22
+ multi_json (1.6.1)
23
+ multipart-post (1.2.0)
24
+ nokogiri (1.5.6)
25
+ rack (1.5.2)
26
+ rack-test (0.6.2)
27
+ rack (>= 1.0)
28
+ rake (10.0.3)
29
+ rspec (2.13.0)
30
+ rspec-core (~> 2.13.0)
31
+ rspec-expectations (~> 2.13.0)
32
+ rspec-mocks (~> 2.13.0)
33
+ rspec-core (2.13.0)
34
+ rspec-expectations (2.13.0)
35
+ diff-lcs (>= 1.1.3, < 2.0)
36
+ rspec-mocks (2.13.0)
37
+ rubyzip (0.9.9)
38
+ selenium-webdriver (2.31.0)
39
+ childprocess (>= 0.2.5)
40
+ multi_json (~> 1.0)
41
+ rubyzip
42
+ websocket (~> 1.0.4)
43
+ simplecov (0.7.1)
44
+ multi_json (~> 1.0)
45
+ simplecov-html (~> 0.7.1)
46
+ simplecov-html (0.7.1)
47
+ websocket (1.0.7)
48
+ xpath (1.0.0)
49
+ nokogiri (~> 1.3)
50
+
51
+ PLATFORMS
52
+ ruby
53
+
54
+ DEPENDENCIES
55
+ capybara-webkit (>= 0.10.0)
56
+ faraday
57
+ rack
58
+ rake
59
+ rspec
60
+ simplecov
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Ben Kitzelman
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,57 @@
1
+ # Google Ajax Crawler
2
+
3
+ Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.
4
+
5
+ Details of the scheme can be found at: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
6
+
7
+ ## Using
8
+
9
+ install
10
+
11
+ ``` ruby
12
+ gem install google-ajax-crawler
13
+ ```
14
+
15
+ In your config.ru
16
+
17
+ ``` ruby
18
+ require 'google_ajax_crawler'
19
+
20
+ use GoogleAjaxCrawler::Crawler do |config|
21
+ config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
22
+ end
23
+
24
+ app = lambda {|env| [200, {'Content-Type' => 'text/plain'}, "b" ] }
25
+ run app
26
+
27
+ ```
28
+
29
+ ## Configuration Options
30
+
31
+ ### page_loaded_test
32
+
33
+ Tell the crawler when your page has finished loading / rendering. As determining when a page has completed rendering can depend on a number of qualitative factors (i.e. all ajax requests have responses, certain content has been displayed, or even when there are no loaders / spinners visible on the page), the page loaded test allows you to specify when the crawler should decide that your page has finished loading / rendering and to return a snapshot of the rendered dom at that time.
34
+
35
+ The current crawler driver is passed to the lambda to allow querying of the current page's dom state.
36
+
37
+ ### timeout
38
+
39
+ The max time the crawler should wait before returning a response
40
+
41
+ ### driver
42
+
43
+ The configured google ajax crawler driver used to query the current page state. Presently there is only one driver (now taking pull requests!); CapybaraWebkit
44
+
45
+ ### poll_interval
46
+
47
+ How often (in seconds) to test the page state with the configured page_loaded_test
48
+
49
+ ### response_headers
50
+
51
+ What response headers shoudl be returned with the dom snapshot. Default headers specify the content-type text/html
52
+
53
+ ## License
54
+
55
+ All free - Use, modify, fork to your hearts content...
56
+ See LICENSE.txt for further details.
57
+
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
@@ -0,0 +1,15 @@
1
+ require './lib/google_ajax_crawler'
2
+ Gem::Specification.new do |s|
3
+ s.name = 'google_ajax_crawler'
4
+ s.version = GoogleAjaxCrawler.version
5
+ s.summary = 'Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your JS rendered page states (i.e. BackboneJS routes) can be crawled and indexed by search engines.'
6
+ s.description = 'Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.'
7
+ s.authors = ['Ben Kitzelman']
8
+ s.email = ['benkitzelman@gmail.com']
9
+ s.homepage = 'http://github.com/benkitzelman/google-ajax-crawler'
10
+ s.files = `git ls-files`.strip.split("\n")
11
+ s.executables = []
12
+
13
+ s.add_dependency 'capybara-webkit', '>= 0.10.0'
14
+ s.add_dependency 'rack'
15
+ end
@@ -0,0 +1,15 @@
1
+ require 'rack/utils'
2
+ require 'uri'
3
+
4
+ base_path = './lib/google_ajax_crawler'
5
+
6
+ require "#{base_path}/drivers/driver"
7
+ [base_path, "#{base_path}/drivers"].each do |folder|
8
+ Dir["#{folder}/*.rb"].each {|file| require file }
9
+ end
10
+
11
+ module GoogleAjaxCrawler
12
+ def self.version
13
+ "0.1.0"
14
+ end
15
+ end
@@ -0,0 +1,56 @@
1
+
2
+ module GoogleAjaxCrawler
3
+ class Crawler
4
+
5
+ class << self
6
+ def options
7
+ configure if @options.nil?
8
+ @options
9
+ end
10
+
11
+ def configure(&block)
12
+ @options = Options.new(self, &block)
13
+ end
14
+ end
15
+
16
+ def initialize(app = nil, &block)
17
+ @app = app
18
+ self.class.configure &block
19
+ end
20
+
21
+ def options
22
+ self.class.options
23
+ end
24
+
25
+ def call(env)
26
+ request = Rack::Request.new(env)
27
+ if is_search_engine?(request)
28
+ serve_crawlable_content_for request
29
+ else
30
+ @app.call(env)
31
+ end
32
+ end
33
+
34
+ protected
35
+
36
+ def is_search_engine?(request)
37
+ request.params.include? options.requested_route_key
38
+ end
39
+
40
+ def as_uri_with_fragment(url)
41
+ uri = URI.parse(url)
42
+ params = Rack::Utils.parse_query(uri.query).merge(search_engine: true)
43
+ uri.fragment = params.delete options.requested_route_key
44
+ uri.query = Rack::Utils.build_query params
45
+ uri
46
+ end
47
+
48
+ def serve_crawlable_content_for(request)
49
+ puts ' -- GOOGLE Ajax Web Crawler Request --'
50
+ html = GoogleAjaxCrawler::Page.read as_uri_with_fragment(request.url), options
51
+
52
+ [200, options.response_headers, [html]]
53
+ end
54
+
55
+ end
56
+ end
@@ -0,0 +1,28 @@
1
+ require "capybara"
2
+ require "capybara/dsl"
3
+ require "capybara-webkit"
4
+
5
+ module GoogleAjaxCrawler
6
+ module Drivers
7
+ class CapybaraWebkit < Driver
8
+ include Capybara::DSL
9
+
10
+ def initialize *args
11
+ super *args
12
+ configure
13
+ end
14
+
15
+ def default_page_loaded_test
16
+ (page.evaluate_script('$.active') == 0)
17
+ end
18
+
19
+ protected
20
+
21
+ def configure
22
+ Capybara.run_server = false
23
+ Capybara.current_driver = :webkit
24
+ Capybara.default_wait_time = options.timeout
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,50 @@
1
+ module GoogleAjaxCrawler
2
+ module Drivers
3
+ class Driver
4
+ attr_reader :options
5
+
6
+ def initialize(options)
7
+ @options = options
8
+ end
9
+
10
+ def visit(url)
11
+ raise "Driver Not Specified"
12
+ end
13
+
14
+ def default_page_loaded_test
15
+ raise "Driver Not Specified"
16
+ end
17
+
18
+ def html
19
+ raise "Driver Not Specified"
20
+ end
21
+
22
+ def get_content(uri)
23
+ puts "requesting: #{uri}"
24
+ visit uri.to_s
25
+
26
+ wait_until_page_is_fully_loaded
27
+ html
28
+ end
29
+
30
+ def is_page_loaded?
31
+ if options.page_loaded_test.nil?
32
+ default_page_loaded_test
33
+ else
34
+ options.page_loaded_test.call(self)
35
+ end
36
+ end
37
+
38
+ def wait_until_page_is_fully_loaded
39
+ begin
40
+ while !is_page_loaded?
41
+ sleep options.poll_interval
42
+ end
43
+ rescue
44
+ #...squelch
45
+ puts "Timeout: #{$!}"
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,22 @@
1
+ module GoogleAjaxCrawler
2
+ class Options
3
+ attr_accessor :driver, :timeout, :requested_route_key, :page_loaded_test, :poll_interval, :response_headers
4
+
5
+ def initialize(app, &block)
6
+ @driver = Drivers::CapybaraWebkit.new(self)
7
+ @timeout = 30
8
+ @requested_route_key = '_escaped_fragment_'
9
+ @response_headers = { 'Content-Type' => 'text/html' }
10
+ @poll_interval = 0.5
11
+
12
+ instance_exec(self, &block) unless block.nil?
13
+
14
+ @app = app
15
+ end
16
+
17
+ def driver=(klass)
18
+ @driver = klass.new(self)
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,18 @@
1
+ module GoogleAjaxCrawler
2
+ class Page
3
+ attr_reader :options
4
+
5
+ def self.read(uri, options)
6
+ page = Page.new(options)
7
+ page.get_page_content(uri)
8
+ end
9
+
10
+ def initialize(options)
11
+ @options = options
12
+ end
13
+
14
+ def get_page_content(uri = URI.parse("http://localhost"))
15
+ options.driver.get_content uri
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,35 @@
1
+ require './spec/spec_helper'
2
+
3
+ describe 'CapybaraWebkit driver' do
4
+ let(:host) { "http://localhost:#{RackApp.port}/"}
5
+ let(:browser_route) { "#{host}#test" }
6
+ let(:snapshot_route) { "#{host}?_escaped_fragment_=test" }
7
+
8
+ before(:all) do
9
+ RackApp.configure_crawler do |config|
10
+ config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
11
+ config.poll_interval = 0.25
12
+ config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
13
+ end
14
+
15
+ RackApp.start
16
+ end
17
+
18
+ after(:all) do
19
+ RackApp.stop
20
+ end
21
+
22
+ describe 'when a browser requests a client side route (i.e.: /#my_route)' do
23
+ it 'should not serve a snapshot of the dom' do
24
+ response = Faraday.get browser_route
25
+ response.body.should_not =~ /Javascript rendering complete for client-side route #test/
26
+ end
27
+ end
28
+
29
+ describe 'when an ajax crawler requests a snapshot of a client side route' do
30
+ it 'should serve a snapshot of the dom that includes js rendered components' do
31
+ response = Faraday.get snapshot_route
32
+ response.body.should =~ /Javascript rendering complete for client-side route #test/
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require './lib/google_ajax_crawler'
4
+ require 'faraday'
5
+
6
+ here = File.dirname __FILE__
7
+ Dir["#{here}/support/*.rb"].each {|file| require file }
8
+
9
+ class MockDriver < GoogleAjaxCrawler::Drivers::Driver; end
10
+
@@ -0,0 +1,28 @@
1
+ <html>
2
+ <head></head>
3
+ <body>
4
+ <h1>A Simple State Test</h1>
5
+ <p>State: <span id='page_state'></span></p>
6
+ <div class='loading' id='loading'>Loading....</div>
7
+
8
+ <script type='text/javascript'>
9
+
10
+ var init = function() {
11
+ var writeHash = function() {
12
+ document.getElementById('page_state').innerHTML = "Javascript rendering complete for client-side route " + document.location.hash;
13
+ var loadingMask = document.getElementById('loading');
14
+ loadingMask.parentNode.removeChild(loadingMask);
15
+ console.log('done...');
16
+ };
17
+
18
+ setTimeout(writeHash, 500);
19
+ };
20
+
21
+ //
22
+ // Only execute js if loading the page using an unescaped url
23
+ //
24
+ if(/#.*$/.test(document.location.href)) init();
25
+
26
+ </script>
27
+ </body>
28
+ </html>
@@ -0,0 +1,87 @@
1
+ require 'rack'
2
+
3
+ class RackApp
4
+
5
+ def app
6
+ page_content = File.read('./spec/support/page.html')
7
+ Rack::Builder.new do
8
+
9
+ use GoogleAjaxCrawler::Crawler do |c|
10
+ RackApp.crawler_configuration.call(c)
11
+ end
12
+
13
+ map '/' do
14
+ run lambda {|env| [200, { 'Content-Type' => 'text/html' }, [page_content]] }
15
+ end
16
+ end
17
+ end
18
+
19
+ class << self
20
+ attr_reader :crawler_configuration
21
+
22
+ def app
23
+ (@app ||= RackApp.new).app
24
+ end
25
+
26
+ def configure_crawler(&block)
27
+ @crawler_configuration = block
28
+ end
29
+
30
+ def port
31
+ 9999
32
+ end
33
+
34
+ def start
35
+ setup!
36
+ pid = Process.fork
37
+ if pid.nil?
38
+ Rack::Server.start(:app => app, :Port => port)
39
+ sleep 1
40
+ else
41
+ File.open(pidfile, 'w') { |f| f.write pid }
42
+ trap("SIGINT") { stop }
43
+ Process.detach pid
44
+ end
45
+ end
46
+
47
+ def stop
48
+ return unless running?
49
+
50
+ Process.kill 9, pid
51
+ File.delete pidfile
52
+ puts "[Stopped rack app...]"
53
+ end
54
+
55
+ def setup!
56
+ FileUtils.mkpath(File.dirname(pidfile))
57
+ FileUtils.mkpath(File.dirname(logfile))
58
+ end
59
+
60
+ def pidfile
61
+ "tmp/server.pid"
62
+ end
63
+
64
+ def logfile
65
+ "tmp/server.log"
66
+ end
67
+
68
+ def pid
69
+ running? ? File.read(pidfile).to_i : 0
70
+ end
71
+
72
+ def running?
73
+ File.exists?(pidfile)
74
+ end
75
+
76
+ def restart
77
+ stop if running?
78
+ start
79
+ end
80
+
81
+ def log_to_file
82
+ log = File.new RackApp.logfile, "a"
83
+ STDOUT.reopen log
84
+ STDERR.reopen log
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,54 @@
1
+ require './spec/spec_helper'
2
+
3
+ describe GoogleAjaxCrawler::Crawler do
4
+ before(:each) do
5
+ GoogleAjaxCrawler::Crawler.configure do |config|
6
+ config.page_loaded_test = lambda{ page.find('.loading', count: 0) }
7
+ end
8
+ end
9
+
10
+ shared_examples 'a crawler configurer' do |method, *args|
11
+ it 'and facilitate the setting of crawler options' do
12
+ GoogleAjaxCrawler::Crawler.send(method, *args) do |config|
13
+ config.timeout = 10
14
+ config.driver = MockDriver
15
+ end
16
+
17
+ GoogleAjaxCrawler::Crawler.options.timeout.should == 10
18
+ GoogleAjaxCrawler::Crawler.options.driver.should be_instance_of(MockDriver)
19
+ end
20
+ end
21
+
22
+ context 'configure' do
23
+ it_should_behave_like 'a crawler configurer', :configure
24
+ end
25
+
26
+ context 'initialize' do
27
+ it_should_behave_like 'a crawler configurer', :new, nil
28
+ end
29
+
30
+ context 'call' do
31
+ let(:app) { double(:app) }
32
+ let(:request) { {
33
+ 'HTTP_METHOD' => 'GET',
34
+ 'HTTP_HOST' => 'test.com',
35
+ 'PATH_INFO' => '/test',
36
+ 'QUERY_STRING' => 'some_key=some_val',
37
+ 'rack.url_scheme' => 'http',
38
+ "rack.input" => :blah
39
+ } }
40
+ let(:search_engine_request) { request.merge('QUERY_STRING' => '_escaped_fragment_=test') }
41
+ let(:crawler) { GoogleAjaxCrawler::Crawler.new app }
42
+
43
+ it 'should delegate non snapshot requests to the configured app' do
44
+ app.should_receive(:call).with request
45
+ crawler.call request
46
+ end
47
+
48
+ it 'should generate a rendered snapshot on search engine requests' do
49
+ GoogleAjaxCrawler::Page.stub(:read).and_return :wibble
50
+ response = crawler.call search_engine_request
51
+ response.should == [200, GoogleAjaxCrawler::Crawler.options.response_headers, [:wibble]]
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,35 @@
1
+ require './spec/spec_helper'
2
+ describe GoogleAjaxCrawler::Options do
3
+ context 'initialize' do
4
+ let(:app) { Class.new }
5
+
6
+ it 'should set default values' do
7
+ options = GoogleAjaxCrawler::Options.new(app)
8
+
9
+ options.timeout.should == 30
10
+ options.requested_route_key.should == '_escaped_fragment_'
11
+ options.response_headers.should == { 'Content-Type' => 'text/html' }
12
+ options.poll_interval.should == 0.5
13
+ options.driver.should be_a(GoogleAjaxCrawler::Drivers::CapybaraWebkit)
14
+ options.page_loaded_test.should be_nil
15
+ end
16
+
17
+ it 'should allow default overrides within block scope' do
18
+ options = GoogleAjaxCrawler::Options.new(app) do |config|
19
+ config.requested_route_key = :some_other_key
20
+ config.page_loaded_test = :some_other_page_loaded_test
21
+ config.poll_interval = 7000
22
+ config.response_headers = :some_other_headers
23
+ config.timeout = 20
24
+ config.driver = MockDriver
25
+ end
26
+
27
+ options.requested_route_key.should == :some_other_key
28
+ options.page_loaded_test.should == :some_other_page_loaded_test
29
+ options.poll_interval.should == 7000
30
+ options.response_headers.should == :some_other_headers
31
+ options.timeout.should == 20
32
+ options.driver.should be_instance_of(MockDriver)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,14 @@
1
+ require './spec/spec_helper'
2
+ describe GoogleAjaxCrawler::Page do
3
+ context 'read' do
4
+ let(:uri) { URI.parse('http://www.test.com') }
5
+ let(:options) { double(:options) }
6
+
7
+ it 'should ask the driver to fetch content from a given uri' do
8
+ options.stub_chain(:driver, :get_content).with(uri).and_return :wibble
9
+ content = GoogleAjaxCrawler::Page.read(uri, options)
10
+ content.should == :wibble
11
+ end
12
+
13
+ end
14
+ end
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google_ajax_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ben Kitzelman
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-03-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: capybara-webkit
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.10.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.10.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: rack
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Rack Middleware adhering to the Google Ajax Crawling Scheme, using a
42
+ headless browser to render JS heavy pages and serve a dom snapshot of the rendered
43
+ state to a requesting search engine.
44
+ email:
45
+ - benkitzelman@gmail.com
46
+ executables: []
47
+ extensions: []
48
+ extra_rdoc_files: []
49
+ files:
50
+ - .document
51
+ - .gitignore
52
+ - .rspec
53
+ - Gemfile
54
+ - Gemfile.lock
55
+ - LICENSE.txt
56
+ - README.md
57
+ - Rakefile
58
+ - google_ajax_crawler.gemspec
59
+ - lib/google_ajax_crawler.rb
60
+ - lib/google_ajax_crawler/crawler.rb
61
+ - lib/google_ajax_crawler/drivers/capybara_webkit.rb
62
+ - lib/google_ajax_crawler/drivers/driver.rb
63
+ - lib/google_ajax_crawler/options.rb
64
+ - lib/google_ajax_crawler/page.rb
65
+ - spec/integration/capybara_webkit_spec.rb
66
+ - spec/spec_helper.rb
67
+ - spec/support/page.html
68
+ - spec/support/rack_app.rb
69
+ - spec/unit/crawler_spec.rb
70
+ - spec/unit/options_spec.rb
71
+ - spec/unit/page_spec.rb
72
+ homepage: http://github.com/benkitzelman/google-ajax-crawler
73
+ licenses: []
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ! '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.0.3
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your
95
+ JS rendered page states (i.e. BackboneJS routes) can be crawled and indexed by search
96
+ engines.
97
+ test_files: []