google_ajax_crawler 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ M2QxYzU1Yjc3YmEzYzRmYjg5MTdlMWViN2JiYWUyNjdiYWRmZmQ4Yg==
5
+ data.tar.gz: !binary |-
6
+ MDBmOTllZDkzYzg2NjE1ZmVjY2RiYzE4NzA1YjVhNzA4ZTNiMjE4ZA==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ OTg5Y2E4MGZjZTRlOWU0ZmRkNDY3OGRmMTMwYzkzZTc3MzIyNDViMTdiMGQy
10
+ ZmQ5Yjc1ZGM2ZDUwYTk4YjliYzYwYzg1MmY0MzY0ODliNGI5MWZlMTlhYjhm
11
+ Njg1MTI2MzAzOTY1Mzg2MGYyMmExNDM5YmI4Y2ZkYzBkYmU5MjI=
12
+ data.tar.gz: !binary |-
13
+ ZGYzZDVlMmZkOTY2YTU5MWZhNDRlNWQ5MzllMmFmMjVlMjBhYjk2MjM0NDZi
14
+ YmFkMzE4MDBlMTliYTUzOWE4MWZkNjRmNjA3NTZmOTg5NWE2M2RjNzRmMDA4
15
+ ZGYyOTc5ZjI2N2NjMTFhZTA4NTdjMDVjZGU2MzIxNTRlNDlkM2M=
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.gitignore ADDED
@@ -0,0 +1,38 @@
1
+ .rvmrc
2
+ tmp
3
+ # rcov generated
4
+ coverage
5
+ coverage.data
6
+
7
+ # rdoc generated
8
+ rdoc
9
+
10
+ # yard generated
11
+ doc
12
+ .yardoc
13
+
14
+ # bundler
15
+ .bundle
16
+
17
+ # jeweler generated
18
+ pkg
19
+
20
+ # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
21
+ #
22
+ # * Create a file at ~/.gitignore
23
+ # * Include files you want ignored
24
+ # * Run: git config --global core.excludesfile ~/.gitignore
25
+ #
26
+ # After doing this, these files will be ignored in all your git projects,
27
+ # saving you from having to 'pollute' every project you touch with them
28
+ #
29
+ # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
30
+ #
31
+ # For MacOS:
32
+ #
33
+ .DS_Store
34
+
35
+ # For TextMate
36
+ *.tmproj
37
+ tmtags
38
+
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem 'capybara-webkit', '>= 0.10.0'
4
+ gem 'rack'
5
+
6
+ group :development, :test do
7
+ gem "simplecov"
8
+ gem 'rake'
9
+ gem 'rspec'
10
+ gem 'faraday'
11
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,60 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ capybara (2.0.2)
5
+ mime-types (>= 1.16)
6
+ nokogiri (>= 1.3.3)
7
+ rack (>= 1.0.0)
8
+ rack-test (>= 0.5.4)
9
+ selenium-webdriver (~> 2.0)
10
+ xpath (~> 1.0.0)
11
+ capybara-webkit (0.14.2)
12
+ capybara (~> 2.0, >= 2.0.2)
13
+ json
14
+ childprocess (0.3.9)
15
+ ffi (~> 1.0, >= 1.0.11)
16
+ diff-lcs (1.2.1)
17
+ faraday (0.8.6)
18
+ multipart-post (~> 1.1)
19
+ ffi (1.4.0)
20
+ json (1.7.7)
21
+ mime-types (1.21)
22
+ multi_json (1.6.1)
23
+ multipart-post (1.2.0)
24
+ nokogiri (1.5.6)
25
+ rack (1.5.2)
26
+ rack-test (0.6.2)
27
+ rack (>= 1.0)
28
+ rake (10.0.3)
29
+ rspec (2.13.0)
30
+ rspec-core (~> 2.13.0)
31
+ rspec-expectations (~> 2.13.0)
32
+ rspec-mocks (~> 2.13.0)
33
+ rspec-core (2.13.0)
34
+ rspec-expectations (2.13.0)
35
+ diff-lcs (>= 1.1.3, < 2.0)
36
+ rspec-mocks (2.13.0)
37
+ rubyzip (0.9.9)
38
+ selenium-webdriver (2.31.0)
39
+ childprocess (>= 0.2.5)
40
+ multi_json (~> 1.0)
41
+ rubyzip
42
+ websocket (~> 1.0.4)
43
+ simplecov (0.7.1)
44
+ multi_json (~> 1.0)
45
+ simplecov-html (~> 0.7.1)
46
+ simplecov-html (0.7.1)
47
+ websocket (1.0.7)
48
+ xpath (1.0.0)
49
+ nokogiri (~> 1.3)
50
+
51
+ PLATFORMS
52
+ ruby
53
+
54
+ DEPENDENCIES
55
+ capybara-webkit (>= 0.10.0)
56
+ faraday
57
+ rack
58
+ rake
59
+ rspec
60
+ simplecov
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Ben Kitzelman
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,57 @@
1
+ # Google Ajax Crawler
2
+
3
+ Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.
4
+
5
+ Details of the scheme can be found at: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
6
+
7
+ ## Using
8
+
9
+ install
10
+
11
+ ``` ruby
12
+ gem install google-ajax-crawler
13
+ ```
14
+
15
+ In your config.ru
16
+
17
+ ``` ruby
18
+ require 'google_ajax_crawler'
19
+
20
+ use GoogleAjaxCrawler::Crawler do |config|
21
+ config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
22
+ end
23
+
24
+ app = lambda {|env| [200, {'Content-Type' => 'text/plain'}, "b" ] }
25
+ run app
26
+
27
+ ```
28
+
29
+ ## Configuration Options
30
+
31
+ ### page_loaded_test
32
+
33
+ Tell the crawler when your page has finished loading / rendering. As determining when a page has completed rendering can depend on a number of qualitative factors (i.e. all ajax requests have responses, certain content has been displayed, or even when there are no loaders / spinners visible on the page), the page loaded test allows you to specify when the crawler should decide that your page has finished loading / rendering and to return a snapshot of the rendered dom at that time.
34
+
35
+ The current crawler driver is passed to the lambda to allow querying of the current page's dom state.
36
+
37
+ ### timeout
38
+
39
+ The max time the crawler should wait before returning a response
40
+
41
+ ### driver
42
+
43
+ The configured google ajax crawler driver used to query the current page state. Presently there is only one driver (now taking pull requests!); CapybaraWebkit
44
+
45
+ ### poll_interval
46
+
47
+ How often (in seconds) to test the page state with the configured page_loaded_test
48
+
49
+ ### response_headers
50
+
51
+ What response headers shoudl be returned with the dom snapshot. Default headers specify the content-type text/html
52
+
53
+ ## License
54
+
55
+ All free - Use, modify, fork to your hearts content...
56
+ See LICENSE.txt for further details.
57
+
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
@@ -0,0 +1,15 @@
1
+ require './lib/google_ajax_crawler'
2
+ Gem::Specification.new do |s|
3
+ s.name = 'google_ajax_crawler'
4
+ s.version = GoogleAjaxCrawler.version
5
+ s.summary = 'Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your JS rendered page states (i.e. BackboneJS routes) can be crawled and indexed by search engines.'
6
+ s.description = 'Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.'
7
+ s.authors = ['Ben Kitzelman']
8
+ s.email = ['benkitzelman@gmail.com']
9
+ s.homepage = 'http://github.com/benkitzelman/google-ajax-crawler'
10
+ s.files = `git ls-files`.strip.split("\n")
11
+ s.executables = []
12
+
13
+ s.add_dependency 'capybara-webkit', '>= 0.10.0'
14
+ s.add_dependency 'rack'
15
+ end
@@ -0,0 +1,15 @@
1
+ require 'rack/utils'
2
+ require 'uri'
3
+
4
+ base_path = './lib/google_ajax_crawler'
5
+
6
+ require "#{base_path}/drivers/driver"
7
+ [base_path, "#{base_path}/drivers"].each do |folder|
8
+ Dir["#{folder}/*.rb"].each {|file| require file }
9
+ end
10
+
11
+ module GoogleAjaxCrawler
12
+ def self.version
13
+ "0.1.0"
14
+ end
15
+ end
@@ -0,0 +1,56 @@
1
+
2
+ module GoogleAjaxCrawler
3
+ class Crawler
4
+
5
+ class << self
6
+ def options
7
+ configure if @options.nil?
8
+ @options
9
+ end
10
+
11
+ def configure(&block)
12
+ @options = Options.new(self, &block)
13
+ end
14
+ end
15
+
16
+ def initialize(app = nil, &block)
17
+ @app = app
18
+ self.class.configure &block
19
+ end
20
+
21
+ def options
22
+ self.class.options
23
+ end
24
+
25
+ def call(env)
26
+ request = Rack::Request.new(env)
27
+ if is_search_engine?(request)
28
+ serve_crawlable_content_for request
29
+ else
30
+ @app.call(env)
31
+ end
32
+ end
33
+
34
+ protected
35
+
36
+ def is_search_engine?(request)
37
+ request.params.include? options.requested_route_key
38
+ end
39
+
40
+ def as_uri_with_fragment(url)
41
+ uri = URI.parse(url)
42
+ params = Rack::Utils.parse_query(uri.query).merge(search_engine: true)
43
+ uri.fragment = params.delete options.requested_route_key
44
+ uri.query = Rack::Utils.build_query params
45
+ uri
46
+ end
47
+
48
+ def serve_crawlable_content_for(request)
49
+ puts ' -- GOOGLE Ajax Web Crawler Request --'
50
+ html = GoogleAjaxCrawler::Page.read as_uri_with_fragment(request.url), options
51
+
52
+ [200, options.response_headers, [html]]
53
+ end
54
+
55
+ end
56
+ end
@@ -0,0 +1,28 @@
1
+ require "capybara"
2
+ require "capybara/dsl"
3
+ require "capybara-webkit"
4
+
5
+ module GoogleAjaxCrawler
6
+ module Drivers
7
+ class CapybaraWebkit < Driver
8
+ include Capybara::DSL
9
+
10
+ def initialize *args
11
+ super *args
12
+ configure
13
+ end
14
+
15
+ def default_page_loaded_test
16
+ (page.evaluate_script('$.active') == 0)
17
+ end
18
+
19
+ protected
20
+
21
+ def configure
22
+ Capybara.run_server = false
23
+ Capybara.current_driver = :webkit
24
+ Capybara.default_wait_time = options.timeout
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,50 @@
1
+ module GoogleAjaxCrawler
2
+ module Drivers
3
+ class Driver
4
+ attr_reader :options
5
+
6
+ def initialize(options)
7
+ @options = options
8
+ end
9
+
10
+ def visit(url)
11
+ raise "Driver Not Specified"
12
+ end
13
+
14
+ def default_page_loaded_test
15
+ raise "Driver Not Specified"
16
+ end
17
+
18
+ def html
19
+ raise "Driver Not Specified"
20
+ end
21
+
22
+ def get_content(uri)
23
+ puts "requesting: #{uri}"
24
+ visit uri.to_s
25
+
26
+ wait_until_page_is_fully_loaded
27
+ html
28
+ end
29
+
30
+ def is_page_loaded?
31
+ if options.page_loaded_test.nil?
32
+ default_page_loaded_test
33
+ else
34
+ options.page_loaded_test.call(self)
35
+ end
36
+ end
37
+
38
+ def wait_until_page_is_fully_loaded
39
+ begin
40
+ while !is_page_loaded?
41
+ sleep options.poll_interval
42
+ end
43
+ rescue
44
+ #...squelch
45
+ puts "Timeout: #{$!}"
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,22 @@
1
+ module GoogleAjaxCrawler
2
+ class Options
3
+ attr_accessor :driver, :timeout, :requested_route_key, :page_loaded_test, :poll_interval, :response_headers
4
+
5
+ def initialize(app, &block)
6
+ @driver = Drivers::CapybaraWebkit.new(self)
7
+ @timeout = 30
8
+ @requested_route_key = '_escaped_fragment_'
9
+ @response_headers = { 'Content-Type' => 'text/html' }
10
+ @poll_interval = 0.5
11
+
12
+ instance_exec(self, &block) unless block.nil?
13
+
14
+ @app = app
15
+ end
16
+
17
+ def driver=(klass)
18
+ @driver = klass.new(self)
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,18 @@
1
+ module GoogleAjaxCrawler
2
+ class Page
3
+ attr_reader :options
4
+
5
+ def self.read(uri, options)
6
+ page = Page.new(options)
7
+ page.get_page_content(uri)
8
+ end
9
+
10
+ def initialize(options)
11
+ @options = options
12
+ end
13
+
14
+ def get_page_content(uri = URI.parse("http://localhost"))
15
+ options.driver.get_content uri
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,35 @@
1
+ require './spec/spec_helper'
2
+
3
+ describe 'CapybaraWebkit driver' do
4
+ let(:host) { "http://localhost:#{RackApp.port}/"}
5
+ let(:browser_route) { "#{host}#test" }
6
+ let(:snapshot_route) { "#{host}?_escaped_fragment_=test" }
7
+
8
+ before(:all) do
9
+ RackApp.configure_crawler do |config|
10
+ config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
11
+ config.poll_interval = 0.25
12
+ config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
13
+ end
14
+
15
+ RackApp.start
16
+ end
17
+
18
+ after(:all) do
19
+ RackApp.stop
20
+ end
21
+
22
+ describe 'when a browser requests a client side route (i.e.: /#my_route)' do
23
+ it 'should not serve a snapshot of the dom' do
24
+ response = Faraday.get browser_route
25
+ response.body.should_not =~ /Javascript rendering complete for client-side route #test/
26
+ end
27
+ end
28
+
29
+ describe 'when an ajax crawler requests a snapshot of a client side route' do
30
+ it 'should serve a snapshot of the dom that includes js rendered components' do
31
+ response = Faraday.get snapshot_route
32
+ response.body.should =~ /Javascript rendering complete for client-side route #test/
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require './lib/google_ajax_crawler'
4
+ require 'faraday'
5
+
6
+ here = File.dirname __FILE__
7
+ Dir["#{here}/support/*.rb"].each {|file| require file }
8
+
9
+ class MockDriver < GoogleAjaxCrawler::Drivers::Driver; end
10
+
@@ -0,0 +1,28 @@
1
+ <html>
2
+ <head></head>
3
+ <body>
4
+ <h1>A Simple State Test</h1>
5
+ <p>State: <span id='page_state'></span></p>
6
+ <div class='loading' id='loading'>Loading....</div>
7
+
8
+ <script type='text/javascript'>
9
+
10
+ var init = function() {
11
+ var writeHash = function() {
12
+ document.getElementById('page_state').innerHTML = "Javascript rendering complete for client-side route " + document.location.hash;
13
+ var loadingMask = document.getElementById('loading');
14
+ loadingMask.parentNode.removeChild(loadingMask);
15
+ console.log('done...');
16
+ };
17
+
18
+ setTimeout(writeHash, 500);
19
+ };
20
+
21
+ //
22
+ // Only execute js if loading the page using an unescaped url
23
+ //
24
+ if(/#.*$/.test(document.location.href)) init();
25
+
26
+ </script>
27
+ </body>
28
+ </html>
@@ -0,0 +1,87 @@
1
+ require 'rack'
2
+
3
+ class RackApp
4
+
5
+ def app
6
+ page_content = File.read('./spec/support/page.html')
7
+ Rack::Builder.new do
8
+
9
+ use GoogleAjaxCrawler::Crawler do |c|
10
+ RackApp.crawler_configuration.call(c)
11
+ end
12
+
13
+ map '/' do
14
+ run lambda {|env| [200, { 'Content-Type' => 'text/html' }, [page_content]] }
15
+ end
16
+ end
17
+ end
18
+
19
+ class << self
20
+ attr_reader :crawler_configuration
21
+
22
+ def app
23
+ (@app ||= RackApp.new).app
24
+ end
25
+
26
+ def configure_crawler(&block)
27
+ @crawler_configuration = block
28
+ end
29
+
30
+ def port
31
+ 9999
32
+ end
33
+
34
+ def start
35
+ setup!
36
+ pid = Process.fork
37
+ if pid.nil?
38
+ Rack::Server.start(:app => app, :Port => port)
39
+ sleep 1
40
+ else
41
+ File.open(pidfile, 'w') { |f| f.write pid }
42
+ trap("SIGINT") { stop }
43
+ Process.detach pid
44
+ end
45
+ end
46
+
47
+ def stop
48
+ return unless running?
49
+
50
+ Process.kill 9, pid
51
+ File.delete pidfile
52
+ puts "[Stopped rack app...]"
53
+ end
54
+
55
+ def setup!
56
+ FileUtils.mkpath(File.dirname(pidfile))
57
+ FileUtils.mkpath(File.dirname(logfile))
58
+ end
59
+
60
+ def pidfile
61
+ "tmp/server.pid"
62
+ end
63
+
64
+ def logfile
65
+ "tmp/server.log"
66
+ end
67
+
68
+ def pid
69
+ running? ? File.read(pidfile).to_i : 0
70
+ end
71
+
72
+ def running?
73
+ File.exists?(pidfile)
74
+ end
75
+
76
+ def restart
77
+ stop if running?
78
+ start
79
+ end
80
+
81
+ def log_to_file
82
+ log = File.new RackApp.logfile, "a"
83
+ STDOUT.reopen log
84
+ STDERR.reopen log
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,54 @@
1
+ require './spec/spec_helper'
2
+
3
+ describe GoogleAjaxCrawler::Crawler do
4
+ before(:each) do
5
+ GoogleAjaxCrawler::Crawler.configure do |config|
6
+ config.page_loaded_test = lambda{ page.find('.loading', count: 0) }
7
+ end
8
+ end
9
+
10
+ shared_examples 'a crawler configurer' do |method, *args|
11
+ it 'and facilitate the setting of crawler options' do
12
+ GoogleAjaxCrawler::Crawler.send(method, *args) do |config|
13
+ config.timeout = 10
14
+ config.driver = MockDriver
15
+ end
16
+
17
+ GoogleAjaxCrawler::Crawler.options.timeout.should == 10
18
+ GoogleAjaxCrawler::Crawler.options.driver.should be_instance_of(MockDriver)
19
+ end
20
+ end
21
+
22
+ context 'configure' do
23
+ it_should_behave_like 'a crawler configurer', :configure
24
+ end
25
+
26
+ context 'initialize' do
27
+ it_should_behave_like 'a crawler configurer', :new, nil
28
+ end
29
+
30
+ context 'call' do
31
+ let(:app) { double(:app) }
32
+ let(:request) { {
33
+ 'HTTP_METHOD' => 'GET',
34
+ 'HTTP_HOST' => 'test.com',
35
+ 'PATH_INFO' => '/test',
36
+ 'QUERY_STRING' => 'some_key=some_val',
37
+ 'rack.url_scheme' => 'http',
38
+ "rack.input" => :blah
39
+ } }
40
+ let(:search_engine_request) { request.merge('QUERY_STRING' => '_escaped_fragment_=test') }
41
+ let(:crawler) { GoogleAjaxCrawler::Crawler.new app }
42
+
43
+ it 'should delegate non snapshot requests to the configured app' do
44
+ app.should_receive(:call).with request
45
+ crawler.call request
46
+ end
47
+
48
+ it 'should generate a rendered snapshot on search engine requests' do
49
+ GoogleAjaxCrawler::Page.stub(:read).and_return :wibble
50
+ response = crawler.call search_engine_request
51
+ response.should == [200, GoogleAjaxCrawler::Crawler.options.response_headers, [:wibble]]
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,35 @@
1
+ require './spec/spec_helper'
2
+ describe GoogleAjaxCrawler::Options do
3
+ context 'initialize' do
4
+ let(:app) { Class.new }
5
+
6
+ it 'should set default values' do
7
+ options = GoogleAjaxCrawler::Options.new(app)
8
+
9
+ options.timeout.should == 30
10
+ options.requested_route_key.should == '_escaped_fragment_'
11
+ options.response_headers.should == { 'Content-Type' => 'text/html' }
12
+ options.poll_interval.should == 0.5
13
+ options.driver.should be_a(GoogleAjaxCrawler::Drivers::CapybaraWebkit)
14
+ options.page_loaded_test.should be_nil
15
+ end
16
+
17
+ it 'should allow default overrides within block scope' do
18
+ options = GoogleAjaxCrawler::Options.new(app) do |config|
19
+ config.requested_route_key = :some_other_key
20
+ config.page_loaded_test = :some_other_page_loaded_test
21
+ config.poll_interval = 7000
22
+ config.response_headers = :some_other_headers
23
+ config.timeout = 20
24
+ config.driver = MockDriver
25
+ end
26
+
27
+ options.requested_route_key.should == :some_other_key
28
+ options.page_loaded_test.should == :some_other_page_loaded_test
29
+ options.poll_interval.should == 7000
30
+ options.response_headers.should == :some_other_headers
31
+ options.timeout.should == 20
32
+ options.driver.should be_instance_of(MockDriver)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,14 @@
1
+ require './spec/spec_helper'
2
+ describe GoogleAjaxCrawler::Page do
3
+ context 'read' do
4
+ let(:uri) { URI.parse('http://www.test.com') }
5
+ let(:options) { double(:options) }
6
+
7
+ it 'should ask the driver to fetch content from a given uri' do
8
+ options.stub_chain(:driver, :get_content).with(uri).and_return :wibble
9
+ content = GoogleAjaxCrawler::Page.read(uri, options)
10
+ content.should == :wibble
11
+ end
12
+
13
+ end
14
+ end
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google_ajax_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ben Kitzelman
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-03-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: capybara-webkit
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.10.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.10.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: rack
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Rack Middleware adhering to the Google Ajax Crawling Scheme, using a
42
+ headless browser to render JS heavy pages and serve a dom snapshot of the rendered
43
+ state to a requesting search engine.
44
+ email:
45
+ - benkitzelman@gmail.com
46
+ executables: []
47
+ extensions: []
48
+ extra_rdoc_files: []
49
+ files:
50
+ - .document
51
+ - .gitignore
52
+ - .rspec
53
+ - Gemfile
54
+ - Gemfile.lock
55
+ - LICENSE.txt
56
+ - README.md
57
+ - Rakefile
58
+ - google_ajax_crawler.gemspec
59
+ - lib/google_ajax_crawler.rb
60
+ - lib/google_ajax_crawler/crawler.rb
61
+ - lib/google_ajax_crawler/drivers/capybara_webkit.rb
62
+ - lib/google_ajax_crawler/drivers/driver.rb
63
+ - lib/google_ajax_crawler/options.rb
64
+ - lib/google_ajax_crawler/page.rb
65
+ - spec/integration/capybara_webkit_spec.rb
66
+ - spec/spec_helper.rb
67
+ - spec/support/page.html
68
+ - spec/support/rack_app.rb
69
+ - spec/unit/crawler_spec.rb
70
+ - spec/unit/options_spec.rb
71
+ - spec/unit/page_spec.rb
72
+ homepage: http://github.com/benkitzelman/google-ajax-crawler
73
+ licenses: []
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ! '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.0.3
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your
95
+ JS rendered page states (i.e. BackboneJS routes) can be crawled and indexed by search
96
+ engines.
97
+ test_files: []