google_ajax_crawler 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.document +5 -0
- data/.gitignore +38 -0
- data/.rspec +2 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +60 -0
- data/LICENSE.txt +20 -0
- data/README.md +57 -0
- data/Rakefile +12 -0
- data/google_ajax_crawler.gemspec +15 -0
- data/lib/google_ajax_crawler.rb +15 -0
- data/lib/google_ajax_crawler/crawler.rb +56 -0
- data/lib/google_ajax_crawler/drivers/capybara_webkit.rb +28 -0
- data/lib/google_ajax_crawler/drivers/driver.rb +50 -0
- data/lib/google_ajax_crawler/options.rb +22 -0
- data/lib/google_ajax_crawler/page.rb +18 -0
- data/spec/integration/capybara_webkit_spec.rb +35 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/support/page.html +28 -0
- data/spec/support/rack_app.rb +87 -0
- data/spec/unit/crawler_spec.rb +54 -0
- data/spec/unit/options_spec.rb +35 -0
- data/spec/unit/page_spec.rb +14 -0
- metadata +97 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
M2QxYzU1Yjc3YmEzYzRmYjg5MTdlMWViN2JiYWUyNjdiYWRmZmQ4Yg==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MDBmOTllZDkzYzg2NjE1ZmVjY2RiYzE4NzA1YjVhNzA4ZTNiMjE4ZA==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
OTg5Y2E4MGZjZTRlOWU0ZmRkNDY3OGRmMTMwYzkzZTc3MzIyNDViMTdiMGQy
|
10
|
+
ZmQ5Yjc1ZGM2ZDUwYTk4YjliYzYwYzg1MmY0MzY0ODliNGI5MWZlMTlhYjhm
|
11
|
+
Njg1MTI2MzAzOTY1Mzg2MGYyMmExNDM5YmI4Y2ZkYzBkYmU5MjI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZGYzZDVlMmZkOTY2YTU5MWZhNDRlNWQ5MzllMmFmMjVlMjBhYjk2MjM0NDZi
|
14
|
+
YmFkMzE4MDBlMTliYTUzOWE4MWZkNjRmNjA3NTZmOTg5NWE2M2RjNzRmMDA4
|
15
|
+
ZGYyOTc5ZjI2N2NjMTFhZTA4NTdjMDVjZGU2MzIxNTRlNDlkM2M=
|
data/.document
ADDED
data/.gitignore
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
.rvmrc
|
2
|
+
tmp
|
3
|
+
# rcov generated
|
4
|
+
coverage
|
5
|
+
coverage.data
|
6
|
+
|
7
|
+
# rdoc generated
|
8
|
+
rdoc
|
9
|
+
|
10
|
+
# yard generated
|
11
|
+
doc
|
12
|
+
.yardoc
|
13
|
+
|
14
|
+
# bundler
|
15
|
+
.bundle
|
16
|
+
|
17
|
+
# jeweler generated
|
18
|
+
pkg
|
19
|
+
|
20
|
+
# Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
|
21
|
+
#
|
22
|
+
# * Create a file at ~/.gitignore
|
23
|
+
# * Include files you want ignored
|
24
|
+
# * Run: git config --global core.excludesfile ~/.gitignore
|
25
|
+
#
|
26
|
+
# After doing this, these files will be ignored in all your git projects,
|
27
|
+
# saving you from having to 'pollute' every project you touch with them
|
28
|
+
#
|
29
|
+
# Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
|
30
|
+
#
|
31
|
+
# For MacOS:
|
32
|
+
#
|
33
|
+
.DS_Store
|
34
|
+
|
35
|
+
# For TextMate
|
36
|
+
*.tmproj
|
37
|
+
tmtags
|
38
|
+
|
data/.rspec
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
GEM
|
2
|
+
remote: https://rubygems.org/
|
3
|
+
specs:
|
4
|
+
capybara (2.0.2)
|
5
|
+
mime-types (>= 1.16)
|
6
|
+
nokogiri (>= 1.3.3)
|
7
|
+
rack (>= 1.0.0)
|
8
|
+
rack-test (>= 0.5.4)
|
9
|
+
selenium-webdriver (~> 2.0)
|
10
|
+
xpath (~> 1.0.0)
|
11
|
+
capybara-webkit (0.14.2)
|
12
|
+
capybara (~> 2.0, >= 2.0.2)
|
13
|
+
json
|
14
|
+
childprocess (0.3.9)
|
15
|
+
ffi (~> 1.0, >= 1.0.11)
|
16
|
+
diff-lcs (1.2.1)
|
17
|
+
faraday (0.8.6)
|
18
|
+
multipart-post (~> 1.1)
|
19
|
+
ffi (1.4.0)
|
20
|
+
json (1.7.7)
|
21
|
+
mime-types (1.21)
|
22
|
+
multi_json (1.6.1)
|
23
|
+
multipart-post (1.2.0)
|
24
|
+
nokogiri (1.5.6)
|
25
|
+
rack (1.5.2)
|
26
|
+
rack-test (0.6.2)
|
27
|
+
rack (>= 1.0)
|
28
|
+
rake (10.0.3)
|
29
|
+
rspec (2.13.0)
|
30
|
+
rspec-core (~> 2.13.0)
|
31
|
+
rspec-expectations (~> 2.13.0)
|
32
|
+
rspec-mocks (~> 2.13.0)
|
33
|
+
rspec-core (2.13.0)
|
34
|
+
rspec-expectations (2.13.0)
|
35
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
36
|
+
rspec-mocks (2.13.0)
|
37
|
+
rubyzip (0.9.9)
|
38
|
+
selenium-webdriver (2.31.0)
|
39
|
+
childprocess (>= 0.2.5)
|
40
|
+
multi_json (~> 1.0)
|
41
|
+
rubyzip
|
42
|
+
websocket (~> 1.0.4)
|
43
|
+
simplecov (0.7.1)
|
44
|
+
multi_json (~> 1.0)
|
45
|
+
simplecov-html (~> 0.7.1)
|
46
|
+
simplecov-html (0.7.1)
|
47
|
+
websocket (1.0.7)
|
48
|
+
xpath (1.0.0)
|
49
|
+
nokogiri (~> 1.3)
|
50
|
+
|
51
|
+
PLATFORMS
|
52
|
+
ruby
|
53
|
+
|
54
|
+
DEPENDENCIES
|
55
|
+
capybara-webkit (>= 0.10.0)
|
56
|
+
faraday
|
57
|
+
rack
|
58
|
+
rake
|
59
|
+
rspec
|
60
|
+
simplecov
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2013 Ben Kitzelman
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# Google Ajax Crawler
|
2
|
+
|
3
|
+
Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.
|
4
|
+
|
5
|
+
Details of the scheme can be found at: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
6
|
+
|
7
|
+
## Using
|
8
|
+
|
9
|
+
install
|
10
|
+
|
11
|
+
``` ruby
|
12
|
+
gem install google-ajax-crawler
|
13
|
+
```
|
14
|
+
|
15
|
+
In your config.ru
|
16
|
+
|
17
|
+
``` ruby
|
18
|
+
require 'google_ajax_crawler'
|
19
|
+
|
20
|
+
use GoogleAjaxCrawler::Crawler do |config|
|
21
|
+
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
|
22
|
+
end
|
23
|
+
|
24
|
+
app = lambda {|env| [200, {'Content-Type' => 'text/plain'}, "b" ] }
|
25
|
+
run app
|
26
|
+
|
27
|
+
```
|
28
|
+
|
29
|
+
## Configuration Options
|
30
|
+
|
31
|
+
### page_loaded_test
|
32
|
+
|
33
|
+
Tell the crawler when your page has finished loading / rendering. As determining when a page has completed rendering can depend on a number of qualitative factors (i.e. all ajax requests have responses, certain content has been displayed, or even when there are no loaders / spinners visible on the page), the page loaded test allows you to specify when the crawler should decide that your page has finished loading / rendering and to return a snapshot of the rendered dom at that time.
|
34
|
+
|
35
|
+
The current crawler driver is passed to the lambda to allow querying of the current page's dom state.
|
36
|
+
|
37
|
+
### timeout
|
38
|
+
|
39
|
+
The max time the crawler should wait before returning a response
|
40
|
+
|
41
|
+
### driver
|
42
|
+
|
43
|
+
The configured google ajax crawler driver used to query the current page state. Presently there is only one driver (now taking pull requests!); CapybaraWebkit
|
44
|
+
|
45
|
+
### poll_interval
|
46
|
+
|
47
|
+
How often (in seconds) to test the page state with the configured page_loaded_test
|
48
|
+
|
49
|
+
### response_headers
|
50
|
+
|
51
|
+
What response headers shoudl be returned with the dom snapshot. Default headers specify the content-type text/html
|
52
|
+
|
53
|
+
## License
|
54
|
+
|
55
|
+
All free - Use, modify, fork to your hearts content...
|
56
|
+
See LICENSE.txt for further details.
|
57
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require './lib/google_ajax_crawler'
|
2
|
+
Gem::Specification.new do |s|
|
3
|
+
s.name = 'google_ajax_crawler'
|
4
|
+
s.version = GoogleAjaxCrawler.version
|
5
|
+
s.summary = 'Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your JS rendered page states (i.e. BackboneJS routes) can be crawled and indexed by search engines.'
|
6
|
+
s.description = 'Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.'
|
7
|
+
s.authors = ['Ben Kitzelman']
|
8
|
+
s.email = ['benkitzelman@gmail.com']
|
9
|
+
s.homepage = 'http://github.com/benkitzelman/google-ajax-crawler'
|
10
|
+
s.files = `git ls-files`.strip.split("\n")
|
11
|
+
s.executables = []
|
12
|
+
|
13
|
+
s.add_dependency 'capybara-webkit', '>= 0.10.0'
|
14
|
+
s.add_dependency 'rack'
|
15
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rack/utils'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
base_path = './lib/google_ajax_crawler'
|
5
|
+
|
6
|
+
require "#{base_path}/drivers/driver"
|
7
|
+
[base_path, "#{base_path}/drivers"].each do |folder|
|
8
|
+
Dir["#{folder}/*.rb"].each {|file| require file }
|
9
|
+
end
|
10
|
+
|
11
|
+
module GoogleAjaxCrawler
|
12
|
+
def self.version
|
13
|
+
"0.1.0"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
|
2
|
+
module GoogleAjaxCrawler
|
3
|
+
class Crawler
|
4
|
+
|
5
|
+
class << self
|
6
|
+
def options
|
7
|
+
configure if @options.nil?
|
8
|
+
@options
|
9
|
+
end
|
10
|
+
|
11
|
+
def configure(&block)
|
12
|
+
@options = Options.new(self, &block)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(app = nil, &block)
|
17
|
+
@app = app
|
18
|
+
self.class.configure &block
|
19
|
+
end
|
20
|
+
|
21
|
+
def options
|
22
|
+
self.class.options
|
23
|
+
end
|
24
|
+
|
25
|
+
def call(env)
|
26
|
+
request = Rack::Request.new(env)
|
27
|
+
if is_search_engine?(request)
|
28
|
+
serve_crawlable_content_for request
|
29
|
+
else
|
30
|
+
@app.call(env)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def is_search_engine?(request)
|
37
|
+
request.params.include? options.requested_route_key
|
38
|
+
end
|
39
|
+
|
40
|
+
def as_uri_with_fragment(url)
|
41
|
+
uri = URI.parse(url)
|
42
|
+
params = Rack::Utils.parse_query(uri.query).merge(search_engine: true)
|
43
|
+
uri.fragment = params.delete options.requested_route_key
|
44
|
+
uri.query = Rack::Utils.build_query params
|
45
|
+
uri
|
46
|
+
end
|
47
|
+
|
48
|
+
def serve_crawlable_content_for(request)
|
49
|
+
puts ' -- GOOGLE Ajax Web Crawler Request --'
|
50
|
+
html = GoogleAjaxCrawler::Page.read as_uri_with_fragment(request.url), options
|
51
|
+
|
52
|
+
[200, options.response_headers, [html]]
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require "capybara"
|
2
|
+
require "capybara/dsl"
|
3
|
+
require "capybara-webkit"
|
4
|
+
|
5
|
+
module GoogleAjaxCrawler
|
6
|
+
module Drivers
|
7
|
+
class CapybaraWebkit < Driver
|
8
|
+
include Capybara::DSL
|
9
|
+
|
10
|
+
def initialize *args
|
11
|
+
super *args
|
12
|
+
configure
|
13
|
+
end
|
14
|
+
|
15
|
+
def default_page_loaded_test
|
16
|
+
(page.evaluate_script('$.active') == 0)
|
17
|
+
end
|
18
|
+
|
19
|
+
protected
|
20
|
+
|
21
|
+
def configure
|
22
|
+
Capybara.run_server = false
|
23
|
+
Capybara.current_driver = :webkit
|
24
|
+
Capybara.default_wait_time = options.timeout
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module GoogleAjaxCrawler
|
2
|
+
module Drivers
|
3
|
+
class Driver
|
4
|
+
attr_reader :options
|
5
|
+
|
6
|
+
def initialize(options)
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
def visit(url)
|
11
|
+
raise "Driver Not Specified"
|
12
|
+
end
|
13
|
+
|
14
|
+
def default_page_loaded_test
|
15
|
+
raise "Driver Not Specified"
|
16
|
+
end
|
17
|
+
|
18
|
+
def html
|
19
|
+
raise "Driver Not Specified"
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_content(uri)
|
23
|
+
puts "requesting: #{uri}"
|
24
|
+
visit uri.to_s
|
25
|
+
|
26
|
+
wait_until_page_is_fully_loaded
|
27
|
+
html
|
28
|
+
end
|
29
|
+
|
30
|
+
def is_page_loaded?
|
31
|
+
if options.page_loaded_test.nil?
|
32
|
+
default_page_loaded_test
|
33
|
+
else
|
34
|
+
options.page_loaded_test.call(self)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def wait_until_page_is_fully_loaded
|
39
|
+
begin
|
40
|
+
while !is_page_loaded?
|
41
|
+
sleep options.poll_interval
|
42
|
+
end
|
43
|
+
rescue
|
44
|
+
#...squelch
|
45
|
+
puts "Timeout: #{$!}"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module GoogleAjaxCrawler
|
2
|
+
class Options
|
3
|
+
attr_accessor :driver, :timeout, :requested_route_key, :page_loaded_test, :poll_interval, :response_headers
|
4
|
+
|
5
|
+
def initialize(app, &block)
|
6
|
+
@driver = Drivers::CapybaraWebkit.new(self)
|
7
|
+
@timeout = 30
|
8
|
+
@requested_route_key = '_escaped_fragment_'
|
9
|
+
@response_headers = { 'Content-Type' => 'text/html' }
|
10
|
+
@poll_interval = 0.5
|
11
|
+
|
12
|
+
instance_exec(self, &block) unless block.nil?
|
13
|
+
|
14
|
+
@app = app
|
15
|
+
end
|
16
|
+
|
17
|
+
def driver=(klass)
|
18
|
+
@driver = klass.new(self)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module GoogleAjaxCrawler
|
2
|
+
class Page
|
3
|
+
attr_reader :options
|
4
|
+
|
5
|
+
def self.read(uri, options)
|
6
|
+
page = Page.new(options)
|
7
|
+
page.get_page_content(uri)
|
8
|
+
end
|
9
|
+
|
10
|
+
def initialize(options)
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_page_content(uri = URI.parse("http://localhost"))
|
15
|
+
options.driver.get_content uri
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require './spec/spec_helper'
|
2
|
+
|
3
|
+
describe 'CapybaraWebkit driver' do
|
4
|
+
let(:host) { "http://localhost:#{RackApp.port}/"}
|
5
|
+
let(:browser_route) { "#{host}#test" }
|
6
|
+
let(:snapshot_route) { "#{host}?_escaped_fragment_=test" }
|
7
|
+
|
8
|
+
before(:all) do
|
9
|
+
RackApp.configure_crawler do |config|
|
10
|
+
config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
|
11
|
+
config.poll_interval = 0.25
|
12
|
+
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
|
13
|
+
end
|
14
|
+
|
15
|
+
RackApp.start
|
16
|
+
end
|
17
|
+
|
18
|
+
after(:all) do
|
19
|
+
RackApp.stop
|
20
|
+
end
|
21
|
+
|
22
|
+
describe 'when a browser requests a client side route (i.e.: /#my_route)' do
|
23
|
+
it 'should not serve a snapshot of the dom' do
|
24
|
+
response = Faraday.get browser_route
|
25
|
+
response.body.should_not =~ /Javascript rendering complete for client-side route #test/
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe 'when an ajax crawler requests a snapshot of a client side route' do
|
30
|
+
it 'should serve a snapshot of the dom that includes js rendered components' do
|
31
|
+
response = Faraday.get snapshot_route
|
32
|
+
response.body.should =~ /Javascript rendering complete for client-side route #test/
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
<html>
|
2
|
+
<head></head>
|
3
|
+
<body>
|
4
|
+
<h1>A Simple State Test</h1>
|
5
|
+
<p>State: <span id='page_state'></span></p>
|
6
|
+
<div class='loading' id='loading'>Loading....</div>
|
7
|
+
|
8
|
+
<script type='text/javascript'>
|
9
|
+
|
10
|
+
var init = function() {
|
11
|
+
var writeHash = function() {
|
12
|
+
document.getElementById('page_state').innerHTML = "Javascript rendering complete for client-side route " + document.location.hash;
|
13
|
+
var loadingMask = document.getElementById('loading');
|
14
|
+
loadingMask.parentNode.removeChild(loadingMask);
|
15
|
+
console.log('done...');
|
16
|
+
};
|
17
|
+
|
18
|
+
setTimeout(writeHash, 500);
|
19
|
+
};
|
20
|
+
|
21
|
+
//
|
22
|
+
// Only execute js if loading the page using an unescaped url
|
23
|
+
//
|
24
|
+
if(/#.*$/.test(document.location.href)) init();
|
25
|
+
|
26
|
+
</script>
|
27
|
+
</body>
|
28
|
+
</html>
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'rack'
|
2
|
+
|
3
|
+
class RackApp
|
4
|
+
|
5
|
+
def app
|
6
|
+
page_content = File.read('./spec/support/page.html')
|
7
|
+
Rack::Builder.new do
|
8
|
+
|
9
|
+
use GoogleAjaxCrawler::Crawler do |c|
|
10
|
+
RackApp.crawler_configuration.call(c)
|
11
|
+
end
|
12
|
+
|
13
|
+
map '/' do
|
14
|
+
run lambda {|env| [200, { 'Content-Type' => 'text/html' }, [page_content]] }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class << self
|
20
|
+
attr_reader :crawler_configuration
|
21
|
+
|
22
|
+
def app
|
23
|
+
(@app ||= RackApp.new).app
|
24
|
+
end
|
25
|
+
|
26
|
+
def configure_crawler(&block)
|
27
|
+
@crawler_configuration = block
|
28
|
+
end
|
29
|
+
|
30
|
+
def port
|
31
|
+
9999
|
32
|
+
end
|
33
|
+
|
34
|
+
def start
|
35
|
+
setup!
|
36
|
+
pid = Process.fork
|
37
|
+
if pid.nil?
|
38
|
+
Rack::Server.start(:app => app, :Port => port)
|
39
|
+
sleep 1
|
40
|
+
else
|
41
|
+
File.open(pidfile, 'w') { |f| f.write pid }
|
42
|
+
trap("SIGINT") { stop }
|
43
|
+
Process.detach pid
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def stop
|
48
|
+
return unless running?
|
49
|
+
|
50
|
+
Process.kill 9, pid
|
51
|
+
File.delete pidfile
|
52
|
+
puts "[Stopped rack app...]"
|
53
|
+
end
|
54
|
+
|
55
|
+
def setup!
|
56
|
+
FileUtils.mkpath(File.dirname(pidfile))
|
57
|
+
FileUtils.mkpath(File.dirname(logfile))
|
58
|
+
end
|
59
|
+
|
60
|
+
def pidfile
|
61
|
+
"tmp/server.pid"
|
62
|
+
end
|
63
|
+
|
64
|
+
def logfile
|
65
|
+
"tmp/server.log"
|
66
|
+
end
|
67
|
+
|
68
|
+
def pid
|
69
|
+
running? ? File.read(pidfile).to_i : 0
|
70
|
+
end
|
71
|
+
|
72
|
+
def running?
|
73
|
+
File.exists?(pidfile)
|
74
|
+
end
|
75
|
+
|
76
|
+
def restart
|
77
|
+
stop if running?
|
78
|
+
start
|
79
|
+
end
|
80
|
+
|
81
|
+
def log_to_file
|
82
|
+
log = File.new RackApp.logfile, "a"
|
83
|
+
STDOUT.reopen log
|
84
|
+
STDERR.reopen log
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require './spec/spec_helper'
|
2
|
+
|
3
|
+
describe GoogleAjaxCrawler::Crawler do
|
4
|
+
before(:each) do
|
5
|
+
GoogleAjaxCrawler::Crawler.configure do |config|
|
6
|
+
config.page_loaded_test = lambda{ page.find('.loading', count: 0) }
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
shared_examples 'a crawler configurer' do |method, *args|
|
11
|
+
it 'and facilitate the setting of crawler options' do
|
12
|
+
GoogleAjaxCrawler::Crawler.send(method, *args) do |config|
|
13
|
+
config.timeout = 10
|
14
|
+
config.driver = MockDriver
|
15
|
+
end
|
16
|
+
|
17
|
+
GoogleAjaxCrawler::Crawler.options.timeout.should == 10
|
18
|
+
GoogleAjaxCrawler::Crawler.options.driver.should be_instance_of(MockDriver)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context 'configure' do
|
23
|
+
it_should_behave_like 'a crawler configurer', :configure
|
24
|
+
end
|
25
|
+
|
26
|
+
context 'initialize' do
|
27
|
+
it_should_behave_like 'a crawler configurer', :new, nil
|
28
|
+
end
|
29
|
+
|
30
|
+
context 'call' do
|
31
|
+
let(:app) { double(:app) }
|
32
|
+
let(:request) { {
|
33
|
+
'HTTP_METHOD' => 'GET',
|
34
|
+
'HTTP_HOST' => 'test.com',
|
35
|
+
'PATH_INFO' => '/test',
|
36
|
+
'QUERY_STRING' => 'some_key=some_val',
|
37
|
+
'rack.url_scheme' => 'http',
|
38
|
+
"rack.input" => :blah
|
39
|
+
} }
|
40
|
+
let(:search_engine_request) { request.merge('QUERY_STRING' => '_escaped_fragment_=test') }
|
41
|
+
let(:crawler) { GoogleAjaxCrawler::Crawler.new app }
|
42
|
+
|
43
|
+
it 'should delegate non snapshot requests to the configured app' do
|
44
|
+
app.should_receive(:call).with request
|
45
|
+
crawler.call request
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should generate a rendered snapshot on search engine requests' do
|
49
|
+
GoogleAjaxCrawler::Page.stub(:read).and_return :wibble
|
50
|
+
response = crawler.call search_engine_request
|
51
|
+
response.should == [200, GoogleAjaxCrawler::Crawler.options.response_headers, [:wibble]]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require './spec/spec_helper'
|
2
|
+
describe GoogleAjaxCrawler::Options do
|
3
|
+
context 'initialize' do
|
4
|
+
let(:app) { Class.new }
|
5
|
+
|
6
|
+
it 'should set default values' do
|
7
|
+
options = GoogleAjaxCrawler::Options.new(app)
|
8
|
+
|
9
|
+
options.timeout.should == 30
|
10
|
+
options.requested_route_key.should == '_escaped_fragment_'
|
11
|
+
options.response_headers.should == { 'Content-Type' => 'text/html' }
|
12
|
+
options.poll_interval.should == 0.5
|
13
|
+
options.driver.should be_a(GoogleAjaxCrawler::Drivers::CapybaraWebkit)
|
14
|
+
options.page_loaded_test.should be_nil
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should allow default overrides within block scope' do
|
18
|
+
options = GoogleAjaxCrawler::Options.new(app) do |config|
|
19
|
+
config.requested_route_key = :some_other_key
|
20
|
+
config.page_loaded_test = :some_other_page_loaded_test
|
21
|
+
config.poll_interval = 7000
|
22
|
+
config.response_headers = :some_other_headers
|
23
|
+
config.timeout = 20
|
24
|
+
config.driver = MockDriver
|
25
|
+
end
|
26
|
+
|
27
|
+
options.requested_route_key.should == :some_other_key
|
28
|
+
options.page_loaded_test.should == :some_other_page_loaded_test
|
29
|
+
options.poll_interval.should == 7000
|
30
|
+
options.response_headers.should == :some_other_headers
|
31
|
+
options.timeout.should == 20
|
32
|
+
options.driver.should be_instance_of(MockDriver)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require './spec/spec_helper'
|
2
|
+
describe GoogleAjaxCrawler::Page do
|
3
|
+
context 'read' do
|
4
|
+
let(:uri) { URI.parse('http://www.test.com') }
|
5
|
+
let(:options) { double(:options) }
|
6
|
+
|
7
|
+
it 'should ask the driver to fetch content from a given uri' do
|
8
|
+
options.stub_chain(:driver, :get_content).with(uri).and_return :wibble
|
9
|
+
content = GoogleAjaxCrawler::Page.read(uri, options)
|
10
|
+
content.should == :wibble
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
metadata
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: google_ajax_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ben Kitzelman
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-03-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: capybara-webkit
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.10.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.10.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rack
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: Rack Middleware adhering to the Google Ajax Crawling Scheme, using a
|
42
|
+
headless browser to render JS heavy pages and serve a dom snapshot of the rendered
|
43
|
+
state to a requesting search engine.
|
44
|
+
email:
|
45
|
+
- benkitzelman@gmail.com
|
46
|
+
executables: []
|
47
|
+
extensions: []
|
48
|
+
extra_rdoc_files: []
|
49
|
+
files:
|
50
|
+
- .document
|
51
|
+
- .gitignore
|
52
|
+
- .rspec
|
53
|
+
- Gemfile
|
54
|
+
- Gemfile.lock
|
55
|
+
- LICENSE.txt
|
56
|
+
- README.md
|
57
|
+
- Rakefile
|
58
|
+
- google_ajax_crawler.gemspec
|
59
|
+
- lib/google_ajax_crawler.rb
|
60
|
+
- lib/google_ajax_crawler/crawler.rb
|
61
|
+
- lib/google_ajax_crawler/drivers/capybara_webkit.rb
|
62
|
+
- lib/google_ajax_crawler/drivers/driver.rb
|
63
|
+
- lib/google_ajax_crawler/options.rb
|
64
|
+
- lib/google_ajax_crawler/page.rb
|
65
|
+
- spec/integration/capybara_webkit_spec.rb
|
66
|
+
- spec/spec_helper.rb
|
67
|
+
- spec/support/page.html
|
68
|
+
- spec/support/rack_app.rb
|
69
|
+
- spec/unit/crawler_spec.rb
|
70
|
+
- spec/unit/options_spec.rb
|
71
|
+
- spec/unit/page_spec.rb
|
72
|
+
homepage: http://github.com/benkitzelman/google-ajax-crawler
|
73
|
+
licenses: []
|
74
|
+
metadata: {}
|
75
|
+
post_install_message:
|
76
|
+
rdoc_options: []
|
77
|
+
require_paths:
|
78
|
+
- lib
|
79
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - ! '>='
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
requirements: []
|
90
|
+
rubyforge_project:
|
91
|
+
rubygems_version: 2.0.3
|
92
|
+
signing_key:
|
93
|
+
specification_version: 4
|
94
|
+
summary: Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your
|
95
|
+
JS rendered page states (i.e. BackboneJS routes) can be crawled and indexed by search
|
96
|
+
engines.
|
97
|
+
test_files: []
|