google_ajax_crawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.document +5 -0
- data/.gitignore +38 -0
- data/.rspec +2 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +60 -0
- data/LICENSE.txt +20 -0
- data/README.md +57 -0
- data/Rakefile +12 -0
- data/google_ajax_crawler.gemspec +15 -0
- data/lib/google_ajax_crawler.rb +15 -0
- data/lib/google_ajax_crawler/crawler.rb +56 -0
- data/lib/google_ajax_crawler/drivers/capybara_webkit.rb +28 -0
- data/lib/google_ajax_crawler/drivers/driver.rb +50 -0
- data/lib/google_ajax_crawler/options.rb +22 -0
- data/lib/google_ajax_crawler/page.rb +18 -0
- data/spec/integration/capybara_webkit_spec.rb +35 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/support/page.html +28 -0
- data/spec/support/rack_app.rb +87 -0
- data/spec/unit/crawler_spec.rb +54 -0
- data/spec/unit/options_spec.rb +35 -0
- data/spec/unit/page_spec.rb +14 -0
- metadata +97 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
M2QxYzU1Yjc3YmEzYzRmYjg5MTdlMWViN2JiYWUyNjdiYWRmZmQ4Yg==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MDBmOTllZDkzYzg2NjE1ZmVjY2RiYzE4NzA1YjVhNzA4ZTNiMjE4ZA==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
OTg5Y2E4MGZjZTRlOWU0ZmRkNDY3OGRmMTMwYzkzZTc3MzIyNDViMTdiMGQy
|
10
|
+
ZmQ5Yjc1ZGM2ZDUwYTk4YjliYzYwYzg1MmY0MzY0ODliNGI5MWZlMTlhYjhm
|
11
|
+
Njg1MTI2MzAzOTY1Mzg2MGYyMmExNDM5YmI4Y2ZkYzBkYmU5MjI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZGYzZDVlMmZkOTY2YTU5MWZhNDRlNWQ5MzllMmFmMjVlMjBhYjk2MjM0NDZi
|
14
|
+
YmFkMzE4MDBlMTliYTUzOWE4MWZkNjRmNjA3NTZmOTg5NWE2M2RjNzRmMDA4
|
15
|
+
ZGYyOTc5ZjI2N2NjMTFhZTA4NTdjMDVjZGU2MzIxNTRlNDlkM2M=
|
data/.document
ADDED
data/.gitignore
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
.rvmrc
|
2
|
+
tmp
|
3
|
+
# rcov generated
|
4
|
+
coverage
|
5
|
+
coverage.data
|
6
|
+
|
7
|
+
# rdoc generated
|
8
|
+
rdoc
|
9
|
+
|
10
|
+
# yard generated
|
11
|
+
doc
|
12
|
+
.yardoc
|
13
|
+
|
14
|
+
# bundler
|
15
|
+
.bundle
|
16
|
+
|
17
|
+
# jeweler generated
|
18
|
+
pkg
|
19
|
+
|
20
|
+
# Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
|
21
|
+
#
|
22
|
+
# * Create a file at ~/.gitignore
|
23
|
+
# * Include files you want ignored
|
24
|
+
# * Run: git config --global core.excludesfile ~/.gitignore
|
25
|
+
#
|
26
|
+
# After doing this, these files will be ignored in all your git projects,
|
27
|
+
# saving you from having to 'pollute' every project you touch with them
|
28
|
+
#
|
29
|
+
# Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
|
30
|
+
#
|
31
|
+
# For MacOS:
|
32
|
+
#
|
33
|
+
.DS_Store
|
34
|
+
|
35
|
+
# For TextMate
|
36
|
+
*.tmproj
|
37
|
+
tmtags
|
38
|
+
|
data/.rspec
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
GEM
|
2
|
+
remote: https://rubygems.org/
|
3
|
+
specs:
|
4
|
+
capybara (2.0.2)
|
5
|
+
mime-types (>= 1.16)
|
6
|
+
nokogiri (>= 1.3.3)
|
7
|
+
rack (>= 1.0.0)
|
8
|
+
rack-test (>= 0.5.4)
|
9
|
+
selenium-webdriver (~> 2.0)
|
10
|
+
xpath (~> 1.0.0)
|
11
|
+
capybara-webkit (0.14.2)
|
12
|
+
capybara (~> 2.0, >= 2.0.2)
|
13
|
+
json
|
14
|
+
childprocess (0.3.9)
|
15
|
+
ffi (~> 1.0, >= 1.0.11)
|
16
|
+
diff-lcs (1.2.1)
|
17
|
+
faraday (0.8.6)
|
18
|
+
multipart-post (~> 1.1)
|
19
|
+
ffi (1.4.0)
|
20
|
+
json (1.7.7)
|
21
|
+
mime-types (1.21)
|
22
|
+
multi_json (1.6.1)
|
23
|
+
multipart-post (1.2.0)
|
24
|
+
nokogiri (1.5.6)
|
25
|
+
rack (1.5.2)
|
26
|
+
rack-test (0.6.2)
|
27
|
+
rack (>= 1.0)
|
28
|
+
rake (10.0.3)
|
29
|
+
rspec (2.13.0)
|
30
|
+
rspec-core (~> 2.13.0)
|
31
|
+
rspec-expectations (~> 2.13.0)
|
32
|
+
rspec-mocks (~> 2.13.0)
|
33
|
+
rspec-core (2.13.0)
|
34
|
+
rspec-expectations (2.13.0)
|
35
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
36
|
+
rspec-mocks (2.13.0)
|
37
|
+
rubyzip (0.9.9)
|
38
|
+
selenium-webdriver (2.31.0)
|
39
|
+
childprocess (>= 0.2.5)
|
40
|
+
multi_json (~> 1.0)
|
41
|
+
rubyzip
|
42
|
+
websocket (~> 1.0.4)
|
43
|
+
simplecov (0.7.1)
|
44
|
+
multi_json (~> 1.0)
|
45
|
+
simplecov-html (~> 0.7.1)
|
46
|
+
simplecov-html (0.7.1)
|
47
|
+
websocket (1.0.7)
|
48
|
+
xpath (1.0.0)
|
49
|
+
nokogiri (~> 1.3)
|
50
|
+
|
51
|
+
PLATFORMS
|
52
|
+
ruby
|
53
|
+
|
54
|
+
DEPENDENCIES
|
55
|
+
capybara-webkit (>= 0.10.0)
|
56
|
+
faraday
|
57
|
+
rack
|
58
|
+
rake
|
59
|
+
rspec
|
60
|
+
simplecov
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2013 Ben Kitzelman
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# Google Ajax Crawler
|
2
|
+
|
3
|
+
Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.
|
4
|
+
|
5
|
+
Details of the scheme can be found at: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
6
|
+
|
7
|
+
## Using
|
8
|
+
|
9
|
+
install
|
10
|
+
|
11
|
+
``` ruby
|
12
|
+
gem install google-ajax-crawler
|
13
|
+
```
|
14
|
+
|
15
|
+
In your config.ru
|
16
|
+
|
17
|
+
``` ruby
|
18
|
+
require 'google_ajax_crawler'
|
19
|
+
|
20
|
+
use GoogleAjaxCrawler::Crawler do |config|
|
21
|
+
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
|
22
|
+
end
|
23
|
+
|
24
|
+
app = lambda {|env| [200, {'Content-Type' => 'text/plain'}, "b" ] }
|
25
|
+
run app
|
26
|
+
|
27
|
+
```
|
28
|
+
|
29
|
+
## Configuration Options
|
30
|
+
|
31
|
+
### page_loaded_test
|
32
|
+
|
33
|
+
Tell the crawler when your page has finished loading / rendering. As determining when a page has completed rendering can depend on a number of qualitative factors (i.e. all ajax requests have responses, certain content has been displayed, or even when there are no loaders / spinners visible on the page), the page loaded test allows you to specify when the crawler should decide that your page has finished loading / rendering and to return a snapshot of the rendered dom at that time.
|
34
|
+
|
35
|
+
The current crawler driver is passed to the lambda to allow querying of the current page's dom state.
|
36
|
+
|
37
|
+
### timeout
|
38
|
+
|
39
|
+
The max time the crawler should wait before returning a response
|
40
|
+
|
41
|
+
### driver
|
42
|
+
|
43
|
+
The configured google ajax crawler driver used to query the current page state. Presently there is only one driver (now taking pull requests!); CapybaraWebkit
|
44
|
+
|
45
|
+
### poll_interval
|
46
|
+
|
47
|
+
How often (in seconds) to test the page state with the configured page_loaded_test
|
48
|
+
|
49
|
+
### response_headers
|
50
|
+
|
51
|
+
What response headers shoudl be returned with the dom snapshot. Default headers specify the content-type text/html
|
52
|
+
|
53
|
+
## License
|
54
|
+
|
55
|
+
All free - Use, modify, fork to your hearts content...
|
56
|
+
See LICENSE.txt for further details.
|
57
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require './lib/google_ajax_crawler'
|
2
|
+
Gem::Specification.new do |s|
|
3
|
+
s.name = 'google_ajax_crawler'
|
4
|
+
s.version = GoogleAjaxCrawler.version
|
5
|
+
s.summary = 'Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your JS rendered page states (i.e. BackboneJS routes) can be crawled and indexed by search engines.'
|
6
|
+
s.description = 'Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.'
|
7
|
+
s.authors = ['Ben Kitzelman']
|
8
|
+
s.email = ['benkitzelman@gmail.com']
|
9
|
+
s.homepage = 'http://github.com/benkitzelman/google-ajax-crawler'
|
10
|
+
s.files = `git ls-files`.strip.split("\n")
|
11
|
+
s.executables = []
|
12
|
+
|
13
|
+
s.add_dependency 'capybara-webkit', '>= 0.10.0'
|
14
|
+
s.add_dependency 'rack'
|
15
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rack/utils'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
base_path = './lib/google_ajax_crawler'
|
5
|
+
|
6
|
+
require "#{base_path}/drivers/driver"
|
7
|
+
[base_path, "#{base_path}/drivers"].each do |folder|
|
8
|
+
Dir["#{folder}/*.rb"].each {|file| require file }
|
9
|
+
end
|
10
|
+
|
11
|
+
module GoogleAjaxCrawler
|
12
|
+
def self.version
|
13
|
+
"0.1.0"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
|
2
|
+
module GoogleAjaxCrawler
|
3
|
+
class Crawler
|
4
|
+
|
5
|
+
class << self
|
6
|
+
def options
|
7
|
+
configure if @options.nil?
|
8
|
+
@options
|
9
|
+
end
|
10
|
+
|
11
|
+
def configure(&block)
|
12
|
+
@options = Options.new(self, &block)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(app = nil, &block)
|
17
|
+
@app = app
|
18
|
+
self.class.configure &block
|
19
|
+
end
|
20
|
+
|
21
|
+
def options
|
22
|
+
self.class.options
|
23
|
+
end
|
24
|
+
|
25
|
+
def call(env)
|
26
|
+
request = Rack::Request.new(env)
|
27
|
+
if is_search_engine?(request)
|
28
|
+
serve_crawlable_content_for request
|
29
|
+
else
|
30
|
+
@app.call(env)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def is_search_engine?(request)
|
37
|
+
request.params.include? options.requested_route_key
|
38
|
+
end
|
39
|
+
|
40
|
+
def as_uri_with_fragment(url)
|
41
|
+
uri = URI.parse(url)
|
42
|
+
params = Rack::Utils.parse_query(uri.query).merge(search_engine: true)
|
43
|
+
uri.fragment = params.delete options.requested_route_key
|
44
|
+
uri.query = Rack::Utils.build_query params
|
45
|
+
uri
|
46
|
+
end
|
47
|
+
|
48
|
+
def serve_crawlable_content_for(request)
|
49
|
+
puts ' -- GOOGLE Ajax Web Crawler Request --'
|
50
|
+
html = GoogleAjaxCrawler::Page.read as_uri_with_fragment(request.url), options
|
51
|
+
|
52
|
+
[200, options.response_headers, [html]]
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require "capybara"
|
2
|
+
require "capybara/dsl"
|
3
|
+
require "capybara-webkit"
|
4
|
+
|
5
|
+
module GoogleAjaxCrawler
|
6
|
+
module Drivers
|
7
|
+
class CapybaraWebkit < Driver
|
8
|
+
include Capybara::DSL
|
9
|
+
|
10
|
+
def initialize *args
|
11
|
+
super *args
|
12
|
+
configure
|
13
|
+
end
|
14
|
+
|
15
|
+
def default_page_loaded_test
|
16
|
+
(page.evaluate_script('$.active') == 0)
|
17
|
+
end
|
18
|
+
|
19
|
+
protected
|
20
|
+
|
21
|
+
def configure
|
22
|
+
Capybara.run_server = false
|
23
|
+
Capybara.current_driver = :webkit
|
24
|
+
Capybara.default_wait_time = options.timeout
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module GoogleAjaxCrawler
|
2
|
+
module Drivers
|
3
|
+
class Driver
|
4
|
+
attr_reader :options
|
5
|
+
|
6
|
+
def initialize(options)
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
def visit(url)
|
11
|
+
raise "Driver Not Specified"
|
12
|
+
end
|
13
|
+
|
14
|
+
def default_page_loaded_test
|
15
|
+
raise "Driver Not Specified"
|
16
|
+
end
|
17
|
+
|
18
|
+
def html
|
19
|
+
raise "Driver Not Specified"
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_content(uri)
|
23
|
+
puts "requesting: #{uri}"
|
24
|
+
visit uri.to_s
|
25
|
+
|
26
|
+
wait_until_page_is_fully_loaded
|
27
|
+
html
|
28
|
+
end
|
29
|
+
|
30
|
+
def is_page_loaded?
|
31
|
+
if options.page_loaded_test.nil?
|
32
|
+
default_page_loaded_test
|
33
|
+
else
|
34
|
+
options.page_loaded_test.call(self)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def wait_until_page_is_fully_loaded
|
39
|
+
begin
|
40
|
+
while !is_page_loaded?
|
41
|
+
sleep options.poll_interval
|
42
|
+
end
|
43
|
+
rescue
|
44
|
+
#...squelch
|
45
|
+
puts "Timeout: #{$!}"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module GoogleAjaxCrawler
|
2
|
+
class Options
|
3
|
+
attr_accessor :driver, :timeout, :requested_route_key, :page_loaded_test, :poll_interval, :response_headers
|
4
|
+
|
5
|
+
def initialize(app, &block)
|
6
|
+
@driver = Drivers::CapybaraWebkit.new(self)
|
7
|
+
@timeout = 30
|
8
|
+
@requested_route_key = '_escaped_fragment_'
|
9
|
+
@response_headers = { 'Content-Type' => 'text/html' }
|
10
|
+
@poll_interval = 0.5
|
11
|
+
|
12
|
+
instance_exec(self, &block) unless block.nil?
|
13
|
+
|
14
|
+
@app = app
|
15
|
+
end
|
16
|
+
|
17
|
+
def driver=(klass)
|
18
|
+
@driver = klass.new(self)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module GoogleAjaxCrawler
|
2
|
+
class Page
|
3
|
+
attr_reader :options
|
4
|
+
|
5
|
+
def self.read(uri, options)
|
6
|
+
page = Page.new(options)
|
7
|
+
page.get_page_content(uri)
|
8
|
+
end
|
9
|
+
|
10
|
+
def initialize(options)
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_page_content(uri = URI.parse("http://localhost"))
|
15
|
+
options.driver.get_content uri
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require './spec/spec_helper'
|
2
|
+
|
3
|
+
describe 'CapybaraWebkit driver' do
|
4
|
+
let(:host) { "http://localhost:#{RackApp.port}/"}
|
5
|
+
let(:browser_route) { "#{host}#test" }
|
6
|
+
let(:snapshot_route) { "#{host}?_escaped_fragment_=test" }
|
7
|
+
|
8
|
+
before(:all) do
|
9
|
+
RackApp.configure_crawler do |config|
|
10
|
+
config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
|
11
|
+
config.poll_interval = 0.25
|
12
|
+
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('document.getElementById("loading") == null') }
|
13
|
+
end
|
14
|
+
|
15
|
+
RackApp.start
|
16
|
+
end
|
17
|
+
|
18
|
+
after(:all) do
|
19
|
+
RackApp.stop
|
20
|
+
end
|
21
|
+
|
22
|
+
describe 'when a browser requests a client side route (i.e.: /#my_route)' do
|
23
|
+
it 'should not serve a snapshot of the dom' do
|
24
|
+
response = Faraday.get browser_route
|
25
|
+
response.body.should_not =~ /Javascript rendering complete for client-side route #test/
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe 'when an ajax crawler requests a snapshot of a client side route' do
|
30
|
+
it 'should serve a snapshot of the dom that includes js rendered components' do
|
31
|
+
response = Faraday.get snapshot_route
|
32
|
+
response.body.should =~ /Javascript rendering complete for client-side route #test/
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
<html>
|
2
|
+
<head></head>
|
3
|
+
<body>
|
4
|
+
<h1>A Simple State Test</h1>
|
5
|
+
<p>State: <span id='page_state'></span></p>
|
6
|
+
<div class='loading' id='loading'>Loading....</div>
|
7
|
+
|
8
|
+
<script type='text/javascript'>
|
9
|
+
|
10
|
+
var init = function() {
|
11
|
+
var writeHash = function() {
|
12
|
+
document.getElementById('page_state').innerHTML = "Javascript rendering complete for client-side route " + document.location.hash;
|
13
|
+
var loadingMask = document.getElementById('loading');
|
14
|
+
loadingMask.parentNode.removeChild(loadingMask);
|
15
|
+
console.log('done...');
|
16
|
+
};
|
17
|
+
|
18
|
+
setTimeout(writeHash, 500);
|
19
|
+
};
|
20
|
+
|
21
|
+
//
|
22
|
+
// Only execute js if loading the page using an unescaped url
|
23
|
+
//
|
24
|
+
if(/#.*$/.test(document.location.href)) init();
|
25
|
+
|
26
|
+
</script>
|
27
|
+
</body>
|
28
|
+
</html>
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'rack'
|
2
|
+
|
3
|
+
class RackApp
|
4
|
+
|
5
|
+
def app
|
6
|
+
page_content = File.read('./spec/support/page.html')
|
7
|
+
Rack::Builder.new do
|
8
|
+
|
9
|
+
use GoogleAjaxCrawler::Crawler do |c|
|
10
|
+
RackApp.crawler_configuration.call(c)
|
11
|
+
end
|
12
|
+
|
13
|
+
map '/' do
|
14
|
+
run lambda {|env| [200, { 'Content-Type' => 'text/html' }, [page_content]] }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class << self
|
20
|
+
attr_reader :crawler_configuration
|
21
|
+
|
22
|
+
def app
|
23
|
+
(@app ||= RackApp.new).app
|
24
|
+
end
|
25
|
+
|
26
|
+
def configure_crawler(&block)
|
27
|
+
@crawler_configuration = block
|
28
|
+
end
|
29
|
+
|
30
|
+
def port
|
31
|
+
9999
|
32
|
+
end
|
33
|
+
|
34
|
+
def start
|
35
|
+
setup!
|
36
|
+
pid = Process.fork
|
37
|
+
if pid.nil?
|
38
|
+
Rack::Server.start(:app => app, :Port => port)
|
39
|
+
sleep 1
|
40
|
+
else
|
41
|
+
File.open(pidfile, 'w') { |f| f.write pid }
|
42
|
+
trap("SIGINT") { stop }
|
43
|
+
Process.detach pid
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def stop
|
48
|
+
return unless running?
|
49
|
+
|
50
|
+
Process.kill 9, pid
|
51
|
+
File.delete pidfile
|
52
|
+
puts "[Stopped rack app...]"
|
53
|
+
end
|
54
|
+
|
55
|
+
def setup!
|
56
|
+
FileUtils.mkpath(File.dirname(pidfile))
|
57
|
+
FileUtils.mkpath(File.dirname(logfile))
|
58
|
+
end
|
59
|
+
|
60
|
+
def pidfile
|
61
|
+
"tmp/server.pid"
|
62
|
+
end
|
63
|
+
|
64
|
+
def logfile
|
65
|
+
"tmp/server.log"
|
66
|
+
end
|
67
|
+
|
68
|
+
def pid
|
69
|
+
running? ? File.read(pidfile).to_i : 0
|
70
|
+
end
|
71
|
+
|
72
|
+
def running?
|
73
|
+
File.exists?(pidfile)
|
74
|
+
end
|
75
|
+
|
76
|
+
def restart
|
77
|
+
stop if running?
|
78
|
+
start
|
79
|
+
end
|
80
|
+
|
81
|
+
def log_to_file
|
82
|
+
log = File.new RackApp.logfile, "a"
|
83
|
+
STDOUT.reopen log
|
84
|
+
STDERR.reopen log
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require './spec/spec_helper'
|
2
|
+
|
3
|
+
describe GoogleAjaxCrawler::Crawler do
|
4
|
+
before(:each) do
|
5
|
+
GoogleAjaxCrawler::Crawler.configure do |config|
|
6
|
+
config.page_loaded_test = lambda{ page.find('.loading', count: 0) }
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
shared_examples 'a crawler configurer' do |method, *args|
|
11
|
+
it 'and facilitate the setting of crawler options' do
|
12
|
+
GoogleAjaxCrawler::Crawler.send(method, *args) do |config|
|
13
|
+
config.timeout = 10
|
14
|
+
config.driver = MockDriver
|
15
|
+
end
|
16
|
+
|
17
|
+
GoogleAjaxCrawler::Crawler.options.timeout.should == 10
|
18
|
+
GoogleAjaxCrawler::Crawler.options.driver.should be_instance_of(MockDriver)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context 'configure' do
|
23
|
+
it_should_behave_like 'a crawler configurer', :configure
|
24
|
+
end
|
25
|
+
|
26
|
+
context 'initialize' do
|
27
|
+
it_should_behave_like 'a crawler configurer', :new, nil
|
28
|
+
end
|
29
|
+
|
30
|
+
context 'call' do
|
31
|
+
let(:app) { double(:app) }
|
32
|
+
let(:request) { {
|
33
|
+
'HTTP_METHOD' => 'GET',
|
34
|
+
'HTTP_HOST' => 'test.com',
|
35
|
+
'PATH_INFO' => '/test',
|
36
|
+
'QUERY_STRING' => 'some_key=some_val',
|
37
|
+
'rack.url_scheme' => 'http',
|
38
|
+
"rack.input" => :blah
|
39
|
+
} }
|
40
|
+
let(:search_engine_request) { request.merge('QUERY_STRING' => '_escaped_fragment_=test') }
|
41
|
+
let(:crawler) { GoogleAjaxCrawler::Crawler.new app }
|
42
|
+
|
43
|
+
it 'should delegate non snapshot requests to the configured app' do
|
44
|
+
app.should_receive(:call).with request
|
45
|
+
crawler.call request
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should generate a rendered snapshot on search engine requests' do
|
49
|
+
GoogleAjaxCrawler::Page.stub(:read).and_return :wibble
|
50
|
+
response = crawler.call search_engine_request
|
51
|
+
response.should == [200, GoogleAjaxCrawler::Crawler.options.response_headers, [:wibble]]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require './spec/spec_helper'
|
2
|
+
describe GoogleAjaxCrawler::Options do
|
3
|
+
context 'initialize' do
|
4
|
+
let(:app) { Class.new }
|
5
|
+
|
6
|
+
it 'should set default values' do
|
7
|
+
options = GoogleAjaxCrawler::Options.new(app)
|
8
|
+
|
9
|
+
options.timeout.should == 30
|
10
|
+
options.requested_route_key.should == '_escaped_fragment_'
|
11
|
+
options.response_headers.should == { 'Content-Type' => 'text/html' }
|
12
|
+
options.poll_interval.should == 0.5
|
13
|
+
options.driver.should be_a(GoogleAjaxCrawler::Drivers::CapybaraWebkit)
|
14
|
+
options.page_loaded_test.should be_nil
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should allow default overrides within block scope' do
|
18
|
+
options = GoogleAjaxCrawler::Options.new(app) do |config|
|
19
|
+
config.requested_route_key = :some_other_key
|
20
|
+
config.page_loaded_test = :some_other_page_loaded_test
|
21
|
+
config.poll_interval = 7000
|
22
|
+
config.response_headers = :some_other_headers
|
23
|
+
config.timeout = 20
|
24
|
+
config.driver = MockDriver
|
25
|
+
end
|
26
|
+
|
27
|
+
options.requested_route_key.should == :some_other_key
|
28
|
+
options.page_loaded_test.should == :some_other_page_loaded_test
|
29
|
+
options.poll_interval.should == 7000
|
30
|
+
options.response_headers.should == :some_other_headers
|
31
|
+
options.timeout.should == 20
|
32
|
+
options.driver.should be_instance_of(MockDriver)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require './spec/spec_helper'
|
2
|
+
describe GoogleAjaxCrawler::Page do
|
3
|
+
context 'read' do
|
4
|
+
let(:uri) { URI.parse('http://www.test.com') }
|
5
|
+
let(:options) { double(:options) }
|
6
|
+
|
7
|
+
it 'should ask the driver to fetch content from a given uri' do
|
8
|
+
options.stub_chain(:driver, :get_content).with(uri).and_return :wibble
|
9
|
+
content = GoogleAjaxCrawler::Page.read(uri, options)
|
10
|
+
content.should == :wibble
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
metadata
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: google_ajax_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ben Kitzelman
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-03-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: capybara-webkit
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.10.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.10.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rack
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: Rack Middleware adhering to the Google Ajax Crawling Scheme, using a
|
42
|
+
headless browser to render JS heavy pages and serve a dom snapshot of the rendered
|
43
|
+
state to a requesting search engine.
|
44
|
+
email:
|
45
|
+
- benkitzelman@gmail.com
|
46
|
+
executables: []
|
47
|
+
extensions: []
|
48
|
+
extra_rdoc_files: []
|
49
|
+
files:
|
50
|
+
- .document
|
51
|
+
- .gitignore
|
52
|
+
- .rspec
|
53
|
+
- Gemfile
|
54
|
+
- Gemfile.lock
|
55
|
+
- LICENSE.txt
|
56
|
+
- README.md
|
57
|
+
- Rakefile
|
58
|
+
- google_ajax_crawler.gemspec
|
59
|
+
- lib/google_ajax_crawler.rb
|
60
|
+
- lib/google_ajax_crawler/crawler.rb
|
61
|
+
- lib/google_ajax_crawler/drivers/capybara_webkit.rb
|
62
|
+
- lib/google_ajax_crawler/drivers/driver.rb
|
63
|
+
- lib/google_ajax_crawler/options.rb
|
64
|
+
- lib/google_ajax_crawler/page.rb
|
65
|
+
- spec/integration/capybara_webkit_spec.rb
|
66
|
+
- spec/spec_helper.rb
|
67
|
+
- spec/support/page.html
|
68
|
+
- spec/support/rack_app.rb
|
69
|
+
- spec/unit/crawler_spec.rb
|
70
|
+
- spec/unit/options_spec.rb
|
71
|
+
- spec/unit/page_spec.rb
|
72
|
+
homepage: http://github.com/benkitzelman/google-ajax-crawler
|
73
|
+
licenses: []
|
74
|
+
metadata: {}
|
75
|
+
post_install_message:
|
76
|
+
rdoc_options: []
|
77
|
+
require_paths:
|
78
|
+
- lib
|
79
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - ! '>='
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
requirements: []
|
90
|
+
rubyforge_project:
|
91
|
+
rubygems_version: 2.0.3
|
92
|
+
signing_key:
|
93
|
+
specification_version: 4
|
94
|
+
summary: Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your
|
95
|
+
JS rendered page states (i.e. BackboneJS routes) can be crawled and indexed by search
|
96
|
+
engines.
|
97
|
+
test_files: []
|