google_ajax_crawler 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.travis.yml +16 -0
- data/README.md +13 -0
- data/Rakefile +14 -11
- data/examples/capybara_webkit.ru +28 -5
- data/google_ajax_crawler.gemspec +2 -3
- data/lib/google_ajax_crawler.rb +11 -6
- data/lib/google_ajax_crawler/crawler.rb +3 -5
- data/lib/google_ajax_crawler/drivers/driver.rb +20 -11
- data/releases/google_ajax_crawler-0.1.2.gem +0 -0
- data/spec/fixtures/backbone.html +61 -0
- data/spec/{support/page.html → fixtures/simple_javascript.html} +7 -2
- data/spec/integration/capybara_webkit_spec.rb +1 -1
- data/spec/spec_helper.rb +2 -0
- data/spec/support/rack_app.rb +1 -1
- data/spec/unit/drivers/driver_spec.rb +19 -0
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
Nzk1MTI4M2Y1ZTgyZmEyYzIyNWVlM2ExYjlhNmQxMTYwYzI3MGU0OQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
N2Y0ZDhjZmFiOGJkYTQwNDA4Y2JlYzQyNGY3OGU1OGYwNzM3YThkZA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjYwY2Q0Y2I4NzQwNmYzYjBkYTc0NDljNDJjY2E2MDJlYjRkZTFjNzg5NjBk
|
10
|
+
ZmFjZjQxMGYzNTgyYjBmNDIzMTQ5ODlkM2U4NDhlMTk1YWQ2NzZjODAyYWQx
|
11
|
+
M2VmNTRjZGI2ODhmYjY4MDNiZTdlNDY1MDQyN2FkMTA0MTJlZWM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MDg0MWM3ZmRiODk1NjM1MTk3MWRmYjFhNDhhOThlOGJlN2FjMTJjZjY0OTNi
|
14
|
+
ZjQwYjAzNWI5MTI3N2Q2MTJkNGZhMjcyN2YwYTU2ZGQ4ZjM0MmVjMmVkZmJi
|
15
|
+
Nzg5MDY0ZDJlYTlhNTRlZmQzODUxYmFiZmIwNWQ2Y2NlNzY3NzM=
|
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
# Google Ajax Crawler
|
2
2
|
|
3
|
+
[![Build Status](https://travis-ci.org/benkitzelman/google-ajax-crawler.png)](https://travis-ci.org/benkitzelman/google-ajax-crawler)
|
4
|
+
[![Gem Version](https://badge.fury.io/rb/google_ajax_crawler.png)](http://badge.fury.io/rb/google_ajax_crawler)
|
5
|
+
|
3
6
|
Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.
|
4
7
|
|
5
8
|
Details of the scheme can be found at: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
@@ -44,6 +47,16 @@ Tell the crawler when your page has finished loading / rendering. As determining
|
|
44
47
|
|
45
48
|
The current crawler driver is passed to the lambda to allow querying of the current page's dom state.
|
46
49
|
|
50
|
+
A good pattern is to test your page state in a js function returning a boolean, accessible from the window context.. i.e.
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
|
54
|
+
use GoogleAjaxCrawler::Crawler do |config|
|
55
|
+
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('myApp.isPageLoaded()') }
|
56
|
+
end
|
57
|
+
|
58
|
+
```
|
59
|
+
|
47
60
|
### timeout
|
48
61
|
|
49
62
|
The max time the crawler should wait before returning a response
|
data/Rakefile
CHANGED
@@ -1,12 +1,15 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require '
|
4
|
-
require '
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
1
|
+
$:.unshift File.join(File.dirname(__FILE__), 'lib')
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
|
6
|
+
desc "Run specifications"
|
7
|
+
RSpec::Core::RakeTask.new(:spec)
|
8
|
+
|
9
|
+
task :ci do
|
10
|
+
puts 'running tests on CI Server....'
|
11
|
+
system("export DISPLAY=:99.0 && bundle exec rake spec")
|
12
|
+
raise "rake spec failed!" unless $?.exitstatus == 0
|
11
13
|
end
|
12
|
-
|
14
|
+
|
15
|
+
task :default => :ci
|
data/examples/capybara_webkit.ru
CHANGED
@@ -1,11 +1,34 @@
|
|
1
|
+
#
|
2
|
+
# to run:
|
3
|
+
# $ rackup examples/capybara_webkit.ru -p 3000
|
4
|
+
# open browser to http://localhost:3000/#!test
|
5
|
+
#
|
1
6
|
require 'bundler/setup'
|
2
7
|
require './lib/google_ajax_crawler'
|
3
8
|
|
4
9
|
use GoogleAjaxCrawler::Crawler do |config|
|
5
|
-
config.driver
|
6
|
-
config.poll_interval
|
7
|
-
config.
|
10
|
+
config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
|
11
|
+
config.poll_interval = 0.25
|
12
|
+
config.timeout = 5
|
13
|
+
|
14
|
+
#
|
15
|
+
# for the demo - the page is considered loaded when the loading mask has been removed from the DOM
|
16
|
+
# this could evaluate something like $.active == 0 to ensure no jquery ajax calls are pending
|
17
|
+
#
|
18
|
+
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('app.pageHasLoaded()') }
|
8
19
|
end
|
9
20
|
|
10
|
-
|
11
|
-
|
21
|
+
#
|
22
|
+
# a sample page using #! url fragments to seed page state
|
23
|
+
#
|
24
|
+
app = lambda do |env|
|
25
|
+
page_content = case env['PATH_INFO']
|
26
|
+
when /\/backbone(\/)?/
|
27
|
+
File.read('./spec/fixtures/backbone.html')
|
28
|
+
else
|
29
|
+
File.read('./spec/fixtures/simple_javascript.html')
|
30
|
+
end
|
31
|
+
|
32
|
+
[200, { 'Content-Type' => 'text/html' }, [page_content]]
|
33
|
+
end
|
34
|
+
run app
|
data/google_ajax_crawler.gemspec
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
require 'google_ajax_crawler'
|
1
|
+
require './lib/google_ajax_crawler'
|
3
2
|
Gem::Specification.new do |s|
|
4
3
|
s.name = 'google_ajax_crawler'
|
5
4
|
s.version = GoogleAjaxCrawler.version
|
@@ -13,4 +12,4 @@ Gem::Specification.new do |s|
|
|
13
12
|
|
14
13
|
s.add_dependency 'capybara-webkit', '>= 0.10.0'
|
15
14
|
s.add_dependency 'rack'
|
16
|
-
end
|
15
|
+
end
|
data/lib/google_ajax_crawler.rb
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
module GoogleAjaxCrawler
|
2
2
|
class << self
|
3
|
+
def env
|
4
|
+
(ENV['RACK_ENV'] || 'development').to_sym
|
5
|
+
end
|
6
|
+
|
3
7
|
def version
|
4
|
-
"0.1.
|
8
|
+
"0.1.3"
|
5
9
|
end
|
6
10
|
end
|
7
11
|
end
|
8
12
|
|
13
|
+
here = File.dirname(__FILE__)
|
9
14
|
require 'uri'
|
10
|
-
require
|
11
|
-
require
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
15
|
+
require "#{here}/google_ajax_crawler/crawler"
|
16
|
+
require "#{here}/google_ajax_crawler/options"
|
17
|
+
require "#{here}/google_ajax_crawler/page"
|
18
|
+
require "#{here}/google_ajax_crawler/drivers/driver"
|
19
|
+
require "#{here}/google_ajax_crawler/drivers/capybara_webkit"
|
@@ -24,11 +24,9 @@ module GoogleAjaxCrawler
|
|
24
24
|
|
25
25
|
def call(env)
|
26
26
|
request = Rack::Request.new(env)
|
27
|
-
if is_search_engine?(request)
|
28
|
-
|
29
|
-
|
30
|
-
@app.call(env)
|
31
|
-
end
|
27
|
+
return serve_crawlable_content_for(request) if is_search_engine?(request)
|
28
|
+
|
29
|
+
@app.call(env)
|
32
30
|
end
|
33
31
|
|
34
32
|
protected
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'timeout'
|
1
2
|
module GoogleAjaxCrawler
|
2
3
|
module Drivers
|
3
4
|
class Driver
|
@@ -20,10 +21,16 @@ module GoogleAjaxCrawler
|
|
20
21
|
end
|
21
22
|
|
22
23
|
def get_content(uri)
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
begin
|
25
|
+
puts "::requesting: #{uri}"
|
26
|
+
visit uri.to_s
|
27
|
+
wait_until_page_is_fully_loaded
|
28
|
+
rescue Timeout::Error
|
29
|
+
puts "-- Page Rendering Timed out: --\n"\
|
30
|
+
"Either your page_loaded_test didn't successfully detect when your page had loaded, \n"\
|
31
|
+
"or your page took longer than #{options.timeout} seconds to load \n"\
|
32
|
+
"-- Returning page snapshot in its present state --"
|
33
|
+
end
|
27
34
|
html
|
28
35
|
end
|
29
36
|
|
@@ -31,18 +38,20 @@ module GoogleAjaxCrawler
|
|
31
38
|
if options.page_loaded_test.nil?
|
32
39
|
default_page_loaded_test
|
33
40
|
else
|
34
|
-
options.page_loaded_test.call
|
41
|
+
options.page_loaded_test.call self
|
35
42
|
end
|
36
43
|
end
|
37
44
|
|
38
45
|
def wait_until_page_is_fully_loaded
|
39
|
-
|
40
|
-
|
41
|
-
|
46
|
+
Timeout::timeout(options.timeout) do
|
47
|
+
begin
|
48
|
+
while !is_page_loaded?
|
49
|
+
sleep options.poll_interval
|
50
|
+
end
|
51
|
+
rescue
|
52
|
+
#...squelch
|
53
|
+
puts "Exception: #{$!}"
|
42
54
|
end
|
43
|
-
rescue
|
44
|
-
#...squelch
|
45
|
-
puts "Timeout: #{$!}"
|
46
55
|
end
|
47
56
|
end
|
48
57
|
end
|
Binary file
|
@@ -0,0 +1,61 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<script src='http://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.4.4/underscore-min.js'></script>
|
4
|
+
<script src='http://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min.js'></script>
|
5
|
+
<script src='http://cdnjs.cloudflare.com/ajax/libs/backbone.js/1.0.0/backbone-min.js'></script>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
|
9
|
+
<div id='app'>
|
10
|
+
<div id='loading'>Loading...</div>
|
11
|
+
</div>
|
12
|
+
|
13
|
+
<script type='text/javascript'>
|
14
|
+
|
15
|
+
var View = Backbone.View.extend({
|
16
|
+
className: 'backbone-view',
|
17
|
+
template: _.template(
|
18
|
+
'<h1>Backbone Example</h1><p>This template was rendered by backbone js</p>'
|
19
|
+
),
|
20
|
+
|
21
|
+
render: function(){
|
22
|
+
var app = document.getElementById('app');
|
23
|
+
var _this = this;
|
24
|
+
this.el.innerHTML = _this.template();
|
25
|
+
return this;
|
26
|
+
}
|
27
|
+
});
|
28
|
+
|
29
|
+
var Router = Backbone.Router.extend({
|
30
|
+
routes: {
|
31
|
+
"*anything": "home"
|
32
|
+
},
|
33
|
+
|
34
|
+
home: function() {
|
35
|
+
el = document.getElementById('app');
|
36
|
+
var view = new View;
|
37
|
+
|
38
|
+
// simulate fetching data / async operation
|
39
|
+
_.delay(function() {
|
40
|
+
el.replaceChild(view.render().el, el.children[0]);
|
41
|
+
}, 1500);
|
42
|
+
}
|
43
|
+
});
|
44
|
+
|
45
|
+
function App() {
|
46
|
+
new Router();
|
47
|
+
Backbone.history.start();
|
48
|
+
};
|
49
|
+
|
50
|
+
App.prototype.pageHasLoaded = function() {
|
51
|
+
return !!document.getElementsByClassName('backbone-view').length;
|
52
|
+
};
|
53
|
+
|
54
|
+
|
55
|
+
$(function() {
|
56
|
+
window.app = new App();
|
57
|
+
});
|
58
|
+
|
59
|
+
</script>
|
60
|
+
</body>
|
61
|
+
</html>
|
@@ -7,7 +7,8 @@
|
|
7
7
|
|
8
8
|
<script type='text/javascript'>
|
9
9
|
|
10
|
-
|
10
|
+
|
11
|
+
function App() {
|
11
12
|
var writeHash = function() {
|
12
13
|
document.getElementById('page_state').innerHTML = "Javascript rendering complete for client-side route " + document.location.hash;
|
13
14
|
var loadingMask = document.getElementById('loading');
|
@@ -19,10 +20,14 @@
|
|
19
20
|
setTimeout(writeHash, 500);
|
20
21
|
};
|
21
22
|
|
23
|
+
App.prototype.pageHasLoaded = function() {
|
24
|
+
return !document.getElementById('loading');
|
25
|
+
};
|
26
|
+
|
22
27
|
//
|
23
28
|
// Only execute js if loading the page using an unescaped url
|
24
29
|
//
|
25
|
-
if(/#.*$/.test(document.location.href))
|
30
|
+
if(/#.*$/.test(document.location.href)) window.app = new App();
|
26
31
|
|
27
32
|
</script>
|
28
33
|
</body>
|
@@ -9,7 +9,7 @@ describe 'CapybaraWebkit driver' do
|
|
9
9
|
RackApp.configure_crawler do |config|
|
10
10
|
config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
|
11
11
|
config.poll_interval = 0.25
|
12
|
-
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('
|
12
|
+
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('app.pageHasLoaded()') }
|
13
13
|
end
|
14
14
|
|
15
15
|
RackApp.start
|
data/spec/spec_helper.rb
CHANGED
data/spec/support/rack_app.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
require './spec/spec_helper'
|
2
|
+
|
3
|
+
describe GoogleAjaxCrawler::Drivers::Driver do
|
4
|
+
let(:options) do
|
5
|
+
GoogleAjaxCrawler::Options.new(nil) do |o|
|
6
|
+
o.timeout = 0.05
|
7
|
+
o.page_loaded_test = lambda {|d| false }
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
describe '#wait_until_page_is_fully_loaded' do
|
12
|
+
it 'should raise a Timeout Exception when timeout limit reached' do
|
13
|
+
expect do
|
14
|
+
driver = GoogleAjaxCrawler::Drivers::Driver.new(options)
|
15
|
+
driver.wait_until_page_is_fully_loaded
|
16
|
+
end.to raise_error(Timeout::Error)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_ajax_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Kitzelman
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara-webkit
|
@@ -49,6 +49,7 @@ extra_rdoc_files: []
|
|
49
49
|
files:
|
50
50
|
- .gitignore
|
51
51
|
- .rspec
|
52
|
+
- .travis.yml
|
52
53
|
- Gemfile
|
53
54
|
- Gemfile.lock
|
54
55
|
- LICENSE.txt
|
@@ -64,11 +65,14 @@ files:
|
|
64
65
|
- lib/google_ajax_crawler/page.rb
|
65
66
|
- releases/google_ajax_crawler-0.1.0.gem
|
66
67
|
- releases/google_ajax_crawler-0.1.1.gem
|
68
|
+
- releases/google_ajax_crawler-0.1.2.gem
|
69
|
+
- spec/fixtures/backbone.html
|
70
|
+
- spec/fixtures/simple_javascript.html
|
67
71
|
- spec/integration/capybara_webkit_spec.rb
|
68
72
|
- spec/spec_helper.rb
|
69
|
-
- spec/support/page.html
|
70
73
|
- spec/support/rack_app.rb
|
71
74
|
- spec/unit/crawler_spec.rb
|
75
|
+
- spec/unit/drivers/driver_spec.rb
|
72
76
|
- spec/unit/options_spec.rb
|
73
77
|
- spec/unit/page_spec.rb
|
74
78
|
homepage: http://github.com/benkitzelman/google-ajax-crawler
|