google_ajax_crawler 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.travis.yml +16 -0
- data/README.md +13 -0
- data/Rakefile +14 -11
- data/examples/capybara_webkit.ru +28 -5
- data/google_ajax_crawler.gemspec +2 -3
- data/lib/google_ajax_crawler.rb +11 -6
- data/lib/google_ajax_crawler/crawler.rb +3 -5
- data/lib/google_ajax_crawler/drivers/driver.rb +20 -11
- data/releases/google_ajax_crawler-0.1.2.gem +0 -0
- data/spec/fixtures/backbone.html +61 -0
- data/spec/{support/page.html → fixtures/simple_javascript.html} +7 -2
- data/spec/integration/capybara_webkit_spec.rb +1 -1
- data/spec/spec_helper.rb +2 -0
- data/spec/support/rack_app.rb +1 -1
- data/spec/unit/drivers/driver_spec.rb +19 -0
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
Nzk1MTI4M2Y1ZTgyZmEyYzIyNWVlM2ExYjlhNmQxMTYwYzI3MGU0OQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
N2Y0ZDhjZmFiOGJkYTQwNDA4Y2JlYzQyNGY3OGU1OGYwNzM3YThkZA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjYwY2Q0Y2I4NzQwNmYzYjBkYTc0NDljNDJjY2E2MDJlYjRkZTFjNzg5NjBk
|
10
|
+
ZmFjZjQxMGYzNTgyYjBmNDIzMTQ5ODlkM2U4NDhlMTk1YWQ2NzZjODAyYWQx
|
11
|
+
M2VmNTRjZGI2ODhmYjY4MDNiZTdlNDY1MDQyN2FkMTA0MTJlZWM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MDg0MWM3ZmRiODk1NjM1MTk3MWRmYjFhNDhhOThlOGJlN2FjMTJjZjY0OTNi
|
14
|
+
ZjQwYjAzNWI5MTI3N2Q2MTJkNGZhMjcyN2YwYTU2ZGQ4ZjM0MmVjMmVkZmJi
|
15
|
+
Nzg5MDY0ZDJlYTlhNTRlZmQzODUxYmFiZmIwNWQ2Y2NlNzY3NzM=
|
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
# Google Ajax Crawler
|
2
2
|
|
3
|
+
[](https://travis-ci.org/benkitzelman/google-ajax-crawler)
|
4
|
+
[](http://badge.fury.io/rb/google_ajax_crawler)
|
5
|
+
|
3
6
|
Rack Middleware adhering to the Google Ajax Crawling Scheme, using a headless browser to render JS heavy pages and serve a dom snapshot of the rendered state to a requesting search engine.
|
4
7
|
|
5
8
|
Details of the scheme can be found at: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
@@ -44,6 +47,16 @@ Tell the crawler when your page has finished loading / rendering. As determining
|
|
44
47
|
|
45
48
|
The current crawler driver is passed to the lambda to allow querying of the current page's dom state.
|
46
49
|
|
50
|
+
A good pattern is to test your page state in a js function returning a boolean, accessible from the window context.. i.e.
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
|
54
|
+
use GoogleAjaxCrawler::Crawler do |config|
|
55
|
+
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('myApp.isPageLoaded()') }
|
56
|
+
end
|
57
|
+
|
58
|
+
```
|
59
|
+
|
47
60
|
### timeout
|
48
61
|
|
49
62
|
The max time the crawler should wait before returning a response
|
data/Rakefile
CHANGED
@@ -1,12 +1,15 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require '
|
4
|
-
require '
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
1
|
+
$:.unshift File.join(File.dirname(__FILE__), 'lib')
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
|
6
|
+
desc "Run specifications"
|
7
|
+
RSpec::Core::RakeTask.new(:spec)
|
8
|
+
|
9
|
+
task :ci do
|
10
|
+
puts 'running tests on CI Server....'
|
11
|
+
system("export DISPLAY=:99.0 && bundle exec rake spec")
|
12
|
+
raise "rake spec failed!" unless $?.exitstatus == 0
|
11
13
|
end
|
12
|
-
|
14
|
+
|
15
|
+
task :default => :ci
|
data/examples/capybara_webkit.ru
CHANGED
@@ -1,11 +1,34 @@
|
|
1
|
+
#
|
2
|
+
# to run:
|
3
|
+
# $ rackup examples/capybara_webkit.ru -p 3000
|
4
|
+
# open browser to http://localhost:3000/#!test
|
5
|
+
#
|
1
6
|
require 'bundler/setup'
|
2
7
|
require './lib/google_ajax_crawler'
|
3
8
|
|
4
9
|
use GoogleAjaxCrawler::Crawler do |config|
|
5
|
-
config.driver
|
6
|
-
config.poll_interval
|
7
|
-
config.
|
10
|
+
config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
|
11
|
+
config.poll_interval = 0.25
|
12
|
+
config.timeout = 5
|
13
|
+
|
14
|
+
#
|
15
|
+
# for the demo - the page is considered loaded when the loading mask has been removed from the DOM
|
16
|
+
# this could evaluate something like $.active == 0 to ensure no jquery ajax calls are pending
|
17
|
+
#
|
18
|
+
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('app.pageHasLoaded()') }
|
8
19
|
end
|
9
20
|
|
10
|
-
|
11
|
-
|
21
|
+
#
|
22
|
+
# a sample page using #! url fragments to seed page state
|
23
|
+
#
|
24
|
+
app = lambda do |env|
|
25
|
+
page_content = case env['PATH_INFO']
|
26
|
+
when /\/backbone(\/)?/
|
27
|
+
File.read('./spec/fixtures/backbone.html')
|
28
|
+
else
|
29
|
+
File.read('./spec/fixtures/simple_javascript.html')
|
30
|
+
end
|
31
|
+
|
32
|
+
[200, { 'Content-Type' => 'text/html' }, [page_content]]
|
33
|
+
end
|
34
|
+
run app
|
data/google_ajax_crawler.gemspec
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
require 'google_ajax_crawler'
|
1
|
+
require './lib/google_ajax_crawler'
|
3
2
|
Gem::Specification.new do |s|
|
4
3
|
s.name = 'google_ajax_crawler'
|
5
4
|
s.version = GoogleAjaxCrawler.version
|
@@ -13,4 +12,4 @@ Gem::Specification.new do |s|
|
|
13
12
|
|
14
13
|
s.add_dependency 'capybara-webkit', '>= 0.10.0'
|
15
14
|
s.add_dependency 'rack'
|
16
|
-
end
|
15
|
+
end
|
data/lib/google_ajax_crawler.rb
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
module GoogleAjaxCrawler
|
2
2
|
class << self
|
3
|
+
def env
|
4
|
+
(ENV['RACK_ENV'] || 'development').to_sym
|
5
|
+
end
|
6
|
+
|
3
7
|
def version
|
4
|
-
"0.1.
|
8
|
+
"0.1.3"
|
5
9
|
end
|
6
10
|
end
|
7
11
|
end
|
8
12
|
|
13
|
+
here = File.dirname(__FILE__)
|
9
14
|
require 'uri'
|
10
|
-
require
|
11
|
-
require
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
15
|
+
require "#{here}/google_ajax_crawler/crawler"
|
16
|
+
require "#{here}/google_ajax_crawler/options"
|
17
|
+
require "#{here}/google_ajax_crawler/page"
|
18
|
+
require "#{here}/google_ajax_crawler/drivers/driver"
|
19
|
+
require "#{here}/google_ajax_crawler/drivers/capybara_webkit"
|
@@ -24,11 +24,9 @@ module GoogleAjaxCrawler
|
|
24
24
|
|
25
25
|
def call(env)
|
26
26
|
request = Rack::Request.new(env)
|
27
|
-
if is_search_engine?(request)
|
28
|
-
|
29
|
-
|
30
|
-
@app.call(env)
|
31
|
-
end
|
27
|
+
return serve_crawlable_content_for(request) if is_search_engine?(request)
|
28
|
+
|
29
|
+
@app.call(env)
|
32
30
|
end
|
33
31
|
|
34
32
|
protected
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'timeout'
|
1
2
|
module GoogleAjaxCrawler
|
2
3
|
module Drivers
|
3
4
|
class Driver
|
@@ -20,10 +21,16 @@ module GoogleAjaxCrawler
|
|
20
21
|
end
|
21
22
|
|
22
23
|
def get_content(uri)
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
begin
|
25
|
+
puts "::requesting: #{uri}"
|
26
|
+
visit uri.to_s
|
27
|
+
wait_until_page_is_fully_loaded
|
28
|
+
rescue Timeout::Error
|
29
|
+
puts "-- Page Rendering Timed out: --\n"\
|
30
|
+
"Either your page_loaded_test didn't successfully detect when your page had loaded, \n"\
|
31
|
+
"or your page took longer than #{options.timeout} seconds to load \n"\
|
32
|
+
"-- Returning page snapshot in its present state --"
|
33
|
+
end
|
27
34
|
html
|
28
35
|
end
|
29
36
|
|
@@ -31,18 +38,20 @@ module GoogleAjaxCrawler
|
|
31
38
|
if options.page_loaded_test.nil?
|
32
39
|
default_page_loaded_test
|
33
40
|
else
|
34
|
-
options.page_loaded_test.call
|
41
|
+
options.page_loaded_test.call self
|
35
42
|
end
|
36
43
|
end
|
37
44
|
|
38
45
|
def wait_until_page_is_fully_loaded
|
39
|
-
|
40
|
-
|
41
|
-
|
46
|
+
Timeout::timeout(options.timeout) do
|
47
|
+
begin
|
48
|
+
while !is_page_loaded?
|
49
|
+
sleep options.poll_interval
|
50
|
+
end
|
51
|
+
rescue
|
52
|
+
#...squelch
|
53
|
+
puts "Exception: #{$!}"
|
42
54
|
end
|
43
|
-
rescue
|
44
|
-
#...squelch
|
45
|
-
puts "Timeout: #{$!}"
|
46
55
|
end
|
47
56
|
end
|
48
57
|
end
|
Binary file
|
@@ -0,0 +1,61 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<script src='http://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.4.4/underscore-min.js'></script>
|
4
|
+
<script src='http://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min.js'></script>
|
5
|
+
<script src='http://cdnjs.cloudflare.com/ajax/libs/backbone.js/1.0.0/backbone-min.js'></script>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
|
9
|
+
<div id='app'>
|
10
|
+
<div id='loading'>Loading...</div>
|
11
|
+
</div>
|
12
|
+
|
13
|
+
<script type='text/javascript'>
|
14
|
+
|
15
|
+
var View = Backbone.View.extend({
|
16
|
+
className: 'backbone-view',
|
17
|
+
template: _.template(
|
18
|
+
'<h1>Backbone Example</h1><p>This template was rendered by backbone js</p>'
|
19
|
+
),
|
20
|
+
|
21
|
+
render: function(){
|
22
|
+
var app = document.getElementById('app');
|
23
|
+
var _this = this;
|
24
|
+
this.el.innerHTML = _this.template();
|
25
|
+
return this;
|
26
|
+
}
|
27
|
+
});
|
28
|
+
|
29
|
+
var Router = Backbone.Router.extend({
|
30
|
+
routes: {
|
31
|
+
"*anything": "home"
|
32
|
+
},
|
33
|
+
|
34
|
+
home: function() {
|
35
|
+
el = document.getElementById('app');
|
36
|
+
var view = new View;
|
37
|
+
|
38
|
+
// simulate fetching data / async operation
|
39
|
+
_.delay(function() {
|
40
|
+
el.replaceChild(view.render().el, el.children[0]);
|
41
|
+
}, 1500);
|
42
|
+
}
|
43
|
+
});
|
44
|
+
|
45
|
+
function App() {
|
46
|
+
new Router();
|
47
|
+
Backbone.history.start();
|
48
|
+
};
|
49
|
+
|
50
|
+
App.prototype.pageHasLoaded = function() {
|
51
|
+
return !!document.getElementsByClassName('backbone-view').length;
|
52
|
+
};
|
53
|
+
|
54
|
+
|
55
|
+
$(function() {
|
56
|
+
window.app = new App();
|
57
|
+
});
|
58
|
+
|
59
|
+
</script>
|
60
|
+
</body>
|
61
|
+
</html>
|
@@ -7,7 +7,8 @@
|
|
7
7
|
|
8
8
|
<script type='text/javascript'>
|
9
9
|
|
10
|
-
|
10
|
+
|
11
|
+
function App() {
|
11
12
|
var writeHash = function() {
|
12
13
|
document.getElementById('page_state').innerHTML = "Javascript rendering complete for client-side route " + document.location.hash;
|
13
14
|
var loadingMask = document.getElementById('loading');
|
@@ -19,10 +20,14 @@
|
|
19
20
|
setTimeout(writeHash, 500);
|
20
21
|
};
|
21
22
|
|
23
|
+
App.prototype.pageHasLoaded = function() {
|
24
|
+
return !document.getElementById('loading');
|
25
|
+
};
|
26
|
+
|
22
27
|
//
|
23
28
|
// Only execute js if loading the page using an unescaped url
|
24
29
|
//
|
25
|
-
if(/#.*$/.test(document.location.href))
|
30
|
+
if(/#.*$/.test(document.location.href)) window.app = new App();
|
26
31
|
|
27
32
|
</script>
|
28
33
|
</body>
|
@@ -9,7 +9,7 @@ describe 'CapybaraWebkit driver' do
|
|
9
9
|
RackApp.configure_crawler do |config|
|
10
10
|
config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
|
11
11
|
config.poll_interval = 0.25
|
12
|
-
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('
|
12
|
+
config.page_loaded_test = lambda {|driver| driver.page.evaluate_script('app.pageHasLoaded()') }
|
13
13
|
end
|
14
14
|
|
15
15
|
RackApp.start
|
data/spec/spec_helper.rb
CHANGED
data/spec/support/rack_app.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
require './spec/spec_helper'
|
2
|
+
|
3
|
+
describe GoogleAjaxCrawler::Drivers::Driver do
|
4
|
+
let(:options) do
|
5
|
+
GoogleAjaxCrawler::Options.new(nil) do |o|
|
6
|
+
o.timeout = 0.05
|
7
|
+
o.page_loaded_test = lambda {|d| false }
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
describe '#wait_until_page_is_fully_loaded' do
|
12
|
+
it 'should raise a Timeout Exception when timeout limit reached' do
|
13
|
+
expect do
|
14
|
+
driver = GoogleAjaxCrawler::Drivers::Driver.new(options)
|
15
|
+
driver.wait_until_page_is_fully_loaded
|
16
|
+
end.to raise_error(Timeout::Error)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_ajax_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Kitzelman
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara-webkit
|
@@ -49,6 +49,7 @@ extra_rdoc_files: []
|
|
49
49
|
files:
|
50
50
|
- .gitignore
|
51
51
|
- .rspec
|
52
|
+
- .travis.yml
|
52
53
|
- Gemfile
|
53
54
|
- Gemfile.lock
|
54
55
|
- LICENSE.txt
|
@@ -64,11 +65,14 @@ files:
|
|
64
65
|
- lib/google_ajax_crawler/page.rb
|
65
66
|
- releases/google_ajax_crawler-0.1.0.gem
|
66
67
|
- releases/google_ajax_crawler-0.1.1.gem
|
68
|
+
- releases/google_ajax_crawler-0.1.2.gem
|
69
|
+
- spec/fixtures/backbone.html
|
70
|
+
- spec/fixtures/simple_javascript.html
|
67
71
|
- spec/integration/capybara_webkit_spec.rb
|
68
72
|
- spec/spec_helper.rb
|
69
|
-
- spec/support/page.html
|
70
73
|
- spec/support/rack_app.rb
|
71
74
|
- spec/unit/crawler_spec.rb
|
75
|
+
- spec/unit/drivers/driver_spec.rb
|
72
76
|
- spec/unit/options_spec.rb
|
73
77
|
- spec/unit/page_spec.rb
|
74
78
|
homepage: http://github.com/benkitzelman/google-ajax-crawler
|