google_ajax_crawler 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.travis.yml +0 -1
- data/README.md +73 -16
- data/examples/capybara_webkit.ru +12 -4
- data/lib/google_ajax_crawler.rb +1 -1
- data/lib/google_ajax_crawler/drivers/driver.rb +8 -8
- data/lib/google_ajax_crawler/options.rb +1 -1
- data/releases/google_ajax_crawler-0.1.3.gem +0 -0
- data/spec/fixtures/angular.html +32 -0
- data/spec/integration/capybara_webkit_spec.rb +41 -17
- data/spec/unit/crawler_spec.rb +0 -6
- data/spec/unit/drivers/driver_spec.rb +47 -5
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YTU0N2ZkNGQ0YzJiMDlkYmYyMTI4YWNkMDY0ZmQ2MmM1MzBkZGNkNQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ODdkODI5OGI3NGIzYTdhOWNlOTEwMGE5NzRkZjgxYzUzOTY0NzZjZg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZTBlNjdlOWI3NDU2ZDI3OTBkYmU5MDE4MTJlZWIxZDZmODMzYTg1NmUyMjE1
|
10
|
+
OTExMDNjY2UyOWZiZjljOGUwYzhhZTJiZjFmODY2OTE3NmFkYzEyNWZhY2E3
|
11
|
+
M2E1MzlmZTljYjVmMTBmOWQ0ZDIzMzQyMWY1NTA4NWY3MjgwYzE=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ODc3YzBjYzg0MzYyYTA5NzVkM2M2OWRkYWFjNjNlMDQ2MzI2OTI4NjE5MDc2
|
14
|
+
YTUzNmJkNzFlMTE4ZjJiNmRlYzcyMDNhZTUxN2YwOTQ4Y2U0M2MyZDRjNzU5
|
15
|
+
MGNjZDhlZTllY2NjYzUzODljOWJjYjliOGUzNWM0NThkZTE1NjY=
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -9,7 +9,7 @@ Details of the scheme can be found at: https://developers.google.com/webmasters/
|
|
9
9
|
|
10
10
|
## Using
|
11
11
|
|
12
|
-
install
|
12
|
+
### install
|
13
13
|
|
14
14
|
``` ruby
|
15
15
|
gem install google_ajax_crawler
|
@@ -21,57 +21,114 @@ In your config.ru
|
|
21
21
|
require 'google_ajax_crawler'
|
22
22
|
|
23
23
|
use GoogleAjaxCrawler::Crawler do |config|
|
24
|
-
config.
|
24
|
+
config.page_loaded_js = "MyApp.isPageLoaded()"
|
25
25
|
end
|
26
26
|
|
27
|
-
app =
|
27
|
+
app = -> env { [200, {'Content-Type' => 'text/plain'}, "b" ] }
|
28
28
|
run app
|
29
29
|
|
30
30
|
```
|
31
31
|
|
32
|
+
### rails usage
|
33
|
+
|
34
|
+
create in the initializer folder :
|
35
|
+
``` ruby
|
36
|
+
google_ajax_crawler_middleware.rb
|
37
|
+
```
|
38
|
+
|
39
|
+
with
|
40
|
+
``` ruby
|
41
|
+
if defined?(Rails.configuration) && Rails.configuration.respond_to?(:middleware)
|
42
|
+
require 'google_ajax_crawler'
|
43
|
+
Rails.configuration.middleware.use GoogleAjaxCrawler::Crawler do |config|
|
44
|
+
config.page_loaded_test = -> driver { driver.page.evaluate_script('document.getElementById("loading") == null') }
|
45
|
+
end
|
46
|
+
end
|
47
|
+
```
|
48
|
+
|
49
|
+
#### Important
|
50
|
+
|
51
|
+
Concurrent requests must be enabled to allow your site to snapshot itself. If concurrent requests are not allowed, the site will simple hang on a crawler request.
|
52
|
+
|
53
|
+
In config/application.rb :
|
54
|
+
|
55
|
+
``` ruby
|
56
|
+
config.threadsafe!
|
57
|
+
```
|
58
|
+
|
32
59
|
## Examples
|
33
60
|
|
34
|
-
In the examples folder, each driver has a rackup file, which can be launched:
|
61
|
+
In the examples folder, each driver has a rackup file (at the moment only one driver, capybara-webkit, exists), which can be launched:
|
62
|
+
|
63
|
+
`rackup examples/capybara_webkit.ru`
|
35
64
|
|
36
|
-
|
65
|
+
Examples for how to use the crawler with Backbone.JS, Angular.JS and plain ol javascript are accesible via:
|
66
|
+
- http://localhost:9292/backbone
|
67
|
+
- http://localhost:9292/angular
|
68
|
+
- http://localhost:9292/
|
37
69
|
|
38
|
-
|
70
|
+
Curl, or open a browser to http://localhost:9292/[framework]#!test and view source.... This is how a search engine will see your page before snapshotting. *NOTE:* don't look at the markup through a web inspector as it will most likely display dom elements rendered on the fly by js.
|
39
71
|
|
40
|
-
Change the url to http://localhost:9292
|
72
|
+
Change the url to http://localhost:9292/[framework]?_escaped_fragment_=test , and then again curl or view source to see how the DOM state has been captured
|
41
73
|
|
42
74
|
## Configuration Options
|
43
75
|
|
44
|
-
###
|
76
|
+
### Page Loaded Tests
|
45
77
|
|
46
|
-
|
78
|
+
As determining when a page has completed rendering can depend on a number of qualitative factors (i.e. all ajax requests have responses, certain content has been displayed, or even when there are no loaders / spinners visible on the page), you can specify one of two ways to tell the crawler that your page has finished loading / rendering and to return a snapshot of the rendered dom at that time.
|
47
79
|
|
48
|
-
|
80
|
+
#### page_loaded_js (client side test)
|
49
81
|
|
50
|
-
|
82
|
+
Tell the crawler the client side javascript function (returning true/false) you have created, that determines when your page has finished loading / rendering.
|
51
83
|
|
52
84
|
```ruby
|
53
85
|
|
54
86
|
use GoogleAjaxCrawler::Crawler do |config|
|
55
|
-
config.
|
87
|
+
config.page_loaded_js = "MyApp.isPageLoaded()"
|
88
|
+
end
|
89
|
+
|
90
|
+
```
|
91
|
+
|
92
|
+
#### page_loaded_test (server side test)
|
93
|
+
|
94
|
+
A server side test determining when your page has finished loading / rendering.
|
95
|
+
The configured crawler driver is passed to the lambda to allow querying of the current page's dom state from the server side.
|
96
|
+
|
97
|
+
```ruby
|
98
|
+
|
99
|
+
use GoogleAjaxCrawler::Crawler do |config|
|
100
|
+
config.page_loaded_test = -> driver { driver.page.has_css?('.loading') == false }
|
56
101
|
end
|
57
102
|
|
58
103
|
```
|
59
104
|
|
60
105
|
### timeout
|
61
106
|
|
62
|
-
The max time the crawler should wait before returning a response
|
107
|
+
The max time (in seconds) the crawler should wait before returning a response. After the timeout has been reached,
|
108
|
+
a snapshot of the DOM in its current state is returned. Defaults to 30 seconds.
|
63
109
|
|
64
110
|
### driver
|
65
111
|
|
66
|
-
The configured google ajax crawler driver used to query the current page state.
|
112
|
+
The configured google ajax crawler driver used to query the current page state. Defaults to capybara_webkit.
|
67
113
|
|
68
114
|
### poll_interval
|
69
115
|
|
70
|
-
How often (in seconds) to test the page state with the configured page_loaded_test
|
116
|
+
How often (in seconds) to test the page state with the configured page_loaded_test. Defaults to 0.5 seconds.
|
71
117
|
|
72
118
|
### response_headers
|
73
119
|
|
74
|
-
What response headers shoudl be returned with the dom snapshot. Default headers specify the content-type text/html
|
120
|
+
What response headers shoudl be returned with the dom snapshot. Default headers specify the content-type text/html.
|
121
|
+
|
122
|
+
### requested_route_key
|
123
|
+
|
124
|
+
The parameter name used by a search bot to idenitfy which client side route to snapshot. Defaults to _escaped_fragment_.
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
### Identifing Search Engine Requests
|
129
|
+
|
130
|
+
Snapshot requests are passed an additional query string param (?search_engine=true), allowing you to optionally execute client side code.
|
131
|
+
This is particularly handy should you have stats tracking code (i.e. Google Analytics), which you don't want executed / included when search engines are trawling your site.
|
75
132
|
|
76
133
|
## License
|
77
134
|
|
data/examples/capybara_webkit.ru
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
# to run:
|
3
3
|
# $ rackup examples/capybara_webkit.ru -p 3000
|
4
4
|
# open browser to http://localhost:3000/#!test
|
5
|
+
# or http://localhost:3000/backbone/#!test
|
6
|
+
# or http://localhost:3000/backbone/#!test
|
5
7
|
#
|
6
8
|
require 'bundler/setup'
|
7
9
|
require './lib/google_ajax_crawler'
|
@@ -12,19 +14,25 @@ use GoogleAjaxCrawler::Crawler do |config|
|
|
12
14
|
config.timeout = 5
|
13
15
|
|
14
16
|
#
|
15
|
-
# for the demo -
|
16
|
-
#
|
17
|
+
# for the demo - in each example (simple, backbone and angular) there is a page loaded function signaling
|
18
|
+
# when the page has completed loading. If neither page_loaded_js or page_loaded_test has been configured, the crawler will default to
|
19
|
+
# executing $.active == 0 to ensure no jquery ajax calls are pending
|
17
20
|
#
|
18
|
-
config.
|
21
|
+
config.page_loaded_js = 'app.pageHasLoaded()'
|
22
|
+
|
23
|
+
# alertnative server side test for the simple_javascript example:
|
24
|
+
# config.page_loaded_test = -> driver { !driver.page.has_css?('#loading') }
|
19
25
|
end
|
20
26
|
|
21
27
|
#
|
22
28
|
# a sample page using #! url fragments to seed page state
|
23
29
|
#
|
24
|
-
app =
|
30
|
+
app = -> env do
|
25
31
|
page_content = case env['PATH_INFO']
|
26
32
|
when /\/backbone(\/)?/
|
27
33
|
File.read('./spec/fixtures/backbone.html')
|
34
|
+
when /\/angular(\/)?/
|
35
|
+
File.read('./spec/fixtures/angular.html')
|
28
36
|
else
|
29
37
|
File.read('./spec/fixtures/simple_javascript.html')
|
30
38
|
end
|
data/lib/google_ajax_crawler.rb
CHANGED
@@ -12,6 +12,10 @@ module GoogleAjaxCrawler
|
|
12
12
|
raise "Driver Not Specified"
|
13
13
|
end
|
14
14
|
|
15
|
+
def evaluate_script(javascript)
|
16
|
+
raise "Driver Not Specified"
|
17
|
+
end
|
18
|
+
|
15
19
|
def default_page_loaded_test
|
16
20
|
raise "Driver Not Specified"
|
17
21
|
end
|
@@ -35,19 +39,15 @@ module GoogleAjaxCrawler
|
|
35
39
|
end
|
36
40
|
|
37
41
|
def is_page_loaded?
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
options.page_loaded_test.call self
|
42
|
-
end
|
42
|
+
return evaluate_script(options.page_loaded_js) unless options.page_loaded_js.nil?
|
43
|
+
return options.page_loaded_test.call(self) unless options.page_loaded_test.nil?
|
44
|
+
default_page_loaded_test
|
43
45
|
end
|
44
46
|
|
45
47
|
def wait_until_page_is_fully_loaded
|
46
48
|
Timeout::timeout(options.timeout) do
|
47
49
|
begin
|
48
|
-
while !is_page_loaded?
|
49
|
-
sleep options.poll_interval
|
50
|
-
end
|
50
|
+
sleep(options.poll_interval) while !is_page_loaded?
|
51
51
|
rescue
|
52
52
|
#...squelch
|
53
53
|
puts "Exception: #{$!}"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module GoogleAjaxCrawler
|
2
2
|
class Options
|
3
|
-
attr_accessor :driver, :timeout, :requested_route_key, :page_loaded_test, :poll_interval, :response_headers
|
3
|
+
attr_accessor :driver, :timeout, :requested_route_key, :page_loaded_test, :page_loaded_js, :poll_interval, :response_headers
|
4
4
|
|
5
5
|
def initialize(app, &block)
|
6
6
|
@driver = Drivers::CapybaraWebkit.new(self)
|
Binary file
|
@@ -0,0 +1,32 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html ng-app>
|
3
|
+
<head>
|
4
|
+
<script src="https://ajax.googleapis.com/ajax/libs/angularjs/1.0.6/angular.min.js"></script>
|
5
|
+
</head>
|
6
|
+
<body>
|
7
|
+
|
8
|
+
<div ng-controller="renderTSCtrl">
|
9
|
+
<h1 id='title'>Angular tested route: {{route()}}</h1>
|
10
|
+
<p id='ts'>{{renderTime()}}</p>
|
11
|
+
</div>
|
12
|
+
|
13
|
+
<script type='text/javascript'>
|
14
|
+
function renderTSCtrl ($scope) {
|
15
|
+
$scope.route = function() {
|
16
|
+
return document.location.hash;
|
17
|
+
};
|
18
|
+
|
19
|
+
$scope.renderTime = function() {
|
20
|
+
return "This rendered at " + new Date().toString() +"!";
|
21
|
+
};
|
22
|
+
};
|
23
|
+
|
24
|
+
window.app = {
|
25
|
+
pageHasLoaded: function () {
|
26
|
+
return !!document.getElementById('title').innerText && !!document.getElementById('ts').innerText;
|
27
|
+
}
|
28
|
+
};
|
29
|
+
|
30
|
+
</script>
|
31
|
+
</body>
|
32
|
+
</html>
|
@@ -5,31 +5,55 @@ describe 'CapybaraWebkit driver' do
|
|
5
5
|
let(:browser_route) { "#{host}#!test" }
|
6
6
|
let(:snapshot_route) { "#{host}?_escaped_fragment_=test" }
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
shared_examples 'google ajax crawler' do
|
9
|
+
describe 'when a browser requests a client side route (i.e.: /#my_route)' do
|
10
|
+
it 'should not serve a snapshot of the dom' do
|
11
|
+
response = Faraday.get browser_route
|
12
|
+
response.body.should_not =~ /Javascript rendering complete for client-side route #!test/
|
13
|
+
end
|
13
14
|
end
|
14
15
|
|
15
|
-
|
16
|
+
describe 'when an ajax crawler requests a snapshot of a client side route' do
|
17
|
+
it 'should serve a snapshot of the dom that includes js rendered components' do
|
18
|
+
response = Faraday.get snapshot_route
|
19
|
+
response.body.should =~ /Javascript rendering complete for client-side route #!test/
|
20
|
+
end
|
21
|
+
end
|
16
22
|
end
|
17
23
|
|
18
|
-
|
19
|
-
|
20
|
-
|
24
|
+
describe 'with page_loaded_test' do
|
25
|
+
before(:all) do
|
26
|
+
RackApp.configure_crawler do |config|
|
27
|
+
config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
|
28
|
+
config.poll_interval = 0.25
|
29
|
+
config.page_loaded_test = -> driver { driver.page.evaluate_script('app.pageHasLoaded()') }
|
30
|
+
end
|
21
31
|
|
22
|
-
|
23
|
-
it 'should not serve a snapshot of the dom' do
|
24
|
-
response = Faraday.get browser_route
|
25
|
-
response.body.should_not =~ /Javascript rendering complete for client-side route #!test/
|
32
|
+
RackApp.start
|
26
33
|
end
|
34
|
+
|
35
|
+
after(:all) do
|
36
|
+
RackApp.stop
|
37
|
+
end
|
38
|
+
|
39
|
+
it_should_behave_like 'google ajax crawler'
|
27
40
|
end
|
28
41
|
|
29
|
-
describe '
|
30
|
-
|
31
|
-
|
32
|
-
|
42
|
+
describe 'with page_loaded_js' do
|
43
|
+
before(:all) do
|
44
|
+
RackApp.configure_crawler do |config|
|
45
|
+
config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
|
46
|
+
config.poll_interval = 0.25
|
47
|
+
config.page_loaded_js = 'app.pageHasLoaded()'
|
48
|
+
end
|
49
|
+
|
50
|
+
RackApp.start
|
51
|
+
end
|
52
|
+
|
53
|
+
after(:all) do
|
54
|
+
RackApp.stop
|
33
55
|
end
|
56
|
+
|
57
|
+
it_should_behave_like 'google ajax crawler'
|
34
58
|
end
|
35
59
|
end
|
data/spec/unit/crawler_spec.rb
CHANGED
@@ -1,12 +1,6 @@
|
|
1
1
|
require './spec/spec_helper'
|
2
2
|
|
3
3
|
describe GoogleAjaxCrawler::Crawler do
|
4
|
-
before(:each) do
|
5
|
-
GoogleAjaxCrawler::Crawler.configure do |config|
|
6
|
-
config.page_loaded_test = lambda{ page.find('.loading', count: 0) }
|
7
|
-
end
|
8
|
-
end
|
9
|
-
|
10
4
|
shared_examples 'a crawler configurer' do |method, *args|
|
11
5
|
it 'and facilitate the setting of crawler options' do
|
12
6
|
GoogleAjaxCrawler::Crawler.send(method, *args) do |config|
|
@@ -1,17 +1,59 @@
|
|
1
1
|
require './spec/spec_helper'
|
2
2
|
|
3
3
|
describe GoogleAjaxCrawler::Drivers::Driver do
|
4
|
-
let(:options)
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
let(:options) { GoogleAjaxCrawler::Options.new(nil) { |o| o.timeout = 0.01 } }
|
5
|
+
let(:driver) { GoogleAjaxCrawler::Drivers::Driver.new(options) }
|
6
|
+
|
7
|
+
describe '#mandatory overrides' do
|
8
|
+
shared_examples 'an enforced override method' do |method, *args|
|
9
|
+
it 'should throw an exception if not overridden' do
|
10
|
+
expect { driver.send(method, *args) }.to raise_error(RuntimeError, "Driver Not Specified")
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
it_should_behave_like 'an enforced override method', :visit, 'http://test.com'
|
15
|
+
it_should_behave_like 'an enforced override method', :evaluate_script, 'myApp.isPageLoaded()'
|
16
|
+
it_should_behave_like 'an enforced override method', :default_page_loaded_test
|
17
|
+
it_should_behave_like 'an enforced override method', :html
|
18
|
+
end
|
19
|
+
|
20
|
+
describe '#is_page_loaded?' do
|
21
|
+
|
22
|
+
describe 'when page_loaded_test optioned' do
|
23
|
+
it 'should be called' do
|
24
|
+
driver.options.page_loaded_test = double
|
25
|
+
driver.options.page_loaded_test.should_receive(:call).with(driver)
|
26
|
+
driver.is_page_loaded?
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe 'when page_loaded_js optioned' do
|
31
|
+
it 'should call evaluate_script with the page_loaded_js' do
|
32
|
+
driver.options.page_loaded_js = 'MyApp.isPageLoaded()'
|
33
|
+
driver.stub :evaluate_script
|
34
|
+
driver.should_receive(:evaluate_script).with('MyApp.isPageLoaded()').once
|
35
|
+
driver.is_page_loaded?
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe 'when no loaded tests optioned' do
|
40
|
+
it 'should execute the default_page_loaded_test' do
|
41
|
+
driver.options.page_loaded_test = driver.options.page_loaded_js = nil
|
42
|
+
driver.stub :default_page_loaded_test
|
43
|
+
driver.should_receive(:default_page_loaded_test).once
|
44
|
+
driver.is_page_loaded?
|
45
|
+
end
|
8
46
|
end
|
9
47
|
end
|
10
48
|
|
11
49
|
describe '#wait_until_page_is_fully_loaded' do
|
50
|
+
before do
|
51
|
+
driver.options.page_loaded_test = double
|
52
|
+
driver.options.page_loaded_test.should_receive(:call).with(driver)
|
53
|
+
end
|
54
|
+
|
12
55
|
it 'should raise a Timeout Exception when timeout limit reached' do
|
13
56
|
expect do
|
14
|
-
driver = GoogleAjaxCrawler::Drivers::Driver.new(options)
|
15
57
|
driver.wait_until_page_is_fully_loaded
|
16
58
|
end.to raise_error(Timeout::Error)
|
17
59
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_ajax_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Kitzelman
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-07-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara-webkit
|
@@ -66,6 +66,8 @@ files:
|
|
66
66
|
- releases/google_ajax_crawler-0.1.0.gem
|
67
67
|
- releases/google_ajax_crawler-0.1.1.gem
|
68
68
|
- releases/google_ajax_crawler-0.1.2.gem
|
69
|
+
- releases/google_ajax_crawler-0.1.3.gem
|
70
|
+
- spec/fixtures/angular.html
|
69
71
|
- spec/fixtures/backbone.html
|
70
72
|
- spec/fixtures/simple_javascript.html
|
71
73
|
- spec/integration/capybara_webkit_spec.rb
|
@@ -94,7 +96,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
94
96
|
version: '0'
|
95
97
|
requirements: []
|
96
98
|
rubyforge_project:
|
97
|
-
rubygems_version: 2.0.
|
99
|
+
rubygems_version: 2.0.5
|
98
100
|
signing_key:
|
99
101
|
specification_version: 4
|
100
102
|
summary: Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your
|