google_ajax_crawler 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.travis.yml +0 -1
- data/README.md +73 -16
- data/examples/capybara_webkit.ru +12 -4
- data/lib/google_ajax_crawler.rb +1 -1
- data/lib/google_ajax_crawler/drivers/driver.rb +8 -8
- data/lib/google_ajax_crawler/options.rb +1 -1
- data/releases/google_ajax_crawler-0.1.3.gem +0 -0
- data/spec/fixtures/angular.html +32 -0
- data/spec/integration/capybara_webkit_spec.rb +41 -17
- data/spec/unit/crawler_spec.rb +0 -6
- data/spec/unit/drivers/driver_spec.rb +47 -5
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YTU0N2ZkNGQ0YzJiMDlkYmYyMTI4YWNkMDY0ZmQ2MmM1MzBkZGNkNQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ODdkODI5OGI3NGIzYTdhOWNlOTEwMGE5NzRkZjgxYzUzOTY0NzZjZg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZTBlNjdlOWI3NDU2ZDI3OTBkYmU5MDE4MTJlZWIxZDZmODMzYTg1NmUyMjE1
|
10
|
+
OTExMDNjY2UyOWZiZjljOGUwYzhhZTJiZjFmODY2OTE3NmFkYzEyNWZhY2E3
|
11
|
+
M2E1MzlmZTljYjVmMTBmOWQ0ZDIzMzQyMWY1NTA4NWY3MjgwYzE=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ODc3YzBjYzg0MzYyYTA5NzVkM2M2OWRkYWFjNjNlMDQ2MzI2OTI4NjE5MDc2
|
14
|
+
YTUzNmJkNzFlMTE4ZjJiNmRlYzcyMDNhZTUxN2YwOTQ4Y2U0M2MyZDRjNzU5
|
15
|
+
MGNjZDhlZTllY2NjYzUzODljOWJjYjliOGUzNWM0NThkZTE1NjY=
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -9,7 +9,7 @@ Details of the scheme can be found at: https://developers.google.com/webmasters/
|
|
9
9
|
|
10
10
|
## Using
|
11
11
|
|
12
|
-
install
|
12
|
+
### install
|
13
13
|
|
14
14
|
``` ruby
|
15
15
|
gem install google_ajax_crawler
|
@@ -21,57 +21,114 @@ In your config.ru
|
|
21
21
|
require 'google_ajax_crawler'
|
22
22
|
|
23
23
|
use GoogleAjaxCrawler::Crawler do |config|
|
24
|
-
config.
|
24
|
+
config.page_loaded_js = "MyApp.isPageLoaded()"
|
25
25
|
end
|
26
26
|
|
27
|
-
app =
|
27
|
+
app = -> env { [200, {'Content-Type' => 'text/plain'}, "b" ] }
|
28
28
|
run app
|
29
29
|
|
30
30
|
```
|
31
31
|
|
32
|
+
### rails usage
|
33
|
+
|
34
|
+
create in the initializer folder :
|
35
|
+
``` ruby
|
36
|
+
google_ajax_crawler_middleware.rb
|
37
|
+
```
|
38
|
+
|
39
|
+
with
|
40
|
+
``` ruby
|
41
|
+
if defined?(Rails.configuration) && Rails.configuration.respond_to?(:middleware)
|
42
|
+
require 'google_ajax_crawler'
|
43
|
+
Rails.configuration.middleware.use GoogleAjaxCrawler::Crawler do |config|
|
44
|
+
config.page_loaded_test = -> driver { driver.page.evaluate_script('document.getElementById("loading") == null') }
|
45
|
+
end
|
46
|
+
end
|
47
|
+
```
|
48
|
+
|
49
|
+
#### Important
|
50
|
+
|
51
|
+
Concurrent requests must be enabled to allow your site to snapshot itself. If concurrent requests are not allowed, the site will simple hang on a crawler request.
|
52
|
+
|
53
|
+
In config/application.rb :
|
54
|
+
|
55
|
+
``` ruby
|
56
|
+
config.threadsafe!
|
57
|
+
```
|
58
|
+
|
32
59
|
## Examples
|
33
60
|
|
34
|
-
In the examples folder, each driver has a rackup file, which can be launched:
|
61
|
+
In the examples folder, each driver has a rackup file (at the moment only one driver, capybara-webkit, exists), which can be launched:
|
62
|
+
|
63
|
+
`rackup examples/capybara_webkit.ru`
|
35
64
|
|
36
|
-
|
65
|
+
Examples for how to use the crawler with Backbone.JS, Angular.JS and plain ol javascript are accesible via:
|
66
|
+
- http://localhost:9292/backbone
|
67
|
+
- http://localhost:9292/angular
|
68
|
+
- http://localhost:9292/
|
37
69
|
|
38
|
-
|
70
|
+
Curl, or open a browser to http://localhost:9292/[framework]#!test and view source.... This is how a search engine will see your page before snapshotting. *NOTE:* don't look at the markup through a web inspector as it will most likely display dom elements rendered on the fly by js.
|
39
71
|
|
40
|
-
Change the url to http://localhost:9292
|
72
|
+
Change the url to http://localhost:9292/[framework]?_escaped_fragment_=test , and then again curl or view source to see how the DOM state has been captured
|
41
73
|
|
42
74
|
## Configuration Options
|
43
75
|
|
44
|
-
###
|
76
|
+
### Page Loaded Tests
|
45
77
|
|
46
|
-
|
78
|
+
As determining when a page has completed rendering can depend on a number of qualitative factors (i.e. all ajax requests have responses, certain content has been displayed, or even when there are no loaders / spinners visible on the page), you can specify one of two ways to tell the crawler that your page has finished loading / rendering and to return a snapshot of the rendered dom at that time.
|
47
79
|
|
48
|
-
|
80
|
+
#### page_loaded_js (client side test)
|
49
81
|
|
50
|
-
|
82
|
+
Tell the crawler the client side javascript function (returning true/false) you have created, that determines when your page has finished loading / rendering.
|
51
83
|
|
52
84
|
```ruby
|
53
85
|
|
54
86
|
use GoogleAjaxCrawler::Crawler do |config|
|
55
|
-
config.
|
87
|
+
config.page_loaded_js = "MyApp.isPageLoaded()"
|
88
|
+
end
|
89
|
+
|
90
|
+
```
|
91
|
+
|
92
|
+
#### page_loaded_test (server side test)
|
93
|
+
|
94
|
+
A server side test determining when your page has finished loading / rendering.
|
95
|
+
The configured crawler driver is passed to the lambda to allow querying of the current page's dom state from the server side.
|
96
|
+
|
97
|
+
```ruby
|
98
|
+
|
99
|
+
use GoogleAjaxCrawler::Crawler do |config|
|
100
|
+
config.page_loaded_test = -> driver { driver.page.has_css?('.loading') == false }
|
56
101
|
end
|
57
102
|
|
58
103
|
```
|
59
104
|
|
60
105
|
### timeout
|
61
106
|
|
62
|
-
The max time the crawler should wait before returning a response
|
107
|
+
The max time (in seconds) the crawler should wait before returning a response. After the timeout has been reached,
|
108
|
+
a snapshot of the DOM in its current state is returned. Defaults to 30 seconds.
|
63
109
|
|
64
110
|
### driver
|
65
111
|
|
66
|
-
The configured google ajax crawler driver used to query the current page state.
|
112
|
+
The configured google ajax crawler driver used to query the current page state. Defaults to capybara_webkit.
|
67
113
|
|
68
114
|
### poll_interval
|
69
115
|
|
70
|
-
How often (in seconds) to test the page state with the configured page_loaded_test
|
116
|
+
How often (in seconds) to test the page state with the configured page_loaded_test. Defaults to 0.5 seconds.
|
71
117
|
|
72
118
|
### response_headers
|
73
119
|
|
74
|
-
What response headers shoudl be returned with the dom snapshot. Default headers specify the content-type text/html
|
120
|
+
What response headers shoudl be returned with the dom snapshot. Default headers specify the content-type text/html.
|
121
|
+
|
122
|
+
### requested_route_key
|
123
|
+
|
124
|
+
The parameter name used by a search bot to idenitfy which client side route to snapshot. Defaults to _escaped_fragment_.
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
### Identifing Search Engine Requests
|
129
|
+
|
130
|
+
Snapshot requests are passed an additional query string param (?search_engine=true), allowing you to optionally execute client side code.
|
131
|
+
This is particularly handy should you have stats tracking code (i.e. Google Analytics), which you don't want executed / included when search engines are trawling your site.
|
75
132
|
|
76
133
|
## License
|
77
134
|
|
data/examples/capybara_webkit.ru
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
# to run:
|
3
3
|
# $ rackup examples/capybara_webkit.ru -p 3000
|
4
4
|
# open browser to http://localhost:3000/#!test
|
5
|
+
# or http://localhost:3000/backbone/#!test
|
6
|
+
# or http://localhost:3000/backbone/#!test
|
5
7
|
#
|
6
8
|
require 'bundler/setup'
|
7
9
|
require './lib/google_ajax_crawler'
|
@@ -12,19 +14,25 @@ use GoogleAjaxCrawler::Crawler do |config|
|
|
12
14
|
config.timeout = 5
|
13
15
|
|
14
16
|
#
|
15
|
-
# for the demo -
|
16
|
-
#
|
17
|
+
# for the demo - in each example (simple, backbone and angular) there is a page loaded function signaling
|
18
|
+
# when the page has completed loading. If neither page_loaded_js or page_loaded_test has been configured, the crawler will default to
|
19
|
+
# executing $.active == 0 to ensure no jquery ajax calls are pending
|
17
20
|
#
|
18
|
-
config.
|
21
|
+
config.page_loaded_js = 'app.pageHasLoaded()'
|
22
|
+
|
23
|
+
# alertnative server side test for the simple_javascript example:
|
24
|
+
# config.page_loaded_test = -> driver { !driver.page.has_css?('#loading') }
|
19
25
|
end
|
20
26
|
|
21
27
|
#
|
22
28
|
# a sample page using #! url fragments to seed page state
|
23
29
|
#
|
24
|
-
app =
|
30
|
+
app = -> env do
|
25
31
|
page_content = case env['PATH_INFO']
|
26
32
|
when /\/backbone(\/)?/
|
27
33
|
File.read('./spec/fixtures/backbone.html')
|
34
|
+
when /\/angular(\/)?/
|
35
|
+
File.read('./spec/fixtures/angular.html')
|
28
36
|
else
|
29
37
|
File.read('./spec/fixtures/simple_javascript.html')
|
30
38
|
end
|
data/lib/google_ajax_crawler.rb
CHANGED
@@ -12,6 +12,10 @@ module GoogleAjaxCrawler
|
|
12
12
|
raise "Driver Not Specified"
|
13
13
|
end
|
14
14
|
|
15
|
+
def evaluate_script(javascript)
|
16
|
+
raise "Driver Not Specified"
|
17
|
+
end
|
18
|
+
|
15
19
|
def default_page_loaded_test
|
16
20
|
raise "Driver Not Specified"
|
17
21
|
end
|
@@ -35,19 +39,15 @@ module GoogleAjaxCrawler
|
|
35
39
|
end
|
36
40
|
|
37
41
|
def is_page_loaded?
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
options.page_loaded_test.call self
|
42
|
-
end
|
42
|
+
return evaluate_script(options.page_loaded_js) unless options.page_loaded_js.nil?
|
43
|
+
return options.page_loaded_test.call(self) unless options.page_loaded_test.nil?
|
44
|
+
default_page_loaded_test
|
43
45
|
end
|
44
46
|
|
45
47
|
def wait_until_page_is_fully_loaded
|
46
48
|
Timeout::timeout(options.timeout) do
|
47
49
|
begin
|
48
|
-
while !is_page_loaded?
|
49
|
-
sleep options.poll_interval
|
50
|
-
end
|
50
|
+
sleep(options.poll_interval) while !is_page_loaded?
|
51
51
|
rescue
|
52
52
|
#...squelch
|
53
53
|
puts "Exception: #{$!}"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module GoogleAjaxCrawler
|
2
2
|
class Options
|
3
|
-
attr_accessor :driver, :timeout, :requested_route_key, :page_loaded_test, :poll_interval, :response_headers
|
3
|
+
attr_accessor :driver, :timeout, :requested_route_key, :page_loaded_test, :page_loaded_js, :poll_interval, :response_headers
|
4
4
|
|
5
5
|
def initialize(app, &block)
|
6
6
|
@driver = Drivers::CapybaraWebkit.new(self)
|
Binary file
|
@@ -0,0 +1,32 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html ng-app>
|
3
|
+
<head>
|
4
|
+
<script src="https://ajax.googleapis.com/ajax/libs/angularjs/1.0.6/angular.min.js"></script>
|
5
|
+
</head>
|
6
|
+
<body>
|
7
|
+
|
8
|
+
<div ng-controller="renderTSCtrl">
|
9
|
+
<h1 id='title'>Angular tested route: {{route()}}</h1>
|
10
|
+
<p id='ts'>{{renderTime()}}</p>
|
11
|
+
</div>
|
12
|
+
|
13
|
+
<script type='text/javascript'>
|
14
|
+
function renderTSCtrl ($scope) {
|
15
|
+
$scope.route = function() {
|
16
|
+
return document.location.hash;
|
17
|
+
};
|
18
|
+
|
19
|
+
$scope.renderTime = function() {
|
20
|
+
return "This rendered at " + new Date().toString() +"!";
|
21
|
+
};
|
22
|
+
};
|
23
|
+
|
24
|
+
window.app = {
|
25
|
+
pageHasLoaded: function () {
|
26
|
+
return !!document.getElementById('title').innerText && !!document.getElementById('ts').innerText;
|
27
|
+
}
|
28
|
+
};
|
29
|
+
|
30
|
+
</script>
|
31
|
+
</body>
|
32
|
+
</html>
|
@@ -5,31 +5,55 @@ describe 'CapybaraWebkit driver' do
|
|
5
5
|
let(:browser_route) { "#{host}#!test" }
|
6
6
|
let(:snapshot_route) { "#{host}?_escaped_fragment_=test" }
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
shared_examples 'google ajax crawler' do
|
9
|
+
describe 'when a browser requests a client side route (i.e.: /#my_route)' do
|
10
|
+
it 'should not serve a snapshot of the dom' do
|
11
|
+
response = Faraday.get browser_route
|
12
|
+
response.body.should_not =~ /Javascript rendering complete for client-side route #!test/
|
13
|
+
end
|
13
14
|
end
|
14
15
|
|
15
|
-
|
16
|
+
describe 'when an ajax crawler requests a snapshot of a client side route' do
|
17
|
+
it 'should serve a snapshot of the dom that includes js rendered components' do
|
18
|
+
response = Faraday.get snapshot_route
|
19
|
+
response.body.should =~ /Javascript rendering complete for client-side route #!test/
|
20
|
+
end
|
21
|
+
end
|
16
22
|
end
|
17
23
|
|
18
|
-
|
19
|
-
|
20
|
-
|
24
|
+
describe 'with page_loaded_test' do
|
25
|
+
before(:all) do
|
26
|
+
RackApp.configure_crawler do |config|
|
27
|
+
config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
|
28
|
+
config.poll_interval = 0.25
|
29
|
+
config.page_loaded_test = -> driver { driver.page.evaluate_script('app.pageHasLoaded()') }
|
30
|
+
end
|
21
31
|
|
22
|
-
|
23
|
-
it 'should not serve a snapshot of the dom' do
|
24
|
-
response = Faraday.get browser_route
|
25
|
-
response.body.should_not =~ /Javascript rendering complete for client-side route #!test/
|
32
|
+
RackApp.start
|
26
33
|
end
|
34
|
+
|
35
|
+
after(:all) do
|
36
|
+
RackApp.stop
|
37
|
+
end
|
38
|
+
|
39
|
+
it_should_behave_like 'google ajax crawler'
|
27
40
|
end
|
28
41
|
|
29
|
-
describe '
|
30
|
-
|
31
|
-
|
32
|
-
|
42
|
+
describe 'with page_loaded_js' do
|
43
|
+
before(:all) do
|
44
|
+
RackApp.configure_crawler do |config|
|
45
|
+
config.driver = GoogleAjaxCrawler::Drivers::CapybaraWebkit
|
46
|
+
config.poll_interval = 0.25
|
47
|
+
config.page_loaded_js = 'app.pageHasLoaded()'
|
48
|
+
end
|
49
|
+
|
50
|
+
RackApp.start
|
51
|
+
end
|
52
|
+
|
53
|
+
after(:all) do
|
54
|
+
RackApp.stop
|
33
55
|
end
|
56
|
+
|
57
|
+
it_should_behave_like 'google ajax crawler'
|
34
58
|
end
|
35
59
|
end
|
data/spec/unit/crawler_spec.rb
CHANGED
@@ -1,12 +1,6 @@
|
|
1
1
|
require './spec/spec_helper'
|
2
2
|
|
3
3
|
describe GoogleAjaxCrawler::Crawler do
|
4
|
-
before(:each) do
|
5
|
-
GoogleAjaxCrawler::Crawler.configure do |config|
|
6
|
-
config.page_loaded_test = lambda{ page.find('.loading', count: 0) }
|
7
|
-
end
|
8
|
-
end
|
9
|
-
|
10
4
|
shared_examples 'a crawler configurer' do |method, *args|
|
11
5
|
it 'and facilitate the setting of crawler options' do
|
12
6
|
GoogleAjaxCrawler::Crawler.send(method, *args) do |config|
|
@@ -1,17 +1,59 @@
|
|
1
1
|
require './spec/spec_helper'
|
2
2
|
|
3
3
|
describe GoogleAjaxCrawler::Drivers::Driver do
|
4
|
-
let(:options)
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
let(:options) { GoogleAjaxCrawler::Options.new(nil) { |o| o.timeout = 0.01 } }
|
5
|
+
let(:driver) { GoogleAjaxCrawler::Drivers::Driver.new(options) }
|
6
|
+
|
7
|
+
describe '#mandatory overrides' do
|
8
|
+
shared_examples 'an enforced override method' do |method, *args|
|
9
|
+
it 'should throw an exception if not overridden' do
|
10
|
+
expect { driver.send(method, *args) }.to raise_error(RuntimeError, "Driver Not Specified")
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
it_should_behave_like 'an enforced override method', :visit, 'http://test.com'
|
15
|
+
it_should_behave_like 'an enforced override method', :evaluate_script, 'myApp.isPageLoaded()'
|
16
|
+
it_should_behave_like 'an enforced override method', :default_page_loaded_test
|
17
|
+
it_should_behave_like 'an enforced override method', :html
|
18
|
+
end
|
19
|
+
|
20
|
+
describe '#is_page_loaded?' do
|
21
|
+
|
22
|
+
describe 'when page_loaded_test optioned' do
|
23
|
+
it 'should be called' do
|
24
|
+
driver.options.page_loaded_test = double
|
25
|
+
driver.options.page_loaded_test.should_receive(:call).with(driver)
|
26
|
+
driver.is_page_loaded?
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe 'when page_loaded_js optioned' do
|
31
|
+
it 'should call evaluate_script with the page_loaded_js' do
|
32
|
+
driver.options.page_loaded_js = 'MyApp.isPageLoaded()'
|
33
|
+
driver.stub :evaluate_script
|
34
|
+
driver.should_receive(:evaluate_script).with('MyApp.isPageLoaded()').once
|
35
|
+
driver.is_page_loaded?
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe 'when no loaded tests optioned' do
|
40
|
+
it 'should execute the default_page_loaded_test' do
|
41
|
+
driver.options.page_loaded_test = driver.options.page_loaded_js = nil
|
42
|
+
driver.stub :default_page_loaded_test
|
43
|
+
driver.should_receive(:default_page_loaded_test).once
|
44
|
+
driver.is_page_loaded?
|
45
|
+
end
|
8
46
|
end
|
9
47
|
end
|
10
48
|
|
11
49
|
describe '#wait_until_page_is_fully_loaded' do
|
50
|
+
before do
|
51
|
+
driver.options.page_loaded_test = double
|
52
|
+
driver.options.page_loaded_test.should_receive(:call).with(driver)
|
53
|
+
end
|
54
|
+
|
12
55
|
it 'should raise a Timeout Exception when timeout limit reached' do
|
13
56
|
expect do
|
14
|
-
driver = GoogleAjaxCrawler::Drivers::Driver.new(options)
|
15
57
|
driver.wait_until_page_is_fully_loaded
|
16
58
|
end.to raise_error(Timeout::Error)
|
17
59
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_ajax_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Kitzelman
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-07-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara-webkit
|
@@ -66,6 +66,8 @@ files:
|
|
66
66
|
- releases/google_ajax_crawler-0.1.0.gem
|
67
67
|
- releases/google_ajax_crawler-0.1.1.gem
|
68
68
|
- releases/google_ajax_crawler-0.1.2.gem
|
69
|
+
- releases/google_ajax_crawler-0.1.3.gem
|
70
|
+
- spec/fixtures/angular.html
|
69
71
|
- spec/fixtures/backbone.html
|
70
72
|
- spec/fixtures/simple_javascript.html
|
71
73
|
- spec/integration/capybara_webkit_spec.rb
|
@@ -94,7 +96,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
94
96
|
version: '0'
|
95
97
|
requirements: []
|
96
98
|
rubyforge_project:
|
97
|
-
rubygems_version: 2.0.
|
99
|
+
rubygems_version: 2.0.5
|
98
100
|
signing_key:
|
99
101
|
specification_version: 4
|
100
102
|
summary: Rack Middleware adhering to the Google Ajax Crawling Scheme ensuring your
|