indexable 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/indexable.rb +53 -0
- data/lib/phantomjs.rb +26 -0
- data/lib/render_page.js +82 -0
- metadata +60 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ea666f584199ac33afce60449c7e2ca89a58fe09
|
4
|
+
data.tar.gz: c6cd4101fa9c6ae24f67e8fc1fe58d0fe923cf16
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3d99e57df1a36bd2330f1f1b118c1433581a5411cd6427efd8642b2815003763a7128f51796e9d5bdcb79ef0c824cd6243f03f0380ea517d6ce8250dc3a423c9
|
7
|
+
data.tar.gz: 91554eb4a3c2ffa54239a0ceff1c766b9ff7d5b74641f5be6329a7293c9597df310babf217be4ca35f3f043151439100d3b15a0ed8b096c99e28fd80c0aeda13
|
data/lib/indexable.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'phantomjs'
|
2
|
+
require 'rack/request'
|
3
|
+
|
4
|
+
module Rack
|
5
|
+
class Indexable
|
6
|
+
CRAWLER_USER_AGENTS = [
|
7
|
+
/^Twitterbot/, /^curl/, /Googlebot/, /Mediapartners/, /Adsbot-Google/,
|
8
|
+
/\(.*http(s|\(s\))?:\/\/.*\)/
|
9
|
+
]
|
10
|
+
|
11
|
+
def initialize(app)
|
12
|
+
@app = app
|
13
|
+
end
|
14
|
+
|
15
|
+
# Detect whether the current request comes from a bot. Based on the logic used
|
16
|
+
# by Bustle.com (https://www.dropbox.com/s/s4oibqsxqpo3hll/bustle%20slizzle.pdf)
|
17
|
+
def request_from_crawler?(env)
|
18
|
+
user_agent = env["HTTP_USER_AGENT"]
|
19
|
+
params = Rack::Request.new(env).params
|
20
|
+
return false unless user_agent
|
21
|
+
return true if CRAWLER_USER_AGENTS.any? {|s| user_agent.match(s) }
|
22
|
+
return true if params.has_key?('_escaped_fragment_')
|
23
|
+
params['nojs'].eql?('true')
|
24
|
+
end
|
25
|
+
|
26
|
+
def call(env)
|
27
|
+
status, headers, content = *@app.call(env)
|
28
|
+
|
29
|
+
if status == 200 and headers['Content-Type'].match(/^text\/html/) and request_from_crawler?(env)
|
30
|
+
script = ::File.dirname(__FILE__) + "/render_page.js"
|
31
|
+
file = Tempfile.new(['indexable', '.html'])
|
32
|
+
|
33
|
+
if content.respond_to? :body
|
34
|
+
html = content.body
|
35
|
+
else
|
36
|
+
html = content.join('')
|
37
|
+
end
|
38
|
+
|
39
|
+
file.write html
|
40
|
+
file.close
|
41
|
+
begin
|
42
|
+
url = Rack::Request.new(env).url
|
43
|
+
content = [Phantomjs.new(script, file.path, url).run]
|
44
|
+
status = 500 if content[0] == "Couldn't render page... orz."
|
45
|
+
ensure
|
46
|
+
file.unlink
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
[status, headers, content]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/lib/phantomjs.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'timeout'
|
2
|
+
|
3
|
+
class Phantomjs
|
4
|
+
attr_accessor :timeout
|
5
|
+
|
6
|
+
def initialize(script, *args)
|
7
|
+
@script = script
|
8
|
+
@args = args
|
9
|
+
@timeout = 20
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
pipe = nil
|
14
|
+
begin
|
15
|
+
Timeout.timeout(@timeout) do
|
16
|
+
pipe = IO.popen(["phantomjs", @script] + @args)
|
17
|
+
Process.wait pipe.pid
|
18
|
+
return pipe.read
|
19
|
+
end
|
20
|
+
rescue Timeout::Error
|
21
|
+
Process.kill 9, pipe.pid
|
22
|
+
Process.wait pipe.pid
|
23
|
+
return "Couldn't render page... orz."
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/render_page.js
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
// TODO Add header.
|
2
|
+
//
|
3
|
+
// Based off https://gist.github.com/pieterjongsma/4515412
|
4
|
+
|
5
|
+
var fs = require('fs');
|
6
|
+
var system = require('system');
|
7
|
+
var page = require('webpage').create();
|
8
|
+
var path = system.args[1];
|
9
|
+
var url = system.args[2];
|
10
|
+
var content = fs.read(path);
|
11
|
+
|
12
|
+
// Keep track of whether the page has already been exported, because the '
|
13
|
+
// 'setTimeout's we use might cause it to be exported multiple times.
|
14
|
+
var pageHasBeenExported = false;
|
15
|
+
function exportPage(content) {
|
16
|
+
if (!pageHasBeenExported) {
|
17
|
+
pageHasBeenExported = true;
|
18
|
+
console.log(content);
|
19
|
+
phantom.exit();
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
// Since Ember has no method that indicates when everything has loaded, we keep
|
24
|
+
// keep track of resource requests. The claim is that the page is ready when there
|
25
|
+
// are no outstanding requests for resources.
|
26
|
+
//
|
27
|
+
// This is not strictly true, because sometimes a resource still needs to be
|
28
|
+
// rendered after loading, or the application might still be building a request.
|
29
|
+
// To work around this, we wait an addition 2 seconds to make sure rendering etc.
|
30
|
+
// can take place.
|
31
|
+
//
|
32
|
+
// We need to keep track of request/response IDs, not just the count because larger
|
33
|
+
// resources may be returned in chunks.
|
34
|
+
|
35
|
+
var activeRequests = [];
|
36
|
+
var cumulativeRequests = 0;
|
37
|
+
|
38
|
+
page.onResourceRequested = function(requestData, networkRequest) {
|
39
|
+
if ((/ga\.js/gi).test(requestData.url) || (/dc\.js/gi).test(requestData.url)) {
|
40
|
+
// Don't load ga.js and dc.js to avoid polluting Google Analytics data
|
41
|
+
// regardless of whether external resources are allowed or not.
|
42
|
+
networkRequest.abort();
|
43
|
+
}
|
44
|
+
else {
|
45
|
+
activeRequests.push(requestData.id);
|
46
|
+
cumulativeRequests += 1;
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
page.onResourceReceived = function(response) {
|
51
|
+
// Remove the ID from the array.
|
52
|
+
activeRequests.splice(activeRequests.indexOf(response.id), 1);
|
53
|
+
|
54
|
+
// Should we output the HTML?
|
55
|
+
if (activeRequests.length == 0) {
|
56
|
+
window.setTimeout(function() {
|
57
|
+
if (activeRequests.length == 0) {
|
58
|
+
exportPage(page.content);
|
59
|
+
}
|
60
|
+
}, 2000);
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
page.onError = function(message, trace) {
|
65
|
+
// Catch and ignore errors.
|
66
|
+
}
|
67
|
+
|
68
|
+
page.setContent(content, url);
|
69
|
+
|
70
|
+
// If the page hasn't been exported in 8 seconds something is probably wrong,
|
71
|
+
// go ahead and export it.
|
72
|
+
setTimeout(function() {
|
73
|
+
exportPage(page.content);
|
74
|
+
}, 8000);
|
75
|
+
|
76
|
+
// If there are no requests in 2 seconds, assume the page does not depend on any
|
77
|
+
// external resources and has finished rendering.
|
78
|
+
setTimeout(function() {
|
79
|
+
if (cumulativeRequests == 0) {
|
80
|
+
exportPage(page.content);
|
81
|
+
}
|
82
|
+
}, 2000);
|
metadata
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: indexable
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Vikhyat Korrapati
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-12-10 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rack
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
description:
|
28
|
+
email: c@vikhyat.net
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/indexable.rb
|
34
|
+
- lib/render_page.js
|
35
|
+
- lib/phantomjs.rb
|
36
|
+
homepage: https://github.com/vikhyat/indexable
|
37
|
+
licenses:
|
38
|
+
- MIT
|
39
|
+
metadata: {}
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
requirements: []
|
55
|
+
rubyforge_project:
|
56
|
+
rubygems_version: 2.0.3
|
57
|
+
signing_key:
|
58
|
+
specification_version: 4
|
59
|
+
summary: Rack middleware that executes javascript before serving pages to crawlers.
|
60
|
+
test_files: []
|