indexable 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/indexable.rb +53 -0
- data/lib/phantomjs.rb +26 -0
- data/lib/render_page.js +82 -0
- metadata +60 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ea666f584199ac33afce60449c7e2ca89a58fe09
|
4
|
+
data.tar.gz: c6cd4101fa9c6ae24f67e8fc1fe58d0fe923cf16
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3d99e57df1a36bd2330f1f1b118c1433581a5411cd6427efd8642b2815003763a7128f51796e9d5bdcb79ef0c824cd6243f03f0380ea517d6ce8250dc3a423c9
|
7
|
+
data.tar.gz: 91554eb4a3c2ffa54239a0ceff1c766b9ff7d5b74641f5be6329a7293c9597df310babf217be4ca35f3f043151439100d3b15a0ed8b096c99e28fd80c0aeda13
|
data/lib/indexable.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'phantomjs'
|
2
|
+
require 'rack/request'
|
3
|
+
|
4
|
+
module Rack
|
5
|
+
class Indexable
|
6
|
+
CRAWLER_USER_AGENTS = [
|
7
|
+
/^Twitterbot/, /^curl/, /Googlebot/, /Mediapartners/, /Adsbot-Google/,
|
8
|
+
/\(.*http(s|\(s\))?:\/\/.*\)/
|
9
|
+
]
|
10
|
+
|
11
|
+
def initialize(app)
|
12
|
+
@app = app
|
13
|
+
end
|
14
|
+
|
15
|
+
# Detect whether the current request comes from a bot. Based on the logic used
|
16
|
+
# by Bustle.com (https://www.dropbox.com/s/s4oibqsxqpo3hll/bustle%20slizzle.pdf)
|
17
|
+
def request_from_crawler?(env)
|
18
|
+
user_agent = env["HTTP_USER_AGENT"]
|
19
|
+
params = Rack::Request.new(env).params
|
20
|
+
return false unless user_agent
|
21
|
+
return true if CRAWLER_USER_AGENTS.any? {|s| user_agent.match(s) }
|
22
|
+
return true if params.has_key?('_escaped_fragment_')
|
23
|
+
params['nojs'].eql?('true')
|
24
|
+
end
|
25
|
+
|
26
|
+
def call(env)
|
27
|
+
status, headers, content = *@app.call(env)
|
28
|
+
|
29
|
+
if status == 200 and headers['Content-Type'].match(/^text\/html/) and request_from_crawler?(env)
|
30
|
+
script = ::File.dirname(__FILE__) + "/render_page.js"
|
31
|
+
file = Tempfile.new(['indexable', '.html'])
|
32
|
+
|
33
|
+
if content.respond_to? :body
|
34
|
+
html = content.body
|
35
|
+
else
|
36
|
+
html = content.join('')
|
37
|
+
end
|
38
|
+
|
39
|
+
file.write html
|
40
|
+
file.close
|
41
|
+
begin
|
42
|
+
url = Rack::Request.new(env).url
|
43
|
+
content = [Phantomjs.new(script, file.path, url).run]
|
44
|
+
status = 500 if content[0] == "Couldn't render page... orz."
|
45
|
+
ensure
|
46
|
+
file.unlink
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
[status, headers, content]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/lib/phantomjs.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'timeout'
|
2
|
+
|
3
|
+
class Phantomjs
|
4
|
+
attr_accessor :timeout
|
5
|
+
|
6
|
+
def initialize(script, *args)
|
7
|
+
@script = script
|
8
|
+
@args = args
|
9
|
+
@timeout = 20
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
pipe = nil
|
14
|
+
begin
|
15
|
+
Timeout.timeout(@timeout) do
|
16
|
+
pipe = IO.popen(["phantomjs", @script] + @args)
|
17
|
+
Process.wait pipe.pid
|
18
|
+
return pipe.read
|
19
|
+
end
|
20
|
+
rescue Timeout::Error
|
21
|
+
Process.kill 9, pipe.pid
|
22
|
+
Process.wait pipe.pid
|
23
|
+
return "Couldn't render page... orz."
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/render_page.js
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
// TODO Add header.
|
2
|
+
//
|
3
|
+
// Based off https://gist.github.com/pieterjongsma/4515412
|
4
|
+
|
5
|
+
var fs = require('fs');
|
6
|
+
var system = require('system');
|
7
|
+
var page = require('webpage').create();
|
8
|
+
var path = system.args[1];
|
9
|
+
var url = system.args[2];
|
10
|
+
var content = fs.read(path);
|
11
|
+
|
12
|
+
// Keep track of whether the page has already been exported, because the '
|
13
|
+
// 'setTimeout's we use might cause it to be exported multiple times.
|
14
|
+
var pageHasBeenExported = false;
|
15
|
+
function exportPage(content) {
|
16
|
+
if (!pageHasBeenExported) {
|
17
|
+
pageHasBeenExported = true;
|
18
|
+
console.log(content);
|
19
|
+
phantom.exit();
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
// Since Ember has no method that indicates when everything has loaded, we keep
|
24
|
+
// keep track of resource requests. The claim is that the page is ready when there
|
25
|
+
// are no outstanding requests for resources.
|
26
|
+
//
|
27
|
+
// This is not strictly true, because sometimes a resource still needs to be
|
28
|
+
// rendered after loading, or the application might still be building a request.
|
29
|
+
// To work around this, we wait an addition 2 seconds to make sure rendering etc.
|
30
|
+
// can take place.
|
31
|
+
//
|
32
|
+
// We need to keep track of request/response IDs, not just the count because larger
|
33
|
+
// resources may be returned in chunks.
|
34
|
+
|
35
|
+
var activeRequests = [];
|
36
|
+
var cumulativeRequests = 0;
|
37
|
+
|
38
|
+
page.onResourceRequested = function(requestData, networkRequest) {
|
39
|
+
if ((/ga\.js/gi).test(requestData.url) || (/dc\.js/gi).test(requestData.url)) {
|
40
|
+
// Don't load ga.js and dc.js to avoid polluting Google Analytics data
|
41
|
+
// regardless of whether external resources are allowed or not.
|
42
|
+
networkRequest.abort();
|
43
|
+
}
|
44
|
+
else {
|
45
|
+
activeRequests.push(requestData.id);
|
46
|
+
cumulativeRequests += 1;
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
page.onResourceReceived = function(response) {
|
51
|
+
// Remove the ID from the array.
|
52
|
+
activeRequests.splice(activeRequests.indexOf(response.id), 1);
|
53
|
+
|
54
|
+
// Should we output the HTML?
|
55
|
+
if (activeRequests.length == 0) {
|
56
|
+
window.setTimeout(function() {
|
57
|
+
if (activeRequests.length == 0) {
|
58
|
+
exportPage(page.content);
|
59
|
+
}
|
60
|
+
}, 2000);
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
page.onError = function(message, trace) {
|
65
|
+
// Catch and ignore errors.
|
66
|
+
}
|
67
|
+
|
68
|
+
page.setContent(content, url);
|
69
|
+
|
70
|
+
// If the page hasn't been exported in 8 seconds something is probably wrong,
|
71
|
+
// go ahead and export it.
|
72
|
+
setTimeout(function() {
|
73
|
+
exportPage(page.content);
|
74
|
+
}, 8000);
|
75
|
+
|
76
|
+
// If there are no requests in 2 seconds, assume the page does not depend on any
|
77
|
+
// external resources and has finished rendering.
|
78
|
+
setTimeout(function() {
|
79
|
+
if (cumulativeRequests == 0) {
|
80
|
+
exportPage(page.content);
|
81
|
+
}
|
82
|
+
}, 2000);
|
metadata
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: indexable
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Vikhyat Korrapati
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-12-10 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rack
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
description:
|
28
|
+
email: c@vikhyat.net
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/indexable.rb
|
34
|
+
- lib/render_page.js
|
35
|
+
- lib/phantomjs.rb
|
36
|
+
homepage: https://github.com/vikhyat/indexable
|
37
|
+
licenses:
|
38
|
+
- MIT
|
39
|
+
metadata: {}
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
requirements: []
|
55
|
+
rubyforge_project:
|
56
|
+
rubygems_version: 2.0.3
|
57
|
+
signing_key:
|
58
|
+
specification_version: 4
|
59
|
+
summary: Rack middleware that executes javascript before serving pages to crawlers.
|
60
|
+
test_files: []
|