indexable 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ea666f584199ac33afce60449c7e2ca89a58fe09
4
+ data.tar.gz: c6cd4101fa9c6ae24f67e8fc1fe58d0fe923cf16
5
+ SHA512:
6
+ metadata.gz: 3d99e57df1a36bd2330f1f1b118c1433581a5411cd6427efd8642b2815003763a7128f51796e9d5bdcb79ef0c824cd6243f03f0380ea517d6ce8250dc3a423c9
7
+ data.tar.gz: 91554eb4a3c2ffa54239a0ceff1c766b9ff7d5b74641f5be6329a7293c9597df310babf217be4ca35f3f043151439100d3b15a0ed8b096c99e28fd80c0aeda13
@@ -0,0 +1,53 @@
1
+ require 'phantomjs'
2
+ require 'rack/request'
3
+
4
+ module Rack
5
+ class Indexable
6
+ CRAWLER_USER_AGENTS = [
7
+ /^Twitterbot/, /^curl/, /Googlebot/, /Mediapartners/, /Adsbot-Google/,
8
+ /\(.*http(s|\(s\))?:\/\/.*\)/
9
+ ]
10
+
11
+ def initialize(app)
12
+ @app = app
13
+ end
14
+
15
+ # Detect whether the current request comes from a bot. Based on the logic used
16
+ # by Bustle.com (https://www.dropbox.com/s/s4oibqsxqpo3hll/bustle%20slizzle.pdf)
17
+ def request_from_crawler?(env)
18
+ user_agent = env["HTTP_USER_AGENT"]
19
+ params = Rack::Request.new(env).params
20
+ return false unless user_agent
21
+ return true if CRAWLER_USER_AGENTS.any? {|s| user_agent.match(s) }
22
+ return true if params.has_key?('_escaped_fragment_')
23
+ params['nojs'].eql?('true')
24
+ end
25
+
26
+ def call(env)
27
+ status, headers, content = *@app.call(env)
28
+
29
+ if status == 200 and headers['Content-Type'].match(/^text\/html/) and request_from_crawler?(env)
30
+ script = ::File.dirname(__FILE__) + "/render_page.js"
31
+ file = Tempfile.new(['indexable', '.html'])
32
+
33
+ if content.respond_to? :body
34
+ html = content.body
35
+ else
36
+ html = content.join('')
37
+ end
38
+
39
+ file.write html
40
+ file.close
41
+ begin
42
+ url = Rack::Request.new(env).url
43
+ content = [Phantomjs.new(script, file.path, url).run]
44
+ status = 500 if content[0] == "Couldn't render page... orz."
45
+ ensure
46
+ file.unlink
47
+ end
48
+ end
49
+
50
+ [status, headers, content]
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,26 @@
1
+ require 'timeout'
2
+
3
+ class Phantomjs
4
+ attr_accessor :timeout
5
+
6
+ def initialize(script, *args)
7
+ @script = script
8
+ @args = args
9
+ @timeout = 20
10
+ end
11
+
12
+ def run
13
+ pipe = nil
14
+ begin
15
+ Timeout.timeout(@timeout) do
16
+ pipe = IO.popen(["phantomjs", @script] + @args)
17
+ Process.wait pipe.pid
18
+ return pipe.read
19
+ end
20
+ rescue Timeout::Error
21
+ Process.kill 9, pipe.pid
22
+ Process.wait pipe.pid
23
+ return "Couldn't render page... orz."
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,82 @@
1
+ // TODO Add header.
2
+ //
3
+ // Based off https://gist.github.com/pieterjongsma/4515412
4
+
5
+ var fs = require('fs');
6
+ var system = require('system');
7
+ var page = require('webpage').create();
8
+ var path = system.args[1];
9
+ var url = system.args[2];
10
+ var content = fs.read(path);
11
+
12
+ // Keep track of whether the page has already been exported, because the '
13
+ // 'setTimeout's we use might cause it to be exported multiple times.
14
+ var pageHasBeenExported = false;
15
+ function exportPage(content) {
16
+ if (!pageHasBeenExported) {
17
+ pageHasBeenExported = true;
18
+ console.log(content);
19
+ phantom.exit();
20
+ }
21
+ }
22
+
23
+ // Since Ember has no method that indicates when everything has loaded, we keep
24
+ // keep track of resource requests. The claim is that the page is ready when there
25
+ // are no outstanding requests for resources.
26
+ //
27
+ // This is not strictly true, because sometimes a resource still needs to be
28
+ // rendered after loading, or the application might still be building a request.
29
+ // To work around this, we wait an addition 2 seconds to make sure rendering etc.
30
+ // can take place.
31
+ //
32
+ // We need to keep track of request/response IDs, not just the count because larger
33
+ // resources may be returned in chunks.
34
+
35
+ var activeRequests = [];
36
+ var cumulativeRequests = 0;
37
+
38
+ page.onResourceRequested = function(requestData, networkRequest) {
39
+ if ((/ga\.js/gi).test(requestData.url) || (/dc\.js/gi).test(requestData.url)) {
40
+ // Don't load ga.js and dc.js to avoid polluting Google Analytics data
41
+ // regardless of whether external resources are allowed or not.
42
+ networkRequest.abort();
43
+ }
44
+ else {
45
+ activeRequests.push(requestData.id);
46
+ cumulativeRequests += 1;
47
+ }
48
+ }
49
+
50
+ page.onResourceReceived = function(response) {
51
+ // Remove the ID from the array.
52
+ activeRequests.splice(activeRequests.indexOf(response.id), 1);
53
+
54
+ // Should we output the HTML?
55
+ if (activeRequests.length == 0) {
56
+ window.setTimeout(function() {
57
+ if (activeRequests.length == 0) {
58
+ exportPage(page.content);
59
+ }
60
+ }, 2000);
61
+ }
62
+ }
63
+
64
+ page.onError = function(message, trace) {
65
+ // Catch and ignore errors.
66
+ }
67
+
68
+ page.setContent(content, url);
69
+
70
+ // If the page hasn't been exported in 8 seconds something is probably wrong,
71
+ // go ahead and export it.
72
+ setTimeout(function() {
73
+ exportPage(page.content);
74
+ }, 8000);
75
+
76
+ // If there are no requests in 2 seconds, assume the page does not depend on any
77
+ // external resources and has finished rendering.
78
+ setTimeout(function() {
79
+ if (cumulativeRequests == 0) {
80
+ exportPage(page.content);
81
+ }
82
+ }, 2000);
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: indexable
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - Vikhyat Korrapati
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rack
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description:
28
+ email: c@vikhyat.net
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - lib/indexable.rb
34
+ - lib/render_page.js
35
+ - lib/phantomjs.rb
36
+ homepage: https://github.com/vikhyat/indexable
37
+ licenses:
38
+ - MIT
39
+ metadata: {}
40
+ post_install_message:
41
+ rdoc_options: []
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ requirements: []
55
+ rubyforge_project:
56
+ rubygems_version: 2.0.3
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: Rack middleware that executes javascript before serving pages to crawlers.
60
+ test_files: []