indexable 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ea666f584199ac33afce60449c7e2ca89a58fe09
4
+ data.tar.gz: c6cd4101fa9c6ae24f67e8fc1fe58d0fe923cf16
5
+ SHA512:
6
+ metadata.gz: 3d99e57df1a36bd2330f1f1b118c1433581a5411cd6427efd8642b2815003763a7128f51796e9d5bdcb79ef0c824cd6243f03f0380ea517d6ce8250dc3a423c9
7
+ data.tar.gz: 91554eb4a3c2ffa54239a0ceff1c766b9ff7d5b74641f5be6329a7293c9597df310babf217be4ca35f3f043151439100d3b15a0ed8b096c99e28fd80c0aeda13
@@ -0,0 +1,53 @@
1
+ require 'phantomjs'
2
+ require 'rack/request'
3
+
4
+ module Rack
5
+ class Indexable
6
+ CRAWLER_USER_AGENTS = [
7
+ /^Twitterbot/, /^curl/, /Googlebot/, /Mediapartners/, /Adsbot-Google/,
8
+ /\(.*http(s|\(s\))?:\/\/.*\)/
9
+ ]
10
+
11
+ def initialize(app)
12
+ @app = app
13
+ end
14
+
15
+ # Detect whether the current request comes from a bot. Based on the logic used
16
+ # by Bustle.com (https://www.dropbox.com/s/s4oibqsxqpo3hll/bustle%20slizzle.pdf)
17
+ def request_from_crawler?(env)
18
+ user_agent = env["HTTP_USER_AGENT"]
19
+ params = Rack::Request.new(env).params
20
+ return false unless user_agent
21
+ return true if CRAWLER_USER_AGENTS.any? {|s| user_agent.match(s) }
22
+ return true if params.has_key?('_escaped_fragment_')
23
+ params['nojs'].eql?('true')
24
+ end
25
+
26
+ def call(env)
27
+ status, headers, content = *@app.call(env)
28
+
29
+ if status == 200 and headers['Content-Type'].match(/^text\/html/) and request_from_crawler?(env)
30
+ script = ::File.dirname(__FILE__) + "/render_page.js"
31
+ file = Tempfile.new(['indexable', '.html'])
32
+
33
+ if content.respond_to? :body
34
+ html = content.body
35
+ else
36
+ html = content.join('')
37
+ end
38
+
39
+ file.write html
40
+ file.close
41
+ begin
42
+ url = Rack::Request.new(env).url
43
+ content = [Phantomjs.new(script, file.path, url).run]
44
+ status = 500 if content[0] == "Couldn't render page... orz."
45
+ ensure
46
+ file.unlink
47
+ end
48
+ end
49
+
50
+ [status, headers, content]
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,26 @@
1
+ require 'timeout'
2
+
3
+ class Phantomjs
4
+ attr_accessor :timeout
5
+
6
+ def initialize(script, *args)
7
+ @script = script
8
+ @args = args
9
+ @timeout = 20
10
+ end
11
+
12
+ def run
13
+ pipe = nil
14
+ begin
15
+ Timeout.timeout(@timeout) do
16
+ pipe = IO.popen(["phantomjs", @script] + @args)
17
+ Process.wait pipe.pid
18
+ return pipe.read
19
+ end
20
+ rescue Timeout::Error
21
+ Process.kill 9, pipe.pid
22
+ Process.wait pipe.pid
23
+ return "Couldn't render page... orz."
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,82 @@
1
+ // TODO Add header.
2
+ //
3
+ // Based off https://gist.github.com/pieterjongsma/4515412
4
+
5
+ var fs = require('fs');
6
+ var system = require('system');
7
+ var page = require('webpage').create();
8
+ var path = system.args[1];
9
+ var url = system.args[2];
10
+ var content = fs.read(path);
11
+
12
+ // Keep track of whether the page has already been exported, because the '
13
+ // 'setTimeout's we use might cause it to be exported multiple times.
14
+ var pageHasBeenExported = false;
15
+ function exportPage(content) {
16
+ if (!pageHasBeenExported) {
17
+ pageHasBeenExported = true;
18
+ console.log(content);
19
+ phantom.exit();
20
+ }
21
+ }
22
+
23
+ // Since Ember has no method that indicates when everything has loaded, we keep
24
+ // keep track of resource requests. The claim is that the page is ready when there
25
+ // are no outstanding requests for resources.
26
+ //
27
+ // This is not strictly true, because sometimes a resource still needs to be
28
+ // rendered after loading, or the application might still be building a request.
29
+ // To work around this, we wait an addition 2 seconds to make sure rendering etc.
30
+ // can take place.
31
+ //
32
+ // We need to keep track of request/response IDs, not just the count because larger
33
+ // resources may be returned in chunks.
34
+
35
+ var activeRequests = [];
36
+ var cumulativeRequests = 0;
37
+
38
+ page.onResourceRequested = function(requestData, networkRequest) {
39
+ if ((/ga\.js/gi).test(requestData.url) || (/dc\.js/gi).test(requestData.url)) {
40
+ // Don't load ga.js and dc.js to avoid polluting Google Analytics data
41
+ // regardless of whether external resources are allowed or not.
42
+ networkRequest.abort();
43
+ }
44
+ else {
45
+ activeRequests.push(requestData.id);
46
+ cumulativeRequests += 1;
47
+ }
48
+ }
49
+
50
+ page.onResourceReceived = function(response) {
51
+ // Remove the ID from the array.
52
+ activeRequests.splice(activeRequests.indexOf(response.id), 1);
53
+
54
+ // Should we output the HTML?
55
+ if (activeRequests.length == 0) {
56
+ window.setTimeout(function() {
57
+ if (activeRequests.length == 0) {
58
+ exportPage(page.content);
59
+ }
60
+ }, 2000);
61
+ }
62
+ }
63
+
64
+ page.onError = function(message, trace) {
65
+ // Catch and ignore errors.
66
+ }
67
+
68
+ page.setContent(content, url);
69
+
70
+ // If the page hasn't been exported in 8 seconds something is probably wrong,
71
+ // go ahead and export it.
72
+ setTimeout(function() {
73
+ exportPage(page.content);
74
+ }, 8000);
75
+
76
+ // If there are no requests in 2 seconds, assume the page does not depend on any
77
+ // external resources and has finished rendering.
78
+ setTimeout(function() {
79
+ if (cumulativeRequests == 0) {
80
+ exportPage(page.content);
81
+ }
82
+ }, 2000);
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: indexable
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - Vikhyat Korrapati
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rack
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description:
28
+ email: c@vikhyat.net
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - lib/indexable.rb
34
+ - lib/render_page.js
35
+ - lib/phantomjs.rb
36
+ homepage: https://github.com/vikhyat/indexable
37
+ licenses:
38
+ - MIT
39
+ metadata: {}
40
+ post_install_message:
41
+ rdoc_options: []
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ requirements: []
55
+ rubyforge_project:
56
+ rubygems_version: 2.0.3
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: Rack middleware that executes javascript before serving pages to crawlers.
60
+ test_files: []