crabfarm 0.0.11 → 0.0.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 45541f82668f0910accb44339916af9e37595470
4
- data.tar.gz: 99de58252a283fd334a5c1aafc491eec16610c21
3
+ metadata.gz: 9ffaef8409650267bf6e4272421008ff9c38d05e
4
+ data.tar.gz: 996ba45929699ec7eebac5d8d4a1a24f231cf4a1
5
5
  SHA512:
6
- metadata.gz: 826f03dc142d062f35c2e7a3ff0cf7c56c16127704c95f81d39a14a0b702ea57c4da07250471ad0b9f55b3825ef51312317b7fc8c0aced47e1fb3f395e87c5d4
7
- data.tar.gz: 63863c7f0ede26385cdc1993a97433cfb16911edd593d30488ff5569366aaa38654084f22391c4ce6dad47105b8b691947c37af4ab7ecd9146ee24c1fbc63066
6
+ metadata.gz: 4d5cffdf273b3f31c807502f036583ab89e9ad58b66aa2f13a79112350f418a246535702109bcdabf8cbded64410414b8fa39a581af079f99d205b28c1387cc9
7
+ data.tar.gz: 6dd417dbebd417b32fa738c58af2927651d6106f66909b4748328dcb9965976c3f206f9d792362740793ffd1b8ae0dc18bb07dc6ea5fe29f8581b53c60728fa6
@@ -27,7 +27,7 @@ module Crabfarm
27
27
  [:phantom_lock_file, :string, 'Phantomjs lock file path, only for phantomjs driver.'],
28
28
 
29
29
  # Crabtrap launcher configuration
30
- [:crabtrap_bin, :string, 'Crabtrap binary path.'],
30
+ [:crabtrap_bin_path, :string, 'Crabtrap binary path.'],
31
31
  [:crabtrap_port, :integer, 'Crabtrap port, defaults to 4000.'],
32
32
  [:crabtrap_mode, ['capture', 'replay'], 'Crabtrap operation mode.']
33
33
  ]
@@ -68,7 +68,7 @@ module Crabfarm
68
68
  phantom_bin_path: 'phantomjs',
69
69
  phantom_lock_file: nil,
70
70
 
71
- crabtrap_bin: 'crabtrap',
71
+ crabtrap_bin_path: 'crabtrap',
72
72
  crabtrap_port: 4000
73
73
  }
74
74
  end
@@ -113,7 +113,7 @@ module Crabfarm
113
113
 
114
114
  def crabtrap_config
115
115
  {
116
- bin_path: crabtrap_bin,
116
+ bin_path: crabtrap_bin_path,
117
117
  port: crabtrap_port,
118
118
  proxy: proxy
119
119
  }
@@ -39,11 +39,11 @@ module Crabfarm
39
39
  end
40
40
 
41
41
  def driver_config
42
- super.merge(proxy: proxy_address)
42
+ if @runner.is_running? then super.merge(proxy: proxy_address) else super end
43
43
  end
44
44
 
45
45
  def phantom_config
46
- super.merge(proxy: proxy_address)
46
+ if @runner.is_running? then super.merge(proxy: proxy_address) else super end
47
47
  end
48
48
 
49
49
  def proxy_address
@@ -8,6 +8,10 @@ module Crabfarm
8
8
  @pid = nil
9
9
  end
10
10
 
11
+ def is_running?
12
+ not @pid.nil?
13
+ end
14
+
11
15
  def port
12
16
  @config[:port] # TODO: maybe select port dynamically...
13
17
  end
@@ -17,8 +21,13 @@ module Crabfarm
17
21
  end
18
22
 
19
23
  def start
20
- @pid = Process.spawn({}, crabtrap_cmd)
21
- wait_for_server
24
+ begin
25
+ @pid = Process.spawn({}, crabtrap_cmd)
26
+ wait_for_server
27
+ rescue
28
+ puts "Could not find crabtrap at #{@config[:bin_path]}, memento replaying is disabled!"
29
+ @pid = nil
30
+ end
22
31
  end
23
32
 
24
33
  def stop
@@ -6,7 +6,6 @@ module Crabfarm
6
6
  class SafeStateLoop
7
7
 
8
8
  def initialize
9
- @context = Crabfarm::Context.new
10
9
  @running = true
11
10
  @working = false
12
11
  @lock = Mutex.new
@@ -16,7 +15,6 @@ module Crabfarm
16
15
  def release
17
16
  @running = false
18
17
  @thread.join
19
- @context.release
20
18
  end
21
19
 
22
20
  def change_state(_name, _params={}, _wait=nil)
@@ -90,29 +88,38 @@ module Crabfarm
90
88
  end
91
89
 
92
90
  def crawl_loop
93
- while @running
94
- if @working
95
- @elapsed = Benchmark.measure do
96
- begin
97
- ActiveSupport::Dependencies.clear
98
- logger.info "StateLoop: loading state: #{@next_state_name}"
99
- @doc = @context.run_state(@next_state_name, @next_state_params).output_as_json
100
- logger.info "StateLoop: state loaded successfully: #{@next_state_name}"
101
- @error = nil
102
- rescue Exception => e
103
- logger.error "StateLoop: error while loading state: #{@next_state_name}"
104
- logger.error e
105
- @doc = nil
106
- @error = e
107
- end
108
- end.real
109
-
110
- @lock.synchronize {
111
- @state_name = @next_state_name
112
- @state_params = @next_state_params
113
- @working = false
114
- }
115
- else sleep 0.2 end
91
+ context = Crabfarm::Context.new
92
+
93
+ begin
94
+ while @running
95
+ if @working
96
+ @elapsed = Benchmark.measure do
97
+ begin
98
+ ActiveSupport::Dependencies.clear
99
+ logger.info "StateLoop: loading state: #{@next_state_name}"
100
+ @doc = context.run_state(@next_state_name, @next_state_params).output_as_json
101
+ logger.info "StateLoop: state loaded successfully: #{@next_state_name}"
102
+ @error = nil
103
+ rescue Exception => e
104
+ logger.error "StateLoop: error while loading state: #{@next_state_name}"
105
+ logger.error e
106
+ @doc = nil
107
+ @error = e
108
+ end
109
+ end.real
110
+
111
+ @lock.synchronize {
112
+ @state_name = @next_state_name
113
+ @state_params = @next_state_params
114
+ @working = false
115
+ }
116
+ else sleep 0.2 end
117
+ end
118
+ rescue Exception => e
119
+ logger.fatal "StateLoop: unhandled exception!"
120
+ logger.fatal e
121
+ ensure
122
+ context.release
116
123
  end
117
124
  end
118
125
 
@@ -30,6 +30,7 @@ module Crabfarm
30
30
  path(_name, 'spec', 'snapshots', '.gitkeep').render('dot_gitkeep')
31
31
  path(_name, 'spec', 'mementos', '.gitkeep').render('dot_gitkeep')
32
32
  path(_name, 'spec', 'integration', '.gitkeep').render('dot_gitkeep')
33
+ path(_name, 'logs').render('dot_gitkeep')
33
34
  end
34
35
  end
35
36
 
@@ -1,3 +1,3 @@
1
1
  module Crabfarm
2
- VERSION = "0.0.11"
2
+ VERSION = "0.0.12"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crabfarm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ignacio Baixas
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-25 00:00:00.000000000 Z
11
+ date: 2015-02-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: jbuilder
@@ -315,7 +315,6 @@ email:
315
315
  - ignacio@platan.us
316
316
  executables:
317
317
  - crabfarm
318
- - crabtrap
319
318
  extensions: []
320
319
  extra_rdoc_files: []
321
320
  files:
@@ -371,7 +370,6 @@ files:
371
370
  - lib/crabfarm/version.rb
372
371
  - lib/crabfarm.rb
373
372
  - bin/crabfarm
374
- - bin/crabtrap
375
373
  homepage: https://github.com/platanus/crabfarm-gem
376
374
  licenses:
377
375
  - MIT
data/bin/crabtrap DELETED
@@ -1,347 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- var net = require('net'),
4
- http = require('http'),
5
- https = require('https'),
6
- url = require('url'),
7
- fs = require('fs'),
8
- zlib = require('zlib');
9
-
10
- // Globals
11
-
12
- var HTTPS_OPTIONS = {
13
- key: '-----BEGIN RSA PRIVATE KEY-----\nMIIBOQIBAAJBAK/L/lXb/kxUzve1olo71s6mQLvuQCm3z2wqClq71NLerFnaXpN+\nFrNPy7+R3gZ1hdWXqbN5NqpWDMM9fcbd7p0CAwEAAQJAUDImN3Lhgl7Z/+TLSJCt\nwJ3VQCZC/QUOSdCv4o53Wy5aL/n8ootYFC3eoFC2Nal5bnH6onP9YR+X9l3HKLaT\n3QIhANXwb5SvJ+Kewa8F5wNHo9LFjSbL7WSSb1MyvYnOeFlPAiEA0lvaLz6UXRDL\n6T6Z1fkF0exmQqVimeL5qjY5o9Gk5lMCH1A52Z3oEQzqe7cmf3q7YrOnYUcrMdqF\nDzojzO/gfUECIQCe9fImiW+r9CljFH9Dhm6zd6S+8CNWjoKD8X4VITMvKQIgb3sg\nq9gPVzXn/+f8Qcc2KILSh3ffkIpA8yJK9omUIxI=\n-----END RSA PRIVATE KEY-----\n',
14
- cert: '-----BEGIN CERTIFICATE-----\nMIIBmDCCAUICCQDGtiGKgI9AXjANBgkqhkiG9w0BAQUFADBTMQswCQYDVQQGEwJD\nTDELMAkGA1UECBMCUk0xETAPBgNVBAcTCFNhbnRpYWdvMREwDwYDVQQKEwhQbGF0\nYW51czERMA8GA1UEAxMIQ3JhYnRyYXAwHhcNMTUwMTE1MjAxNzMzWhcNNDIwNjAx\nMjAxNzMzWjBTMQswCQYDVQQGEwJDTDELMAkGA1UECBMCUk0xETAPBgNVBAcTCFNh\nbnRpYWdvMREwDwYDVQQKEwhQbGF0YW51czERMA8GA1UEAxMIQ3JhYnRyYXAwXDAN\nBgkqhkiG9w0BAQEFAANLADBIAkEAr8v+Vdv+TFTO97WiWjvWzqZAu+5AKbfPbCoK\nWrvU0t6sWdpek34Ws0/Lv5HeBnWF1Zeps3k2qlYMwz19xt3unQIDAQABMA0GCSqG\nSIb3DQEBBQUAA0EAmecqIZqQ8OXSIj0V2VKaIXwz8RBnhLzU7BJwcsWJE/Bex7zB\nWP+vLv9ML5ZRLCsXjL5IOav8qAX/NZXjoN3e3Q==\n-----END CERTIFICATE-----\n'
15
- };
16
-
17
- var LOG = {
18
- DEBUG: 0,
19
- INFO: 1,
20
- WARN: 2,
21
- ERROR: 3
22
- };
23
-
24
- var STACK = [],
25
- MODE = false,
26
- SOURCE = null,
27
- PORT = 4000,
28
- LOG_LEVEL = LOG.WARN;
29
-
30
- (function() {
31
- if(process.argv.length < 2) throw 'Must provide a proxy mode';
32
- MODE = process.argv[2];
33
- var i = 3;
34
-
35
- if(MODE != 'pass') {
36
- if(process.argv.length < 3) throw 'Must provide a bucket path';
37
- SOURCE = process.argv[3];
38
- i = 4;
39
- }
40
-
41
- for(; i < process.argv.length; i++) {
42
- var parts = process.argv[i].split('=');
43
- switch(parts[0]) {
44
- case '--port': PORT = parseInt(parts[1], 10); break;
45
- case '--quiet': PORT = parseInt(parts[1], 10); break;
46
- default: throw 'Invalid option ' + parts[0];
47
- }
48
- }
49
- })();
50
-
51
- // Utility methods
52
-
53
- function log(_level, _message) {
54
- if(_level == LOG.DEBUG) _message = '\t' + _message;
55
- if(_level >= LOG_LEVEL) console.log(_message);
56
- }
57
-
58
- function forOwn(_obj, _cb) {
59
- for(var key in _obj) {
60
- if(_obj.hasOwnProperty(key)) {
61
- _cb(key, _obj[key]);
62
- }
63
- }
64
- }
65
-
66
- function keysToLowerCase(_obj) {
67
- var result = {};
68
- forOwn(_obj, function(k,v) { result[k.toLowerCase()] = v; });
69
- return result;
70
- }
71
-
72
- function pickRandomPort() {
73
- return 0; // This could fail on Linux...
74
- }
75
-
76
- function matchRequestToResource(_req, _resource) {
77
- return _resource.method.toLowerCase() == _req.method.toLowerCase() && _resource.url == _req.url;
78
- }
79
-
80
- function matchRequestToResourceWOQuery(_req, _resource) {
81
- if(_resource.method.toLowerCase() == _req.method.toLowerCase()) return false;
82
-
83
- var reqUrl = url.parse(_req.url, true),
84
- resUrl = url.parse(_resource.url, true);
85
-
86
- return reqUrl.hostname == resUrl.hostname && reqUrl.pathname == resUrl.pathname;
87
- }
88
-
89
- function findAndMoveLast(_req, _array, _matches) {
90
- for(var i = 0, l = _array.length; i < l; i++) {
91
- if(_matches(_req, _array[i])) {
92
- var resource = _array.splice(i, 1)[0];
93
- _array.push(resource);
94
- return resource;
95
- }
96
- }
97
-
98
- return null;
99
- }
100
-
101
- function loadStackFrom(_path, _then) {
102
- var data = fs.readFileSync(_path);
103
- zlib.gunzip(data, function(err, buffer) {
104
- if (!err) STACK = JSON.parse(buffer.toString());
105
- _then();
106
- });
107
- }
108
-
109
- function saveStackTo(_path, _then) {
110
- var data = JSON.stringify(STACK);
111
- zlib.gzip(data, function(err, buffer) {
112
- if (!err) fs.writeFileSync(_path, buffer);
113
- _then();
114
- });
115
- }
116
-
117
- function resolveAndServeResource(_req, _resp) {
118
- var resource = findInStack(_req);
119
- if(resource) {
120
- log(LOG.INFO, "Serving: " + resource.method + ' ' + resource.url);
121
- log(LOG.DEBUG, "HTTP " + resource.statusCode);
122
- log(LOG.DEBUG, JSON.stringify(resource.headers));
123
-
124
- serveResource(resource, _resp);
125
- } else {
126
- log(LOG.WARN, 'Not found: ' + _req.url);
127
- _resp.statusCode = 404;
128
- _resp.end();
129
- }
130
- }
131
-
132
- function serveLastResource(_resp) {
133
- serveResource(STACK[STACK.length-1], _resp);
134
- }
135
-
136
- function serveResource(_resource, _resp) {
137
- _resp.statusCode = _resource.statusCode;
138
-
139
- forOwn(_resource.headers, function(k, v) { _resp.setHeader(k, v); });
140
-
141
- if(_resource.content) {
142
- var buf = new Buffer(_resource.content, _resource.encoding);
143
- _resp.end(buf);
144
- } else {
145
- _resp.end();
146
- }
147
- }
148
-
149
- function findAndMoveLast(_req, _matches) {
150
- for(var i = 0, l = STACK.length; i < l; i++) {
151
- if(_matches(_req, STACK[i])) {
152
- var resource = STACK.splice(i, 1)[0];
153
- STACK.push(resource);
154
- return resource;
155
- }
156
- }
157
-
158
- return null;
159
- }
160
-
161
- function findInStack(_req, _partial) {
162
- return findAndMoveLast(_req, matchRequestToResource) ||
163
- findAndMoveLast(_req, matchRequestToResourceWOQuery);
164
- }
165
-
166
- function cacheResponse(_req, _resp, _cb) {
167
-
168
- log(LOG.INFO, "Caching Response");
169
- log(LOG.DEBUG, "HTTP " + _resp.statusCode);
170
- log(LOG.DEBUG, JSON.stringify(keysToLowerCase(_resp.headers)));
171
-
172
- var encoding = null,
173
- // TODO: consider storing port and protocoll in the resource.
174
- resource = {
175
- url: _req.url,
176
- statusCode: _resp.statusCode,
177
- method: _req.method,
178
- // inHeaders: req.headers, // store request headers to aid in recognition?
179
- headers: keysToLowerCase(_resp.headers),
180
- content: '',
181
- encoding: 'base64'
182
- },
183
- contentEncoding = resource.headers['content-encoding'],
184
- contentType = resource.headers['content-type'],
185
- outStream = _resp;
186
-
187
- // add decompression if supported encoding:
188
- if(contentEncoding == 'gzip') {
189
- outStream = _resp.pipe(zlib.createGunzip());
190
- delete resource.headers['content-encoding'];
191
- contentEncoding = null;
192
- } else if(contentEncoding == 'deflate') {
193
- outStream = _resp.pipe(zlib.createInflate());
194
- delete resource.headers['content-encoding'];
195
- contentEncoding = null;
196
- }
197
-
198
- // use utf8 encoding for uncompresed text:
199
- if(!contentEncoding && contentType) {
200
- contentType = contentType.match(/([^\/]+)\/([^\s]+)(?:\s+(.+))?/i);
201
- if(contentType && (contentType[1] == 'text' || contentType[1] == 'application')) {
202
- resource.encoding = 'utf-8';
203
- }
204
- }
205
-
206
- // remove unwanted headers:
207
- delete resource.headers['content-length'];
208
-
209
- // start receiving data:
210
- if(resource.encoding) outStream.setEncoding(resource.encoding);
211
- outStream.on('data', function(_chunk) {
212
- resource.content += _chunk;
213
- });
214
-
215
- // when all data is received, store resource (dont know how this will handle more than one request)
216
- outStream.on('end', function() {
217
- STACK.push(resource);
218
- _cb();
219
- });
220
- }
221
-
222
- function prepareForwardRequest(_req) {
223
- var urlObj = url.parse(_req.url);
224
-
225
- var options = {
226
- method: _req.method,
227
- host: urlObj.host,
228
- path: urlObj.path,
229
- rejectUnauthorized: false,
230
- headers: keysToLowerCase(_req.headers)
231
- };
232
-
233
- // Rewrite headers
234
- options.headers['accept-encoding'] = 'gzip,deflate';
235
- return options;
236
- }
237
-
238
- function passRequest(_req, _resp) {
239
- log(LOG.INFO, 'Passing through ' + _req.method + ' request for ' + _req.url);
240
-
241
- var urlObj = url.parse(_req.url);
242
- var forward = (urlObj.protocol == 'https:' ? https : http).request({
243
- method: _req.method,
244
- host: urlObj.host,
245
- path: urlObj.path,
246
- headers: _req.headers
247
- }, function(_fw_resp) {
248
- // pipe response back untouched
249
- _resp.writeHead(_fw_resp.statusCode, _fw_resp.headers);
250
- _fw_resp.pipe(_resp);
251
- });
252
-
253
- _req.pipe(forward);
254
- }
255
-
256
- function captureRequest(_req, _resp, _useSSL) {
257
- log(LOG.INFO, 'Forwarding ' + _req.method + ' request for ' + _req.url);
258
-
259
- var urlObj = url.parse(_req.url);
260
- var options = {
261
- method: _req.method,
262
- host: urlObj.host,
263
- path: urlObj.path,
264
- rejectUnauthorized: false,
265
- headers: keysToLowerCase(_req.headers)
266
- };
267
-
268
- // Rewrite headers
269
- options.headers['accept-encoding'] = 'gzip,deflate';
270
- log(LOG.DEBUG, JSON.stringify(options));
271
-
272
- var forward = (urlObj.protocol == 'https:' ? https : http).request(options, function(_fw_resp) {
273
- cacheResponse(_req, _fw_resp, function() {
274
- serveLastResource(_resp);
275
- });
276
- });
277
-
278
- _req.pipe(forward); // forward request data
279
- }
280
-
281
- function replayRequest(_req, _resp) {
282
- log(LOG.INFO, 'Resolving ' + _req.method + ' request for ' + _req.url);
283
- resolveAndServeResource(_req, _resp);
284
- }
285
-
286
- function selectProxy() {
287
- switch(MODE) {
288
- case 'pass': return passRequest;
289
- case 'capture': return captureRequest;
290
- case 'replay': return replayRequest;
291
- default: throw 'Invalid proxy mode';
292
- }
293
- }
294
-
295
- var PROXY_FUN = selectProxy(),
296
- SERVER = http.createServer(PROXY_FUN);
297
-
298
- // Special handler for HTTPS request, creates a dedicated HTTPS proxy per connection,
299
- // that way the CONNECT tunnel can be intercepted, requires support for self signed
300
- // certificates in the client.
301
- SERVER.on('connect', function (_req, _sock, _head) {
302
-
303
- var urlObj = url.parse('http://' + _req.url);
304
- log(LOG.INFO, 'New HTTPS request: starting https intercept on ' + urlObj.hostname);
305
-
306
- var httpsServ = https.createServer(HTTPS_OPTIONS, function(_req, _resp) {
307
- _req.url = 'https://' + urlObj.hostname + _req.url;
308
- PROXY_FUN(_req, _resp);
309
- });
310
-
311
- httpsServ.listen(pickRandomPort());
312
-
313
- var tunnelSock = net.connect(httpsServ.address().port, function() {
314
- _sock.write('HTTP/1.1 200 Connection Established\r\n' +
315
- 'Proxy-agent: Node-Proxy\r\n' +
316
- '\r\n');
317
- tunnelSock.write(_head);
318
- tunnelSock.pipe(_sock);
319
- _sock.pipe(tunnelSock);
320
- });
321
-
322
- _sock.on('close', function() {
323
- httpsServ.close();
324
- });
325
- });
326
-
327
- console.log("Starting crabtrap! mode: " + MODE);
328
-
329
- if(MODE == 'replay') {
330
- loadStackFrom(SOURCE, SERVER.listen.bind(SERVER, PORT));
331
- } else {
332
- SERVER.listen(PORT);
333
- }
334
-
335
- var EXITING = false;
336
- process.on('SIGINT', function() {
337
- if(EXITING) return;
338
- EXITING = true;
339
-
340
- console.log("Shutting down crabtrap!");
341
- SERVER.close();
342
- if(MODE == 'capture') {
343
- saveStackTo(SOURCE, process.exit.bind(process));
344
- } else {
345
- process.exit();
346
- }
347
- });