crabfarm 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 45541f82668f0910accb44339916af9e37595470
4
- data.tar.gz: 99de58252a283fd334a5c1aafc491eec16610c21
3
+ metadata.gz: 9ffaef8409650267bf6e4272421008ff9c38d05e
4
+ data.tar.gz: 996ba45929699ec7eebac5d8d4a1a24f231cf4a1
5
5
  SHA512:
6
- metadata.gz: 826f03dc142d062f35c2e7a3ff0cf7c56c16127704c95f81d39a14a0b702ea57c4da07250471ad0b9f55b3825ef51312317b7fc8c0aced47e1fb3f395e87c5d4
7
- data.tar.gz: 63863c7f0ede26385cdc1993a97433cfb16911edd593d30488ff5569366aaa38654084f22391c4ce6dad47105b8b691947c37af4ab7ecd9146ee24c1fbc63066
6
+ metadata.gz: 4d5cffdf273b3f31c807502f036583ab89e9ad58b66aa2f13a79112350f418a246535702109bcdabf8cbded64410414b8fa39a581af079f99d205b28c1387cc9
7
+ data.tar.gz: 6dd417dbebd417b32fa738c58af2927651d6106f66909b4748328dcb9965976c3f206f9d792362740793ffd1b8ae0dc18bb07dc6ea5fe29f8581b53c60728fa6
@@ -27,7 +27,7 @@ module Crabfarm
27
27
  [:phantom_lock_file, :string, 'Phantomjs lock file path, only for phantomjs driver.'],
28
28
 
29
29
  # Crabtrap launcher configuration
30
- [:crabtrap_bin, :string, 'Crabtrap binary path.'],
30
+ [:crabtrap_bin_path, :string, 'Crabtrap binary path.'],
31
31
  [:crabtrap_port, :integer, 'Crabtrap port, defaults to 4000.'],
32
32
  [:crabtrap_mode, ['capture', 'replay'], 'Crabtrap operation mode.']
33
33
  ]
@@ -68,7 +68,7 @@ module Crabfarm
68
68
  phantom_bin_path: 'phantomjs',
69
69
  phantom_lock_file: nil,
70
70
 
71
- crabtrap_bin: 'crabtrap',
71
+ crabtrap_bin_path: 'crabtrap',
72
72
  crabtrap_port: 4000
73
73
  }
74
74
  end
@@ -113,7 +113,7 @@ module Crabfarm
113
113
 
114
114
  def crabtrap_config
115
115
  {
116
- bin_path: crabtrap_bin,
116
+ bin_path: crabtrap_bin_path,
117
117
  port: crabtrap_port,
118
118
  proxy: proxy
119
119
  }
@@ -39,11 +39,11 @@ module Crabfarm
39
39
  end
40
40
 
41
41
  def driver_config
42
- super.merge(proxy: proxy_address)
42
+ if @runner.is_running? then super.merge(proxy: proxy_address) else super end
43
43
  end
44
44
 
45
45
  def phantom_config
46
- super.merge(proxy: proxy_address)
46
+ if @runner.is_running? then super.merge(proxy: proxy_address) else super end
47
47
  end
48
48
 
49
49
  def proxy_address
@@ -8,6 +8,10 @@ module Crabfarm
8
8
  @pid = nil
9
9
  end
10
10
 
11
+ def is_running?
12
+ not @pid.nil?
13
+ end
14
+
11
15
  def port
12
16
  @config[:port] # TODO: maybe select port dynamically...
13
17
  end
@@ -17,8 +21,13 @@ module Crabfarm
17
21
  end
18
22
 
19
23
  def start
20
- @pid = Process.spawn({}, crabtrap_cmd)
21
- wait_for_server
24
+ begin
25
+ @pid = Process.spawn({}, crabtrap_cmd)
26
+ wait_for_server
27
+ rescue
28
+ puts "Could not find crabtrap at #{@config[:bin_path]}, memento replaying is disabled!"
29
+ @pid = nil
30
+ end
22
31
  end
23
32
 
24
33
  def stop
@@ -6,7 +6,6 @@ module Crabfarm
6
6
  class SafeStateLoop
7
7
 
8
8
  def initialize
9
- @context = Crabfarm::Context.new
10
9
  @running = true
11
10
  @working = false
12
11
  @lock = Mutex.new
@@ -16,7 +15,6 @@ module Crabfarm
16
15
  def release
17
16
  @running = false
18
17
  @thread.join
19
- @context.release
20
18
  end
21
19
 
22
20
  def change_state(_name, _params={}, _wait=nil)
@@ -90,29 +88,38 @@ module Crabfarm
90
88
  end
91
89
 
92
90
  def crawl_loop
93
- while @running
94
- if @working
95
- @elapsed = Benchmark.measure do
96
- begin
97
- ActiveSupport::Dependencies.clear
98
- logger.info "StateLoop: loading state: #{@next_state_name}"
99
- @doc = @context.run_state(@next_state_name, @next_state_params).output_as_json
100
- logger.info "StateLoop: state loaded successfully: #{@next_state_name}"
101
- @error = nil
102
- rescue Exception => e
103
- logger.error "StateLoop: error while loading state: #{@next_state_name}"
104
- logger.error e
105
- @doc = nil
106
- @error = e
107
- end
108
- end.real
109
-
110
- @lock.synchronize {
111
- @state_name = @next_state_name
112
- @state_params = @next_state_params
113
- @working = false
114
- }
115
- else sleep 0.2 end
91
+ context = Crabfarm::Context.new
92
+
93
+ begin
94
+ while @running
95
+ if @working
96
+ @elapsed = Benchmark.measure do
97
+ begin
98
+ ActiveSupport::Dependencies.clear
99
+ logger.info "StateLoop: loading state: #{@next_state_name}"
100
+ @doc = context.run_state(@next_state_name, @next_state_params).output_as_json
101
+ logger.info "StateLoop: state loaded successfully: #{@next_state_name}"
102
+ @error = nil
103
+ rescue Exception => e
104
+ logger.error "StateLoop: error while loading state: #{@next_state_name}"
105
+ logger.error e
106
+ @doc = nil
107
+ @error = e
108
+ end
109
+ end.real
110
+
111
+ @lock.synchronize {
112
+ @state_name = @next_state_name
113
+ @state_params = @next_state_params
114
+ @working = false
115
+ }
116
+ else sleep 0.2 end
117
+ end
118
+ rescue Exception => e
119
+ logger.fatal "StateLoop: unhandled exception!"
120
+ logger.fatal e
121
+ ensure
122
+ context.release
116
123
  end
117
124
  end
118
125
 
@@ -30,6 +30,7 @@ module Crabfarm
30
30
  path(_name, 'spec', 'snapshots', '.gitkeep').render('dot_gitkeep')
31
31
  path(_name, 'spec', 'mementos', '.gitkeep').render('dot_gitkeep')
32
32
  path(_name, 'spec', 'integration', '.gitkeep').render('dot_gitkeep')
33
+ path(_name, 'logs').render('dot_gitkeep')
33
34
  end
34
35
  end
35
36
 
@@ -1,3 +1,3 @@
1
1
  module Crabfarm
2
- VERSION = "0.0.11"
2
+ VERSION = "0.0.12"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crabfarm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ignacio Baixas
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-25 00:00:00.000000000 Z
11
+ date: 2015-02-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: jbuilder
@@ -315,7 +315,6 @@ email:
315
315
  - ignacio@platan.us
316
316
  executables:
317
317
  - crabfarm
318
- - crabtrap
319
318
  extensions: []
320
319
  extra_rdoc_files: []
321
320
  files:
@@ -371,7 +370,6 @@ files:
371
370
  - lib/crabfarm/version.rb
372
371
  - lib/crabfarm.rb
373
372
  - bin/crabfarm
374
- - bin/crabtrap
375
373
  homepage: https://github.com/platanus/crabfarm-gem
376
374
  licenses:
377
375
  - MIT
data/bin/crabtrap DELETED
@@ -1,347 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- var net = require('net'),
4
- http = require('http'),
5
- https = require('https'),
6
- url = require('url'),
7
- fs = require('fs'),
8
- zlib = require('zlib');
9
-
10
- // Globals
11
-
12
- var HTTPS_OPTIONS = {
13
- key: '-----BEGIN RSA PRIVATE KEY-----\nMIIBOQIBAAJBAK/L/lXb/kxUzve1olo71s6mQLvuQCm3z2wqClq71NLerFnaXpN+\nFrNPy7+R3gZ1hdWXqbN5NqpWDMM9fcbd7p0CAwEAAQJAUDImN3Lhgl7Z/+TLSJCt\nwJ3VQCZC/QUOSdCv4o53Wy5aL/n8ootYFC3eoFC2Nal5bnH6onP9YR+X9l3HKLaT\n3QIhANXwb5SvJ+Kewa8F5wNHo9LFjSbL7WSSb1MyvYnOeFlPAiEA0lvaLz6UXRDL\n6T6Z1fkF0exmQqVimeL5qjY5o9Gk5lMCH1A52Z3oEQzqe7cmf3q7YrOnYUcrMdqF\nDzojzO/gfUECIQCe9fImiW+r9CljFH9Dhm6zd6S+8CNWjoKD8X4VITMvKQIgb3sg\nq9gPVzXn/+f8Qcc2KILSh3ffkIpA8yJK9omUIxI=\n-----END RSA PRIVATE KEY-----\n',
14
- cert: '-----BEGIN CERTIFICATE-----\nMIIBmDCCAUICCQDGtiGKgI9AXjANBgkqhkiG9w0BAQUFADBTMQswCQYDVQQGEwJD\nTDELMAkGA1UECBMCUk0xETAPBgNVBAcTCFNhbnRpYWdvMREwDwYDVQQKEwhQbGF0\nYW51czERMA8GA1UEAxMIQ3JhYnRyYXAwHhcNMTUwMTE1MjAxNzMzWhcNNDIwNjAx\nMjAxNzMzWjBTMQswCQYDVQQGEwJDTDELMAkGA1UECBMCUk0xETAPBgNVBAcTCFNh\nbnRpYWdvMREwDwYDVQQKEwhQbGF0YW51czERMA8GA1UEAxMIQ3JhYnRyYXAwXDAN\nBgkqhkiG9w0BAQEFAANLADBIAkEAr8v+Vdv+TFTO97WiWjvWzqZAu+5AKbfPbCoK\nWrvU0t6sWdpek34Ws0/Lv5HeBnWF1Zeps3k2qlYMwz19xt3unQIDAQABMA0GCSqG\nSIb3DQEBBQUAA0EAmecqIZqQ8OXSIj0V2VKaIXwz8RBnhLzU7BJwcsWJE/Bex7zB\nWP+vLv9ML5ZRLCsXjL5IOav8qAX/NZXjoN3e3Q==\n-----END CERTIFICATE-----\n'
15
- };
16
-
17
- var LOG = {
18
- DEBUG: 0,
19
- INFO: 1,
20
- WARN: 2,
21
- ERROR: 3
22
- };
23
-
24
- var STACK = [],
25
- MODE = false,
26
- SOURCE = null,
27
- PORT = 4000,
28
- LOG_LEVEL = LOG.WARN;
29
-
30
- (function() {
31
- if(process.argv.length < 2) throw 'Must provide a proxy mode';
32
- MODE = process.argv[2];
33
- var i = 3;
34
-
35
- if(MODE != 'pass') {
36
- if(process.argv.length < 3) throw 'Must provide a bucket path';
37
- SOURCE = process.argv[3];
38
- i = 4;
39
- }
40
-
41
- for(; i < process.argv.length; i++) {
42
- var parts = process.argv[i].split('=');
43
- switch(parts[0]) {
44
- case '--port': PORT = parseInt(parts[1], 10); break;
45
- case '--quiet': PORT = parseInt(parts[1], 10); break;
46
- default: throw 'Invalid option ' + parts[0];
47
- }
48
- }
49
- })();
50
-
51
- // Utility methods
52
-
53
- function log(_level, _message) {
54
- if(_level == LOG.DEBUG) _message = '\t' + _message;
55
- if(_level >= LOG_LEVEL) console.log(_message);
56
- }
57
-
58
- function forOwn(_obj, _cb) {
59
- for(var key in _obj) {
60
- if(_obj.hasOwnProperty(key)) {
61
- _cb(key, _obj[key]);
62
- }
63
- }
64
- }
65
-
66
- function keysToLowerCase(_obj) {
67
- var result = {};
68
- forOwn(_obj, function(k,v) { result[k.toLowerCase()] = v; });
69
- return result;
70
- }
71
-
72
- function pickRandomPort() {
73
- return 0; // This could fail on Linux...
74
- }
75
-
76
- function matchRequestToResource(_req, _resource) {
77
- return _resource.method.toLowerCase() == _req.method.toLowerCase() && _resource.url == _req.url;
78
- }
79
-
80
- function matchRequestToResourceWOQuery(_req, _resource) {
81
- if(_resource.method.toLowerCase() == _req.method.toLowerCase()) return false;
82
-
83
- var reqUrl = url.parse(_req.url, true),
84
- resUrl = url.parse(_resource.url, true);
85
-
86
- return reqUrl.hostname == resUrl.hostname && reqUrl.pathname == resUrl.pathname;
87
- }
88
-
89
- function findAndMoveLast(_req, _array, _matches) {
90
- for(var i = 0, l = _array.length; i < l; i++) {
91
- if(_matches(_req, _array[i])) {
92
- var resource = _array.splice(i, 1)[0];
93
- _array.push(resource);
94
- return resource;
95
- }
96
- }
97
-
98
- return null;
99
- }
100
-
101
- function loadStackFrom(_path, _then) {
102
- var data = fs.readFileSync(_path);
103
- zlib.gunzip(data, function(err, buffer) {
104
- if (!err) STACK = JSON.parse(buffer.toString());
105
- _then();
106
- });
107
- }
108
-
109
- function saveStackTo(_path, _then) {
110
- var data = JSON.stringify(STACK);
111
- zlib.gzip(data, function(err, buffer) {
112
- if (!err) fs.writeFileSync(_path, buffer);
113
- _then();
114
- });
115
- }
116
-
117
- function resolveAndServeResource(_req, _resp) {
118
- var resource = findInStack(_req);
119
- if(resource) {
120
- log(LOG.INFO, "Serving: " + resource.method + ' ' + resource.url);
121
- log(LOG.DEBUG, "HTTP " + resource.statusCode);
122
- log(LOG.DEBUG, JSON.stringify(resource.headers));
123
-
124
- serveResource(resource, _resp);
125
- } else {
126
- log(LOG.WARN, 'Not found: ' + _req.url);
127
- _resp.statusCode = 404;
128
- _resp.end();
129
- }
130
- }
131
-
132
- function serveLastResource(_resp) {
133
- serveResource(STACK[STACK.length-1], _resp);
134
- }
135
-
136
- function serveResource(_resource, _resp) {
137
- _resp.statusCode = _resource.statusCode;
138
-
139
- forOwn(_resource.headers, function(k, v) { _resp.setHeader(k, v); });
140
-
141
- if(_resource.content) {
142
- var buf = new Buffer(_resource.content, _resource.encoding);
143
- _resp.end(buf);
144
- } else {
145
- _resp.end();
146
- }
147
- }
148
-
149
- function findAndMoveLast(_req, _matches) {
150
- for(var i = 0, l = STACK.length; i < l; i++) {
151
- if(_matches(_req, STACK[i])) {
152
- var resource = STACK.splice(i, 1)[0];
153
- STACK.push(resource);
154
- return resource;
155
- }
156
- }
157
-
158
- return null;
159
- }
160
-
161
- function findInStack(_req, _partial) {
162
- return findAndMoveLast(_req, matchRequestToResource) ||
163
- findAndMoveLast(_req, matchRequestToResourceWOQuery);
164
- }
165
-
166
- function cacheResponse(_req, _resp, _cb) {
167
-
168
- log(LOG.INFO, "Caching Response");
169
- log(LOG.DEBUG, "HTTP " + _resp.statusCode);
170
- log(LOG.DEBUG, JSON.stringify(keysToLowerCase(_resp.headers)));
171
-
172
- var encoding = null,
173
- // TODO: consider storing port and protocoll in the resource.
174
- resource = {
175
- url: _req.url,
176
- statusCode: _resp.statusCode,
177
- method: _req.method,
178
- // inHeaders: req.headers, // store request headers to aid in recognition?
179
- headers: keysToLowerCase(_resp.headers),
180
- content: '',
181
- encoding: 'base64'
182
- },
183
- contentEncoding = resource.headers['content-encoding'],
184
- contentType = resource.headers['content-type'],
185
- outStream = _resp;
186
-
187
- // add decompression if supported encoding:
188
- if(contentEncoding == 'gzip') {
189
- outStream = _resp.pipe(zlib.createGunzip());
190
- delete resource.headers['content-encoding'];
191
- contentEncoding = null;
192
- } else if(contentEncoding == 'deflate') {
193
- outStream = _resp.pipe(zlib.createInflate());
194
- delete resource.headers['content-encoding'];
195
- contentEncoding = null;
196
- }
197
-
198
- // use utf8 encoding for uncompresed text:
199
- if(!contentEncoding && contentType) {
200
- contentType = contentType.match(/([^\/]+)\/([^\s]+)(?:\s+(.+))?/i);
201
- if(contentType && (contentType[1] == 'text' || contentType[1] == 'application')) {
202
- resource.encoding = 'utf-8';
203
- }
204
- }
205
-
206
- // remove unwanted headers:
207
- delete resource.headers['content-length'];
208
-
209
- // start receiving data:
210
- if(resource.encoding) outStream.setEncoding(resource.encoding);
211
- outStream.on('data', function(_chunk) {
212
- resource.content += _chunk;
213
- });
214
-
215
- // when all data is received, store resource (dont know how this will handle more than one request)
216
- outStream.on('end', function() {
217
- STACK.push(resource);
218
- _cb();
219
- });
220
- }
221
-
222
- function prepareForwardRequest(_req) {
223
- var urlObj = url.parse(_req.url);
224
-
225
- var options = {
226
- method: _req.method,
227
- host: urlObj.host,
228
- path: urlObj.path,
229
- rejectUnauthorized: false,
230
- headers: keysToLowerCase(_req.headers)
231
- };
232
-
233
- // Rewrite headers
234
- options.headers['accept-encoding'] = 'gzip,deflate';
235
- return options;
236
- }
237
-
238
- function passRequest(_req, _resp) {
239
- log(LOG.INFO, 'Passing through ' + _req.method + ' request for ' + _req.url);
240
-
241
- var urlObj = url.parse(_req.url);
242
- var forward = (urlObj.protocol == 'https:' ? https : http).request({
243
- method: _req.method,
244
- host: urlObj.host,
245
- path: urlObj.path,
246
- headers: _req.headers
247
- }, function(_fw_resp) {
248
- // pipe response back untouched
249
- _resp.writeHead(_fw_resp.statusCode, _fw_resp.headers);
250
- _fw_resp.pipe(_resp);
251
- });
252
-
253
- _req.pipe(forward);
254
- }
255
-
256
- function captureRequest(_req, _resp, _useSSL) {
257
- log(LOG.INFO, 'Forwarding ' + _req.method + ' request for ' + _req.url);
258
-
259
- var urlObj = url.parse(_req.url);
260
- var options = {
261
- method: _req.method,
262
- host: urlObj.host,
263
- path: urlObj.path,
264
- rejectUnauthorized: false,
265
- headers: keysToLowerCase(_req.headers)
266
- };
267
-
268
- // Rewrite headers
269
- options.headers['accept-encoding'] = 'gzip,deflate';
270
- log(LOG.DEBUG, JSON.stringify(options));
271
-
272
- var forward = (urlObj.protocol == 'https:' ? https : http).request(options, function(_fw_resp) {
273
- cacheResponse(_req, _fw_resp, function() {
274
- serveLastResource(_resp);
275
- });
276
- });
277
-
278
- _req.pipe(forward); // forward request data
279
- }
280
-
281
- function replayRequest(_req, _resp) {
282
- log(LOG.INFO, 'Resolving ' + _req.method + ' request for ' + _req.url);
283
- resolveAndServeResource(_req, _resp);
284
- }
285
-
286
- function selectProxy() {
287
- switch(MODE) {
288
- case 'pass': return passRequest;
289
- case 'capture': return captureRequest;
290
- case 'replay': return replayRequest;
291
- default: throw 'Invalid proxy mode';
292
- }
293
- }
294
-
295
- var PROXY_FUN = selectProxy(),
296
- SERVER = http.createServer(PROXY_FUN);
297
-
298
- // Special handler for HTTPS request, creates a dedicated HTTPS proxy per connection,
299
- // that way the CONNECT tunnel can be intercepted, requires support for self signed
300
- // certificates in the client.
301
- SERVER.on('connect', function (_req, _sock, _head) {
302
-
303
- var urlObj = url.parse('http://' + _req.url);
304
- log(LOG.INFO, 'New HTTPS request: starting https intercept on ' + urlObj.hostname);
305
-
306
- var httpsServ = https.createServer(HTTPS_OPTIONS, function(_req, _resp) {
307
- _req.url = 'https://' + urlObj.hostname + _req.url;
308
- PROXY_FUN(_req, _resp);
309
- });
310
-
311
- httpsServ.listen(pickRandomPort());
312
-
313
- var tunnelSock = net.connect(httpsServ.address().port, function() {
314
- _sock.write('HTTP/1.1 200 Connection Established\r\n' +
315
- 'Proxy-agent: Node-Proxy\r\n' +
316
- '\r\n');
317
- tunnelSock.write(_head);
318
- tunnelSock.pipe(_sock);
319
- _sock.pipe(tunnelSock);
320
- });
321
-
322
- _sock.on('close', function() {
323
- httpsServ.close();
324
- });
325
- });
326
-
327
- console.log("Starting crabtrap! mode: " + MODE);
328
-
329
- if(MODE == 'replay') {
330
- loadStackFrom(SOURCE, SERVER.listen.bind(SERVER, PORT));
331
- } else {
332
- SERVER.listen(PORT);
333
- }
334
-
335
- var EXITING = false;
336
- process.on('SIGINT', function() {
337
- if(EXITING) return;
338
- EXITING = true;
339
-
340
- console.log("Shutting down crabtrap!");
341
- SERVER.close();
342
- if(MODE == 'capture') {
343
- saveStackTo(SOURCE, process.exit.bind(process));
344
- } else {
345
- process.exit();
346
- }
347
- });