crabfarm 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/crabtrap +347 -0
- data/lib/crabfarm/cli.rb +36 -2
- data/lib/crabfarm/configuration.rb +25 -8
- data/lib/crabfarm/context.rb +46 -2
- data/lib/crabfarm/crabtrap_context.rb +54 -0
- data/lib/crabfarm/crabtrap_runner.rb +54 -0
- data/lib/crabfarm/default_driver_factory.rb +63 -17
- data/lib/crabfarm/driver_bucket_pool.rb +3 -28
- data/lib/crabfarm/modes/console.rb +12 -18
- data/lib/crabfarm/modes/generator.rb +3 -0
- data/lib/crabfarm/modes/publisher.rb +189 -0
- data/lib/crabfarm/modes/recorder.rb +42 -0
- data/lib/crabfarm/rspec.rb +33 -1
- data/lib/crabfarm/templates/dot_crabfarm.erb +9 -0
- data/lib/crabfarm/templates/dot_gitignore.erb +0 -1
- data/lib/crabfarm/version.rb +1 -1
- metadata +37 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1031e279aeab473f8e46469b3f91383a4dffbd70
|
4
|
+
data.tar.gz: f366d36831117570d65b999de947bb5d6af0d0d1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a5e46aeefc0f4fe6c96ad2f5d02ee312c826a5a724e20b9797818ddb4c876245b6e58331a85feaafc48ab7b7f97f38c3dd1fd497705c72020559bf84ee8c22e4
|
7
|
+
data.tar.gz: df9dd5a3874ef38e462d1adfcf8c4ae92bf4b8f7ed0cfda6456d7e3ed6a1ceb2b791e1dd6f89988bec33bdf8d3fa9ef7db86c38515c43ae1de47aa314876063e
|
data/bin/crabtrap
ADDED
@@ -0,0 +1,347 @@
|
|
1
|
+
#!/usr/bin/env node
|
2
|
+
|
3
|
+
var net = require('net'),
|
4
|
+
http = require('http'),
|
5
|
+
https = require('https'),
|
6
|
+
url = require('url'),
|
7
|
+
fs = require('fs'),
|
8
|
+
zlib = require('zlib');
|
9
|
+
|
10
|
+
// Globals
|
11
|
+
|
12
|
+
var HTTPS_OPTIONS = {
|
13
|
+
key: '-----BEGIN RSA PRIVATE KEY-----\nMIIBOQIBAAJBAK/L/lXb/kxUzve1olo71s6mQLvuQCm3z2wqClq71NLerFnaXpN+\nFrNPy7+R3gZ1hdWXqbN5NqpWDMM9fcbd7p0CAwEAAQJAUDImN3Lhgl7Z/+TLSJCt\nwJ3VQCZC/QUOSdCv4o53Wy5aL/n8ootYFC3eoFC2Nal5bnH6onP9YR+X9l3HKLaT\n3QIhANXwb5SvJ+Kewa8F5wNHo9LFjSbL7WSSb1MyvYnOeFlPAiEA0lvaLz6UXRDL\n6T6Z1fkF0exmQqVimeL5qjY5o9Gk5lMCH1A52Z3oEQzqe7cmf3q7YrOnYUcrMdqF\nDzojzO/gfUECIQCe9fImiW+r9CljFH9Dhm6zd6S+8CNWjoKD8X4VITMvKQIgb3sg\nq9gPVzXn/+f8Qcc2KILSh3ffkIpA8yJK9omUIxI=\n-----END RSA PRIVATE KEY-----\n',
|
14
|
+
cert: '-----BEGIN CERTIFICATE-----\nMIIBmDCCAUICCQDGtiGKgI9AXjANBgkqhkiG9w0BAQUFADBTMQswCQYDVQQGEwJD\nTDELMAkGA1UECBMCUk0xETAPBgNVBAcTCFNhbnRpYWdvMREwDwYDVQQKEwhQbGF0\nYW51czERMA8GA1UEAxMIQ3JhYnRyYXAwHhcNMTUwMTE1MjAxNzMzWhcNNDIwNjAx\nMjAxNzMzWjBTMQswCQYDVQQGEwJDTDELMAkGA1UECBMCUk0xETAPBgNVBAcTCFNh\nbnRpYWdvMREwDwYDVQQKEwhQbGF0YW51czERMA8GA1UEAxMIQ3JhYnRyYXAwXDAN\nBgkqhkiG9w0BAQEFAANLADBIAkEAr8v+Vdv+TFTO97WiWjvWzqZAu+5AKbfPbCoK\nWrvU0t6sWdpek34Ws0/Lv5HeBnWF1Zeps3k2qlYMwz19xt3unQIDAQABMA0GCSqG\nSIb3DQEBBQUAA0EAmecqIZqQ8OXSIj0V2VKaIXwz8RBnhLzU7BJwcsWJE/Bex7zB\nWP+vLv9ML5ZRLCsXjL5IOav8qAX/NZXjoN3e3Q==\n-----END CERTIFICATE-----\n'
|
15
|
+
};
|
16
|
+
|
17
|
+
var LOG = {
|
18
|
+
DEBUG: 0,
|
19
|
+
INFO: 1,
|
20
|
+
WARN: 2,
|
21
|
+
ERROR: 3
|
22
|
+
};
|
23
|
+
|
24
|
+
var STACK = [],
|
25
|
+
MODE = false,
|
26
|
+
SOURCE = null,
|
27
|
+
PORT = 4000,
|
28
|
+
LOG_LEVEL = LOG.WARN;
|
29
|
+
|
30
|
+
(function() {
|
31
|
+
if(process.argv.length < 2) throw 'Must provide a proxy mode';
|
32
|
+
MODE = process.argv[2];
|
33
|
+
var i = 3;
|
34
|
+
|
35
|
+
if(MODE != 'pass') {
|
36
|
+
if(process.argv.length < 3) throw 'Must provide a bucket path';
|
37
|
+
SOURCE = process.argv[3];
|
38
|
+
i = 4;
|
39
|
+
}
|
40
|
+
|
41
|
+
for(; i < process.argv.length; i++) {
|
42
|
+
var parts = process.argv[i].split('=');
|
43
|
+
switch(parts[0]) {
|
44
|
+
case '--port': PORT = parseInt(parts[1], 10); break;
|
45
|
+
case '--quiet': PORT = parseInt(parts[1], 10); break;
|
46
|
+
default: throw 'Invalid option ' + parts[0];
|
47
|
+
}
|
48
|
+
}
|
49
|
+
})();
|
50
|
+
|
51
|
+
// Utility methods
|
52
|
+
|
53
|
+
function log(_level, _message) {
|
54
|
+
if(_level == LOG.DEBUG) _message = '\t' + _message;
|
55
|
+
if(_level >= LOG_LEVEL) console.log(_message);
|
56
|
+
}
|
57
|
+
|
58
|
+
function forOwn(_obj, _cb) {
|
59
|
+
for(var key in _obj) {
|
60
|
+
if(_obj.hasOwnProperty(key)) {
|
61
|
+
_cb(key, _obj[key]);
|
62
|
+
}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
function keysToLowerCase(_obj) {
|
67
|
+
var result = {};
|
68
|
+
forOwn(_obj, function(k,v) { result[k.toLowerCase()] = v; });
|
69
|
+
return result;
|
70
|
+
}
|
71
|
+
|
72
|
+
function pickRandomPort() {
|
73
|
+
return 0; // This could fail on Linux...
|
74
|
+
}
|
75
|
+
|
76
|
+
function matchRequestToResource(_req, _resource) {
|
77
|
+
return _resource.method.toLowerCase() == _req.method.toLowerCase() && _resource.url == _req.url;
|
78
|
+
}
|
79
|
+
|
80
|
+
function matchRequestToResourceWOQuery(_req, _resource) {
|
81
|
+
if(_resource.method.toLowerCase() == _req.method.toLowerCase()) return false;
|
82
|
+
|
83
|
+
var reqUrl = url.parse(_req.url, true),
|
84
|
+
resUrl = url.parse(_resource.url, true);
|
85
|
+
|
86
|
+
return reqUrl.hostname == resUrl.hostname && reqUrl.pathname == resUrl.pathname;
|
87
|
+
}
|
88
|
+
|
89
|
+
function findAndMoveLast(_req, _array, _matches) {
|
90
|
+
for(var i = 0, l = _array.length; i < l; i++) {
|
91
|
+
if(_matches(_req, _array[i])) {
|
92
|
+
var resource = _array.splice(i, 1)[0];
|
93
|
+
_array.push(resource);
|
94
|
+
return resource;
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
return null;
|
99
|
+
}
|
100
|
+
|
101
|
+
function loadStackFrom(_path, _then) {
|
102
|
+
var data = fs.readFileSync(_path);
|
103
|
+
zlib.gunzip(data, function(err, buffer) {
|
104
|
+
if (!err) STACK = JSON.parse(buffer.toString());
|
105
|
+
_then();
|
106
|
+
});
|
107
|
+
}
|
108
|
+
|
109
|
+
function saveStackTo(_path, _then) {
|
110
|
+
var data = JSON.stringify(STACK);
|
111
|
+
zlib.gzip(data, function(err, buffer) {
|
112
|
+
if (!err) fs.writeFileSync(_path, buffer);
|
113
|
+
_then();
|
114
|
+
});
|
115
|
+
}
|
116
|
+
|
117
|
+
function resolveAndServeResource(_req, _resp) {
|
118
|
+
var resource = findInStack(_req);
|
119
|
+
if(resource) {
|
120
|
+
log(LOG.INFO, "Serving: " + resource.method + ' ' + resource.url);
|
121
|
+
log(LOG.DEBUG, "HTTP " + resource.statusCode);
|
122
|
+
log(LOG.DEBUG, JSON.stringify(resource.headers));
|
123
|
+
|
124
|
+
serveResource(resource, _resp);
|
125
|
+
} else {
|
126
|
+
log(LOG.WARN, 'Not found: ' + _req.url);
|
127
|
+
_resp.statusCode = 404;
|
128
|
+
_resp.end();
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
function serveLastResource(_resp) {
|
133
|
+
serveResource(STACK[STACK.length-1], _resp);
|
134
|
+
}
|
135
|
+
|
136
|
+
function serveResource(_resource, _resp) {
|
137
|
+
_resp.statusCode = _resource.statusCode;
|
138
|
+
|
139
|
+
forOwn(_resource.headers, function(k, v) { _resp.setHeader(k, v); });
|
140
|
+
|
141
|
+
if(_resource.content) {
|
142
|
+
var buf = new Buffer(_resource.content, _resource.encoding);
|
143
|
+
_resp.end(buf);
|
144
|
+
} else {
|
145
|
+
_resp.end();
|
146
|
+
}
|
147
|
+
}
|
148
|
+
|
149
|
+
function findAndMoveLast(_req, _matches) {
|
150
|
+
for(var i = 0, l = STACK.length; i < l; i++) {
|
151
|
+
if(_matches(_req, STACK[i])) {
|
152
|
+
var resource = STACK.splice(i, 1)[0];
|
153
|
+
STACK.push(resource);
|
154
|
+
return resource;
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
return null;
|
159
|
+
}
|
160
|
+
|
161
|
+
function findInStack(_req, _partial) {
|
162
|
+
return findAndMoveLast(_req, matchRequestToResource) ||
|
163
|
+
findAndMoveLast(_req, matchRequestToResourceWOQuery);
|
164
|
+
}
|
165
|
+
|
166
|
+
function cacheResponse(_req, _resp, _cb) {
|
167
|
+
|
168
|
+
log(LOG.INFO, "Caching Response");
|
169
|
+
log(LOG.DEBUG, "HTTP " + _resp.statusCode);
|
170
|
+
log(LOG.DEBUG, JSON.stringify(keysToLowerCase(_resp.headers)));
|
171
|
+
|
172
|
+
var encoding = null,
|
173
|
+
// TODO: consider storing port and protocoll in the resource.
|
174
|
+
resource = {
|
175
|
+
url: _req.url,
|
176
|
+
statusCode: _resp.statusCode,
|
177
|
+
method: _req.method,
|
178
|
+
// inHeaders: req.headers, // store request headers to aid in recognition?
|
179
|
+
headers: keysToLowerCase(_resp.headers),
|
180
|
+
content: '',
|
181
|
+
encoding: 'base64'
|
182
|
+
},
|
183
|
+
contentEncoding = resource.headers['content-encoding'],
|
184
|
+
contentType = resource.headers['content-type'],
|
185
|
+
outStream = _resp;
|
186
|
+
|
187
|
+
// add decompression if supported encoding:
|
188
|
+
if(contentEncoding == 'gzip') {
|
189
|
+
outStream = _resp.pipe(zlib.createGunzip());
|
190
|
+
delete resource.headers['content-encoding'];
|
191
|
+
contentEncoding = null;
|
192
|
+
} else if(contentEncoding == 'deflate') {
|
193
|
+
outStream = _resp.pipe(zlib.createInflate());
|
194
|
+
delete resource.headers['content-encoding'];
|
195
|
+
contentEncoding = null;
|
196
|
+
}
|
197
|
+
|
198
|
+
// use utf8 encoding for uncompresed text:
|
199
|
+
if(!contentEncoding && contentType) {
|
200
|
+
contentType = contentType.match(/([^\/]+)\/([^\s]+)(?:\s+(.+))?/i);
|
201
|
+
if(contentType && (contentType[1] == 'text' || contentType[1] == 'application')) {
|
202
|
+
resource.encoding = 'utf-8';
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
// remove unwanted headers:
|
207
|
+
delete resource.headers['content-length'];
|
208
|
+
|
209
|
+
// start receiving data:
|
210
|
+
if(resource.encoding) outStream.setEncoding(resource.encoding);
|
211
|
+
outStream.on('data', function(_chunk) {
|
212
|
+
resource.content += _chunk;
|
213
|
+
});
|
214
|
+
|
215
|
+
// when all data is received, store resource (dont know how this will handle more than one request)
|
216
|
+
outStream.on('end', function() {
|
217
|
+
STACK.push(resource);
|
218
|
+
_cb();
|
219
|
+
});
|
220
|
+
}
|
221
|
+
|
222
|
+
function prepareForwardRequest(_req) {
|
223
|
+
var urlObj = url.parse(_req.url);
|
224
|
+
|
225
|
+
var options = {
|
226
|
+
method: _req.method,
|
227
|
+
host: urlObj.host,
|
228
|
+
path: urlObj.path,
|
229
|
+
rejectUnauthorized: false,
|
230
|
+
headers: keysToLowerCase(_req.headers)
|
231
|
+
};
|
232
|
+
|
233
|
+
// Rewrite headers
|
234
|
+
options.headers['accept-encoding'] = 'gzip,deflate';
|
235
|
+
return options;
|
236
|
+
}
|
237
|
+
|
238
|
+
function passRequest(_req, _resp) {
|
239
|
+
log(LOG.INFO, 'Passing through ' + _req.method + ' request for ' + _req.url);
|
240
|
+
|
241
|
+
var urlObj = url.parse(_req.url);
|
242
|
+
var forward = (urlObj.protocol == 'https:' ? https : http).request({
|
243
|
+
method: _req.method,
|
244
|
+
host: urlObj.host,
|
245
|
+
path: urlObj.path,
|
246
|
+
headers: _req.headers
|
247
|
+
}, function(_fw_resp) {
|
248
|
+
// pipe response back untouched
|
249
|
+
_resp.writeHead(_fw_resp.statusCode, _fw_resp.headers);
|
250
|
+
_fw_resp.pipe(_resp);
|
251
|
+
});
|
252
|
+
|
253
|
+
_req.pipe(forward);
|
254
|
+
}
|
255
|
+
|
256
|
+
function captureRequest(_req, _resp, _useSSL) {
|
257
|
+
log(LOG.INFO, 'Forwarding ' + _req.method + ' request for ' + _req.url);
|
258
|
+
|
259
|
+
var urlObj = url.parse(_req.url);
|
260
|
+
var options = {
|
261
|
+
method: _req.method,
|
262
|
+
host: urlObj.host,
|
263
|
+
path: urlObj.path,
|
264
|
+
rejectUnauthorized: false,
|
265
|
+
headers: keysToLowerCase(_req.headers)
|
266
|
+
};
|
267
|
+
|
268
|
+
// Rewrite headers
|
269
|
+
options.headers['accept-encoding'] = 'gzip,deflate';
|
270
|
+
log(LOG.DEBUG, JSON.stringify(options));
|
271
|
+
|
272
|
+
var forward = (urlObj.protocol == 'https:' ? https : http).request(options, function(_fw_resp) {
|
273
|
+
cacheResponse(_req, _fw_resp, function() {
|
274
|
+
serveLastResource(_resp);
|
275
|
+
});
|
276
|
+
});
|
277
|
+
|
278
|
+
_req.pipe(forward); // forward request data
|
279
|
+
}
|
280
|
+
|
281
|
+
function replayRequest(_req, _resp) {
|
282
|
+
log(LOG.INFO, 'Resolving ' + _req.method + ' request for ' + _req.url);
|
283
|
+
resolveAndServeResource(_req, _resp);
|
284
|
+
}
|
285
|
+
|
286
|
+
function selectProxy() {
|
287
|
+
switch(MODE) {
|
288
|
+
case 'pass': return passRequest;
|
289
|
+
case 'capture': return captureRequest;
|
290
|
+
case 'replay': return replayRequest;
|
291
|
+
default: throw 'Invalid proxy mode';
|
292
|
+
}
|
293
|
+
}
|
294
|
+
|
295
|
+
var PROXY_FUN = selectProxy(),
|
296
|
+
SERVER = http.createServer(PROXY_FUN);
|
297
|
+
|
298
|
+
// Special handler for HTTPS request, creates a dedicated HTTPS proxy per connection,
|
299
|
+
// that way the CONNECT tunnel can be intercepted, requires support for self signed
|
300
|
+
// certificates in the client.
|
301
|
+
SERVER.on('connect', function (_req, _sock, _head) {
|
302
|
+
|
303
|
+
var urlObj = url.parse('http://' + _req.url);
|
304
|
+
log(LOG.INFO, 'New HTTPS request: starting https intercept on ' + urlObj.hostname);
|
305
|
+
|
306
|
+
var httpsServ = https.createServer(HTTPS_OPTIONS, function(_req, _resp) {
|
307
|
+
_req.url = 'https://' + urlObj.hostname + _req.url;
|
308
|
+
PROXY_FUN(_req, _resp);
|
309
|
+
});
|
310
|
+
|
311
|
+
httpsServ.listen(pickRandomPort());
|
312
|
+
|
313
|
+
var tunnelSock = net.connect(httpsServ.address().port, function() {
|
314
|
+
_sock.write('HTTP/1.1 200 Connection Established\r\n' +
|
315
|
+
'Proxy-agent: Node-Proxy\r\n' +
|
316
|
+
'\r\n');
|
317
|
+
tunnelSock.write(_head);
|
318
|
+
tunnelSock.pipe(_sock);
|
319
|
+
_sock.pipe(tunnelSock);
|
320
|
+
});
|
321
|
+
|
322
|
+
_sock.on('close', function() {
|
323
|
+
httpsServ.close();
|
324
|
+
});
|
325
|
+
});
|
326
|
+
|
327
|
+
console.log("Starting crabtrap! mode: " + MODE);
|
328
|
+
|
329
|
+
if(MODE == 'replay') {
|
330
|
+
loadStackFrom(SOURCE, SERVER.listen.bind(SERVER, PORT));
|
331
|
+
} else {
|
332
|
+
SERVER.listen(PORT);
|
333
|
+
}
|
334
|
+
|
335
|
+
var EXITING = false;
|
336
|
+
process.on('SIGINT', function() {
|
337
|
+
if(EXITING) return;
|
338
|
+
EXITING = true;
|
339
|
+
|
340
|
+
console.log("Shutting down crabtrap!");
|
341
|
+
SERVER.close();
|
342
|
+
if(MODE == 'capture') {
|
343
|
+
saveStackTo(SOURCE, process.exit.bind(process));
|
344
|
+
} else {
|
345
|
+
process.exit();
|
346
|
+
}
|
347
|
+
});
|
data/lib/crabfarm/cli.rb
CHANGED
@@ -10,14 +10,28 @@ module Crabfarm
|
|
10
10
|
desc "Starts the crawler in console mode"
|
11
11
|
command [:console, :c] do |c|
|
12
12
|
|
13
|
+
c.desc "Capture to crabtrap file"
|
14
|
+
c.flag :capture
|
15
|
+
|
16
|
+
c.desc "Replay from crabtrap file"
|
17
|
+
c.flag :replay
|
18
|
+
|
13
19
|
Support::GLI.generate_options c
|
14
20
|
|
15
21
|
c.action do |global_options,options,args|
|
16
22
|
next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
|
17
23
|
|
18
|
-
require "crabfarm/modes/console"
|
19
24
|
Crabfarm.config.set Support::GLI.parse_options options
|
20
|
-
|
25
|
+
|
26
|
+
next puts "Cannot use --replay with --capture" if options[:capture] and options[:replay]
|
27
|
+
|
28
|
+
require 'crabfarm/crabtrap_context'
|
29
|
+
context = Crabfarm::CrabtrapContext.new
|
30
|
+
context.capture options[:capture] if options[:capture]
|
31
|
+
context.replay options[:replay] if options[:replay]
|
32
|
+
|
33
|
+
require "crabfarm/modes/console"
|
34
|
+
Crabfarm::Modes::Console.start context
|
21
35
|
end
|
22
36
|
end
|
23
37
|
|
@@ -84,9 +98,29 @@ module Crabfarm
|
|
84
98
|
end
|
85
99
|
end
|
86
100
|
|
101
|
+
desc "Perform an HTTP recording for use in tests"
|
102
|
+
command [:record, :r] do |c|
|
103
|
+
c.action do |global_options, options, args|
|
104
|
+
next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
|
105
|
+
|
106
|
+
require "crabfarm/modes/recorder"
|
107
|
+
Crabfarm::Modes::Recorder.start args[0]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
desc "Publish the crawler to a crabfarm cloud"
|
87
112
|
command :publish do |c|
|
113
|
+
c.desc "Just list the files that are beign packaged"
|
114
|
+
c.switch :dry, :default_value => false
|
115
|
+
|
116
|
+
c.desc "Don't check for pending changes"
|
117
|
+
c.switch :unsafe, :default_value => false
|
118
|
+
|
88
119
|
c.action do |global_options,options,args|
|
120
|
+
next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
|
89
121
|
|
122
|
+
require "crabfarm/modes/publisher"
|
123
|
+
Crabfarm::Modes::Publisher.publish CF_PATH, options
|
90
124
|
end
|
91
125
|
end
|
92
126
|
|
@@ -9,6 +9,7 @@ module Crabfarm
|
|
9
9
|
[:output_builder, :string, 'Default json output builder used by states'],
|
10
10
|
[:driver_factory, :mixed, 'Driver factory, disabled if phantom_mode is used'],
|
11
11
|
[:log_path, :string, 'Path where logs should be stored'],
|
12
|
+
[:proxy, :string, 'If given, a proxy is used to connect to the internet if driver supports it'],
|
12
13
|
|
13
14
|
# Default driver configuration parameters
|
14
15
|
[:driver, ['chrome', 'firefox', 'phantomjs', 'remote'], 'Webdriver to be user, common options: chrome, firefox, phantomjs, remote.'],
|
@@ -21,10 +22,14 @@ module Crabfarm
|
|
21
22
|
|
22
23
|
# Phantom launcher configuration
|
23
24
|
[:phantom_load_images, :boolean, 'Phantomjs image loading, only for phantomjs driver.'],
|
24
|
-
[:phantom_proxy, :string, 'Phantonjs proxy address, only for phantomjs driver.'],
|
25
25
|
[:phantom_ssl, ['sslv3', 'sslv2', 'tlsv1', 'any'], 'Phantomjs ssl mode: sslv3, sslv2, tlsv1 or any, only for phantomjs driver.'],
|
26
26
|
[:phantom_bin_path, :string, 'Phantomjs binary path, only for phantomjs driver.'],
|
27
|
-
[:phantom_lock_file, :string, 'Phantomjs lock file path, only for phantomjs driver.']
|
27
|
+
[:phantom_lock_file, :string, 'Phantomjs lock file path, only for phantomjs driver.'],
|
28
|
+
|
29
|
+
# Crabtrap launcher configuration
|
30
|
+
[:crabtrap_bin, :string, 'Crabtrap binary path.'],
|
31
|
+
[:crabtrap_port, :integer, 'Crabtrap port, defaults to 4000.'],
|
32
|
+
[:crabtrap_mode, ['capture', 'replay'], 'Crabtrap operation mode.']
|
28
33
|
]
|
29
34
|
.map { |o| Option.new *o }
|
30
35
|
|
@@ -48,6 +53,7 @@ module Crabfarm
|
|
48
53
|
output_builder: :hash,
|
49
54
|
driver_factory: nil,
|
50
55
|
log_path: 'logs',
|
56
|
+
proxy: nil,
|
51
57
|
|
52
58
|
driver: 'phantomjs',
|
53
59
|
driver_capabilities: Selenium::WebDriver::Remote::Capabilities.firefox,
|
@@ -58,10 +64,12 @@ module Crabfarm
|
|
58
64
|
driver_window_height: 800,
|
59
65
|
|
60
66
|
phantom_load_images: false,
|
61
|
-
phantom_proxy: nil,
|
62
67
|
phantom_ssl: 'any',
|
63
68
|
phantom_bin_path: 'phantomjs',
|
64
|
-
phantom_lock_file: nil
|
69
|
+
phantom_lock_file: nil,
|
70
|
+
|
71
|
+
crabtrap_bin: 'crabtrap',
|
72
|
+
crabtrap_port: 4000
|
65
73
|
}
|
66
74
|
end
|
67
75
|
|
@@ -79,6 +87,7 @@ module Crabfarm
|
|
79
87
|
def driver_config
|
80
88
|
{
|
81
89
|
name: driver,
|
90
|
+
proxy: proxy,
|
82
91
|
capabilities: driver_capabilities,
|
83
92
|
remote_host: driver_remote_host,
|
84
93
|
remote_timeout: driver_remote_timeout,
|
@@ -94,7 +103,7 @@ module Crabfarm
|
|
94
103
|
def phantom_config
|
95
104
|
{
|
96
105
|
load_images: phantom_load_images,
|
97
|
-
proxy:
|
106
|
+
proxy: proxy,
|
98
107
|
ssl: phantom_ssl,
|
99
108
|
bin_path: phantom_bin_path,
|
100
109
|
lock_file: phantom_lock_file,
|
@@ -102,9 +111,17 @@ module Crabfarm
|
|
102
111
|
}
|
103
112
|
end
|
104
113
|
|
105
|
-
|
106
|
-
|
107
|
-
|
114
|
+
def crabtrap_config
|
115
|
+
{
|
116
|
+
bin_path: crabtrap_bin,
|
117
|
+
port: crabtrap_port,
|
118
|
+
proxy: proxy
|
119
|
+
}
|
120
|
+
end
|
121
|
+
|
122
|
+
# Add enviroment support (like a Gemfile)
|
123
|
+
# group :test { set_driver :phantom }
|
124
|
+
# set_driver :phantom, group: :test
|
108
125
|
|
109
126
|
end
|
110
127
|
|
data/lib/crabfarm/context.rb
CHANGED
@@ -7,23 +7,67 @@ module Crabfarm
|
|
7
7
|
def_delegators :@pool, :driver
|
8
8
|
|
9
9
|
def initialize
|
10
|
-
@pool = DriverBucketPool.new
|
11
10
|
@store = StateStore.new
|
11
|
+
@loaded = false
|
12
|
+
end
|
13
|
+
|
14
|
+
def load
|
15
|
+
unless @loaded
|
16
|
+
init_phantom_if_required
|
17
|
+
@pool = DriverBucketPool.new build_driver_factory
|
18
|
+
@loaded = true
|
19
|
+
end
|
12
20
|
end
|
13
21
|
|
14
22
|
def run_state(_name, _params={})
|
23
|
+
load
|
15
24
|
state = LoaderService.load_state(_name).new @pool, @store, _params
|
16
25
|
state.crawl
|
17
26
|
state
|
18
27
|
end
|
19
28
|
|
20
29
|
def reset
|
30
|
+
load
|
21
31
|
@store.reset
|
22
32
|
@pool.reset
|
23
33
|
end
|
24
34
|
|
25
35
|
def release
|
26
|
-
@
|
36
|
+
if @loaded
|
37
|
+
@pool.release
|
38
|
+
@phantom.stop unless @phantom.nil?
|
39
|
+
@loaded = false
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def init_phantom_if_required
|
46
|
+
if config.phantom_mode_enabled?
|
47
|
+
@phantom = PhantomRunner.new phantom_config
|
48
|
+
@phantom.start
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def build_driver_factory
|
53
|
+
if @phantom
|
54
|
+
PhantomDriverFactory.new @phantom, driver_config
|
55
|
+
else
|
56
|
+
return config.driver_factory if config.driver_factory
|
57
|
+
DefaultDriverFactory.new driver_config
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def config
|
62
|
+
Crabfarm.config
|
63
|
+
end
|
64
|
+
|
65
|
+
def driver_config
|
66
|
+
config.driver_config
|
67
|
+
end
|
68
|
+
|
69
|
+
def phantom_config
|
70
|
+
config.phantom_config
|
27
71
|
end
|
28
72
|
|
29
73
|
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'crabfarm/crabtrap_runner'
|
3
|
+
|
4
|
+
module Crabfarm
|
5
|
+
class CrabtrapContext < Context
|
6
|
+
|
7
|
+
def load
|
8
|
+
pass_through if @runner.nil?
|
9
|
+
super
|
10
|
+
end
|
11
|
+
|
12
|
+
def pass_through
|
13
|
+
restart_with_options(mode: :pass) if @runner.nil? or @runner.mode != :pass
|
14
|
+
end
|
15
|
+
|
16
|
+
def capture(_path)
|
17
|
+
restart_with_options(mode: :capture, bucket_path: _path)
|
18
|
+
end
|
19
|
+
|
20
|
+
def replay(_path)
|
21
|
+
restart_with_options(mode: :replay, bucket_path: _path)
|
22
|
+
end
|
23
|
+
|
24
|
+
def release
|
25
|
+
super
|
26
|
+
stop_daemon
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def restart_with_options(_options)
|
32
|
+
stop_daemon
|
33
|
+
@runner = CrabtrapRunner.new Crabfarm.config.crabtrap_config.merge(_options)
|
34
|
+
@runner.start
|
35
|
+
end
|
36
|
+
|
37
|
+
def stop_daemon
|
38
|
+
@runner.stop unless @runner.nil?
|
39
|
+
end
|
40
|
+
|
41
|
+
def driver_config
|
42
|
+
super.merge(proxy: proxy_address)
|
43
|
+
end
|
44
|
+
|
45
|
+
def phantom_config
|
46
|
+
super.merge(proxy: proxy_address)
|
47
|
+
end
|
48
|
+
|
49
|
+
def proxy_address
|
50
|
+
"127.0.0.1:#{@runner.port}"
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
module Crabfarm
|
4
|
+
class CrabtrapRunner
|
5
|
+
|
6
|
+
def initialize(_config={})
|
7
|
+
@config = _config;
|
8
|
+
@pid = nil
|
9
|
+
end
|
10
|
+
|
11
|
+
def port
|
12
|
+
@config[:port] # TODO: maybe select port dynamically...
|
13
|
+
end
|
14
|
+
|
15
|
+
def mode
|
16
|
+
@config.fetch(:mode, :pass).to_sym
|
17
|
+
end
|
18
|
+
|
19
|
+
def start
|
20
|
+
@pid = Process.spawn({}, crabtrap_cmd)
|
21
|
+
wait_for_server
|
22
|
+
end
|
23
|
+
|
24
|
+
def stop
|
25
|
+
unless @pid.nil?
|
26
|
+
Process.kill("INT", @pid)
|
27
|
+
Process.wait @pid
|
28
|
+
@pid = nil
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def crabtrap_cmd
|
35
|
+
cmd = [@config[:bin_path]]
|
36
|
+
cmd << mode.to_s
|
37
|
+
cmd << @config[:bucket_path] if mode != :pass
|
38
|
+
cmd << "--port=#{port}"
|
39
|
+
cmd.join(' ')
|
40
|
+
end
|
41
|
+
|
42
|
+
def wait_for_server
|
43
|
+
loop do
|
44
|
+
begin
|
45
|
+
# TODO: improve waiting, making this kind of request could change crabtrap's stack
|
46
|
+
Net::HTTP.get_response(URI.parse("http://127.0.0.1:#{port}/status"))
|
47
|
+
break
|
48
|
+
rescue
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|
@@ -7,33 +7,79 @@ module Crabfarm
|
|
7
7
|
|
8
8
|
def build_driver(_session_id)
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
raise ConfigurationError.new 'must provide a webdriver type' unless config_present? :name
|
11
|
+
driver_name = @config[:name].to_sym
|
12
12
|
|
13
|
-
case driver_name
|
13
|
+
driver = case driver_name
|
14
14
|
when :noop
|
15
15
|
require "crabfarm/mocks/noop_driver"
|
16
16
|
driver = Crabfarm::Mocks::NoopDriver.new # TODO: improve dummy driver...
|
17
17
|
when :remote
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
load_remote_driver
|
19
|
+
when :firefox
|
20
|
+
load_firefox_driver
|
21
|
+
when :chrome
|
22
|
+
load_chrome_driver
|
23
|
+
else
|
24
|
+
load_other_driver driver_name
|
25
|
+
end
|
21
26
|
|
22
|
-
|
23
|
-
|
24
|
-
:http_client => client,
|
25
|
-
:desired_capabilities => @config[:capabilities]
|
26
|
-
}
|
27
|
+
# apply browser configuration to new driver
|
28
|
+
driver.manage.window.resize_to(@config[:window_width], @config[:window_height]) rescue nil
|
27
29
|
|
28
|
-
|
29
|
-
|
30
|
-
|
30
|
+
return driver
|
31
|
+
end
|
32
|
+
|
33
|
+
def load_remote_driver
|
34
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
35
|
+
client.timeout = @config[:remote_timeout]
|
31
36
|
|
32
|
-
|
33
|
-
|
37
|
+
if config_present? :proxy
|
38
|
+
client.proxy = Selenium::WebDriver::Proxy.new({
|
39
|
+
:http => @config[:proxy],
|
40
|
+
:ssl => @config[:proxy]
|
41
|
+
})
|
34
42
|
end
|
35
43
|
|
36
|
-
|
44
|
+
Selenium::WebDriver.for(:remote, {
|
45
|
+
:url => @config[:remote_host],
|
46
|
+
:http_client => client,
|
47
|
+
:desired_capabilities => @config[:capabilities]
|
48
|
+
})
|
49
|
+
end
|
50
|
+
|
51
|
+
def load_firefox_driver
|
52
|
+
profile = Selenium::WebDriver::Firefox::Profile.new
|
53
|
+
|
54
|
+
if config_present? :proxy
|
55
|
+
profile.proxy = Selenium::WebDriver::Proxy.new({
|
56
|
+
:http => @config[:proxy],
|
57
|
+
:ssl => @config[:proxy]
|
58
|
+
})
|
59
|
+
end
|
60
|
+
|
61
|
+
Selenium::WebDriver.for :firefox, :profile => profile
|
62
|
+
end
|
63
|
+
|
64
|
+
def load_chrome_driver
|
65
|
+
switches = []
|
66
|
+
|
67
|
+
if config_present? :proxy
|
68
|
+
switches << "--proxy-server=#{@config[:proxy]}"
|
69
|
+
switches << "--ignore-certificate-errors"
|
70
|
+
end
|
71
|
+
|
72
|
+
Selenium::WebDriver.for :chrome, :switches => switches
|
73
|
+
end
|
74
|
+
|
75
|
+
def load_other_driver(_name)
|
76
|
+
raise ConfigurationError.new 'default driver does not support proxy' if config_present? :proxy
|
77
|
+
|
78
|
+
Selenium::WebDriver.for _name.to_sym
|
79
|
+
end
|
80
|
+
|
81
|
+
def config_present?(_key)
|
82
|
+
not (@config[_key].nil? or @config[_key].empty?)
|
37
83
|
end
|
38
84
|
|
39
85
|
end
|
@@ -1,17 +1,15 @@
|
|
1
1
|
module Crabfarm
|
2
2
|
class DriverBucketPool
|
3
3
|
|
4
|
-
def initialize
|
4
|
+
def initialize(_factory=nil)
|
5
|
+
@factory = _factory || DefaultDriverFactory.new(Crabfarm.config.driver_config)
|
5
6
|
@buckets = Hash.new
|
6
|
-
@phantom = nil
|
7
|
-
|
8
|
-
init_phantom_if_required
|
9
7
|
end
|
10
8
|
|
11
9
|
def driver(_session_id=nil)
|
12
10
|
_session_id ||= :default_driver
|
13
11
|
bucket = @buckets[_session_id.to_sym]
|
14
|
-
bucket = @buckets[_session_id.to_sym] = DriverBucket.new(_session_id,
|
12
|
+
bucket = @buckets[_session_id.to_sym] = DriverBucket.new(_session_id, @factory) if bucket.nil?
|
15
13
|
bucket
|
16
14
|
end
|
17
15
|
|
@@ -22,29 +20,6 @@ module Crabfarm
|
|
22
20
|
|
23
21
|
def release
|
24
22
|
reset
|
25
|
-
@phantom.stop unless @phantom.nil?
|
26
|
-
end
|
27
|
-
|
28
|
-
private
|
29
|
-
|
30
|
-
def init_phantom_if_required
|
31
|
-
if config.phantom_mode_enabled?
|
32
|
-
@phantom = PhantomRunner.new config.phantom_config
|
33
|
-
@phantom.start
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
def build_driver_factory
|
38
|
-
if config.phantom_mode_enabled?
|
39
|
-
PhantomDriverFactory.new @phantom, config.driver_config
|
40
|
-
else
|
41
|
-
return config.driver_factory if config.driver_factory
|
42
|
-
DefaultDriverFactory.new config.driver_config
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def config
|
47
|
-
Crabfarm.config
|
48
23
|
end
|
49
24
|
|
50
25
|
end
|
@@ -10,20 +10,19 @@ module Crabfarm
|
|
10
10
|
|
11
11
|
class ConsoleDsl
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
def initialize
|
16
|
-
reload!
|
13
|
+
def initialize(_context)
|
14
|
+
@context = _context
|
17
15
|
end
|
18
16
|
|
19
17
|
def reload!
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
18
|
+
puts "Reloading crawler source".color(:green)
|
19
|
+
ActiveSupport::Dependencies.clear
|
20
|
+
@context.reset
|
21
|
+
end
|
25
22
|
|
26
|
-
|
23
|
+
def reset
|
24
|
+
puts "Resetting crawling context".color(:green)
|
25
|
+
@context.reset
|
27
26
|
end
|
28
27
|
|
29
28
|
def transition(_name=nil, _params={})
|
@@ -53,17 +52,12 @@ module Crabfarm
|
|
53
52
|
puts "Ejem..."
|
54
53
|
end
|
55
54
|
|
56
|
-
def reset
|
57
|
-
puts "Resetting crawling context".color(:green)
|
58
|
-
@context.reset
|
59
|
-
end
|
60
|
-
|
61
55
|
alias :t :transition
|
62
56
|
alias :r :reset
|
63
57
|
end
|
64
58
|
|
65
|
-
def self.start
|
66
|
-
dsl = ConsoleDsl.new
|
59
|
+
def self.start(_context)
|
60
|
+
dsl = ConsoleDsl.new _context
|
67
61
|
|
68
62
|
loop do
|
69
63
|
begin
|
@@ -78,7 +72,7 @@ module Crabfarm
|
|
78
72
|
end
|
79
73
|
|
80
74
|
puts "Releasing crawling context".color(:green)
|
81
|
-
|
75
|
+
_context.release
|
82
76
|
end
|
83
77
|
|
84
78
|
end
|
@@ -20,6 +20,7 @@ module Crabfarm
|
|
20
20
|
path(_name, 'Gemfile').render('Gemfile', binding)
|
21
21
|
path(_name, 'Crabfile').render('Crabfile', binding)
|
22
22
|
path(_name, '.rspec').render('dot_rspec', binding)
|
23
|
+
path(_name, '.crabfarm').render('dot_crabfarm', binding)
|
23
24
|
path(_name, 'boot.rb').render('boot.rb', binding)
|
24
25
|
path(_name, 'bin', 'crabfarm').render('crabfarm_bin', binding, 0755)
|
25
26
|
path(_name, 'app', 'parsers', '.gitkeep').render('dot_gitkeep')
|
@@ -27,6 +28,8 @@ module Crabfarm
|
|
27
28
|
path(_name, 'app', 'helpers', '.gitkeep').render('dot_gitkeep')
|
28
29
|
path(_name, 'spec', 'spec_helper.rb').render('spec_helper.rb', binding)
|
29
30
|
path(_name, 'spec', 'snapshots', '.gitkeep').render('dot_gitkeep')
|
31
|
+
path(_name, 'spec', 'mementos', '.gitkeep').render('dot_gitkeep')
|
32
|
+
path(_name, 'spec', 'integration', '.gitkeep').render('dot_gitkeep')
|
30
33
|
end
|
31
34
|
end
|
32
35
|
|
@@ -0,0 +1,189 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'git'
|
3
|
+
require 'zlib'
|
4
|
+
require 'rubygems/package'
|
5
|
+
require 'net/http/post/multipart'
|
6
|
+
require 'rainbow'
|
7
|
+
require 'rainbow/ext/string'
|
8
|
+
require 'digest/sha1'
|
9
|
+
|
10
|
+
module Crabfarm
|
11
|
+
module Modes
|
12
|
+
module Publisher
|
13
|
+
extend self
|
14
|
+
|
15
|
+
DEFAULT_HOST = 'http://www.crabfarm.io'
|
16
|
+
|
17
|
+
def publish(_path, _options={})
|
18
|
+
|
19
|
+
@crawler_path = _path
|
20
|
+
@options = _options
|
21
|
+
|
22
|
+
load_config
|
23
|
+
return unless dry_run or authenticated?
|
24
|
+
detect_git_repo
|
25
|
+
|
26
|
+
if inside_git_repo?
|
27
|
+
if not unsafe and is_tree_dirty?
|
28
|
+
puts "Aborting: Your working copy has uncommited changes! Use the --unsafe option to force.".color(:red)
|
29
|
+
return
|
30
|
+
end
|
31
|
+
load_files_from_git
|
32
|
+
else
|
33
|
+
load_files_from_fs
|
34
|
+
end
|
35
|
+
|
36
|
+
build_package
|
37
|
+
compress_package
|
38
|
+
generate_signature
|
39
|
+
|
40
|
+
send_package unless dry_run
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def dry_run
|
46
|
+
@options.fetch(:dry, false)
|
47
|
+
end
|
48
|
+
|
49
|
+
def unsafe
|
50
|
+
@options.fetch(:unsafe, false)
|
51
|
+
end
|
52
|
+
|
53
|
+
def config_path
|
54
|
+
File.join(@crawler_path, '.crabfarm')
|
55
|
+
end
|
56
|
+
|
57
|
+
def home_config_path
|
58
|
+
File.join(Dir.home, '.crabfarm')
|
59
|
+
end
|
60
|
+
|
61
|
+
def load_config
|
62
|
+
config = YAML.load_file config_path
|
63
|
+
|
64
|
+
if File.exists? home_config_path
|
65
|
+
home_config = YAML.load_file home_config_path
|
66
|
+
config = home_config.merge config
|
67
|
+
end
|
68
|
+
|
69
|
+
@token = config['token']
|
70
|
+
@name = config['name']
|
71
|
+
@host = config['host'] || DEFAULT_HOST
|
72
|
+
@include = config['files']
|
73
|
+
end
|
74
|
+
|
75
|
+
def authenticated?
|
76
|
+
# TODO: if no token, ask for credentials and fetch token
|
77
|
+
if @token.nil? or @token.empty?
|
78
|
+
puts "No crabfarm API token has been provided".color(:red)
|
79
|
+
return false
|
80
|
+
end
|
81
|
+
|
82
|
+
true
|
83
|
+
end
|
84
|
+
|
85
|
+
def is_tree_dirty?
|
86
|
+
@git.chdir do
|
87
|
+
status = @git.status
|
88
|
+
(status.changed.count + status.added.count + status.deleted.count + status.untracked.count) > 0
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def detect_git_repo
|
93
|
+
git_path = @crawler_path
|
94
|
+
|
95
|
+
path_to_git = []
|
96
|
+
while git_path != '/'
|
97
|
+
if File.exists? File.join(git_path, '.git')
|
98
|
+
@git = Git.open git_path
|
99
|
+
@rel_path = if path_to_git.count > 0 then File.join(*path_to_git.reverse!) else nil end
|
100
|
+
return
|
101
|
+
else
|
102
|
+
path_to_git << File.basename(git_path)
|
103
|
+
git_path = File.expand_path('..', git_path)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
@git = nil
|
108
|
+
end
|
109
|
+
|
110
|
+
def inside_git_repo?
|
111
|
+
not @git.nil?
|
112
|
+
end
|
113
|
+
|
114
|
+
def load_files_from_git
|
115
|
+
@git.chdir do
|
116
|
+
@ref = @git.log.first.sha
|
117
|
+
puts "Packaging files from current HEAD (#{@ref}):".color(:green)
|
118
|
+
entries = @git.gtree(@ref).full_tree.map(&:split)
|
119
|
+
entries = entries.select { |e| e[1] == 'blob' }
|
120
|
+
|
121
|
+
@file_list = []
|
122
|
+
entries.each do |entry|
|
123
|
+
path = unless @rel_path.nil?
|
124
|
+
next unless entry[3].starts_with? @rel_path
|
125
|
+
entry[3][@rel_path.length+1..-1]
|
126
|
+
else entry[3] end
|
127
|
+
|
128
|
+
if @include.any? { |p| File.fnmatch? p, path }
|
129
|
+
@file_list << [path, entry[0].to_i(8), @git.show(@ref, entry[3])]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def load_files_from_fs
|
136
|
+
puts "Packaging files (no version control)".color(:green)
|
137
|
+
@file_list = Dir[*@include].map do |path|
|
138
|
+
full_path = File.join(@crawler_path, path)
|
139
|
+
[path, File.stat(full_path).mode, File.read(full_path)]
|
140
|
+
end
|
141
|
+
@ref = "filesystem"
|
142
|
+
end
|
143
|
+
|
144
|
+
def build_package
|
145
|
+
@package = StringIO.new("")
|
146
|
+
Gem::Package::TarWriter.new(@package) do |tar|
|
147
|
+
@file_list.each do |f|
|
148
|
+
puts "+ #{f[0]} - #{f[1]}"
|
149
|
+
path, mode, contents = f
|
150
|
+
tar.add_file(path, mode) { |tf| tf.write contents }
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
@package.rewind
|
155
|
+
end
|
156
|
+
|
157
|
+
def compress_package
|
158
|
+
@cpackage = StringIO.new("")
|
159
|
+
writer = Zlib::GzipWriter.new(@cpackage)
|
160
|
+
writer.write @package.string
|
161
|
+
writer.close
|
162
|
+
end
|
163
|
+
|
164
|
+
def generate_signature
|
165
|
+
@signature = Digest::SHA1.hexdigest @package.string
|
166
|
+
puts "Package SHA1: #{@signature}"
|
167
|
+
end
|
168
|
+
|
169
|
+
def send_package
|
170
|
+
url = URI.join(@host, 'api/crawlers/', @name)
|
171
|
+
|
172
|
+
req = Net::HTTP::Put::Multipart.new(url.path, {
|
173
|
+
"repo" => UploadIO.new(StringIO.new(@cpackage.string), "application/x-gzip", "tree.tar.gz"),
|
174
|
+
"sha" => @signature,
|
175
|
+
"ref" => @ref
|
176
|
+
}, {
|
177
|
+
'X-Api-Token' => @token
|
178
|
+
})
|
179
|
+
|
180
|
+
res = Net::HTTP.start(url.host, url.port) do |http|
|
181
|
+
http.request(req)
|
182
|
+
end
|
183
|
+
|
184
|
+
puts res.body
|
185
|
+
end
|
186
|
+
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'rainbow'
|
2
|
+
require 'rainbow/ext/string'
|
3
|
+
require 'crabfarm/crabtrap_runner'
|
4
|
+
|
5
|
+
module Crabfarm
|
6
|
+
module Modes
|
7
|
+
class Recorder
|
8
|
+
|
9
|
+
def self.start(_target)
|
10
|
+
return puts "Must provide a recording name" unless _target.is_a? String
|
11
|
+
|
12
|
+
crabtrap_config = Crabfarm.config.crabtrap_config
|
13
|
+
crabtrap_config[:mode] = :capture
|
14
|
+
crabtrap_config[:bucket_path] = File.join(CF_PATH, 'spec/mementos', _target + '.json.gz')
|
15
|
+
|
16
|
+
crabtrap = CrabtrapRunner.new crabtrap_config
|
17
|
+
crabtrap.start
|
18
|
+
|
19
|
+
driver_config = Crabfarm.config.driver_config
|
20
|
+
driver_config[:name] = :firefox
|
21
|
+
driver_config[:proxy] = "127.0.0.1:#{crabtrap.port}"
|
22
|
+
|
23
|
+
driver = DefaultDriverFactory.new(driver_config).build_driver nil
|
24
|
+
|
25
|
+
begin
|
26
|
+
puts "Press Ctrl-C to stop capturing."
|
27
|
+
loop do
|
28
|
+
driver.current_url
|
29
|
+
sleep 1.0
|
30
|
+
end
|
31
|
+
rescue Selenium::WebDriver::Error::WebDriverError, SystemExit, Interrupt
|
32
|
+
# noop
|
33
|
+
end
|
34
|
+
|
35
|
+
puts "Releasing crawling context".color(:green)
|
36
|
+
driver.quit rescue nil
|
37
|
+
crabtrap.stop
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/crabfarm/rspec.rb
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
-
|
1
|
+
require 'crabfarm/crabtrap_context'
|
2
|
+
|
3
|
+
CF_TEST_CONTEXT = Crabfarm::CrabtrapContext::new
|
4
|
+
CF_TEST_CONTEXT.load
|
2
5
|
CF_TEST_BUCKET = CF_TEST_CONTEXT.driver
|
3
6
|
|
4
7
|
module Crabfarm
|
@@ -15,6 +18,28 @@ module Crabfarm
|
|
15
18
|
CF_TEST_BUCKET.parse(described_class, _options)
|
16
19
|
end
|
17
20
|
|
21
|
+
def crawl(_state=nil, _params={})
|
22
|
+
if _state.is_a? Hash
|
23
|
+
_params = _state
|
24
|
+
_state = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
if _state.nil?
|
28
|
+
return nil unless described_class < BaseState # TODO: maybe raise an error here.
|
29
|
+
@state = @last_state = CF_TEST_CONTEXT.run_state(described_class, _params)
|
30
|
+
else
|
31
|
+
@last_state = CF_TEST_CONTEXT.run_state(_state, _params)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def state
|
36
|
+
@state ||= crawl
|
37
|
+
end
|
38
|
+
|
39
|
+
def last_state
|
40
|
+
@last_state
|
41
|
+
end
|
42
|
+
|
18
43
|
def parser
|
19
44
|
@parser
|
20
45
|
end
|
@@ -26,9 +51,16 @@ RSpec.configure do |config|
|
|
26
51
|
config.include Crabfarm::RSpec
|
27
52
|
|
28
53
|
config.before(:example) do |example|
|
54
|
+
|
29
55
|
if example.metadata[:parsing]
|
30
56
|
@parser = parse example.metadata[:parsing], example.metadata[:using] || {}
|
31
57
|
end
|
58
|
+
|
59
|
+
if example.metadata[:crawling]
|
60
|
+
CF_TEST_CONTEXT.replay File.join(CF_PATH, 'spec/mementos', example.metadata[:crawling] + '.json.gz')
|
61
|
+
else
|
62
|
+
CF_TEST_CONTEXT.pass_through
|
63
|
+
end
|
32
64
|
end
|
33
65
|
|
34
66
|
config.after(:suite) do
|
data/lib/crabfarm/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crabfarm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ignacio Baixas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-02-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: jbuilder
|
@@ -142,6 +142,34 @@ dependencies:
|
|
142
142
|
- - ~>
|
143
143
|
- !ruby/object:Gem::Version
|
144
144
|
version: 2.10.2
|
145
|
+
- !ruby/object:Gem::Dependency
|
146
|
+
name: git
|
147
|
+
requirement: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - '>='
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0'
|
152
|
+
type: :runtime
|
153
|
+
prerelease: false
|
154
|
+
version_requirements: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - '>='
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '0'
|
159
|
+
- !ruby/object:Gem::Dependency
|
160
|
+
name: multipart-post
|
161
|
+
requirement: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - '>='
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0'
|
166
|
+
type: :runtime
|
167
|
+
prerelease: false
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - '>='
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '0'
|
145
173
|
- !ruby/object:Gem::Dependency
|
146
174
|
name: bundler
|
147
175
|
requirement: !ruby/object:Gem::Requirement
|
@@ -287,6 +315,7 @@ email:
|
|
287
315
|
- ignacio@platan.us
|
288
316
|
executables:
|
289
317
|
- crabfarm
|
318
|
+
- crabtrap
|
290
319
|
extensions: []
|
291
320
|
extra_rdoc_files: []
|
292
321
|
files:
|
@@ -301,6 +330,8 @@ files:
|
|
301
330
|
- lib/crabfarm/cli.rb
|
302
331
|
- lib/crabfarm/configuration.rb
|
303
332
|
- lib/crabfarm/context.rb
|
333
|
+
- lib/crabfarm/crabtrap_context.rb
|
334
|
+
- lib/crabfarm/crabtrap_runner.rb
|
304
335
|
- lib/crabfarm/default_driver_factory.rb
|
305
336
|
- lib/crabfarm/driver_bucket.rb
|
306
337
|
- lib/crabfarm/driver_bucket_pool.rb
|
@@ -314,6 +345,8 @@ files:
|
|
314
345
|
- lib/crabfarm/mocks/noop_driver.rb
|
315
346
|
- lib/crabfarm/modes/console.rb
|
316
347
|
- lib/crabfarm/modes/generator.rb
|
348
|
+
- lib/crabfarm/modes/publisher.rb
|
349
|
+
- lib/crabfarm/modes/recorder.rb
|
317
350
|
- lib/crabfarm/modes/server.rb
|
318
351
|
- lib/crabfarm/phantom_driver_factory.rb
|
319
352
|
- lib/crabfarm/phantom_runner.rb
|
@@ -325,6 +358,7 @@ files:
|
|
325
358
|
- lib/crabfarm/templates/boot.rb.erb
|
326
359
|
- lib/crabfarm/templates/crabfarm_bin.erb
|
327
360
|
- lib/crabfarm/templates/Crabfile.erb
|
361
|
+
- lib/crabfarm/templates/dot_crabfarm.erb
|
328
362
|
- lib/crabfarm/templates/dot_gitignore.erb
|
329
363
|
- lib/crabfarm/templates/dot_gitkeep.erb
|
330
364
|
- lib/crabfarm/templates/dot_rspec.erb
|
@@ -337,6 +371,7 @@ files:
|
|
337
371
|
- lib/crabfarm/version.rb
|
338
372
|
- lib/crabfarm.rb
|
339
373
|
- bin/crabfarm
|
374
|
+
- bin/crabtrap
|
340
375
|
homepage: https://github.com/platanus/crabfarm-gem
|
341
376
|
licenses:
|
342
377
|
- MIT
|