crabfarm 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/crabtrap +347 -0
- data/lib/crabfarm/cli.rb +36 -2
- data/lib/crabfarm/configuration.rb +25 -8
- data/lib/crabfarm/context.rb +46 -2
- data/lib/crabfarm/crabtrap_context.rb +54 -0
- data/lib/crabfarm/crabtrap_runner.rb +54 -0
- data/lib/crabfarm/default_driver_factory.rb +63 -17
- data/lib/crabfarm/driver_bucket_pool.rb +3 -28
- data/lib/crabfarm/modes/console.rb +12 -18
- data/lib/crabfarm/modes/generator.rb +3 -0
- data/lib/crabfarm/modes/publisher.rb +189 -0
- data/lib/crabfarm/modes/recorder.rb +42 -0
- data/lib/crabfarm/rspec.rb +33 -1
- data/lib/crabfarm/templates/dot_crabfarm.erb +9 -0
- data/lib/crabfarm/templates/dot_gitignore.erb +0 -1
- data/lib/crabfarm/version.rb +1 -1
- metadata +37 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1031e279aeab473f8e46469b3f91383a4dffbd70
|
4
|
+
data.tar.gz: f366d36831117570d65b999de947bb5d6af0d0d1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a5e46aeefc0f4fe6c96ad2f5d02ee312c826a5a724e20b9797818ddb4c876245b6e58331a85feaafc48ab7b7f97f38c3dd1fd497705c72020559bf84ee8c22e4
|
7
|
+
data.tar.gz: df9dd5a3874ef38e462d1adfcf8c4ae92bf4b8f7ed0cfda6456d7e3ed6a1ceb2b791e1dd6f89988bec33bdf8d3fa9ef7db86c38515c43ae1de47aa314876063e
|
data/bin/crabtrap
ADDED
@@ -0,0 +1,347 @@
|
|
1
|
+
#!/usr/bin/env node
|
2
|
+
|
3
|
+
var net = require('net'),
|
4
|
+
http = require('http'),
|
5
|
+
https = require('https'),
|
6
|
+
url = require('url'),
|
7
|
+
fs = require('fs'),
|
8
|
+
zlib = require('zlib');
|
9
|
+
|
10
|
+
// Globals
|
11
|
+
|
12
|
+
var HTTPS_OPTIONS = {
|
13
|
+
key: '-----BEGIN RSA PRIVATE KEY-----\nMIIBOQIBAAJBAK/L/lXb/kxUzve1olo71s6mQLvuQCm3z2wqClq71NLerFnaXpN+\nFrNPy7+R3gZ1hdWXqbN5NqpWDMM9fcbd7p0CAwEAAQJAUDImN3Lhgl7Z/+TLSJCt\nwJ3VQCZC/QUOSdCv4o53Wy5aL/n8ootYFC3eoFC2Nal5bnH6onP9YR+X9l3HKLaT\n3QIhANXwb5SvJ+Kewa8F5wNHo9LFjSbL7WSSb1MyvYnOeFlPAiEA0lvaLz6UXRDL\n6T6Z1fkF0exmQqVimeL5qjY5o9Gk5lMCH1A52Z3oEQzqe7cmf3q7YrOnYUcrMdqF\nDzojzO/gfUECIQCe9fImiW+r9CljFH9Dhm6zd6S+8CNWjoKD8X4VITMvKQIgb3sg\nq9gPVzXn/+f8Qcc2KILSh3ffkIpA8yJK9omUIxI=\n-----END RSA PRIVATE KEY-----\n',
|
14
|
+
cert: '-----BEGIN CERTIFICATE-----\nMIIBmDCCAUICCQDGtiGKgI9AXjANBgkqhkiG9w0BAQUFADBTMQswCQYDVQQGEwJD\nTDELMAkGA1UECBMCUk0xETAPBgNVBAcTCFNhbnRpYWdvMREwDwYDVQQKEwhQbGF0\nYW51czERMA8GA1UEAxMIQ3JhYnRyYXAwHhcNMTUwMTE1MjAxNzMzWhcNNDIwNjAx\nMjAxNzMzWjBTMQswCQYDVQQGEwJDTDELMAkGA1UECBMCUk0xETAPBgNVBAcTCFNh\nbnRpYWdvMREwDwYDVQQKEwhQbGF0YW51czERMA8GA1UEAxMIQ3JhYnRyYXAwXDAN\nBgkqhkiG9w0BAQEFAANLADBIAkEAr8v+Vdv+TFTO97WiWjvWzqZAu+5AKbfPbCoK\nWrvU0t6sWdpek34Ws0/Lv5HeBnWF1Zeps3k2qlYMwz19xt3unQIDAQABMA0GCSqG\nSIb3DQEBBQUAA0EAmecqIZqQ8OXSIj0V2VKaIXwz8RBnhLzU7BJwcsWJE/Bex7zB\nWP+vLv9ML5ZRLCsXjL5IOav8qAX/NZXjoN3e3Q==\n-----END CERTIFICATE-----\n'
|
15
|
+
};
|
16
|
+
|
17
|
+
var LOG = {
|
18
|
+
DEBUG: 0,
|
19
|
+
INFO: 1,
|
20
|
+
WARN: 2,
|
21
|
+
ERROR: 3
|
22
|
+
};
|
23
|
+
|
24
|
+
var STACK = [],
|
25
|
+
MODE = false,
|
26
|
+
SOURCE = null,
|
27
|
+
PORT = 4000,
|
28
|
+
LOG_LEVEL = LOG.WARN;
|
29
|
+
|
30
|
+
(function() {
|
31
|
+
if(process.argv.length < 2) throw 'Must provide a proxy mode';
|
32
|
+
MODE = process.argv[2];
|
33
|
+
var i = 3;
|
34
|
+
|
35
|
+
if(MODE != 'pass') {
|
36
|
+
if(process.argv.length < 3) throw 'Must provide a bucket path';
|
37
|
+
SOURCE = process.argv[3];
|
38
|
+
i = 4;
|
39
|
+
}
|
40
|
+
|
41
|
+
for(; i < process.argv.length; i++) {
|
42
|
+
var parts = process.argv[i].split('=');
|
43
|
+
switch(parts[0]) {
|
44
|
+
case '--port': PORT = parseInt(parts[1], 10); break;
|
45
|
+
case '--quiet': PORT = parseInt(parts[1], 10); break;
|
46
|
+
default: throw 'Invalid option ' + parts[0];
|
47
|
+
}
|
48
|
+
}
|
49
|
+
})();
|
50
|
+
|
51
|
+
// Utility methods
|
52
|
+
|
53
|
+
function log(_level, _message) {
|
54
|
+
if(_level == LOG.DEBUG) _message = '\t' + _message;
|
55
|
+
if(_level >= LOG_LEVEL) console.log(_message);
|
56
|
+
}
|
57
|
+
|
58
|
+
function forOwn(_obj, _cb) {
|
59
|
+
for(var key in _obj) {
|
60
|
+
if(_obj.hasOwnProperty(key)) {
|
61
|
+
_cb(key, _obj[key]);
|
62
|
+
}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
function keysToLowerCase(_obj) {
|
67
|
+
var result = {};
|
68
|
+
forOwn(_obj, function(k,v) { result[k.toLowerCase()] = v; });
|
69
|
+
return result;
|
70
|
+
}
|
71
|
+
|
72
|
+
function pickRandomPort() {
|
73
|
+
return 0; // This could fail on Linux...
|
74
|
+
}
|
75
|
+
|
76
|
+
function matchRequestToResource(_req, _resource) {
|
77
|
+
return _resource.method.toLowerCase() == _req.method.toLowerCase() && _resource.url == _req.url;
|
78
|
+
}
|
79
|
+
|
80
|
+
function matchRequestToResourceWOQuery(_req, _resource) {
|
81
|
+
if(_resource.method.toLowerCase() == _req.method.toLowerCase()) return false;
|
82
|
+
|
83
|
+
var reqUrl = url.parse(_req.url, true),
|
84
|
+
resUrl = url.parse(_resource.url, true);
|
85
|
+
|
86
|
+
return reqUrl.hostname == resUrl.hostname && reqUrl.pathname == resUrl.pathname;
|
87
|
+
}
|
88
|
+
|
89
|
+
function findAndMoveLast(_req, _array, _matches) {
|
90
|
+
for(var i = 0, l = _array.length; i < l; i++) {
|
91
|
+
if(_matches(_req, _array[i])) {
|
92
|
+
var resource = _array.splice(i, 1)[0];
|
93
|
+
_array.push(resource);
|
94
|
+
return resource;
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
return null;
|
99
|
+
}
|
100
|
+
|
101
|
+
function loadStackFrom(_path, _then) {
|
102
|
+
var data = fs.readFileSync(_path);
|
103
|
+
zlib.gunzip(data, function(err, buffer) {
|
104
|
+
if (!err) STACK = JSON.parse(buffer.toString());
|
105
|
+
_then();
|
106
|
+
});
|
107
|
+
}
|
108
|
+
|
109
|
+
function saveStackTo(_path, _then) {
|
110
|
+
var data = JSON.stringify(STACK);
|
111
|
+
zlib.gzip(data, function(err, buffer) {
|
112
|
+
if (!err) fs.writeFileSync(_path, buffer);
|
113
|
+
_then();
|
114
|
+
});
|
115
|
+
}
|
116
|
+
|
117
|
+
function resolveAndServeResource(_req, _resp) {
|
118
|
+
var resource = findInStack(_req);
|
119
|
+
if(resource) {
|
120
|
+
log(LOG.INFO, "Serving: " + resource.method + ' ' + resource.url);
|
121
|
+
log(LOG.DEBUG, "HTTP " + resource.statusCode);
|
122
|
+
log(LOG.DEBUG, JSON.stringify(resource.headers));
|
123
|
+
|
124
|
+
serveResource(resource, _resp);
|
125
|
+
} else {
|
126
|
+
log(LOG.WARN, 'Not found: ' + _req.url);
|
127
|
+
_resp.statusCode = 404;
|
128
|
+
_resp.end();
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
function serveLastResource(_resp) {
|
133
|
+
serveResource(STACK[STACK.length-1], _resp);
|
134
|
+
}
|
135
|
+
|
136
|
+
function serveResource(_resource, _resp) {
|
137
|
+
_resp.statusCode = _resource.statusCode;
|
138
|
+
|
139
|
+
forOwn(_resource.headers, function(k, v) { _resp.setHeader(k, v); });
|
140
|
+
|
141
|
+
if(_resource.content) {
|
142
|
+
var buf = new Buffer(_resource.content, _resource.encoding);
|
143
|
+
_resp.end(buf);
|
144
|
+
} else {
|
145
|
+
_resp.end();
|
146
|
+
}
|
147
|
+
}
|
148
|
+
|
149
|
+
function findAndMoveLast(_req, _matches) {
|
150
|
+
for(var i = 0, l = STACK.length; i < l; i++) {
|
151
|
+
if(_matches(_req, STACK[i])) {
|
152
|
+
var resource = STACK.splice(i, 1)[0];
|
153
|
+
STACK.push(resource);
|
154
|
+
return resource;
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
return null;
|
159
|
+
}
|
160
|
+
|
161
|
+
function findInStack(_req, _partial) {
|
162
|
+
return findAndMoveLast(_req, matchRequestToResource) ||
|
163
|
+
findAndMoveLast(_req, matchRequestToResourceWOQuery);
|
164
|
+
}
|
165
|
+
|
166
|
+
function cacheResponse(_req, _resp, _cb) {
|
167
|
+
|
168
|
+
log(LOG.INFO, "Caching Response");
|
169
|
+
log(LOG.DEBUG, "HTTP " + _resp.statusCode);
|
170
|
+
log(LOG.DEBUG, JSON.stringify(keysToLowerCase(_resp.headers)));
|
171
|
+
|
172
|
+
var encoding = null,
|
173
|
+
// TODO: consider storing port and protocoll in the resource.
|
174
|
+
resource = {
|
175
|
+
url: _req.url,
|
176
|
+
statusCode: _resp.statusCode,
|
177
|
+
method: _req.method,
|
178
|
+
// inHeaders: req.headers, // store request headers to aid in recognition?
|
179
|
+
headers: keysToLowerCase(_resp.headers),
|
180
|
+
content: '',
|
181
|
+
encoding: 'base64'
|
182
|
+
},
|
183
|
+
contentEncoding = resource.headers['content-encoding'],
|
184
|
+
contentType = resource.headers['content-type'],
|
185
|
+
outStream = _resp;
|
186
|
+
|
187
|
+
// add decompression if supported encoding:
|
188
|
+
if(contentEncoding == 'gzip') {
|
189
|
+
outStream = _resp.pipe(zlib.createGunzip());
|
190
|
+
delete resource.headers['content-encoding'];
|
191
|
+
contentEncoding = null;
|
192
|
+
} else if(contentEncoding == 'deflate') {
|
193
|
+
outStream = _resp.pipe(zlib.createInflate());
|
194
|
+
delete resource.headers['content-encoding'];
|
195
|
+
contentEncoding = null;
|
196
|
+
}
|
197
|
+
|
198
|
+
// use utf8 encoding for uncompresed text:
|
199
|
+
if(!contentEncoding && contentType) {
|
200
|
+
contentType = contentType.match(/([^\/]+)\/([^\s]+)(?:\s+(.+))?/i);
|
201
|
+
if(contentType && (contentType[1] == 'text' || contentType[1] == 'application')) {
|
202
|
+
resource.encoding = 'utf-8';
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
// remove unwanted headers:
|
207
|
+
delete resource.headers['content-length'];
|
208
|
+
|
209
|
+
// start receiving data:
|
210
|
+
if(resource.encoding) outStream.setEncoding(resource.encoding);
|
211
|
+
outStream.on('data', function(_chunk) {
|
212
|
+
resource.content += _chunk;
|
213
|
+
});
|
214
|
+
|
215
|
+
// when all data is received, store resource (dont know how this will handle more than one request)
|
216
|
+
outStream.on('end', function() {
|
217
|
+
STACK.push(resource);
|
218
|
+
_cb();
|
219
|
+
});
|
220
|
+
}
|
221
|
+
|
222
|
+
function prepareForwardRequest(_req) {
|
223
|
+
var urlObj = url.parse(_req.url);
|
224
|
+
|
225
|
+
var options = {
|
226
|
+
method: _req.method,
|
227
|
+
host: urlObj.host,
|
228
|
+
path: urlObj.path,
|
229
|
+
rejectUnauthorized: false,
|
230
|
+
headers: keysToLowerCase(_req.headers)
|
231
|
+
};
|
232
|
+
|
233
|
+
// Rewrite headers
|
234
|
+
options.headers['accept-encoding'] = 'gzip,deflate';
|
235
|
+
return options;
|
236
|
+
}
|
237
|
+
|
238
|
+
function passRequest(_req, _resp) {
|
239
|
+
log(LOG.INFO, 'Passing through ' + _req.method + ' request for ' + _req.url);
|
240
|
+
|
241
|
+
var urlObj = url.parse(_req.url);
|
242
|
+
var forward = (urlObj.protocol == 'https:' ? https : http).request({
|
243
|
+
method: _req.method,
|
244
|
+
host: urlObj.host,
|
245
|
+
path: urlObj.path,
|
246
|
+
headers: _req.headers
|
247
|
+
}, function(_fw_resp) {
|
248
|
+
// pipe response back untouched
|
249
|
+
_resp.writeHead(_fw_resp.statusCode, _fw_resp.headers);
|
250
|
+
_fw_resp.pipe(_resp);
|
251
|
+
});
|
252
|
+
|
253
|
+
_req.pipe(forward);
|
254
|
+
}
|
255
|
+
|
256
|
+
function captureRequest(_req, _resp, _useSSL) {
|
257
|
+
log(LOG.INFO, 'Forwarding ' + _req.method + ' request for ' + _req.url);
|
258
|
+
|
259
|
+
var urlObj = url.parse(_req.url);
|
260
|
+
var options = {
|
261
|
+
method: _req.method,
|
262
|
+
host: urlObj.host,
|
263
|
+
path: urlObj.path,
|
264
|
+
rejectUnauthorized: false,
|
265
|
+
headers: keysToLowerCase(_req.headers)
|
266
|
+
};
|
267
|
+
|
268
|
+
// Rewrite headers
|
269
|
+
options.headers['accept-encoding'] = 'gzip,deflate';
|
270
|
+
log(LOG.DEBUG, JSON.stringify(options));
|
271
|
+
|
272
|
+
var forward = (urlObj.protocol == 'https:' ? https : http).request(options, function(_fw_resp) {
|
273
|
+
cacheResponse(_req, _fw_resp, function() {
|
274
|
+
serveLastResource(_resp);
|
275
|
+
});
|
276
|
+
});
|
277
|
+
|
278
|
+
_req.pipe(forward); // forward request data
|
279
|
+
}
|
280
|
+
|
281
|
+
function replayRequest(_req, _resp) {
|
282
|
+
log(LOG.INFO, 'Resolving ' + _req.method + ' request for ' + _req.url);
|
283
|
+
resolveAndServeResource(_req, _resp);
|
284
|
+
}
|
285
|
+
|
286
|
+
function selectProxy() {
|
287
|
+
switch(MODE) {
|
288
|
+
case 'pass': return passRequest;
|
289
|
+
case 'capture': return captureRequest;
|
290
|
+
case 'replay': return replayRequest;
|
291
|
+
default: throw 'Invalid proxy mode';
|
292
|
+
}
|
293
|
+
}
|
294
|
+
|
295
|
+
var PROXY_FUN = selectProxy(),
|
296
|
+
SERVER = http.createServer(PROXY_FUN);
|
297
|
+
|
298
|
+
// Special handler for HTTPS request, creates a dedicated HTTPS proxy per connection,
|
299
|
+
// that way the CONNECT tunnel can be intercepted, requires support for self signed
|
300
|
+
// certificates in the client.
|
301
|
+
SERVER.on('connect', function (_req, _sock, _head) {
|
302
|
+
|
303
|
+
var urlObj = url.parse('http://' + _req.url);
|
304
|
+
log(LOG.INFO, 'New HTTPS request: starting https intercept on ' + urlObj.hostname);
|
305
|
+
|
306
|
+
var httpsServ = https.createServer(HTTPS_OPTIONS, function(_req, _resp) {
|
307
|
+
_req.url = 'https://' + urlObj.hostname + _req.url;
|
308
|
+
PROXY_FUN(_req, _resp);
|
309
|
+
});
|
310
|
+
|
311
|
+
httpsServ.listen(pickRandomPort());
|
312
|
+
|
313
|
+
var tunnelSock = net.connect(httpsServ.address().port, function() {
|
314
|
+
_sock.write('HTTP/1.1 200 Connection Established\r\n' +
|
315
|
+
'Proxy-agent: Node-Proxy\r\n' +
|
316
|
+
'\r\n');
|
317
|
+
tunnelSock.write(_head);
|
318
|
+
tunnelSock.pipe(_sock);
|
319
|
+
_sock.pipe(tunnelSock);
|
320
|
+
});
|
321
|
+
|
322
|
+
_sock.on('close', function() {
|
323
|
+
httpsServ.close();
|
324
|
+
});
|
325
|
+
});
|
326
|
+
|
327
|
+
console.log("Starting crabtrap! mode: " + MODE);
|
328
|
+
|
329
|
+
if(MODE == 'replay') {
|
330
|
+
loadStackFrom(SOURCE, SERVER.listen.bind(SERVER, PORT));
|
331
|
+
} else {
|
332
|
+
SERVER.listen(PORT);
|
333
|
+
}
|
334
|
+
|
335
|
+
var EXITING = false;
|
336
|
+
process.on('SIGINT', function() {
|
337
|
+
if(EXITING) return;
|
338
|
+
EXITING = true;
|
339
|
+
|
340
|
+
console.log("Shutting down crabtrap!");
|
341
|
+
SERVER.close();
|
342
|
+
if(MODE == 'capture') {
|
343
|
+
saveStackTo(SOURCE, process.exit.bind(process));
|
344
|
+
} else {
|
345
|
+
process.exit();
|
346
|
+
}
|
347
|
+
});
|
data/lib/crabfarm/cli.rb
CHANGED
@@ -10,14 +10,28 @@ module Crabfarm
|
|
10
10
|
desc "Starts the crawler in console mode"
|
11
11
|
command [:console, :c] do |c|
|
12
12
|
|
13
|
+
c.desc "Capture to crabtrap file"
|
14
|
+
c.flag :capture
|
15
|
+
|
16
|
+
c.desc "Replay from crabtrap file"
|
17
|
+
c.flag :replay
|
18
|
+
|
13
19
|
Support::GLI.generate_options c
|
14
20
|
|
15
21
|
c.action do |global_options,options,args|
|
16
22
|
next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
|
17
23
|
|
18
|
-
require "crabfarm/modes/console"
|
19
24
|
Crabfarm.config.set Support::GLI.parse_options options
|
20
|
-
|
25
|
+
|
26
|
+
next puts "Cannot use --replay with --capture" if options[:capture] and options[:replay]
|
27
|
+
|
28
|
+
require 'crabfarm/crabtrap_context'
|
29
|
+
context = Crabfarm::CrabtrapContext.new
|
30
|
+
context.capture options[:capture] if options[:capture]
|
31
|
+
context.replay options[:replay] if options[:replay]
|
32
|
+
|
33
|
+
require "crabfarm/modes/console"
|
34
|
+
Crabfarm::Modes::Console.start context
|
21
35
|
end
|
22
36
|
end
|
23
37
|
|
@@ -84,9 +98,29 @@ module Crabfarm
|
|
84
98
|
end
|
85
99
|
end
|
86
100
|
|
101
|
+
desc "Perform an HTTP recording for use in tests"
|
102
|
+
command [:record, :r] do |c|
|
103
|
+
c.action do |global_options, options, args|
|
104
|
+
next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
|
105
|
+
|
106
|
+
require "crabfarm/modes/recorder"
|
107
|
+
Crabfarm::Modes::Recorder.start args[0]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
desc "Publish the crawler to a crabfarm cloud"
|
87
112
|
command :publish do |c|
|
113
|
+
c.desc "Just list the files that are beign packaged"
|
114
|
+
c.switch :dry, :default_value => false
|
115
|
+
|
116
|
+
c.desc "Don't check for pending changes"
|
117
|
+
c.switch :unsafe, :default_value => false
|
118
|
+
|
88
119
|
c.action do |global_options,options,args|
|
120
|
+
next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
|
89
121
|
|
122
|
+
require "crabfarm/modes/publisher"
|
123
|
+
Crabfarm::Modes::Publisher.publish CF_PATH, options
|
90
124
|
end
|
91
125
|
end
|
92
126
|
|
@@ -9,6 +9,7 @@ module Crabfarm
|
|
9
9
|
[:output_builder, :string, 'Default json output builder used by states'],
|
10
10
|
[:driver_factory, :mixed, 'Driver factory, disabled if phantom_mode is used'],
|
11
11
|
[:log_path, :string, 'Path where logs should be stored'],
|
12
|
+
[:proxy, :string, 'If given, a proxy is used to connect to the internet if driver supports it'],
|
12
13
|
|
13
14
|
# Default driver configuration parameters
|
14
15
|
[:driver, ['chrome', 'firefox', 'phantomjs', 'remote'], 'Webdriver to be user, common options: chrome, firefox, phantomjs, remote.'],
|
@@ -21,10 +22,14 @@ module Crabfarm
|
|
21
22
|
|
22
23
|
# Phantom launcher configuration
|
23
24
|
[:phantom_load_images, :boolean, 'Phantomjs image loading, only for phantomjs driver.'],
|
24
|
-
[:phantom_proxy, :string, 'Phantonjs proxy address, only for phantomjs driver.'],
|
25
25
|
[:phantom_ssl, ['sslv3', 'sslv2', 'tlsv1', 'any'], 'Phantomjs ssl mode: sslv3, sslv2, tlsv1 or any, only for phantomjs driver.'],
|
26
26
|
[:phantom_bin_path, :string, 'Phantomjs binary path, only for phantomjs driver.'],
|
27
|
-
[:phantom_lock_file, :string, 'Phantomjs lock file path, only for phantomjs driver.']
|
27
|
+
[:phantom_lock_file, :string, 'Phantomjs lock file path, only for phantomjs driver.'],
|
28
|
+
|
29
|
+
# Crabtrap launcher configuration
|
30
|
+
[:crabtrap_bin, :string, 'Crabtrap binary path.'],
|
31
|
+
[:crabtrap_port, :integer, 'Crabtrap port, defaults to 4000.'],
|
32
|
+
[:crabtrap_mode, ['capture', 'replay'], 'Crabtrap operation mode.']
|
28
33
|
]
|
29
34
|
.map { |o| Option.new *o }
|
30
35
|
|
@@ -48,6 +53,7 @@ module Crabfarm
|
|
48
53
|
output_builder: :hash,
|
49
54
|
driver_factory: nil,
|
50
55
|
log_path: 'logs',
|
56
|
+
proxy: nil,
|
51
57
|
|
52
58
|
driver: 'phantomjs',
|
53
59
|
driver_capabilities: Selenium::WebDriver::Remote::Capabilities.firefox,
|
@@ -58,10 +64,12 @@ module Crabfarm
|
|
58
64
|
driver_window_height: 800,
|
59
65
|
|
60
66
|
phantom_load_images: false,
|
61
|
-
phantom_proxy: nil,
|
62
67
|
phantom_ssl: 'any',
|
63
68
|
phantom_bin_path: 'phantomjs',
|
64
|
-
phantom_lock_file: nil
|
69
|
+
phantom_lock_file: nil,
|
70
|
+
|
71
|
+
crabtrap_bin: 'crabtrap',
|
72
|
+
crabtrap_port: 4000
|
65
73
|
}
|
66
74
|
end
|
67
75
|
|
@@ -79,6 +87,7 @@ module Crabfarm
|
|
79
87
|
def driver_config
|
80
88
|
{
|
81
89
|
name: driver,
|
90
|
+
proxy: proxy,
|
82
91
|
capabilities: driver_capabilities,
|
83
92
|
remote_host: driver_remote_host,
|
84
93
|
remote_timeout: driver_remote_timeout,
|
@@ -94,7 +103,7 @@ module Crabfarm
|
|
94
103
|
def phantom_config
|
95
104
|
{
|
96
105
|
load_images: phantom_load_images,
|
97
|
-
proxy:
|
106
|
+
proxy: proxy,
|
98
107
|
ssl: phantom_ssl,
|
99
108
|
bin_path: phantom_bin_path,
|
100
109
|
lock_file: phantom_lock_file,
|
@@ -102,9 +111,17 @@ module Crabfarm
|
|
102
111
|
}
|
103
112
|
end
|
104
113
|
|
105
|
-
|
106
|
-
|
107
|
-
|
114
|
+
def crabtrap_config
|
115
|
+
{
|
116
|
+
bin_path: crabtrap_bin,
|
117
|
+
port: crabtrap_port,
|
118
|
+
proxy: proxy
|
119
|
+
}
|
120
|
+
end
|
121
|
+
|
122
|
+
# Add enviroment support (like a Gemfile)
|
123
|
+
# group :test { set_driver :phantom }
|
124
|
+
# set_driver :phantom, group: :test
|
108
125
|
|
109
126
|
end
|
110
127
|
|
data/lib/crabfarm/context.rb
CHANGED
@@ -7,23 +7,67 @@ module Crabfarm
|
|
7
7
|
def_delegators :@pool, :driver
|
8
8
|
|
9
9
|
def initialize
|
10
|
-
@pool = DriverBucketPool.new
|
11
10
|
@store = StateStore.new
|
11
|
+
@loaded = false
|
12
|
+
end
|
13
|
+
|
14
|
+
def load
|
15
|
+
unless @loaded
|
16
|
+
init_phantom_if_required
|
17
|
+
@pool = DriverBucketPool.new build_driver_factory
|
18
|
+
@loaded = true
|
19
|
+
end
|
12
20
|
end
|
13
21
|
|
14
22
|
def run_state(_name, _params={})
|
23
|
+
load
|
15
24
|
state = LoaderService.load_state(_name).new @pool, @store, _params
|
16
25
|
state.crawl
|
17
26
|
state
|
18
27
|
end
|
19
28
|
|
20
29
|
def reset
|
30
|
+
load
|
21
31
|
@store.reset
|
22
32
|
@pool.reset
|
23
33
|
end
|
24
34
|
|
25
35
|
def release
|
26
|
-
@
|
36
|
+
if @loaded
|
37
|
+
@pool.release
|
38
|
+
@phantom.stop unless @phantom.nil?
|
39
|
+
@loaded = false
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def init_phantom_if_required
|
46
|
+
if config.phantom_mode_enabled?
|
47
|
+
@phantom = PhantomRunner.new phantom_config
|
48
|
+
@phantom.start
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def build_driver_factory
|
53
|
+
if @phantom
|
54
|
+
PhantomDriverFactory.new @phantom, driver_config
|
55
|
+
else
|
56
|
+
return config.driver_factory if config.driver_factory
|
57
|
+
DefaultDriverFactory.new driver_config
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def config
|
62
|
+
Crabfarm.config
|
63
|
+
end
|
64
|
+
|
65
|
+
def driver_config
|
66
|
+
config.driver_config
|
67
|
+
end
|
68
|
+
|
69
|
+
def phantom_config
|
70
|
+
config.phantom_config
|
27
71
|
end
|
28
72
|
|
29
73
|
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'crabfarm/crabtrap_runner'
|
3
|
+
|
4
|
+
module Crabfarm
|
5
|
+
class CrabtrapContext < Context
|
6
|
+
|
7
|
+
def load
|
8
|
+
pass_through if @runner.nil?
|
9
|
+
super
|
10
|
+
end
|
11
|
+
|
12
|
+
def pass_through
|
13
|
+
restart_with_options(mode: :pass) if @runner.nil? or @runner.mode != :pass
|
14
|
+
end
|
15
|
+
|
16
|
+
def capture(_path)
|
17
|
+
restart_with_options(mode: :capture, bucket_path: _path)
|
18
|
+
end
|
19
|
+
|
20
|
+
def replay(_path)
|
21
|
+
restart_with_options(mode: :replay, bucket_path: _path)
|
22
|
+
end
|
23
|
+
|
24
|
+
def release
|
25
|
+
super
|
26
|
+
stop_daemon
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def restart_with_options(_options)
|
32
|
+
stop_daemon
|
33
|
+
@runner = CrabtrapRunner.new Crabfarm.config.crabtrap_config.merge(_options)
|
34
|
+
@runner.start
|
35
|
+
end
|
36
|
+
|
37
|
+
def stop_daemon
|
38
|
+
@runner.stop unless @runner.nil?
|
39
|
+
end
|
40
|
+
|
41
|
+
def driver_config
|
42
|
+
super.merge(proxy: proxy_address)
|
43
|
+
end
|
44
|
+
|
45
|
+
def phantom_config
|
46
|
+
super.merge(proxy: proxy_address)
|
47
|
+
end
|
48
|
+
|
49
|
+
def proxy_address
|
50
|
+
"127.0.0.1:#{@runner.port}"
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
module Crabfarm
|
4
|
+
class CrabtrapRunner
|
5
|
+
|
6
|
+
def initialize(_config={})
|
7
|
+
@config = _config;
|
8
|
+
@pid = nil
|
9
|
+
end
|
10
|
+
|
11
|
+
def port
|
12
|
+
@config[:port] # TODO: maybe select port dynamically...
|
13
|
+
end
|
14
|
+
|
15
|
+
def mode
|
16
|
+
@config.fetch(:mode, :pass).to_sym
|
17
|
+
end
|
18
|
+
|
19
|
+
def start
|
20
|
+
@pid = Process.spawn({}, crabtrap_cmd)
|
21
|
+
wait_for_server
|
22
|
+
end
|
23
|
+
|
24
|
+
def stop
|
25
|
+
unless @pid.nil?
|
26
|
+
Process.kill("INT", @pid)
|
27
|
+
Process.wait @pid
|
28
|
+
@pid = nil
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def crabtrap_cmd
|
35
|
+
cmd = [@config[:bin_path]]
|
36
|
+
cmd << mode.to_s
|
37
|
+
cmd << @config[:bucket_path] if mode != :pass
|
38
|
+
cmd << "--port=#{port}"
|
39
|
+
cmd.join(' ')
|
40
|
+
end
|
41
|
+
|
42
|
+
def wait_for_server
|
43
|
+
loop do
|
44
|
+
begin
|
45
|
+
# TODO: improve waiting, making this kind of request could change crabtrap's stack
|
46
|
+
Net::HTTP.get_response(URI.parse("http://127.0.0.1:#{port}/status"))
|
47
|
+
break
|
48
|
+
rescue
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|
@@ -7,33 +7,79 @@ module Crabfarm
|
|
7
7
|
|
8
8
|
def build_driver(_session_id)
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
raise ConfigurationError.new 'must provide a webdriver type' unless config_present? :name
|
11
|
+
driver_name = @config[:name].to_sym
|
12
12
|
|
13
|
-
case driver_name
|
13
|
+
driver = case driver_name
|
14
14
|
when :noop
|
15
15
|
require "crabfarm/mocks/noop_driver"
|
16
16
|
driver = Crabfarm::Mocks::NoopDriver.new # TODO: improve dummy driver...
|
17
17
|
when :remote
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
load_remote_driver
|
19
|
+
when :firefox
|
20
|
+
load_firefox_driver
|
21
|
+
when :chrome
|
22
|
+
load_chrome_driver
|
23
|
+
else
|
24
|
+
load_other_driver driver_name
|
25
|
+
end
|
21
26
|
|
22
|
-
|
23
|
-
|
24
|
-
:http_client => client,
|
25
|
-
:desired_capabilities => @config[:capabilities]
|
26
|
-
}
|
27
|
+
# apply browser configuration to new driver
|
28
|
+
driver.manage.window.resize_to(@config[:window_width], @config[:window_height]) rescue nil
|
27
29
|
|
28
|
-
|
29
|
-
|
30
|
-
|
30
|
+
return driver
|
31
|
+
end
|
32
|
+
|
33
|
+
def load_remote_driver
|
34
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
35
|
+
client.timeout = @config[:remote_timeout]
|
31
36
|
|
32
|
-
|
33
|
-
|
37
|
+
if config_present? :proxy
|
38
|
+
client.proxy = Selenium::WebDriver::Proxy.new({
|
39
|
+
:http => @config[:proxy],
|
40
|
+
:ssl => @config[:proxy]
|
41
|
+
})
|
34
42
|
end
|
35
43
|
|
36
|
-
|
44
|
+
Selenium::WebDriver.for(:remote, {
|
45
|
+
:url => @config[:remote_host],
|
46
|
+
:http_client => client,
|
47
|
+
:desired_capabilities => @config[:capabilities]
|
48
|
+
})
|
49
|
+
end
|
50
|
+
|
51
|
+
def load_firefox_driver
|
52
|
+
profile = Selenium::WebDriver::Firefox::Profile.new
|
53
|
+
|
54
|
+
if config_present? :proxy
|
55
|
+
profile.proxy = Selenium::WebDriver::Proxy.new({
|
56
|
+
:http => @config[:proxy],
|
57
|
+
:ssl => @config[:proxy]
|
58
|
+
})
|
59
|
+
end
|
60
|
+
|
61
|
+
Selenium::WebDriver.for :firefox, :profile => profile
|
62
|
+
end
|
63
|
+
|
64
|
+
def load_chrome_driver
|
65
|
+
switches = []
|
66
|
+
|
67
|
+
if config_present? :proxy
|
68
|
+
switches << "--proxy-server=#{@config[:proxy]}"
|
69
|
+
switches << "--ignore-certificate-errors"
|
70
|
+
end
|
71
|
+
|
72
|
+
Selenium::WebDriver.for :chrome, :switches => switches
|
73
|
+
end
|
74
|
+
|
75
|
+
def load_other_driver(_name)
|
76
|
+
raise ConfigurationError.new 'default driver does not support proxy' if config_present? :proxy
|
77
|
+
|
78
|
+
Selenium::WebDriver.for _name.to_sym
|
79
|
+
end
|
80
|
+
|
81
|
+
def config_present?(_key)
|
82
|
+
not (@config[_key].nil? or @config[_key].empty?)
|
37
83
|
end
|
38
84
|
|
39
85
|
end
|
@@ -1,17 +1,15 @@
|
|
1
1
|
module Crabfarm
|
2
2
|
class DriverBucketPool
|
3
3
|
|
4
|
-
def initialize
|
4
|
+
def initialize(_factory=nil)
|
5
|
+
@factory = _factory || DefaultDriverFactory.new(Crabfarm.config.driver_config)
|
5
6
|
@buckets = Hash.new
|
6
|
-
@phantom = nil
|
7
|
-
|
8
|
-
init_phantom_if_required
|
9
7
|
end
|
10
8
|
|
11
9
|
def driver(_session_id=nil)
|
12
10
|
_session_id ||= :default_driver
|
13
11
|
bucket = @buckets[_session_id.to_sym]
|
14
|
-
bucket = @buckets[_session_id.to_sym] = DriverBucket.new(_session_id,
|
12
|
+
bucket = @buckets[_session_id.to_sym] = DriverBucket.new(_session_id, @factory) if bucket.nil?
|
15
13
|
bucket
|
16
14
|
end
|
17
15
|
|
@@ -22,29 +20,6 @@ module Crabfarm
|
|
22
20
|
|
23
21
|
def release
|
24
22
|
reset
|
25
|
-
@phantom.stop unless @phantom.nil?
|
26
|
-
end
|
27
|
-
|
28
|
-
private
|
29
|
-
|
30
|
-
def init_phantom_if_required
|
31
|
-
if config.phantom_mode_enabled?
|
32
|
-
@phantom = PhantomRunner.new config.phantom_config
|
33
|
-
@phantom.start
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
def build_driver_factory
|
38
|
-
if config.phantom_mode_enabled?
|
39
|
-
PhantomDriverFactory.new @phantom, config.driver_config
|
40
|
-
else
|
41
|
-
return config.driver_factory if config.driver_factory
|
42
|
-
DefaultDriverFactory.new config.driver_config
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def config
|
47
|
-
Crabfarm.config
|
48
23
|
end
|
49
24
|
|
50
25
|
end
|
@@ -10,20 +10,19 @@ module Crabfarm
|
|
10
10
|
|
11
11
|
class ConsoleDsl
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
def initialize
|
16
|
-
reload!
|
13
|
+
def initialize(_context)
|
14
|
+
@context = _context
|
17
15
|
end
|
18
16
|
|
19
17
|
def reload!
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
18
|
+
puts "Reloading crawler source".color(:green)
|
19
|
+
ActiveSupport::Dependencies.clear
|
20
|
+
@context.reset
|
21
|
+
end
|
25
22
|
|
26
|
-
|
23
|
+
def reset
|
24
|
+
puts "Resetting crawling context".color(:green)
|
25
|
+
@context.reset
|
27
26
|
end
|
28
27
|
|
29
28
|
def transition(_name=nil, _params={})
|
@@ -53,17 +52,12 @@ module Crabfarm
|
|
53
52
|
puts "Ejem..."
|
54
53
|
end
|
55
54
|
|
56
|
-
def reset
|
57
|
-
puts "Resetting crawling context".color(:green)
|
58
|
-
@context.reset
|
59
|
-
end
|
60
|
-
|
61
55
|
alias :t :transition
|
62
56
|
alias :r :reset
|
63
57
|
end
|
64
58
|
|
65
|
-
def self.start
|
66
|
-
dsl = ConsoleDsl.new
|
59
|
+
def self.start(_context)
|
60
|
+
dsl = ConsoleDsl.new _context
|
67
61
|
|
68
62
|
loop do
|
69
63
|
begin
|
@@ -78,7 +72,7 @@ module Crabfarm
|
|
78
72
|
end
|
79
73
|
|
80
74
|
puts "Releasing crawling context".color(:green)
|
81
|
-
|
75
|
+
_context.release
|
82
76
|
end
|
83
77
|
|
84
78
|
end
|
@@ -20,6 +20,7 @@ module Crabfarm
|
|
20
20
|
path(_name, 'Gemfile').render('Gemfile', binding)
|
21
21
|
path(_name, 'Crabfile').render('Crabfile', binding)
|
22
22
|
path(_name, '.rspec').render('dot_rspec', binding)
|
23
|
+
path(_name, '.crabfarm').render('dot_crabfarm', binding)
|
23
24
|
path(_name, 'boot.rb').render('boot.rb', binding)
|
24
25
|
path(_name, 'bin', 'crabfarm').render('crabfarm_bin', binding, 0755)
|
25
26
|
path(_name, 'app', 'parsers', '.gitkeep').render('dot_gitkeep')
|
@@ -27,6 +28,8 @@ module Crabfarm
|
|
27
28
|
path(_name, 'app', 'helpers', '.gitkeep').render('dot_gitkeep')
|
28
29
|
path(_name, 'spec', 'spec_helper.rb').render('spec_helper.rb', binding)
|
29
30
|
path(_name, 'spec', 'snapshots', '.gitkeep').render('dot_gitkeep')
|
31
|
+
path(_name, 'spec', 'mementos', '.gitkeep').render('dot_gitkeep')
|
32
|
+
path(_name, 'spec', 'integration', '.gitkeep').render('dot_gitkeep')
|
30
33
|
end
|
31
34
|
end
|
32
35
|
|
@@ -0,0 +1,189 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'git'
|
3
|
+
require 'zlib'
|
4
|
+
require 'rubygems/package'
|
5
|
+
require 'net/http/post/multipart'
|
6
|
+
require 'rainbow'
|
7
|
+
require 'rainbow/ext/string'
|
8
|
+
require 'digest/sha1'
|
9
|
+
|
10
|
+
module Crabfarm
|
11
|
+
module Modes
|
12
|
+
module Publisher
|
13
|
+
extend self
|
14
|
+
|
15
|
+
DEFAULT_HOST = 'http://www.crabfarm.io'
|
16
|
+
|
17
|
+
def publish(_path, _options={})
|
18
|
+
|
19
|
+
@crawler_path = _path
|
20
|
+
@options = _options
|
21
|
+
|
22
|
+
load_config
|
23
|
+
return unless dry_run or authenticated?
|
24
|
+
detect_git_repo
|
25
|
+
|
26
|
+
if inside_git_repo?
|
27
|
+
if not unsafe and is_tree_dirty?
|
28
|
+
puts "Aborting: Your working copy has uncommited changes! Use the --unsafe option to force.".color(:red)
|
29
|
+
return
|
30
|
+
end
|
31
|
+
load_files_from_git
|
32
|
+
else
|
33
|
+
load_files_from_fs
|
34
|
+
end
|
35
|
+
|
36
|
+
build_package
|
37
|
+
compress_package
|
38
|
+
generate_signature
|
39
|
+
|
40
|
+
send_package unless dry_run
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def dry_run
|
46
|
+
@options.fetch(:dry, false)
|
47
|
+
end
|
48
|
+
|
49
|
+
def unsafe
|
50
|
+
@options.fetch(:unsafe, false)
|
51
|
+
end
|
52
|
+
|
53
|
+
def config_path
|
54
|
+
File.join(@crawler_path, '.crabfarm')
|
55
|
+
end
|
56
|
+
|
57
|
+
def home_config_path
|
58
|
+
File.join(Dir.home, '.crabfarm')
|
59
|
+
end
|
60
|
+
|
61
|
+
def load_config
|
62
|
+
config = YAML.load_file config_path
|
63
|
+
|
64
|
+
if File.exists? home_config_path
|
65
|
+
home_config = YAML.load_file home_config_path
|
66
|
+
config = home_config.merge config
|
67
|
+
end
|
68
|
+
|
69
|
+
@token = config['token']
|
70
|
+
@name = config['name']
|
71
|
+
@host = config['host'] || DEFAULT_HOST
|
72
|
+
@include = config['files']
|
73
|
+
end
|
74
|
+
|
75
|
+
def authenticated?
|
76
|
+
# TODO: if no token, ask for credentials and fetch token
|
77
|
+
if @token.nil? or @token.empty?
|
78
|
+
puts "No crabfarm API token has been provided".color(:red)
|
79
|
+
return false
|
80
|
+
end
|
81
|
+
|
82
|
+
true
|
83
|
+
end
|
84
|
+
|
85
|
+
def is_tree_dirty?
|
86
|
+
@git.chdir do
|
87
|
+
status = @git.status
|
88
|
+
(status.changed.count + status.added.count + status.deleted.count + status.untracked.count) > 0
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def detect_git_repo
|
93
|
+
git_path = @crawler_path
|
94
|
+
|
95
|
+
path_to_git = []
|
96
|
+
while git_path != '/'
|
97
|
+
if File.exists? File.join(git_path, '.git')
|
98
|
+
@git = Git.open git_path
|
99
|
+
@rel_path = if path_to_git.count > 0 then File.join(*path_to_git.reverse!) else nil end
|
100
|
+
return
|
101
|
+
else
|
102
|
+
path_to_git << File.basename(git_path)
|
103
|
+
git_path = File.expand_path('..', git_path)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
@git = nil
|
108
|
+
end
|
109
|
+
|
110
|
+
def inside_git_repo?
|
111
|
+
not @git.nil?
|
112
|
+
end
|
113
|
+
|
114
|
+
def load_files_from_git
|
115
|
+
@git.chdir do
|
116
|
+
@ref = @git.log.first.sha
|
117
|
+
puts "Packaging files from current HEAD (#{@ref}):".color(:green)
|
118
|
+
entries = @git.gtree(@ref).full_tree.map(&:split)
|
119
|
+
entries = entries.select { |e| e[1] == 'blob' }
|
120
|
+
|
121
|
+
@file_list = []
|
122
|
+
entries.each do |entry|
|
123
|
+
path = unless @rel_path.nil?
|
124
|
+
next unless entry[3].starts_with? @rel_path
|
125
|
+
entry[3][@rel_path.length+1..-1]
|
126
|
+
else entry[3] end
|
127
|
+
|
128
|
+
if @include.any? { |p| File.fnmatch? p, path }
|
129
|
+
@file_list << [path, entry[0].to_i(8), @git.show(@ref, entry[3])]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def load_files_from_fs
|
136
|
+
puts "Packaging files (no version control)".color(:green)
|
137
|
+
@file_list = Dir[*@include].map do |path|
|
138
|
+
full_path = File.join(@crawler_path, path)
|
139
|
+
[path, File.stat(full_path).mode, File.read(full_path)]
|
140
|
+
end
|
141
|
+
@ref = "filesystem"
|
142
|
+
end
|
143
|
+
|
144
|
+
def build_package
|
145
|
+
@package = StringIO.new("")
|
146
|
+
Gem::Package::TarWriter.new(@package) do |tar|
|
147
|
+
@file_list.each do |f|
|
148
|
+
puts "+ #{f[0]} - #{f[1]}"
|
149
|
+
path, mode, contents = f
|
150
|
+
tar.add_file(path, mode) { |tf| tf.write contents }
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
@package.rewind
|
155
|
+
end
|
156
|
+
|
157
|
+
def compress_package
|
158
|
+
@cpackage = StringIO.new("")
|
159
|
+
writer = Zlib::GzipWriter.new(@cpackage)
|
160
|
+
writer.write @package.string
|
161
|
+
writer.close
|
162
|
+
end
|
163
|
+
|
164
|
+
def generate_signature
|
165
|
+
@signature = Digest::SHA1.hexdigest @package.string
|
166
|
+
puts "Package SHA1: #{@signature}"
|
167
|
+
end
|
168
|
+
|
169
|
+
def send_package
|
170
|
+
url = URI.join(@host, 'api/crawlers/', @name)
|
171
|
+
|
172
|
+
req = Net::HTTP::Put::Multipart.new(url.path, {
|
173
|
+
"repo" => UploadIO.new(StringIO.new(@cpackage.string), "application/x-gzip", "tree.tar.gz"),
|
174
|
+
"sha" => @signature,
|
175
|
+
"ref" => @ref
|
176
|
+
}, {
|
177
|
+
'X-Api-Token' => @token
|
178
|
+
})
|
179
|
+
|
180
|
+
res = Net::HTTP.start(url.host, url.port) do |http|
|
181
|
+
http.request(req)
|
182
|
+
end
|
183
|
+
|
184
|
+
puts res.body
|
185
|
+
end
|
186
|
+
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'rainbow'
|
2
|
+
require 'rainbow/ext/string'
|
3
|
+
require 'crabfarm/crabtrap_runner'
|
4
|
+
|
5
|
+
module Crabfarm
|
6
|
+
module Modes
|
7
|
+
class Recorder
|
8
|
+
|
9
|
+
def self.start(_target)
|
10
|
+
return puts "Must provide a recording name" unless _target.is_a? String
|
11
|
+
|
12
|
+
crabtrap_config = Crabfarm.config.crabtrap_config
|
13
|
+
crabtrap_config[:mode] = :capture
|
14
|
+
crabtrap_config[:bucket_path] = File.join(CF_PATH, 'spec/mementos', _target + '.json.gz')
|
15
|
+
|
16
|
+
crabtrap = CrabtrapRunner.new crabtrap_config
|
17
|
+
crabtrap.start
|
18
|
+
|
19
|
+
driver_config = Crabfarm.config.driver_config
|
20
|
+
driver_config[:name] = :firefox
|
21
|
+
driver_config[:proxy] = "127.0.0.1:#{crabtrap.port}"
|
22
|
+
|
23
|
+
driver = DefaultDriverFactory.new(driver_config).build_driver nil
|
24
|
+
|
25
|
+
begin
|
26
|
+
puts "Press Ctrl-C to stop capturing."
|
27
|
+
loop do
|
28
|
+
driver.current_url
|
29
|
+
sleep 1.0
|
30
|
+
end
|
31
|
+
rescue Selenium::WebDriver::Error::WebDriverError, SystemExit, Interrupt
|
32
|
+
# noop
|
33
|
+
end
|
34
|
+
|
35
|
+
puts "Releasing crawling context".color(:green)
|
36
|
+
driver.quit rescue nil
|
37
|
+
crabtrap.stop
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/crabfarm/rspec.rb
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
-
|
1
|
+
require 'crabfarm/crabtrap_context'
|
2
|
+
|
3
|
+
CF_TEST_CONTEXT = Crabfarm::CrabtrapContext::new
|
4
|
+
CF_TEST_CONTEXT.load
|
2
5
|
CF_TEST_BUCKET = CF_TEST_CONTEXT.driver
|
3
6
|
|
4
7
|
module Crabfarm
|
@@ -15,6 +18,28 @@ module Crabfarm
|
|
15
18
|
CF_TEST_BUCKET.parse(described_class, _options)
|
16
19
|
end
|
17
20
|
|
21
|
+
def crawl(_state=nil, _params={})
|
22
|
+
if _state.is_a? Hash
|
23
|
+
_params = _state
|
24
|
+
_state = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
if _state.nil?
|
28
|
+
return nil unless described_class < BaseState # TODO: maybe raise an error here.
|
29
|
+
@state = @last_state = CF_TEST_CONTEXT.run_state(described_class, _params)
|
30
|
+
else
|
31
|
+
@last_state = CF_TEST_CONTEXT.run_state(_state, _params)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def state
|
36
|
+
@state ||= crawl
|
37
|
+
end
|
38
|
+
|
39
|
+
def last_state
|
40
|
+
@last_state
|
41
|
+
end
|
42
|
+
|
18
43
|
def parser
|
19
44
|
@parser
|
20
45
|
end
|
@@ -26,9 +51,16 @@ RSpec.configure do |config|
|
|
26
51
|
config.include Crabfarm::RSpec
|
27
52
|
|
28
53
|
config.before(:example) do |example|
|
54
|
+
|
29
55
|
if example.metadata[:parsing]
|
30
56
|
@parser = parse example.metadata[:parsing], example.metadata[:using] || {}
|
31
57
|
end
|
58
|
+
|
59
|
+
if example.metadata[:crawling]
|
60
|
+
CF_TEST_CONTEXT.replay File.join(CF_PATH, 'spec/mementos', example.metadata[:crawling] + '.json.gz')
|
61
|
+
else
|
62
|
+
CF_TEST_CONTEXT.pass_through
|
63
|
+
end
|
32
64
|
end
|
33
65
|
|
34
66
|
config.after(:suite) do
|
data/lib/crabfarm/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crabfarm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ignacio Baixas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-02-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: jbuilder
|
@@ -142,6 +142,34 @@ dependencies:
|
|
142
142
|
- - ~>
|
143
143
|
- !ruby/object:Gem::Version
|
144
144
|
version: 2.10.2
|
145
|
+
- !ruby/object:Gem::Dependency
|
146
|
+
name: git
|
147
|
+
requirement: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - '>='
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0'
|
152
|
+
type: :runtime
|
153
|
+
prerelease: false
|
154
|
+
version_requirements: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - '>='
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '0'
|
159
|
+
- !ruby/object:Gem::Dependency
|
160
|
+
name: multipart-post
|
161
|
+
requirement: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - '>='
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0'
|
166
|
+
type: :runtime
|
167
|
+
prerelease: false
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - '>='
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '0'
|
145
173
|
- !ruby/object:Gem::Dependency
|
146
174
|
name: bundler
|
147
175
|
requirement: !ruby/object:Gem::Requirement
|
@@ -287,6 +315,7 @@ email:
|
|
287
315
|
- ignacio@platan.us
|
288
316
|
executables:
|
289
317
|
- crabfarm
|
318
|
+
- crabtrap
|
290
319
|
extensions: []
|
291
320
|
extra_rdoc_files: []
|
292
321
|
files:
|
@@ -301,6 +330,8 @@ files:
|
|
301
330
|
- lib/crabfarm/cli.rb
|
302
331
|
- lib/crabfarm/configuration.rb
|
303
332
|
- lib/crabfarm/context.rb
|
333
|
+
- lib/crabfarm/crabtrap_context.rb
|
334
|
+
- lib/crabfarm/crabtrap_runner.rb
|
304
335
|
- lib/crabfarm/default_driver_factory.rb
|
305
336
|
- lib/crabfarm/driver_bucket.rb
|
306
337
|
- lib/crabfarm/driver_bucket_pool.rb
|
@@ -314,6 +345,8 @@ files:
|
|
314
345
|
- lib/crabfarm/mocks/noop_driver.rb
|
315
346
|
- lib/crabfarm/modes/console.rb
|
316
347
|
- lib/crabfarm/modes/generator.rb
|
348
|
+
- lib/crabfarm/modes/publisher.rb
|
349
|
+
- lib/crabfarm/modes/recorder.rb
|
317
350
|
- lib/crabfarm/modes/server.rb
|
318
351
|
- lib/crabfarm/phantom_driver_factory.rb
|
319
352
|
- lib/crabfarm/phantom_runner.rb
|
@@ -325,6 +358,7 @@ files:
|
|
325
358
|
- lib/crabfarm/templates/boot.rb.erb
|
326
359
|
- lib/crabfarm/templates/crabfarm_bin.erb
|
327
360
|
- lib/crabfarm/templates/Crabfile.erb
|
361
|
+
- lib/crabfarm/templates/dot_crabfarm.erb
|
328
362
|
- lib/crabfarm/templates/dot_gitignore.erb
|
329
363
|
- lib/crabfarm/templates/dot_gitkeep.erb
|
330
364
|
- lib/crabfarm/templates/dot_rspec.erb
|
@@ -337,6 +371,7 @@ files:
|
|
337
371
|
- lib/crabfarm/version.rb
|
338
372
|
- lib/crabfarm.rb
|
339
373
|
- bin/crabfarm
|
374
|
+
- bin/crabtrap
|
340
375
|
homepage: https://github.com/platanus/crabfarm-gem
|
341
376
|
licenses:
|
342
377
|
- MIT
|