dcp-worker 3.2.24 → 3.2.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/dcp-worker CHANGED
@@ -9,46 +9,43 @@
9
9
  'use strict';
10
10
 
11
11
  const process = require('process');
12
- const os = require('os');
13
- const fs = require('fs');
14
-
15
- // Set to true to try to understand unhandled rejection.
16
- const ANALYZE_UNHANDELD_REJECTION = false;
17
-
18
- // Set default location for pid file
19
- let DEFAULT_PID_LOC;
20
- if (fs.existsSync('/var/dcp/run'))
21
- DEFAULT_PID_LOC = '/var/dcp/run/';
22
- else if (fs.existsSync('/var/run'))
23
- DEFAULT_PID_LOC = '/var/run/';
24
- else
25
- DEFAULT_PID_LOC = os.tmpdir();
12
+ const os = require('os');
13
+ const fs = require('fs');
14
+ const crypto = require('crypto');
15
+ const chalk = require('chalk');
26
16
 
17
+ const configName = process.env.DCP_CONFIG || '../etc/dcp-worker-config';
27
18
  const TOTAL_CPU_VCORES = os.cpus().length;
28
- const DEFAULT_CORES = TOTAL_CPU_VCORES - 1;
29
19
  var worker, dcpConfig;
30
20
 
31
- const EXIT_CLEAN = 0; // normal exit, no error
32
- const EXIT_SIGQUIT = 2; // received SIGQUIT
33
- const EXIT_ERROR_STOPPING = 3; // failed to stop worker, no other error
34
- const EXIT_TIMED_OUT = 4; // failed to exit after requesting graceful exit
35
- const EXIT_UNHANDLED = 5; // unhandled rejection
36
-
37
-
38
- process.on('unhandledRejection', (error) => {
39
- console.error('Unhandled rejection:', error);
40
- });
41
-
42
- async function main () {
43
- if (!ANALYZE_UNHANDELD_REJECTION)
44
- process.on('unhandledRejection', unhandledRejectionHandler);
21
+ const EXIT_UNHANDLED = 5;
22
+
23
+ /* Setup the telnet REPL up early to ensure early-failure log messages are captured */
24
+ const replHelpers = {
25
+ help: {
26
+ report: 'Print a worker status & slice report',
27
+ kill: 'Try to kill the worker',
28
+ die: 'Kill the worker',
29
+ },
30
+ commands: {
31
+ report: printReport,
32
+ kill: process.exit,
33
+ die: () => worker && worker.stop()
34
+ },
35
+ };
36
+ require('../lib/remote-console').init(replHelpers);
45
37
 
46
- await require('dcp-client').init({
47
- configName: '../etc/dcp-config',
48
- });
38
+ /* Initialize dcp-client with local config defaults and run the main function. DCP_CONFIG_COOKIE becomes dcpConfig.cookie. */
39
+ process.env.DCP_CONFIG_COOKIE = (Math.random().toString(16)).slice(2) + '-' + process.pid + '-' + Date.now();
40
+ require('dcp-client').init({ configName }).then(main).catch(handleUnhandled);
49
41
 
42
+ function parseCliArgs()
43
+ {
44
+ var defaultPidFileName;
45
+
50
46
  dcpConfig = require('dcp/dcp-config');
51
-
47
+ defaultPidFileName = require('../lib/pidfile').getDefaultPidFileName(dcpConfig.worker.pidfile);
48
+
52
49
  const cliArgs = require('dcp/cli')
53
50
  .base('Standalone NodeJS DCP Worker')
54
51
  .options({
@@ -60,7 +57,7 @@ async function main () {
60
57
  alias: 'c',
61
58
  describe: 'Number of cores to work with',
62
59
  type: 'number',
63
- default: DEFAULT_CORES,
60
+ default: TOTAL_CPU_VCORES - 1,
64
61
  },
65
62
  verbose: {
66
63
  alias: 'v',
@@ -87,7 +84,7 @@ async function main () {
87
84
  alias: 'p',
88
85
  describe: 'Evaluator port',
89
86
  type: 'number',
90
- default: Number(dcpConfig.evaluator.location.port),
87
+ default: Number(dcpConfig.evaluator.listen.port),
91
88
  },
92
89
  priorityOnly: {
93
90
  alias: 'P',
@@ -171,11 +168,6 @@ async function main () {
171
168
  type: 'array'
172
169
  },
173
170
 
174
- replPort: {
175
- describe: 'If set, open a REPL on specified TCP port',
176
- type: 'number',
177
- default: undefined,
178
- },
179
171
  watchdogInterval: {
180
172
  alias: 'W',
181
173
  describe: 'Number of milliseconds between watchdog cycles',
@@ -187,9 +179,9 @@ async function main () {
187
179
  type: 'boolean',
188
180
  hidden: true,
189
181
  },
190
- pidFileLoc: {
182
+ pidFile: {
191
183
  alias: 'f',
192
- describe: 'If set, location to generate the worker pid file',
184
+ describe: `create a .pid file for the worker; value overrides default location (${defaultPidFileName})`,
193
185
  normalize: true
194
186
  },
195
187
  })
@@ -200,140 +192,138 @@ async function main () {
200
192
  if (cliArgs.dumpConfig)
201
193
  {
202
194
  console.debug(JSON.stringify(require('dcp/dcp-config'), null, 2));
203
- process.exit(1);
195
+ process.exit(0);
204
196
  }
205
197
 
206
- return startWorking(cliArgs);
198
+ return cliArgs;
207
199
  }
208
200
 
209
- // Preserve console.error, the dashboard replaces it with a custom logger
210
- const logError = console.error;
211
- main()
212
- .then(exitcode => process.exit(exitcode || 0))
213
- .catch(e => {
214
- logError("Script failed:");
215
- logError(e);
216
- process.exit(1);
217
- });
218
-
219
201
  // Imperfect, but handles CG { joinKey, joinHash }.
220
202
  function isHash(b) {
221
203
  return b && b.length === 68 && b.startsWith('eh1-');
222
204
  }
223
205
 
224
- async function startWorking(cliArgs) {
225
- //console.log('cliArgs', cliArgs);
206
+ /**
207
+ * Add one or more configuration objects into a target via leaf-merging.
208
+ */
209
+ function addConfig(target, ...objs)
210
+ {
211
+ const { leafMerge } = require('dcp/utils');
212
+ var tmp = target;
213
+
214
+ for (let obj of objs)
215
+ tmp = leafMerge(tmp, obj);
216
+
217
+ Object.assign(target, tmp);
218
+ }
219
+
220
+ /**
221
+ * Main program entry point. Assumes DCP client is already initialized and console logging is ready.
222
+ */
223
+ async function main()
224
+ {
226
225
  const wallet = require('dcp/wallet');
227
226
  const DCPWorker = require('dcp/worker').Worker;
228
227
  const { startWorkerLogger } = require('../lib/startWorkerLogger');
228
+ const cliArgs = parseCliArgs();
229
229
  const sawOptions = {
230
230
  hostname: cliArgs.hostname,
231
231
  port: cliArgs.port
232
232
  };
233
233
 
234
- let paymentAddress;
235
- if (cliArgs.paymentAddress)
236
- paymentAddress = new wallet.Address(cliArgs.paymentAddress);
237
- else
238
- paymentAddress = (await wallet.get()).address;
234
+ verifyDefaultConfigIntegrity();
239
235
 
240
- if (cliArgs.pidFileLoc)
241
- memoizePid(cliArgs.pidFileLoc)
236
+ process.on('SIGINT', handleSigDeath);
237
+ process.on('SIGTERM', handleSigDeath);
238
+ process.on('SIGQUIT', handleSigDeath);
239
+ process.on('unhandledRejection', handleUnhandled);
240
+ process.on('uncaughtException', handleUnhandled);
242
241
 
243
- // Different ways to get the identity:
244
- let identityKeystore = false;
242
+ let paymentAddress = false
243
+ || cliArgs.paymentAddress
244
+ || dcpConfig.worker.paymentAddress
245
+ || (await wallet.get()).address;
246
+ if (typeof paymentAddress === 'string')
247
+ paymentAddress = new wallet.Address(paymentAddress);
248
+
249
+ if (cliArgs.pidFile)
250
+ require('../lib/pidfile').write(cliArgs.pidFile);
245
251
 
252
+ /* Figure out of the worker's identity and put that keystore in the wallet */
253
+ let identityKeystore = false;
246
254
  if (cliArgs.identityKey)
247
255
  identityKeystore = await new wallet.IdKeystore(cliArgs.identityKey, '');
248
256
  else if (cliArgs.identityKeystore)
249
257
  identityKeystore = await new wallet.IdKeystore(JSON.parse(cliArgs.identityKeystore), '');
250
258
  else
251
259
  identityKeystore = await wallet.getId();
252
-
253
- // Set the provided identity as the wallet's default
254
260
  await wallet.addId(identityKeystore);
255
261
 
256
-
257
- if (typeof dcpConfig.worker.unhandledRejectionCleanupTimeout !== 'undefined')
258
- unhandledRejectionHandler.timeout = dcpConfig.worker.unhandledRejectionCleanupTimeout;
259
-
260
- // Leave the public compute group, if desired
261
- if (cliArgs.leavePublicGroup || cliArgs.publicGroupFallback)
262
- dcpConfig.worker.leavePublicGroup = true;
263
-
264
-
265
- // The exitGuard will hold an "exit" method, and a Promise to await for
266
- // the exit code passed to exitGuard.exit()
267
- let exitcode = EXIT_CLEAN;
268
- const exitGuard = {
269
- promise: Promise.resolve(0), // will be overwritten when worker starts
270
- exit(code) { process.exit(code||exitcode||0) }, // will be overwritten when worker starts
271
- };
272
- process.on('SIGQUIT', () => {
273
- exitcode = EXIT_SIGQUIT;
274
- cliArgs.verbose >= 1 && console.info(`240: Caught SIGQUIT; exiting worker with exitcode ${exitcode}`);
275
- exitGuard.exit(exitcode);
276
- });
277
-
278
-
279
- /** @type {string[]} */
262
+ /* Build the worker options, which are largely given by dcpConfig.worker. We use a reference for
263
+ * dcpConfig.worker rather than copying it, so that runtime modifications to the worker configuration
264
+ * in memory take effect immediately.
265
+ *
266
+ * forceOptions override any setting in dcpConfig; this can be used for settings calculated above
267
+ * which were derived from dcpConfig in the first place. defaultOptions are overrideable by the usual
268
+ * dcpConfig mechanisms, but since they are dynamic (or non-user-facing) they don't come from the
269
+ * etc/dcp-worker-config.js file that ships with the work.
270
+ */
280
271
  const dcpWorkerOptions = dcpConfig.worker;
281
-
282
- Object.assign(dcpWorkerOptions, {
272
+ const forceOptions = {
283
273
  paymentAddress,
274
+ leavePublicGroup: cliArgs.leavePublicGroup || dcpConfig.worker.leavePublicGroup || cliArgs.publicGroupFallback || false,
284
275
  maxWorkingSandboxes: cliArgs.cores,
285
- cores: { cpu: TOTAL_CPU_VCORES, gpu: undefined }, /** XXXpfr @todo: Figure out how many gpus. */
286
- targetLoad: { cpu: 1.0, gpu: 1.0 }, /** Use 100%: XXXpfr @todo Allow command-line override. */
276
+ };
277
+ const defaultOptions = {
287
278
  sandboxOptions: {
288
279
  SandboxConstructor: require('dcp-client/lib/standaloneWorker').workerFactory(sawOptions)
289
280
  },
290
- computeGroups: [], /* public group is implied */
291
- leavePublicGroup: cliArgs.leavePublicGroup || dcpConfig.worker.leavePublicGroup,
292
- });
281
+ };
282
+
283
+ addConfig(dcpWorkerOptions, defaultOptions, dcpConfig.worker, forceOptions);
293
284
 
294
285
  /* cliArgs.join is the list of compute groups to join */
295
286
  if (cliArgs.join && cliArgs.join.length)
296
287
  {
297
- dcpWorkerOptions.computeGroups = cliArgs.join
288
+ const cliComputeGroups = cliArgs.join
298
289
  .map((el) => {
299
290
  /* Map cliArgs.join to give us [{ joinKey, joinSecret/joinHash }...] */
300
291
  const [a, b] = el.split(',');
301
292
  return isHash(b) ? { joinKey: a, joinHash: b } : { joinKey: a, joinSecret: b };
302
293
  })
303
294
  .filter((el) => el.joinKey); /* Filter out entries with no joinKey */
304
- //console.log(dcpWorkerOptions.computeGroups);
295
+
296
+ addConfig(dcpWorkerOptions.computeGroups, dcpWorkerOptions.computeGroups, cliComputeGroups);
305
297
  }
306
-
298
+
307
299
  if (cliArgs.jobId)
308
300
  {
309
- dcpWorkerOptions.jobAddresses = cliArgs.jobId;
301
+ dcpWorkerOptions.jobAddresses.push(...cliArgs.jobId);
310
302
  dcpWorkerOptions.priorityOnly = true;
311
303
  }
304
+
312
305
  if (cliArgs.allowedOrigins)
313
- dcpConfig.worker.allowOrigins.any.push(...cliArgs.allowedOrigins);
306
+ {
307
+ if (!dcpWorkerOptions.allowOrigins)
308
+ dcpWorkerOptions.allowOrigins = {};
309
+ if (!dcpWorkerOptions.allowOrigins.any)
310
+ dcpWorkerOptions.allowOrigins.any = [];
311
+ dcpWorkerOptions.allowOrigins.any.push(...cliArgs.allowedOrigins);
312
+ }
314
313
  if (cliArgs.watchdogInterval)
315
314
  dcpWorkerOptions.watchdogInterval = cliArgs.watchdogInterval;
316
315
 
317
- worker = new DCPWorker(identityKeystore, dcpWorkerOptions);
316
+ worker = new DCPWorker(identityKeystore, dcpWorkerOptions);
317
+ worker.on('error', console.error);
318
+ worker.on('warning', console.warn);
318
319
 
319
- /**
320
- * NOTE: In Supervisor2 this function is a NOOP.
321
- * When (and if) we stop using Supevisor1, delete this reference to setDefaultIdentityKeystore
322
- * and delete the corresponding fucntion from Supervisor2.
323
- *
324
- * startWorkerLogger needs to be called before the worker is started so that
325
- * it can attach event listeners before the events fire, else UI events for
326
- * things such as progress will never get attached.
327
- *
328
- * setDefaultIdentityKeystore needs to be called before the logger because it
329
- * tries access the identity of the worker before it has started, i.e. where
330
- * it sets its identity, throwing an assertion error.
331
- *
332
- * FIXME(bryan-hoang): This is a fragile solution that is too coupled with the
333
- * implementation of the worker that should be addressed in Supervisor 2
320
+ /* Let incorrect event-loop references keep us alive when linked with a debug library, but
321
+ * exit quickly/accurately for production code even when the library isn't perfect.
334
322
  */
335
- await worker.supervisor.setDefaultIdentityKeystore();
336
-
323
+ if (require('dcp/build').config.build !== 'debug')
324
+ worker.on('end', process.exit);
325
+ else
326
+ worker.on('end', () => setTimeout(process.exit, getCleanupTimeoutMs()).unref());
337
327
 
338
328
  if (cliArgs.eventDebug)
339
329
  {
@@ -341,43 +331,34 @@ async function startWorking(cliArgs) {
341
331
  worker.supervisor.debug = true;
342
332
  }
343
333
 
334
+ worker.on('stop', () => { console.log('Worker is stopping') });
335
+ worker.on('end', () => { logClosing('log', 'Worker has stopped') });
336
+ startWorkerLogger(worker, cliArgs);
344
337
 
345
- // if the worker stops internally (eg. schedmsg stop), then exit without
346
- // changing the saved exitcode
347
- worker.on('stop', () => {
348
- exitGuard.exit();
349
- });
350
-
351
-
352
- startWorkerLogger(worker, {
353
- exitGuard,
354
- verbose: cliArgs.verbose,
355
- outputMode: cliArgs.outputMode,
356
-
357
- logfile: cliArgs.logfile,
338
+ require('../lib/remote-console').setMainEval(function mainEval() { return eval(arguments[0]) });
358
339
 
359
- syslogAddress: cliArgs.syslogAddress,
360
- syslogTransport: cliArgs.syslogTransport,
361
- syslogPort: cliArgs.syslogPort,
362
- });
363
-
364
- try
365
- {
366
- require('../lib/remote-console').init(cliArgs.replPort, {
367
- help: {
368
- report: 'Print a worker status & slice report',
369
- kill: 'Kill the worker',
370
- },
371
- commands: {
372
- report: printReport,
373
- kill: exitcode => exitGuard.exit(exitcode),
374
- },
375
- });
376
- require('../lib/remote-console').setMainEval(function mainEval() { return eval(arguments[0]) });
377
- }
378
- catch (error)
340
+ // Activate public group fallback
341
+ // If requested by CLI
342
+ // OR if requested by dcpConfig and not forbidden by the cli
343
+ if (cliArgs.publicGroupFallback
344
+ || (dcpConfig.worker?.leavePublicGroup === 'fallback'
345
+ && typeof cliArgs.publicGroupFallback !== false))
379
346
  {
380
- console.warn('350: Failed to initialize remote console:', error.message);
347
+ dcpWorkerOptions.publicGroupFallback = true;
348
+
349
+ // If local config blocks the public group, then complain instead of activating fallback
350
+ if (dcpConfig.worker?.leavePublicGroup === true)
351
+ {
352
+ console.warn('* Public Group fallback has been requested, but the public group is blocked by local configuration');
353
+ }
354
+ else
355
+ {
356
+ worker.on('fetchend', slicesFetched => {
357
+ // Iff we got work in this fetch, then leave the public group for the
358
+ // next fetch
359
+ dcpConfig.worker.leavePublicGroup = Boolean(slicesFetched > 0);
360
+ });
361
+ }
381
362
  }
382
363
 
383
364
  let introBanner = '';
@@ -395,7 +376,7 @@ async function startWorking(cliArgs) {
395
376
  plural = singular + 's';
396
377
  if (!amount)
397
378
  return plural;
398
- if (amount == 1)
379
+ if (Number(amount) === 1)
399
380
  return singular;
400
381
  return plural;
401
382
  }
@@ -417,269 +398,263 @@ async function startWorking(cliArgs) {
417
398
  introBanner += ' . ready' + '\n';
418
399
 
419
400
  console.log(introBanner);
420
-
421
401
  require('../lib/check-scheduler-version').check();
422
402
 
423
-
424
- /** print the slice report via console.log */
425
- function printReport()
403
+ if (parseFloat(cliArgs.reportInterval))
426
404
  {
427
- console.log(sliceReport());
405
+ if (cliArgs.outputMode !== 'dashboard')
406
+ setInterval(printReport, parseFloat(cliArgs.reportInterval) * 1000).unref();
407
+ else
408
+ console.log('Ignoring --reportInterval in dashboard output mode');
428
409
  }
429
410
 
430
- /** retrieve a slice report screen */
431
- function sliceReport()
411
+ /* Start the worker. Normal process exit happens by virtue of the worker<end> event */
412
+ await worker.start();
413
+ }
414
+
415
+ /**
416
+ * Log a closing message (or messages). Since the dashboard clears the screen on exit, we use the
417
+ * memoized console property to log the message after we destroy the instance of screen.
418
+ */
419
+ function logClosing(facility, ...message)
420
+ {
421
+ var screen = require('../lib/worker-loggers/dashboard').screen;
422
+
423
+ if (!screen)
424
+ console[facility](message);
425
+ else
432
426
  {
433
- const sup = worker.supervisor;
434
- let report = '';
435
-
436
- report += ('='.repeat(78)) + '\n';
437
-
438
- const sbStates = {
439
- WORKING: 0,
440
- ASSIGNED: 0,
441
- READY: 0,
442
- TERMINATED: 0,
443
- };
444
- const stateNames = {
445
- WORKING: 'Working',
446
- ASSIGNED: 'Assigned',
447
- READY: 'Ready',
448
- TERMINATED: 'Terminated',
449
- };
450
- sup.sandboxes.forEach(sb => {
451
- const { state } = sb;
452
- if (!sbStates[state])
453
- sbStates[state] = 0;
454
- sbStates[state]++;
455
- });
456
-
457
- report += (Date()) + '\n';
458
- report += ('Sandboxes:') + '\n';
459
- Object.keys(sbStates).forEach(state => {
460
- const stateName = stateNames[state] || state;
461
- report += (` ${(stateName + ':').padEnd(12)} ${sbStates[state]}`) + '\n';
462
- })
463
- report += (` * ALL: ${sup.sandboxes.length}`) + '\n';
464
-
465
- report += ('Progress:') + '\n';
466
- sup.workingSandboxes.forEach(sb => {
467
- const jobName = sb.job && sb.job.public && sb.job.public.name || `idek (${sb.jobAddress})`;
468
- let el = Date.now() - sb.sliceStartTime;
469
- const t = el < 1000000
470
- ? toInterval(el)
471
- : 'new';
472
-
473
- el = sb.progressReports && sb.progressReports.last
474
- ? Date.now() - (sb.sliceStartTime + sb.progressReports.last.timestamp)
475
- : 0;
476
- const pct = (typeof sb.progress) === 'number'
477
- ? `${Number(sb.progress).toFixed(0).padStart(2)}%`
478
- : 'ind';
479
- const stale = (el < 2000) ? '' : `(stale: ${toInterval(el)})`;
480
-
481
- report += (` ${String(sb.id).padStart(4)}: ${sb.jobAddress} ${jobName.padEnd(34)} `+ `${t} ${pct} ${stale}`.padStart(13)) + '\n';
482
- });
483
-
484
- report += ('Slices:') + '\n';
485
- report += (` working: ${sup.allocatedSlices.length}`) + '\n';
486
- report += (` queued: ${sup.queuedSlices.length}`) + '\n';
487
-
488
- report += ('='.repeat(78)) + '\n';
489
-
490
- return report;
427
+ /* Turn off fullscreen TUI and resume "normal" console logging.
428
+ * FUTURE: dashboard API should know how to unregister its hook so that we don't have to clobber
429
+ * it here.
430
+ */
431
+ screen.log(...message);
432
+ screen.destroy();
433
+ screen = false;
434
+ console = new (require('console').Console)(process);
435
+ require('../lib/remote-console').reintercept();
436
+ console[facility](...message);
491
437
  }
438
+ }
492
439
 
493
- /**
494
- * Convert a timespan in ms to a human-readable interval in minutes and seconds
495
- *
496
- * @param {number} el Milliseconds to convert
497
- * @return {string} Timespan formatted as `m:ss`
498
- */
499
- function toInterval(el)
440
+ /**
441
+ * Fatal error handler: __must not ever throw no matter what__.
442
+ * If we hit a fatal error, we are by definition no longer confident of our program state, meaning that
443
+ * the worker must be restarted. This handler does its best to report the rejection and give the worker a few
444
+ * seconds in which to attempt to return slices to the scheduler before it gives up completely.
445
+ */
446
+ async function handleUnhandled(error)
447
+ {
448
+ var _worker = worker;
449
+ worker = false;
450
+
451
+ process.exitCode = process.exitCode || EXIT_UNHANDLED;
452
+
453
+ try
500
454
  {
501
- const m = Math.floor((el / 1000) / 60).toString(10);
502
- const s = Math.floor((el / 1000) % 60).toString(10).padStart(2, '0');
503
- return `${m}:${s}`;
504
- }
455
+ logClosing(error);
456
+ } catch(e) {};
505
457
 
506
- if (parseFloat(cliArgs.reportInterval))
458
+ if (!_worker)
459
+ console.error('trapped unhandled error:', error)
460
+ else
507
461
  {
508
- if (cliArgs.outputMode !== 'dashboard')
509
- setInterval(printReport, parseFloat(cliArgs.reportInterval) * 1000);
510
- else
511
- console.log('Ignoring --reportInterval in dashboard output mode');
462
+ console.error('trapped unhandled error -- stopping worker:', error);
463
+ _worker.on('end', process.exit);
464
+ _worker.stop();
512
465
  }
513
466
 
467
+ setTimeout(() => {
468
+ logClosing('error', 'handleFatalError timeout - exiting now');
469
+ process.exit();
470
+ }, getCleanupTimeoutMs()).unref();
514
471
 
515
- // Set the exit guard - this method can be called by signal and exception
516
- // handlers
517
- exitGuard.promise = new Promise(resolve => {
518
- exitGuard.exit = resolve;
519
- });
472
+ try {
473
+ let log = dcpConfig && dcpConfig.worker && dcpConfig.worker.unhandledRejectionLog;
474
+ if (!log) log = process.env.DCP_WORKER_UNHANDLED_REJECTION_LOG;
475
+ if (log) {
476
+ fs.appendFileSync(process.env.DCP_WORKER_UNHANDLED_REJECTION_LOG,
477
+ `${Date.now()}: ${error.message}\n${error.stack}\n\n`);
478
+ }
479
+ } catch(e) {};
480
+ }
520
481
 
482
+ /** print the slice report via console.log */
483
+ function printReport()
484
+ {
485
+ console.log(sliceReport());
486
+ }
521
487
 
522
- await worker.start();
488
+ /**
489
+ * Convert a timespan in ms to a human-readable interval in minutes and seconds
490
+ *
491
+ * @param {number} el Milliseconds to convert
492
+ * @return {string} Timespan formatted as `m:ss`
493
+ */
494
+ function toInterval(el)
495
+ {
496
+ const m = Math.floor((el / 1000) / 60).toString(10);
497
+ const s = Math.floor((el / 1000) % 60).toString(10).padStart(2, '0');
498
+ return `${m}:${s}`;
499
+ }
500
+
501
+ /** retrieve a slice report screen */
502
+ function sliceReport()
503
+ {
504
+ const sup = worker.supervisor;
505
+ let report = '';
523
506
 
524
- exitcode = await exitGuard.promise;
525
-
526
- cliArgs.verbose >= 1 && console.log(`418: exit guard called with ${exitcode}`);
527
-
528
- const exitTimeAllowed = 30; // seconds to allow for worker to stop gracefully
529
- const forceExitTimeout = setTimeout(() => {
530
- console.error(`396: Worker failed to exit within ${exitTimeAllowed} seconds; terminating forcibly.`);
531
- process.exit(exitcode || EXIT_TIMED_OUT)
532
- }, exitTimeAllowed * 1000);
533
-
534
- await worker.stop(true)
535
- .catch(error => {
536
- if (error.message.includes('Already stopped'))
537
- return;
538
- console.error('255: Unexpected error stopping worker:',
539
- error.code
540
- ? `${error.code}: ${error.message}`
541
- : error.message);
542
- exitcode = exitcode || EXIT_ERROR_STOPPING;
507
+ report += ('='.repeat(78)) + '\n';
508
+
509
+ const sbStates = {
510
+ WORKING: 0,
511
+ ASSIGNED: 0,
512
+ READY: 0,
513
+ TERMINATED: 0,
514
+ };
515
+ const stateNames = {
516
+ WORKING: 'Working',
517
+ ASSIGNED: 'Assigned',
518
+ READY: 'Ready',
519
+ TERMINATED: 'Terminated',
520
+ };
521
+ sup.sandboxes.forEach(sb => {
522
+ const { state } = sb;
523
+ if (!sbStates[state])
524
+ sbStates[state] = 0;
525
+ sbStates[state]++;
543
526
  });
544
527
 
545
- clearTimeout(forceExitTimeout);
528
+ report += (Date()) + '\n';
529
+ report += ('Sandboxes:') + '\n';
530
+ Object.keys(sbStates).forEach(state => {
531
+ const stateName = stateNames[state] || state;
532
+ report += (` ${(stateName + ':').padEnd(12)} ${sbStates[state]}`) + '\n';
533
+ })
534
+ report += (` * ALL: ${sup.sandboxes.length}`) + '\n';
535
+
536
+ report += ('Progress:') + '\n';
537
+ sup.workingSandboxes.forEach(sb => {
538
+ const jobName = sb.job && sb.job.public && sb.job.public.name || `idek (${sb.jobAddress})`;
539
+ let el = Date.now() - sb.sliceStartTime;
540
+ const t = el < 1000000
541
+ ? toInterval(el)
542
+ : 'new';
543
+
544
+ el = sb.progressReports && sb.progressReports.last
545
+ ? Date.now() - (sb.sliceStartTime + sb.progressReports.last.timestamp)
546
+ : 0;
547
+ const pct = (typeof sb.progress) === 'number'
548
+ ? `${Number(sb.progress).toFixed(0).padStart(2)}%`
549
+ : 'ind';
550
+ const stale = (el < 2000) ? '' : `(stale: ${toInterval(el)})`;
551
+
552
+ report += (` ${String(sb.id).padStart(4)}: ${sb.jobAddress} ${jobName.padEnd(34)} `+ `${t} ${pct} ${stale}`.padStart(13)) + '\n';
553
+ });
554
+
555
+ report += ('Slices:') + '\n';
556
+ report += (` working: ${sup.allocatedSlices.length}`) + '\n';
557
+ report += (` queued: ${sup.queuedSlices.length}`) + '\n';
558
+
559
+ report += ('='.repeat(78)) + '\n';
546
560
 
547
- return exitcode;
561
+ return report;
548
562
  }
549
563
 
550
- // Create the PID file for the worker
551
- function memoizePid(dir)
564
+ /**
565
+ * Handle a signal which requests our the death of the Worker by
566
+ * - stopping the worker
567
+ * - unregistering the handler (this allows a second signal to forcibly terminate the process
568
+ * if that is the default behaviour)
569
+ * - set a long timeout (dcpConfig.worker.cleanupTimeout seconds), after which the process
570
+ * exits forcibly with a non-zero exit code (unix standard for various signals)
571
+ */
572
+ function handleSigDeath(signalName, signal)
552
573
  {
553
- const path = require('path');
554
- const program = path.basename(require.main.filename, '.js');
555
- let location;
556
- let filename;
574
+ process.off(signalName, handleSigDeath);
557
575
 
558
- if (dir.length && dir.length > 0)
559
- {
560
- location = path.dirname(dir);
561
- if (fs.existsSync(dir))
562
- {
563
- if (fs.statSync(dir).isDirectory())
564
- filename = program;
565
- else
566
- {
567
- console.warn('Previous PID file was not cleaned up');
568
- filename = path.basename(dir);
569
- }
570
- }
571
- else if (dir.endsWith(path.sep))
572
- filename = program;
573
- else
574
- filename = path.basename(dir);
575
- }
576
+ if (!worker)
577
+ console.error(`trapped ${signalName}, signal ${signal}`);
576
578
  else
577
579
  {
578
- location = DEFAULT_PID_LOC;
579
- filename = program;
580
+ console.error(`trapped ${signalName}, signal ${signal} -- stopping worker`);
581
+ worker.stop();
580
582
  }
581
583
 
582
- const pidfile = path.join(
583
- location,
584
- filename + '.pid'
585
- )
586
- try
587
- {
588
- if (fs.existsSync(pidfile))
589
- {
590
- const oldPid = fs.readFileSync(pidfile, 'utf8')
591
- console.warn(`Warning: Previous invocation${oldPid.length ? ' pid#' + parseInt(String(oldPid)) : ''} did not remove ${pidfile}`);
592
- }
593
- else
594
- memoizePid.fd = fs.openSync(pidfile, 'wx');
584
+ setTimeout(() => process.exit(signal - 128), getCleanupTimeoutMs()).unref();
585
+ }
595
586
 
596
- fs.writeSync(memoizePid.fd, Buffer.from(process.pid + '\n'), 0);
597
- }
598
- catch (error)
587
+ /**
588
+ * Returns the duration of the cleanup timeout in milliseconds. It is possible to specify zero.
589
+ */
590
+ function getCleanupTimeoutMs()
591
+ {
592
+ const defaultCT = 60;
593
+ var cleanupTimeout = dcpConfig.worker.cleanupTimeout;
594
+
595
+ if (typeof cleanupTimeout === 'undefined')
596
+ cleanupTimeout = defaultCT;
597
+ if (typeof cleanupTimeout !== 'number')
598
+ cleanupTimeout = Number(cleanupTimeout)
599
+ if (isNaN(cleanupTimeout))
599
600
  {
600
- console.warn(`Warning: Could not create pidfile at ${pidfile} (${error.code || error.message})`);
601
-
602
- if (typeof memoizePid.fd === 'number')
601
+ cleanupTimeout = defaultCT;
602
+ if (!getCleanupTimeoutMs.warned)
603
603
  {
604
- fs.closeSync(memoizePid.fd);
605
- delete memoizePid.fd;
604
+ console.warn(`warning: dcpConfig.worker.cleanupTimeout is not a number (${dcpConfig.worker.cleanupTimeout})`);
605
+ getCleanupTimeoutMs.warned = true;
606
606
  }
607
- return;
608
607
  }
608
+ return cleanupTimeout * 1000;
609
+ }
609
610
 
610
-
611
+ /**
612
+ * Ensure the default configuration hasn't been modified by the end-user-sysadmin. It is an
613
+ * attractive nuisance, as it looks just like the file they should modify, but if they make
614
+ * security changes there that are overwritten in an subsequent update, it will be a problem.
615
+ *
616
+ * Every time a new package is generated, the default config file has its md5 checksum recorded
617
+ * via the pack npm hook; all we do is make sure it hasn't changed.
618
+ */
619
+ function verifyDefaultConfigIntegrity()
620
+ {
621
+ const workerConfPath = require('dcp-client').__cn;
622
+ const md5sumPath = workerConfPath + '.md5';
611
623
 
612
- function exitHandler()
624
+ if (!fs.existsSync(md5sumPath))
613
625
  {
614
- try
615
- {
616
- fs.unlinkSync(pidfile);
617
- fs.closeSync(memoizePid.fd);
618
- delete memoizePid.fd;
619
- }
620
- catch (error)
621
- {
622
- console.warn(`Warning: Could not remove pidfile at ${pidfile} (${error.code})`);
623
- }
626
+ console.log(chalk.bold.red(` ! warning: ${md5sumPath} not found; cannot verify configuration integrity`));
627
+ require('dcp/utils').sleep(2);
624
628
  }
625
-
626
- // Cleanup PID file
627
- process.on('dcpExit', exitHandler);
628
- }
629
-
630
- /**
631
- * Unhandled rejection handler: __must not ever throw no matter what__.
632
- * If we hit an unhandled rejection, we are by definition no longer confident of our program state, meaning that
633
- * the worker must be restarted. This handler does its best to report the rejection and give the worker a few
634
- * seconds in which to attempt to return slices to the scheduler before it gives up completely.
635
- */
636
- async function unhandledRejectionHandler (error) {
637
- let _worker = worker;
638
-
639
- if (!worker)
640
- return;
641
629
  else
642
- worker = false;
643
-
644
- try {
645
- let log = dcpConfig && dcpConfig.worker && dcpConfig.worker.unhandledRejectionLog;
646
- if (!log) log = process.env.DCP_WORKER_UNHANDLED_REJECTION_LOG;
647
- if (log) {
648
- fs.appendFileSync(process.env.DCP_WORKER_UNHANDLED_REJECTION_LOG,
649
- `${Date.now()}: ${error.message}\n${error.stack}\n\n`);
650
- }
651
- } catch(e) {};
630
+ {
631
+ const originalMd5sum = fs.readFileSync(md5sumPath, 'ascii');
632
+ const actualMd5sum = crypto.createHash('md5')
633
+ .update(fs.readFileSync(workerConfPath, 'ascii'))
634
+ .digest('hex');
652
635
 
653
- try {
654
- let screen = require('../lib/worker-loggers/dashboard').screen;
655
-
656
- if (screen) {
657
- screen.log(error.message + '\n' + error.stack);
658
- screen.destroy();
659
- logError(error.message + '\n' + error.stack);
660
- } else {
661
- console.error('Unhandled rejection - preparing to exit:', error.message);
662
- }
663
- } catch(e) {};
664
-
665
- function bail(exitCode) {
666
- try {
667
- const util = require('util');
668
- process.stderr.write('\nWorker stop timeout; bailing due to earlier unhandled rejection:\n');
669
- process.stderr.write(util.inspect(error) + '\n');
670
- } catch(e) {
671
- console.error(error);
636
+ if (!originalMd5sum.startsWith(actualMd5sum))
637
+ {
638
+ console.warn(chalk.yellow(` ! Detected modified ${workerConfPath};`));
639
+ console.warn(' . DCP Worker configuration changes should not be made by updating the default');
640
+ console.warn(' config, as that file will be overwritten on the next npm update. Instead,');
641
+ console.warn(' make changes via one of the following locations:');
642
+ console.warn(' - ~/.dcp/dcp-worker/dcp-config.js');
643
+ console.warn(' - /etc/dcp/dcp-worker/dcp-config.js');
644
+ console.warn(' - /etc/override/dcp/dcp-worker/dcp-config.js');
645
+ console.warn(' - the Windows Registry');
646
+
647
+ if (require('dcp/build').config.build !== 'debug')
648
+ process.exit(1);
649
+
650
+ console.log(chalk.bold.red.inverse("If this wasn't a debug build, the worker would exit now."));
651
+ require('dcp/utils').sleep(2);
672
652
  }
673
- process.exit(exitCode || EXIT_UNHANDLED);
674
653
  }
675
- setTimeout(bail, 1000 * unhandledRejectionHandler.timeout);
676
654
 
677
- try {
678
- await _worker.stop(true);
679
- } catch(e) {
680
- console.log('Error during worker.stop:', e);
655
+ if (dcpConfig.cookie !== process.env.DCP_CONFIG_COOKIE || !dcpConfig.cookie)
656
+ {
657
+ console.error(' ! DCP Worker default configuration was not loaded; exiting.');
658
+ process.exit(1);
681
659
  }
682
-
683
- setImmediate(() => bail(EXIT_UNHANDLED));
684
- };
685
- unhandledRejectionHandler.timeout = 5;
660
+ }