teraslice 0.87.1 → 0.88.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/cluster-service.js +24 -18
  2. package/dist/src/index.js +42 -0
  3. package/package.json +11 -15
  4. package/service.js +4 -6
  5. package/worker-service.js +6 -6
  6. package/index.js +0 -21
  7. package/lib/cluster/cluster_master.js +0 -164
  8. package/lib/cluster/node_master.js +0 -393
  9. package/lib/cluster/services/api.js +0 -581
  10. package/lib/cluster/services/assets.js +0 -211
  11. package/lib/cluster/services/cluster/backends/kubernetes/deployments/worker.hbs +0 -86
  12. package/lib/cluster/services/cluster/backends/kubernetes/index.js +0 -225
  13. package/lib/cluster/services/cluster/backends/kubernetes/jobs/execution_controller.hbs +0 -69
  14. package/lib/cluster/services/cluster/backends/kubernetes/k8s.js +0 -450
  15. package/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js +0 -443
  16. package/lib/cluster/services/cluster/backends/kubernetes/k8sState.js +0 -67
  17. package/lib/cluster/services/cluster/backends/kubernetes/utils.js +0 -58
  18. package/lib/cluster/services/cluster/backends/native/index.js +0 -611
  19. package/lib/cluster/services/cluster/backends/native/messaging.js +0 -563
  20. package/lib/cluster/services/cluster/backends/state-utils.js +0 -49
  21. package/lib/cluster/services/cluster/index.js +0 -15
  22. package/lib/cluster/services/execution.js +0 -459
  23. package/lib/cluster/services/jobs.js +0 -303
  24. package/lib/config/default-sysconfig.js +0 -47
  25. package/lib/config/index.js +0 -32
  26. package/lib/config/schemas/system.js +0 -333
  27. package/lib/processors/save_file/index.js +0 -9
  28. package/lib/processors/save_file/processor.js +0 -17
  29. package/lib/processors/save_file/schema.js +0 -17
  30. package/lib/processors/script.js +0 -130
  31. package/lib/processors/stdout/index.js +0 -9
  32. package/lib/processors/stdout/processor.js +0 -19
  33. package/lib/processors/stdout/schema.js +0 -18
  34. package/lib/storage/analytics.js +0 -106
  35. package/lib/storage/assets.js +0 -275
  36. package/lib/storage/backends/elasticsearch_store.js +0 -567
  37. package/lib/storage/backends/mappings/analytics.json +0 -49
  38. package/lib/storage/backends/mappings/asset.json +0 -40
  39. package/lib/storage/backends/mappings/ex.json +0 -55
  40. package/lib/storage/backends/mappings/job.json +0 -31
  41. package/lib/storage/backends/mappings/state.json +0 -37
  42. package/lib/storage/execution.js +0 -331
  43. package/lib/storage/index.js +0 -16
  44. package/lib/storage/jobs.js +0 -97
  45. package/lib/storage/state.js +0 -302
  46. package/lib/utils/api_utils.js +0 -173
  47. package/lib/utils/asset_utils.js +0 -117
  48. package/lib/utils/date_utils.js +0 -58
  49. package/lib/utils/encoding_utils.js +0 -29
  50. package/lib/utils/events.js +0 -7
  51. package/lib/utils/file_utils.js +0 -118
  52. package/lib/utils/id_utils.js +0 -19
  53. package/lib/utils/port_utils.js +0 -83
  54. package/lib/workers/assets/loader.js +0 -109
  55. package/lib/workers/assets/spawn.js +0 -78
  56. package/lib/workers/context/execution-context.js +0 -16
  57. package/lib/workers/context/terafoundation-context.js +0 -10
  58. package/lib/workers/execution-controller/execution-analytics.js +0 -211
  59. package/lib/workers/execution-controller/index.js +0 -1033
  60. package/lib/workers/execution-controller/recovery.js +0 -188
  61. package/lib/workers/execution-controller/scheduler.js +0 -461
  62. package/lib/workers/execution-controller/slice-analytics.js +0 -115
  63. package/lib/workers/helpers/job.js +0 -93
  64. package/lib/workers/helpers/op-analytics.js +0 -22
  65. package/lib/workers/helpers/terafoundation.js +0 -43
  66. package/lib/workers/helpers/worker-shutdown.js +0 -187
  67. package/lib/workers/metrics/index.js +0 -139
  68. package/lib/workers/worker/index.js +0 -344
  69. package/lib/workers/worker/slice.js +0 -143
@@ -1,393 +0,0 @@
1
- 'use strict';
2
-
3
- const ms = require('ms');
4
- const _ = require('lodash');
5
- const { Mutex } = require('async-mutex');
6
- const { getFullErrorStack } = require('@terascope/utils');
7
- const { makeLogger } = require('../workers/helpers/terafoundation');
8
- const messageModule = require('./services/cluster/backends/native/messaging');
9
- const spawnAssetLoader = require('../workers/assets/spawn');
10
- const { safeEncode } = require('../utils/encoding_utils');
11
- const { findPort, getPorts } = require('../utils/port_utils');
12
-
13
- const nodeVersion = process.version;
14
- const terasliceVersion = require('../../package.json').version;
15
-
16
- module.exports = async function nodeMaster(context) {
17
- const logger = makeLogger(context, 'node_master');
18
- const configWorkerLimit = context.sysconfig.teraslice.workers;
19
- const config = context.sysconfig.teraslice;
20
- const events = context.apis.foundation.getSystemEvents();
21
- const mutex = new Mutex();
22
-
23
- const messaging = messageModule(context, logger);
24
- const host = messaging.getHostUrl();
25
- const isShuttingDown = false;
26
- const ports = getPorts(context);
27
-
28
- logger.info(`node ${context.sysconfig._nodeName} is attempting to connect to cluster_master: ${host}`);
29
-
30
- function sendNodeStateNow() {
31
- if (isShuttingDown) return;
32
- const state = getNodeState();
33
- messaging.send({
34
- to: 'cluster_master',
35
- message: 'node:state',
36
- node_id: state.node_id,
37
- payload: state
38
- });
39
- }
40
-
41
- const sendNodeState = _.debounce(sendNodeStateNow, 500, { leading: false, trailing: true });
42
-
43
- let pendingAllocations = 0;
44
-
45
- function allocateWorkers(count, exConfig, fn) {
46
- const startTime = Date.now();
47
- pendingAllocations += count;
48
- sendNodeStateNow();
49
- const locked = mutex.isLocked() ? ' (locked)' : '';
50
- logger.info(`allocating ${count} workers...${locked}`);
51
-
52
- return mutex.runExclusive(async () => {
53
- try {
54
- await loadAssetsIfNeeded(exConfig.job, exConfig.ex_id);
55
- } catch (err) {
56
- logger.error(`Failure to allocated assets for execution ${exConfig.ex_id}`);
57
- throw err;
58
- } finally {
59
- pendingAllocations -= count;
60
- }
61
-
62
- try {
63
- const workers = await fn();
64
- const elapsed = Date.now() - startTime;
65
- if (workers.length === count) {
66
- logger.info(`allocated ${workers.length} workers, took ${ms(elapsed)}`);
67
- } else {
68
- logger.info(`allocated ${workers.length} out of the requested ${count} workers, took ${ms(elapsed)}`);
69
- }
70
- return workers.length;
71
- } catch (err) {
72
- logger.error(`Failure to allocate workers for execution ${exConfig.ex_id}`);
73
- throw err;
74
- }
75
- });
76
- }
77
-
78
- function canAllocateWorkers(requestedWorkers) {
79
- const numOfCurrentWorkers = Object.keys(context.cluster.workers).length;
80
- // if there is an over allocation, send back rest to be enqueued
81
- if (configWorkerLimit < numOfCurrentWorkers + requestedWorkers) {
82
- return configWorkerLimit - numOfCurrentWorkers > 0;
83
- }
84
- return true;
85
- }
86
-
87
- messaging.registerChildOnlineHook(sendNodeState);
88
-
89
- messaging.register({
90
- event: 'network:connect',
91
- callback: () => {
92
- logger.info(`node has successfully connected to: ${host}`);
93
- const state = getNodeState();
94
- messaging.send({
95
- to: 'cluster_master', message: 'node:online', node_id: state.node_id, payload: state
96
- });
97
- }
98
- });
99
-
100
- messaging.register({
101
- event: 'network:disconnect',
102
- callback: () => logger.info(`node has disconnected from: ${host}`)
103
- });
104
-
105
- messaging.register({
106
- event: 'network:error',
107
- callback: (err) => logger.warn(err, `Attempting to connect to cluster_master: ${host}`)
108
- });
109
-
110
- messaging.register({
111
- event: 'cluster:execution_controller:create',
112
- callback: (createSlicerRequest) => {
113
- const createSlicerMsg = createSlicerRequest.payload;
114
- logger.info(`starting execution_controller for execution ${createSlicerMsg.ex_id}...`);
115
-
116
- allocateWorkers(1, createSlicerMsg, () => {
117
- const controllerContext = {
118
- assignment: 'execution_controller',
119
- NODE_TYPE: 'execution_controller',
120
- EX: safeEncode(createSlicerMsg.job),
121
- job: createSlicerMsg.job,
122
- node_id: context.sysconfig._nodeName,
123
- ex_id: createSlicerMsg.ex_id,
124
- job_id: createSlicerMsg.job_id,
125
- slicer_port: createSlicerMsg.slicer_port
126
- };
127
- logger.trace('starting a execution controller', controllerContext);
128
- return context.foundation.startWorkers(1, controllerContext);
129
- })
130
- .then(() => messaging.respond(createSlicerRequest))
131
- .catch((error) => {
132
- messaging.respond(createSlicerRequest, {
133
- error: getFullErrorStack(error),
134
- });
135
- });
136
- }
137
- });
138
-
139
- messaging.register({
140
- event: 'cluster:workers:create',
141
- callback: (createWorkerRequest) => {
142
- const createWorkerMsg = createWorkerRequest.payload;
143
- const requestedWorkers = createWorkerMsg.workers;
144
- logger.info(`starting ${requestedWorkers} workers for execution ${createWorkerMsg.ex_id}...`);
145
-
146
- if (!canAllocateWorkers(requestedWorkers)) {
147
- logger.warn(`worker is overallocated, maximum number of workers of ${configWorkerLimit}`);
148
- messaging.respond(createWorkerRequest, {
149
- payload: {
150
- createdWorkers: 0,
151
- }
152
- });
153
- return;
154
- }
155
-
156
- allocateWorkers(requestedWorkers, createWorkerMsg, () => {
157
- let newWorkers = requestedWorkers;
158
- const numOfCurrentWorkers = Object.keys(context.cluster.workers).length;
159
- // if there is an over allocation, send back rest to be enqueued
160
- if (configWorkerLimit < numOfCurrentWorkers + requestedWorkers) {
161
- newWorkers = configWorkerLimit - numOfCurrentWorkers;
162
- logger.warn(`worker allocation request would exceed maximum number of workers of ${configWorkerLimit}`);
163
- logger.warn(`reducing allocation to ${newWorkers} workers.`);
164
- }
165
-
166
- let workers = [];
167
- if (newWorkers > 0) {
168
- logger.trace(`starting ${newWorkers} workers`, createWorkerMsg.ex_id);
169
- workers = context.foundation.startWorkers(newWorkers, {
170
- NODE_TYPE: 'worker',
171
- EX: safeEncode(createWorkerMsg.job),
172
- assignment: 'worker',
173
- node_id: context.sysconfig._nodeName,
174
- job: createWorkerMsg.job,
175
- ex_id: createWorkerMsg.ex_id,
176
- job_id: createWorkerMsg.job_id
177
- });
178
- }
179
-
180
- return workers;
181
- })
182
- .then((createdWorkers) => messaging.respond(createWorkerRequest, {
183
- payload: {
184
- createdWorkers,
185
- }
186
- }))
187
- .catch(() => messaging.respond(createWorkerRequest, {
188
- payload: {
189
- createdWorkers: 0,
190
- }
191
- }));
192
- }
193
- });
194
-
195
- messaging.register({ event: 'cluster:node:state', callback: () => sendNodeState() });
196
-
197
- // this fires when entire server will be shutdown
198
- events.once('terafoundation:shutdown', () => {
199
- logger.debug('received shutdown notice from terafoundation');
200
-
201
- const filterFn = () => context.cluster.workers;
202
- const isActionCompleteFn = () => _.isEmpty(getNodeState().active);
203
- shutdownProcesses({}, filterFn, isActionCompleteFn, true);
204
- });
205
-
206
- messaging.register({
207
- event: 'cluster:execution:stop',
208
- callback: (networkMsg) => {
209
- const exId = networkMsg.ex_id;
210
- logger.debug(`received cluster execution stop for execution ${exId}`);
211
-
212
- const filterFn = () => _.filter(
213
- context.cluster.workers,
214
- (worker) => worker.ex_id === exId
215
- );
216
- function actionCompleteFn() {
217
- const children = getNodeState().active;
218
- const workers = _.filter(children, (worker) => worker.ex_id === exId);
219
- logger.debug(`waiting for ${workers.length} to stop for ex: ${exId}`);
220
- return workers.length === 0;
221
- }
222
-
223
- shutdownProcesses(networkMsg, filterFn, actionCompleteFn);
224
- }
225
- });
226
-
227
- messaging.register({
228
- event: 'cluster:workers:remove',
229
- callback: (networkMsg) => {
230
- const numberToRemove = networkMsg.payload.workers;
231
- const children = getNodeState().active;
232
- const startingWorkerCount = _.filter(children, (worker) => worker.ex_id === networkMsg.ex_id && worker.assignment === 'worker').length;
233
- const filterFn = () => _.filter(
234
- children,
235
- (worker) => worker.ex_id === networkMsg.ex_id && worker.assignment === 'worker'
236
- ).slice(0, numberToRemove);
237
-
238
- function actionCompleteFn() {
239
- const childWorkers = getNodeState().active;
240
- const currentWorkersForJob = _.filter(childWorkers, (worker) => worker.ex_id === networkMsg.ex_id && worker.assignment === 'worker').length;
241
- return currentWorkersForJob + numberToRemove <= startingWorkerCount;
242
- }
243
-
244
- shutdownProcesses(networkMsg, filterFn, actionCompleteFn);
245
- }
246
- });
247
-
248
- // used to find an open port for slicer
249
- messaging.register({
250
- event: 'cluster:node:get_port',
251
- callback: async (msg) => {
252
- const port = await findPort(ports);
253
- logger.debug(`assigning port ${port} for new job`);
254
- messaging.respond(msg, { port });
255
- }
256
- });
257
-
258
- messaging.register({
259
- event: 'cluster:error:terminal',
260
- callback: () => {
261
- logger.error('terminal error in cluster_master, flushing logs and shutting down');
262
- logger.flush()
263
- .then(() => process.exit(0));
264
- }
265
- });
266
-
267
- messaging.register({
268
- event: 'child:exit',
269
- callback: () => sendNodeState()
270
- });
271
-
272
- function getAssetsFromJob(jobStr) {
273
- const job = typeof jobStr === 'string' ? JSON.parse(jobStr) : jobStr;
274
- return job.assets || [];
275
- }
276
-
277
- async function loadAssetsIfNeeded(job, exId) {
278
- const assets = getAssetsFromJob(job);
279
- if (!assets.length) return;
280
-
281
- logger.info(`node ${context.sysconfig._nodeName} is checking assets for job, exId: ${exId}`);
282
- await spawnAssetLoader(assets, context);
283
- }
284
-
285
- function shutdownWorkers(signal, filterFn) {
286
- const allWorkersForJob = filterFn();
287
- _.each(allWorkersForJob, (worker) => {
288
- const workerID = worker.worker_id || worker.id;
289
- if (_.has(context.cluster.workers, workerID)) {
290
- const clusterWorker = context.cluster.workers[workerID];
291
- const processId = clusterWorker.process.pid;
292
- if (clusterWorker.isDead()) return;
293
- // if the worker has already been sent a SIGTERM signal it should send a SIGKILL
294
- logger.warn(`sending ${signal} to process ${processId}, assignment: ${worker.assignment}, ex_id: ${worker.ex_id}`);
295
- clusterWorker.kill(signal);
296
- }
297
- });
298
- }
299
-
300
- function shutdownProcesses(message, filterFn, isActionCompleteFn, onlySigKill = false) {
301
- const intervalTime = 200;
302
- const needsResponse = message.response && message.to;
303
-
304
- // give a little extra time to finish shutting down
305
- let stopTime = config.shutdown_timeout + 3000;
306
-
307
- if (!onlySigKill) {
308
- shutdownWorkers('SIGTERM', filterFn);
309
- }
310
-
311
- const stop = setInterval(() => {
312
- if (isActionCompleteFn()) {
313
- clearInterval(stop);
314
- if (needsResponse) messaging.respond(message);
315
- }
316
- if (stopTime <= 0) {
317
- clearInterval(stop);
318
- shutdownWorkers('SIGKILL', filterFn);
319
- if (needsResponse) messaging.respond(message);
320
- }
321
-
322
- stopTime -= intervalTime;
323
- }, intervalTime);
324
- }
325
-
326
- function getNodeState() {
327
- const nodeId = context.sysconfig._nodeName;
328
-
329
- const state = {
330
- node_id: nodeId,
331
- hostname: context.sysconfig.teraslice.hostname,
332
- pid: process.pid,
333
- node_version: nodeVersion,
334
- teraslice_version: terasliceVersion,
335
- total: context.sysconfig.teraslice.workers,
336
- state: 'connected'
337
- };
338
-
339
- const clusterWorkers = context.cluster.workers;
340
- const active = [];
341
-
342
- _.forOwn(clusterWorkers, (worker) => {
343
- const child = {
344
- worker_id: worker.id,
345
- assignment: worker.assignment,
346
- pid: worker.process.pid
347
- };
348
-
349
- if (worker.ex_id) {
350
- child.ex_id = worker.ex_id;
351
- }
352
-
353
- if (worker.job_id) {
354
- child.job_id = worker.job_id;
355
- }
356
-
357
- if (worker.assets) {
358
- child.assets = worker.assets.map((asset) => asset.id);
359
- }
360
-
361
- active.push(child);
362
- });
363
-
364
- state.available = state.total - active.length - pendingAllocations;
365
- state.active = active;
366
-
367
- return state;
368
- }
369
-
370
- messaging.listen({
371
- query: {
372
- node_id: context.sysconfig._nodeName
373
- }
374
- });
375
-
376
- if (context.sysconfig.teraslice.master) {
377
- logger.debug(`node ${context.sysconfig._nodeName} is creating the cluster_master`);
378
- context.foundation.startWorkers(1, {
379
- assignment: 'cluster_master',
380
- assets_port: ports.assetsPort,
381
- node_id: context.sysconfig._nodeName
382
- });
383
-
384
- logger.debug(`node ${context.sysconfig._nodeName} is creating assets endpoint on port ${ports.assetsPort}`);
385
-
386
- context.foundation.startWorkers(1, {
387
- assignment: 'assets_service',
388
- // key needs to be called port to bypass cluster port sharing
389
- port: ports.assetsPort,
390
- node_id: context.sysconfig._nodeName
391
- });
392
- }
393
- };