teraslice 0.87.0 → 0.88.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cluster-service.js +24 -18
- package/dist/src/index.js +42 -0
- package/package.json +11 -15
- package/service.js +4 -6
- package/worker-service.js +6 -6
- package/index.js +0 -21
- package/lib/cluster/cluster_master.js +0 -164
- package/lib/cluster/node_master.js +0 -393
- package/lib/cluster/services/api.js +0 -581
- package/lib/cluster/services/assets.js +0 -211
- package/lib/cluster/services/cluster/backends/kubernetes/deployments/worker.hbs +0 -86
- package/lib/cluster/services/cluster/backends/kubernetes/index.js +0 -225
- package/lib/cluster/services/cluster/backends/kubernetes/jobs/execution_controller.hbs +0 -69
- package/lib/cluster/services/cluster/backends/kubernetes/k8s.js +0 -450
- package/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js +0 -443
- package/lib/cluster/services/cluster/backends/kubernetes/k8sState.js +0 -67
- package/lib/cluster/services/cluster/backends/kubernetes/utils.js +0 -58
- package/lib/cluster/services/cluster/backends/native/index.js +0 -611
- package/lib/cluster/services/cluster/backends/native/messaging.js +0 -563
- package/lib/cluster/services/cluster/backends/state-utils.js +0 -49
- package/lib/cluster/services/cluster/index.js +0 -15
- package/lib/cluster/services/execution.js +0 -459
- package/lib/cluster/services/jobs.js +0 -303
- package/lib/config/default-sysconfig.js +0 -47
- package/lib/config/index.js +0 -32
- package/lib/config/schemas/system.js +0 -333
- package/lib/processors/save_file/index.js +0 -9
- package/lib/processors/save_file/processor.js +0 -17
- package/lib/processors/save_file/schema.js +0 -17
- package/lib/processors/script.js +0 -130
- package/lib/processors/stdout/index.js +0 -9
- package/lib/processors/stdout/processor.js +0 -19
- package/lib/processors/stdout/schema.js +0 -18
- package/lib/storage/analytics.js +0 -106
- package/lib/storage/assets.js +0 -275
- package/lib/storage/backends/elasticsearch_store.js +0 -567
- package/lib/storage/backends/mappings/analytics.json +0 -49
- package/lib/storage/backends/mappings/asset.json +0 -40
- package/lib/storage/backends/mappings/ex.json +0 -55
- package/lib/storage/backends/mappings/job.json +0 -31
- package/lib/storage/backends/mappings/state.json +0 -37
- package/lib/storage/execution.js +0 -331
- package/lib/storage/index.js +0 -16
- package/lib/storage/jobs.js +0 -97
- package/lib/storage/state.js +0 -302
- package/lib/utils/api_utils.js +0 -173
- package/lib/utils/asset_utils.js +0 -117
- package/lib/utils/date_utils.js +0 -58
- package/lib/utils/encoding_utils.js +0 -29
- package/lib/utils/events.js +0 -7
- package/lib/utils/file_utils.js +0 -118
- package/lib/utils/id_utils.js +0 -19
- package/lib/utils/port_utils.js +0 -83
- package/lib/workers/assets/loader.js +0 -109
- package/lib/workers/assets/spawn.js +0 -78
- package/lib/workers/context/execution-context.js +0 -16
- package/lib/workers/context/terafoundation-context.js +0 -10
- package/lib/workers/execution-controller/execution-analytics.js +0 -211
- package/lib/workers/execution-controller/index.js +0 -1033
- package/lib/workers/execution-controller/recovery.js +0 -188
- package/lib/workers/execution-controller/scheduler.js +0 -461
- package/lib/workers/execution-controller/slice-analytics.js +0 -115
- package/lib/workers/helpers/job.js +0 -93
- package/lib/workers/helpers/op-analytics.js +0 -22
- package/lib/workers/helpers/terafoundation.js +0 -43
- package/lib/workers/helpers/worker-shutdown.js +0 -187
- package/lib/workers/metrics/index.js +0 -139
- package/lib/workers/worker/index.js +0 -344
- package/lib/workers/worker/slice.js +0 -143
|
@@ -1,611 +0,0 @@
|
|
|
1
|
-
'use strict';
|
|
2
|
-
|
|
3
|
-
const _ = require('lodash');
|
|
4
|
-
const {
|
|
5
|
-
Queue, TSError, getFullErrorStack, pDelay, cloneDeep
|
|
6
|
-
} = require('@terascope/utils');
|
|
7
|
-
const { makeLogger } = require('../../../../../workers/helpers/terafoundation');
|
|
8
|
-
const stateUtils = require('../state-utils');
|
|
9
|
-
const Messaging = require('./messaging');
|
|
10
|
-
|
|
11
|
-
/*
|
|
12
|
-
Execution Life Cycle for _status
|
|
13
|
-
pending -> scheduling -> running -> [ paused -> running ] -> [ stopped | completed ]
|
|
14
|
-
Exceptions
|
|
15
|
-
rejected - when a job is rejected prior to scheduling
|
|
16
|
-
failed - when there is an error while the job is running
|
|
17
|
-
aborted - when a job was running at the point when the cluster shutsdown
|
|
18
|
-
*/
|
|
19
|
-
|
|
20
|
-
module.exports = function nativeClustering(context, clusterMasterServer) {
|
|
21
|
-
const events = context.apis.foundation.getSystemEvents();
|
|
22
|
-
const logger = makeLogger(context, 'native_cluster_service');
|
|
23
|
-
const pendingWorkerRequests = new Queue();
|
|
24
|
-
const nodeDisconnectTimeout = context.sysconfig.teraslice.node_disconnect_timeout;
|
|
25
|
-
const nodeStateInterval = context.sysconfig.teraslice.node_state_interval;
|
|
26
|
-
const slicerAllocationAttempts = context.sysconfig.teraslice.slicer_allocation_attempts;
|
|
27
|
-
const clusterState = {};
|
|
28
|
-
const messaging = Messaging(context, logger);
|
|
29
|
-
|
|
30
|
-
let exStore;
|
|
31
|
-
let clusterStateInterval;
|
|
32
|
-
|
|
33
|
-
// temporary holding spot used to attach nodes that are non responsive or
|
|
34
|
-
// disconnect before final cleanup
|
|
35
|
-
const droppedNodes = {};
|
|
36
|
-
|
|
37
|
-
messaging.register({
|
|
38
|
-
event: 'node:online',
|
|
39
|
-
identifier: 'node_id',
|
|
40
|
-
callback: (data, nodeId) => {
|
|
41
|
-
logger.info(`node ${nodeId} has connected`);
|
|
42
|
-
// if a reconnect happens stop timer
|
|
43
|
-
if (droppedNodes[nodeId]) {
|
|
44
|
-
clearTimeout(droppedNodes[nodeId]);
|
|
45
|
-
delete droppedNodes[nodeId];
|
|
46
|
-
}
|
|
47
|
-
logger.trace(`node ${nodeId} has state:`, data.payload);
|
|
48
|
-
clusterState[nodeId] = data.payload;
|
|
49
|
-
// if new node comes online, check if jobs need more workers
|
|
50
|
-
events.emit('cluster:available_workers');
|
|
51
|
-
}
|
|
52
|
-
});
|
|
53
|
-
|
|
54
|
-
messaging.register({
|
|
55
|
-
event: 'node:state',
|
|
56
|
-
callback: (stateMsg) => {
|
|
57
|
-
const data = stateMsg.payload;
|
|
58
|
-
clusterState[data.node_id] = data;
|
|
59
|
-
logger.trace(`node ${data.node_id} state is being updated`, data);
|
|
60
|
-
// check to see if we can provision any additional workers
|
|
61
|
-
events.emit('cluster:available_workers');
|
|
62
|
-
}
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
messaging.register({
|
|
66
|
-
event: 'network:error',
|
|
67
|
-
callback: (err) => logger.error(err, 'cluster_master had an error with one of its connections')
|
|
68
|
-
});
|
|
69
|
-
|
|
70
|
-
messaging.register({
|
|
71
|
-
event: 'network:disconnect',
|
|
72
|
-
identifier: 'node_id',
|
|
73
|
-
callback: (msg, nodeId) => {
|
|
74
|
-
if (!clusterState[nodeId]) return;
|
|
75
|
-
|
|
76
|
-
if (clusterState[nodeId].active.length === 0) {
|
|
77
|
-
logger.warn(`node ${nodeId} has disconnected`);
|
|
78
|
-
delete clusterState[nodeId];
|
|
79
|
-
} else {
|
|
80
|
-
clusterState[nodeId].state = 'disconnected';
|
|
81
|
-
const timer = setTimeout(() => {
|
|
82
|
-
_cleanUpNode(nodeId);
|
|
83
|
-
}, nodeDisconnectTimeout);
|
|
84
|
-
|
|
85
|
-
droppedNodes[nodeId] = timer;
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
});
|
|
89
|
-
|
|
90
|
-
function _cleanUpNode(nodeId) {
|
|
91
|
-
// check workers and slicers
|
|
92
|
-
const node = _checkNode(clusterState[nodeId]);
|
|
93
|
-
// if disconnected node had a slicer, we stop the execution of each slicer on it
|
|
94
|
-
// and mark it as failed
|
|
95
|
-
if (node.hasSlicer) {
|
|
96
|
-
_.forIn(node.slicerExecutions, async (exId) => {
|
|
97
|
-
const errMsg = `node ${nodeId} has been disconnected from cluster_master past the allowed timeout, it has an active slicer for execution: ${exId} which will be marked as terminated and shut down`;
|
|
98
|
-
logger.error(errMsg);
|
|
99
|
-
const metaData = exStore.executionMetaData(null, errMsg);
|
|
100
|
-
pendingWorkerRequests.remove(exId, 'ex_id');
|
|
101
|
-
|
|
102
|
-
try {
|
|
103
|
-
await exStore.setStatus(exId, 'terminated', metaData);
|
|
104
|
-
} catch (err) {
|
|
105
|
-
logger.error(err, `failure to set execution ${exId} status to terminated`);
|
|
106
|
-
} finally {
|
|
107
|
-
messaging.broadcast('cluster:execution:stop', { ex_id: exId });
|
|
108
|
-
}
|
|
109
|
-
});
|
|
110
|
-
}
|
|
111
|
-
// for any other worker not part of what is being shutdown, we attempt to reallocate
|
|
112
|
-
_.forIn(node.workerExecutions, async (__, exId) => {
|
|
113
|
-
// looking for unique ex_id's not in slicerJobID
|
|
114
|
-
if (!node.slicerExecutions[exId]) {
|
|
115
|
-
const activeWorkers = clusterState[nodeId].active;
|
|
116
|
-
const numOfWorkers = activeWorkers.filter((worker) => worker.ex_id === exId).length;
|
|
117
|
-
|
|
118
|
-
try {
|
|
119
|
-
const execution = await exStore.getActiveExecution(exId);
|
|
120
|
-
addWorkers(execution, numOfWorkers);
|
|
121
|
-
} catch (err) {
|
|
122
|
-
logger.error(err, `failure to add workers to execution ${exId}`);
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
});
|
|
126
|
-
|
|
127
|
-
// cleanup key so we don't get ever growing obj
|
|
128
|
-
delete droppedNodes[nodeId];
|
|
129
|
-
delete clusterState[nodeId];
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
function getClusterState() {
|
|
133
|
-
return cloneDeep(clusterState);
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
function _checkNode(node) {
|
|
137
|
-
const obj = {
|
|
138
|
-
hasSlicer: false,
|
|
139
|
-
numOfSlicers: 0,
|
|
140
|
-
slicerExecutions: {},
|
|
141
|
-
workerExecutions: {},
|
|
142
|
-
numOfWorkers: 0,
|
|
143
|
-
id: node.id,
|
|
144
|
-
available: node.available
|
|
145
|
-
};
|
|
146
|
-
|
|
147
|
-
return node.active.reduce((prev, curr) => {
|
|
148
|
-
if (curr.assignment === 'execution_controller') {
|
|
149
|
-
prev.hasSlicer = true;
|
|
150
|
-
prev.numOfSlicers += 1;
|
|
151
|
-
prev.slicerExecutions[curr.ex_id] = curr.ex_id;
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
if (curr.assignment === 'worker') {
|
|
155
|
-
prev.numOfWorkers += 1;
|
|
156
|
-
// if not resgistered, set it to one, if so then increment it
|
|
157
|
-
if (!prev.workerExecutions[curr.ex_id]) {
|
|
158
|
-
prev.workerExecutions[curr.ex_id] = 1;
|
|
159
|
-
} else {
|
|
160
|
-
prev.workerExecutions[curr.ex_id] += 1;
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
return prev;
|
|
165
|
-
}, obj);
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
function _findNodeForSlicer(stateArray, errorNodes) {
|
|
169
|
-
let slicerNode = null;
|
|
170
|
-
for (let i = 0; i < stateArray.length; i += 1) {
|
|
171
|
-
if (stateArray[i].state === 'connected' && stateArray[i].available > 0 && !errorNodes[stateArray[i].node_id]) {
|
|
172
|
-
const node = _checkNode(stateArray[i]);
|
|
173
|
-
|
|
174
|
-
if (!node.hasSlicer) {
|
|
175
|
-
slicerNode = stateArray[i].node_id;
|
|
176
|
-
break;
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
// if all nodes have a slicer
|
|
182
|
-
if (!slicerNode) {
|
|
183
|
-
// list is already sorted by num available since stateArray is sorted
|
|
184
|
-
slicerNode = stateArray[0].node_id;
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
return slicerNode;
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
function _findNodesForExecution(exId, slicerOnly) {
|
|
191
|
-
const nodes = [];
|
|
192
|
-
_.forOwn(clusterState, (node) => {
|
|
193
|
-
if (node.state !== 'disconnected') {
|
|
194
|
-
const hasJob = node.active.filter((worker) => {
|
|
195
|
-
if (slicerOnly) {
|
|
196
|
-
return worker.ex_id === exId && worker.assignment === 'execution_controller';
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
return worker.ex_id === exId;
|
|
200
|
-
});
|
|
201
|
-
|
|
202
|
-
if (hasJob.length >= 1) {
|
|
203
|
-
nodes.push({
|
|
204
|
-
node_id: node.node_id,
|
|
205
|
-
ex_id: exId,
|
|
206
|
-
hostname: node.hostname,
|
|
207
|
-
workers: hasJob
|
|
208
|
-
});
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
});
|
|
212
|
-
|
|
213
|
-
return nodes;
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
function _availableWorkers(all, forceCheck) {
|
|
217
|
-
let num = 0;
|
|
218
|
-
// determine which key to search for in cluster state
|
|
219
|
-
if (pendingWorkerRequests.size() === 0 || forceCheck) {
|
|
220
|
-
const key = all ? 'total' : 'available';
|
|
221
|
-
|
|
222
|
-
_.forOwn(clusterState, (node) => {
|
|
223
|
-
if (node.state === 'connected') {
|
|
224
|
-
num += node[key];
|
|
225
|
-
}
|
|
226
|
-
});
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
return num;
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
function _findPort(nodeId) {
|
|
233
|
-
return messaging.send({
|
|
234
|
-
to: 'node_master',
|
|
235
|
-
address: nodeId,
|
|
236
|
-
message: 'cluster:node:get_port',
|
|
237
|
-
response: true
|
|
238
|
-
});
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
function _makeDispatch() {
|
|
242
|
-
const methods = {};
|
|
243
|
-
const dispatch = {};
|
|
244
|
-
|
|
245
|
-
methods.set = (nodeId, numOfWorkers) => {
|
|
246
|
-
if (dispatch[nodeId]) {
|
|
247
|
-
dispatch[nodeId] += numOfWorkers;
|
|
248
|
-
} else {
|
|
249
|
-
dispatch[nodeId] = numOfWorkers;
|
|
250
|
-
}
|
|
251
|
-
};
|
|
252
|
-
methods.getDispatch = () => dispatch;
|
|
253
|
-
|
|
254
|
-
return methods;
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
// designed to allocate additional workers, not any future slicers
|
|
258
|
-
function allocateWorkers(execution, numOfWorkersRequested) {
|
|
259
|
-
const exId = execution.ex_id;
|
|
260
|
-
const jobId = execution.job_id;
|
|
261
|
-
const jobStr = JSON.stringify(execution);
|
|
262
|
-
const sortedNodes = _.orderBy(clusterState, 'available', 'desc');
|
|
263
|
-
let workersRequested = numOfWorkersRequested;
|
|
264
|
-
let availWorkers = _availableWorkers(false, true);
|
|
265
|
-
|
|
266
|
-
const dispatch = _makeDispatch();
|
|
267
|
-
|
|
268
|
-
while (workersRequested > 0 && availWorkers > 0) {
|
|
269
|
-
for (let i = 0; i < sortedNodes.length; i += 1) {
|
|
270
|
-
// each iteration check if it can allocate
|
|
271
|
-
if (workersRequested > 0 && availWorkers > 0) {
|
|
272
|
-
if (sortedNodes[i].available >= 1) {
|
|
273
|
-
dispatch.set(sortedNodes[i].node_id, 1);
|
|
274
|
-
availWorkers -= 1;
|
|
275
|
-
workersRequested -= 1;
|
|
276
|
-
}
|
|
277
|
-
} else {
|
|
278
|
-
break;
|
|
279
|
-
}
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
// if left over worker requests, enqueue them, queue works based off of id
|
|
283
|
-
// so it redundantly references ex_id
|
|
284
|
-
|
|
285
|
-
const workerData = {
|
|
286
|
-
job: jobStr,
|
|
287
|
-
id: exId,
|
|
288
|
-
ex_id: exId,
|
|
289
|
-
job_id: jobId,
|
|
290
|
-
workers: 1,
|
|
291
|
-
assignment: 'worker'
|
|
292
|
-
};
|
|
293
|
-
|
|
294
|
-
while (workersRequested > 0) {
|
|
295
|
-
logger.trace(`adding worker to pending queue for ex: ${exId}`);
|
|
296
|
-
pendingWorkerRequests.enqueue(workerData);
|
|
297
|
-
workersRequested -= 1;
|
|
298
|
-
}
|
|
299
|
-
const results = [];
|
|
300
|
-
|
|
301
|
-
_.forOwn(dispatch.getDispatch(), (workerCount, nodeId) => {
|
|
302
|
-
const requestedWorkersData = {
|
|
303
|
-
job: jobStr,
|
|
304
|
-
id: exId,
|
|
305
|
-
ex_id: exId,
|
|
306
|
-
job_id: jobId,
|
|
307
|
-
workers: workerCount,
|
|
308
|
-
assignment: 'worker'
|
|
309
|
-
};
|
|
310
|
-
|
|
311
|
-
const createRequest = messaging.send({
|
|
312
|
-
to: 'node_master',
|
|
313
|
-
address: nodeId,
|
|
314
|
-
message: 'cluster:workers:create',
|
|
315
|
-
payload: requestedWorkersData,
|
|
316
|
-
response: true
|
|
317
|
-
}).then((msg) => {
|
|
318
|
-
const createdWorkers = _.get(msg, 'payload.createdWorkers');
|
|
319
|
-
if (!_.isInteger(createdWorkers)) {
|
|
320
|
-
logger.error(`malformed response from create workers request to node ${nodeId}`, msg);
|
|
321
|
-
return;
|
|
322
|
-
}
|
|
323
|
-
if (createdWorkers < workerCount) {
|
|
324
|
-
logger.warn(`node ${nodeId} was only able to allocate ${createdWorkers} the request worker count of ${workerCount}, enqueing the remainder`);
|
|
325
|
-
const newWorkersRequest = _.cloneDeep(requestedWorkersData);
|
|
326
|
-
newWorkersRequest.workers = workerCount - createdWorkers;
|
|
327
|
-
pendingWorkerRequests.enqueue(newWorkersRequest);
|
|
328
|
-
} else {
|
|
329
|
-
logger.debug(`node ${nodeId} allocated ${createdWorkers}`);
|
|
330
|
-
}
|
|
331
|
-
}).catch((err) => {
|
|
332
|
-
logger.error(err, `An error has occurred in allocating : ${workerCount} workers to node : ${nodeId}, the worker request has been enqueued`);
|
|
333
|
-
pendingWorkerRequests.enqueue(requestedWorkersData);
|
|
334
|
-
});
|
|
335
|
-
|
|
336
|
-
results.push(createRequest);
|
|
337
|
-
});
|
|
338
|
-
|
|
339
|
-
// this will resolve successfully if one worker was actually allocated
|
|
340
|
-
return Promise.all(results);
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
async function _createSlicer(ex, errorNodes) {
|
|
344
|
-
const execution = cloneDeep(ex);
|
|
345
|
-
const sortedNodes = _.orderBy(clusterState, 'available', 'desc');
|
|
346
|
-
const slicerNodeID = _findNodeForSlicer(sortedNodes, errorNodes);
|
|
347
|
-
|
|
348
|
-
// need to mutate job so that workers will know the specific port and
|
|
349
|
-
// hostname of the created slicer
|
|
350
|
-
const portObj = await _findPort(slicerNodeID);
|
|
351
|
-
execution.slicer_port = portObj.port;
|
|
352
|
-
execution.slicer_hostname = clusterState[slicerNodeID].hostname;
|
|
353
|
-
|
|
354
|
-
logger.debug(`node ${clusterState[slicerNodeID].hostname} has been elected for slicer, listening on port: ${portObj.port}`);
|
|
355
|
-
|
|
356
|
-
const exId = execution.ex_id;
|
|
357
|
-
const jobId = execution.job_id;
|
|
358
|
-
const jobStr = JSON.stringify(execution);
|
|
359
|
-
|
|
360
|
-
const data = {
|
|
361
|
-
job: jobStr,
|
|
362
|
-
ex_id: exId,
|
|
363
|
-
job_id: jobId,
|
|
364
|
-
workers: 1,
|
|
365
|
-
slicer_port: portObj.port,
|
|
366
|
-
node_id: slicerNodeID,
|
|
367
|
-
assignment: 'execution_controller'
|
|
368
|
-
};
|
|
369
|
-
|
|
370
|
-
try {
|
|
371
|
-
await messaging.send({
|
|
372
|
-
to: 'node_master',
|
|
373
|
-
address: slicerNodeID,
|
|
374
|
-
message: 'cluster:execution_controller:create',
|
|
375
|
-
payload: data,
|
|
376
|
-
response: true
|
|
377
|
-
});
|
|
378
|
-
return execution;
|
|
379
|
-
} catch (err) {
|
|
380
|
-
const error = new TSError(err, {
|
|
381
|
-
reason: `failed to allocate execution_controller to ${slicerNodeID}`
|
|
382
|
-
});
|
|
383
|
-
logger.error(error);
|
|
384
|
-
errorNodes[slicerNodeID] = getFullErrorStack(error);
|
|
385
|
-
throw err;
|
|
386
|
-
}
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
async function allocateSlicer(ex) {
|
|
390
|
-
let retryCount = 0;
|
|
391
|
-
const errorNodes = {};
|
|
392
|
-
|
|
393
|
-
async function _allocateSlicer() {
|
|
394
|
-
try {
|
|
395
|
-
return await _createSlicer(ex, errorNodes);
|
|
396
|
-
} catch (err) {
|
|
397
|
-
retryCount += 1;
|
|
398
|
-
if (retryCount >= slicerAllocationAttempts) {
|
|
399
|
-
throw new Error(`Failed to allocate execution_controller to nodes: ${JSON.stringify(errorNodes)}`);
|
|
400
|
-
} else {
|
|
401
|
-
await pDelay(100);
|
|
402
|
-
return _allocateSlicer();
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
}
|
|
406
|
-
return _allocateSlicer();
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
const schedulePendingRequests = _.debounce(() => {
|
|
410
|
-
if (pendingWorkerRequests.size() && _availableWorkers(false, true) >= 1) {
|
|
411
|
-
const requestedWorker = pendingWorkerRequests.dequeue();
|
|
412
|
-
const job = JSON.parse(requestedWorker.job);
|
|
413
|
-
|
|
414
|
-
allocateWorkers(job, requestedWorker.workers)
|
|
415
|
-
.catch((err) => {
|
|
416
|
-
const error = new TSError(err, {
|
|
417
|
-
reason: 'Error processing pending requests'
|
|
418
|
-
});
|
|
419
|
-
logger.error(error);
|
|
420
|
-
});
|
|
421
|
-
}
|
|
422
|
-
}, 500, { leading: false, trailing: true });
|
|
423
|
-
|
|
424
|
-
events.on('cluster:available_workers', schedulePendingRequests);
|
|
425
|
-
|
|
426
|
-
function addWorkers(execution, workerNum) {
|
|
427
|
-
const workerData = {
|
|
428
|
-
job: JSON.stringify(execution),
|
|
429
|
-
id: execution.ex_id,
|
|
430
|
-
ex_id: execution.ex_id,
|
|
431
|
-
job_id: execution.job_id,
|
|
432
|
-
workers: workerNum,
|
|
433
|
-
assignment: 'worker'
|
|
434
|
-
};
|
|
435
|
-
pendingWorkerRequests.enqueue(workerData);
|
|
436
|
-
return { action: 'enqueued', ex_id: execution.ex_id, workerNum };
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
function setWorkers(execution, workerNum) {
|
|
440
|
-
const totalWorker = stateUtils.findWorkersByExecutionID(
|
|
441
|
-
clusterState,
|
|
442
|
-
execution.ex_id
|
|
443
|
-
).length;
|
|
444
|
-
if (totalWorker > workerNum) {
|
|
445
|
-
const removedWorkersCount = totalWorker - workerNum;
|
|
446
|
-
return removeWorkers(execution.ex_id, removedWorkersCount);
|
|
447
|
-
}
|
|
448
|
-
if (totalWorker < workerNum) {
|
|
449
|
-
const addWorkersCount = workerNum - totalWorker;
|
|
450
|
-
return addWorkers(execution, addWorkersCount);
|
|
451
|
-
}
|
|
452
|
-
// if they are equal then no work needs to be done
|
|
453
|
-
return { action: 'set', ex_id: execution.ex_id, workerNum };
|
|
454
|
-
}
|
|
455
|
-
|
|
456
|
-
function removeWorkers(exId, workerNum) {
|
|
457
|
-
const dispatch = _makeDispatch();
|
|
458
|
-
const workers = stateUtils.findWorkersByExecutionID(clusterState, exId);
|
|
459
|
-
let workerCount = workerNum;
|
|
460
|
-
const workersData = workers.reduce((prev, curr) => {
|
|
461
|
-
if (!prev[curr.node_id]) {
|
|
462
|
-
prev[curr.node_id] = 1;
|
|
463
|
-
} else {
|
|
464
|
-
prev[curr.node_id] += 1;
|
|
465
|
-
}
|
|
466
|
-
prev._total += 1;
|
|
467
|
-
|
|
468
|
-
return prev;
|
|
469
|
-
}, { _total: 0 });
|
|
470
|
-
|
|
471
|
-
if (workerNum >= workersData._total) {
|
|
472
|
-
const errMsg = `workers to be removed: ${workerNum} cannot be >= to current workers: ${workersData._total}`;
|
|
473
|
-
const error = new TSError(errMsg, {
|
|
474
|
-
statusCode: 400,
|
|
475
|
-
});
|
|
476
|
-
logger.error(error);
|
|
477
|
-
return Promise.reject(error);
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
function stateForDispatch(__, key) {
|
|
481
|
-
if (key !== '_total') {
|
|
482
|
-
if (workersData[key] >= 1 && workerCount > 0) {
|
|
483
|
-
dispatch.set(key, 1);
|
|
484
|
-
workersData[key] -= 1;
|
|
485
|
-
workerCount -= 1;
|
|
486
|
-
}
|
|
487
|
-
}
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
while (workerCount) {
|
|
491
|
-
_.forOwn(workersData, stateForDispatch);
|
|
492
|
-
}
|
|
493
|
-
|
|
494
|
-
const nodes = dispatch.getDispatch();
|
|
495
|
-
const results = _.map(nodes, (val, key) => messaging.send({
|
|
496
|
-
to: 'node_master',
|
|
497
|
-
address: key,
|
|
498
|
-
message: 'cluster:workers:remove',
|
|
499
|
-
ex_id: exId,
|
|
500
|
-
payload: { workers: val },
|
|
501
|
-
response: true
|
|
502
|
-
}));
|
|
503
|
-
|
|
504
|
-
return Promise.all(results)
|
|
505
|
-
.then(() => ({ action: 'remove', ex_id: exId, workerNum }))
|
|
506
|
-
.catch((err) => {
|
|
507
|
-
const error = new TSError(err, {
|
|
508
|
-
reason: `Error while releasing workers from job ${exId}`
|
|
509
|
-
});
|
|
510
|
-
logger.error(error);
|
|
511
|
-
return Promise.reject(error);
|
|
512
|
-
});
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
function _notifyNodesWithExecution(exId, messageData, excludeNode) {
|
|
516
|
-
return new Promise(((resolve, reject) => {
|
|
517
|
-
let nodes = _findNodesForExecution(exId);
|
|
518
|
-
if (excludeNode) {
|
|
519
|
-
nodes = nodes.filter((node) => node.hostname !== excludeNode);
|
|
520
|
-
} else if (messageData.message !== 'cluster:execution:stop' && nodes.length === 0) {
|
|
521
|
-
// exclude node is only in regards to a shutdown on the cluster_master, which
|
|
522
|
-
// already receives the shutdown notice so it can be empty, in all other
|
|
523
|
-
// circumstances if the node list length is zero then reject
|
|
524
|
-
const error = new Error(`Could not find active execution processes for ex_id: ${exId}`);
|
|
525
|
-
error.code = 404;
|
|
526
|
-
reject(error);
|
|
527
|
-
return;
|
|
528
|
-
}
|
|
529
|
-
|
|
530
|
-
const promises = nodes.map((node) => {
|
|
531
|
-
const sendingMsg = Object.assign(messageData, {
|
|
532
|
-
to: 'node_master',
|
|
533
|
-
address: node.node_id,
|
|
534
|
-
ex_id: exId,
|
|
535
|
-
response: false
|
|
536
|
-
});
|
|
537
|
-
|
|
538
|
-
logger.trace(`notifying node ${node.node_id} to stop execution ${exId}`, sendingMsg);
|
|
539
|
-
|
|
540
|
-
return messaging.send(sendingMsg);
|
|
541
|
-
});
|
|
542
|
-
|
|
543
|
-
Promise.all(promises)
|
|
544
|
-
.then(() => {
|
|
545
|
-
resolve(true);
|
|
546
|
-
})
|
|
547
|
-
.catch((err) => {
|
|
548
|
-
const error = new Error(`Failure to notify node with execution ${exId}, caused by ${err.message}`);
|
|
549
|
-
logger.error(error);
|
|
550
|
-
reject(error);
|
|
551
|
-
});
|
|
552
|
-
}));
|
|
553
|
-
}
|
|
554
|
-
|
|
555
|
-
function readyForAllocation() {
|
|
556
|
-
return _availableWorkers() >= 2;
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
function clusterAvailable() {}
|
|
560
|
-
|
|
561
|
-
function stopExecution(exId, timeout, exclude) {
|
|
562
|
-
// we are allowing stopExecution to be non blocking, we block at api level
|
|
563
|
-
const excludeNode = exclude || null;
|
|
564
|
-
pendingWorkerRequests.remove(exId, 'ex_id');
|
|
565
|
-
const sendingMessage = { message: 'cluster:execution:stop' };
|
|
566
|
-
if (timeout) {
|
|
567
|
-
sendingMessage.timeout = timeout;
|
|
568
|
-
}
|
|
569
|
-
return _notifyNodesWithExecution(exId, sendingMessage, excludeNode);
|
|
570
|
-
}
|
|
571
|
-
|
|
572
|
-
async function shutdown() {
|
|
573
|
-
clearInterval(clusterStateInterval);
|
|
574
|
-
|
|
575
|
-
logger.info('native clustering shutting down');
|
|
576
|
-
if (messaging) {
|
|
577
|
-
await messaging.shutdown();
|
|
578
|
-
} else {
|
|
579
|
-
await pDelay(100);
|
|
580
|
-
}
|
|
581
|
-
}
|
|
582
|
-
|
|
583
|
-
async function initialize() {
|
|
584
|
-
logger.info('native clustering initializing');
|
|
585
|
-
exStore = context.stores.execution;
|
|
586
|
-
if (!exStore) {
|
|
587
|
-
throw new Error('Missing required stores');
|
|
588
|
-
}
|
|
589
|
-
const server = clusterMasterServer.httpServer;
|
|
590
|
-
await messaging.listen({ server });
|
|
591
|
-
|
|
592
|
-
clusterStateInterval = setInterval(() => {
|
|
593
|
-
logger.trace('cluster_master requesting state update for all nodes');
|
|
594
|
-
messaging.broadcast('cluster:node:state');
|
|
595
|
-
}, nodeStateInterval);
|
|
596
|
-
}
|
|
597
|
-
|
|
598
|
-
return {
|
|
599
|
-
getClusterState,
|
|
600
|
-
allocateWorkers,
|
|
601
|
-
allocateSlicer,
|
|
602
|
-
initialize,
|
|
603
|
-
shutdown,
|
|
604
|
-
stopExecution,
|
|
605
|
-
removeWorkers,
|
|
606
|
-
addWorkers,
|
|
607
|
-
setWorkers,
|
|
608
|
-
readyForAllocation,
|
|
609
|
-
clusterAvailable
|
|
610
|
-
};
|
|
611
|
-
};
|