teraslice 3.2.1 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/package.json +26 -30
- package/dist/src/interfaces.js +0 -12
- package/dist/src/lib/cluster/cluster_master.js +0 -246
- package/dist/src/lib/cluster/node_master.js +0 -355
- package/dist/src/lib/cluster/services/api.js +0 -663
- package/dist/src/lib/cluster/services/assets.js +0 -224
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/index.js +0 -192
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/interfaces.js +0 -2
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8s.js +0 -419
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sDeploymentResource.js +0 -60
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sJobResource.js +0 -55
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.js +0 -357
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sServiceResource.js +0 -37
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sState.js +0 -60
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/utils.js +0 -170
- package/dist/src/lib/cluster/services/cluster/backends/native/dispatch.js +0 -13
- package/dist/src/lib/cluster/services/cluster/backends/native/index.js +0 -526
- package/dist/src/lib/cluster/services/cluster/backends/native/messaging.js +0 -548
- package/dist/src/lib/cluster/services/cluster/backends/state-utils.js +0 -26
- package/dist/src/lib/cluster/services/cluster/index.js +0 -13
- package/dist/src/lib/cluster/services/execution.js +0 -435
- package/dist/src/lib/cluster/services/index.js +0 -6
- package/dist/src/lib/cluster/services/interfaces.js +0 -2
- package/dist/src/lib/cluster/services/jobs.js +0 -458
- package/dist/src/lib/config/default-sysconfig.js +0 -25
- package/dist/src/lib/config/index.js +0 -20
- package/dist/src/lib/config/schemas/system.js +0 -360
- package/dist/src/lib/storage/analytics.js +0 -86
- package/dist/src/lib/storage/assets.js +0 -401
- package/dist/src/lib/storage/backends/elasticsearch_store.js +0 -496
- package/dist/src/lib/storage/backends/mappings/analytics.js +0 -20
- package/dist/src/lib/storage/backends/mappings/asset.js +0 -32
- package/dist/src/lib/storage/backends/mappings/ex.js +0 -53
- package/dist/src/lib/storage/backends/mappings/job.js +0 -42
- package/dist/src/lib/storage/backends/mappings/state.js +0 -16
- package/dist/src/lib/storage/backends/s3_store.js +0 -237
- package/dist/src/lib/storage/execution.js +0 -302
- package/dist/src/lib/storage/index.js +0 -7
- package/dist/src/lib/storage/jobs.js +0 -81
- package/dist/src/lib/storage/state.js +0 -254
- package/dist/src/lib/utils/api_utils.js +0 -128
- package/dist/src/lib/utils/asset_utils.js +0 -94
- package/dist/src/lib/utils/date_utils.js +0 -52
- package/dist/src/lib/utils/encoding_utils.js +0 -27
- package/dist/src/lib/utils/events.js +0 -4
- package/dist/src/lib/utils/file_utils.js +0 -124
- package/dist/src/lib/utils/id_utils.js +0 -15
- package/dist/src/lib/utils/port_utils.js +0 -32
- package/dist/src/lib/workers/assets/index.js +0 -3
- package/dist/src/lib/workers/assets/loader-executable.js +0 -40
- package/dist/src/lib/workers/assets/loader.js +0 -73
- package/dist/src/lib/workers/assets/spawn.js +0 -55
- package/dist/src/lib/workers/context/execution-context.js +0 -12
- package/dist/src/lib/workers/context/terafoundation-context.js +0 -8
- package/dist/src/lib/workers/execution-controller/execution-analytics.js +0 -188
- package/dist/src/lib/workers/execution-controller/index.js +0 -1024
- package/dist/src/lib/workers/execution-controller/recovery.js +0 -151
- package/dist/src/lib/workers/execution-controller/scheduler.js +0 -390
- package/dist/src/lib/workers/execution-controller/slice-analytics.js +0 -96
- package/dist/src/lib/workers/helpers/job.js +0 -80
- package/dist/src/lib/workers/helpers/op-analytics.js +0 -22
- package/dist/src/lib/workers/helpers/terafoundation.js +0 -34
- package/dist/src/lib/workers/helpers/worker-shutdown.js +0 -147
- package/dist/src/lib/workers/metrics/index.js +0 -108
- package/dist/src/lib/workers/worker/index.js +0 -378
- package/dist/src/lib/workers/worker/slice.js +0 -122
- package/dist/test/config/schemas/system_schema-spec.js +0 -26
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8s-v2-spec.js +0 -458
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sResource-v2-spec.js +0 -818
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sState-multicluster-v2-spec.js +0 -67
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sState-v2-spec.js +0 -84
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/utils-v2-spec.js +0 -320
- package/dist/test/lib/cluster/services/cluster/backends/state-utils-spec.js +0 -37
- package/dist/test/node_master-spec.js +0 -194
- package/dist/test/services/api-spec.js +0 -79
- package/dist/test/services/assets-spec.js +0 -158
- package/dist/test/services/messaging-spec.js +0 -440
- package/dist/test/storage/assets_storage-spec.js +0 -95
- package/dist/test/storage/s3_store-spec.js +0 -149
- package/dist/test/test.config.js +0 -23
- package/dist/test/test.setup.js +0 -6
- package/dist/test/utils/api_utils-spec.js +0 -25
- package/dist/test/utils/asset_utils-spec.js +0 -141
- package/dist/test/utils/elastic_utils-spec.js +0 -25
- package/dist/test/workers/execution-controller/execution-controller-spec.js +0 -371
- package/dist/test/workers/execution-controller/execution-special-test-cases-spec.js +0 -519
- package/dist/test/workers/execution-controller/execution-test-cases-spec.js +0 -343
- package/dist/test/workers/execution-controller/recovery-spec.js +0 -160
- package/dist/test/workers/execution-controller/scheduler-spec.js +0 -249
- package/dist/test/workers/execution-controller/slice-analytics-spec.js +0 -121
- package/dist/test/workers/fixtures/ops/example-op/processor.js +0 -20
- package/dist/test/workers/fixtures/ops/example-op/schema.js +0 -19
- package/dist/test/workers/fixtures/ops/example-reader/fetcher.js +0 -20
- package/dist/test/workers/fixtures/ops/example-reader/schema.js +0 -41
- package/dist/test/workers/fixtures/ops/example-reader/slicer.js +0 -37
- package/dist/test/workers/fixtures/ops/new-op/processor.js +0 -29
- package/dist/test/workers/fixtures/ops/new-op/schema.js +0 -18
- package/dist/test/workers/fixtures/ops/new-reader/fetcher.js +0 -19
- package/dist/test/workers/fixtures/ops/new-reader/schema.js +0 -23
- package/dist/test/workers/fixtures/ops/new-reader/slicer.js +0 -13
- package/dist/test/workers/helpers/configs.js +0 -128
- package/dist/test/workers/helpers/execution-controller-helper.js +0 -49
- package/dist/test/workers/helpers/index.js +0 -5
- package/dist/test/workers/helpers/test-context.js +0 -210
- package/dist/test/workers/helpers/zip-directory.js +0 -25
- package/dist/test/workers/worker/slice-spec.js +0 -333
- package/dist/test/workers/worker/worker-spec.js +0 -356
|
@@ -1,526 +0,0 @@
|
|
|
1
|
-
/* eslint-disable @typescript-eslint/prefer-for-of */
|
|
2
|
-
import { TSError, getFullErrorStack, debounce, pDelay, cloneDeep, pMap, orderBy, isInteger, get, Queue } from '@terascope/core-utils';
|
|
3
|
-
import { Dispatch } from './dispatch.js';
|
|
4
|
-
import { makeLogger } from '../../../../../workers/helpers/terafoundation.js';
|
|
5
|
-
import { findWorkersByExecutionID } from '../state-utils.js';
|
|
6
|
-
import { Messaging } from './messaging.js';
|
|
7
|
-
export class NativeClustering {
|
|
8
|
-
context;
|
|
9
|
-
logger;
|
|
10
|
-
events;
|
|
11
|
-
executionStore;
|
|
12
|
-
pendingWorkerRequests = new Queue();
|
|
13
|
-
nodeStateInterval;
|
|
14
|
-
slicerAllocationAttempts;
|
|
15
|
-
clusterState = {};
|
|
16
|
-
clusterStateInterval;
|
|
17
|
-
messaging;
|
|
18
|
-
droppedNodes = {};
|
|
19
|
-
clusterMasterServer;
|
|
20
|
-
constructor(context, clusterMasterServer) {
|
|
21
|
-
this.context = context;
|
|
22
|
-
this.events = context.apis.foundation.getSystemEvents();
|
|
23
|
-
this.logger = makeLogger(context, 'native_cluster_service');
|
|
24
|
-
const nodeDisconnectTimeout = context.sysconfig.teraslice.node_disconnect_timeout;
|
|
25
|
-
this.nodeStateInterval = context.sysconfig.teraslice.node_state_interval;
|
|
26
|
-
this.slicerAllocationAttempts = context.sysconfig.teraslice.slicer_allocation_attempts;
|
|
27
|
-
this.messaging = new Messaging(context, this.logger);
|
|
28
|
-
this.clusterMasterServer = clusterMasterServer;
|
|
29
|
-
// temporary holding spot used to attach nodes that are non responsive or
|
|
30
|
-
// disconnect before final cleanup
|
|
31
|
-
this.messaging.register({
|
|
32
|
-
event: 'node:online',
|
|
33
|
-
identifier: 'node_id',
|
|
34
|
-
callback: (data, nodeId) => {
|
|
35
|
-
this.logger.info(`node ${nodeId} has connected`);
|
|
36
|
-
// if a reconnect happens stop timer
|
|
37
|
-
if (this.droppedNodes[nodeId]) {
|
|
38
|
-
clearTimeout(this.droppedNodes[nodeId]);
|
|
39
|
-
delete this.droppedNodes[nodeId];
|
|
40
|
-
}
|
|
41
|
-
this.logger.trace(`node ${nodeId} has state:`, data.payload);
|
|
42
|
-
this.clusterState[nodeId] = data.payload;
|
|
43
|
-
// if new node comes online, check if jobs need more workers
|
|
44
|
-
this.events.emit('cluster:available_workers');
|
|
45
|
-
}
|
|
46
|
-
});
|
|
47
|
-
this.messaging.register({
|
|
48
|
-
event: 'node:state',
|
|
49
|
-
callback: (stateMsg) => {
|
|
50
|
-
const data = stateMsg.payload;
|
|
51
|
-
this.clusterState[data.node_id] = data;
|
|
52
|
-
this.logger.trace(`node ${data.node_id} state is being updated`, data);
|
|
53
|
-
// check to see if we can provision any additional workers
|
|
54
|
-
this.events.emit('cluster:available_workers');
|
|
55
|
-
}
|
|
56
|
-
});
|
|
57
|
-
this.messaging.register({
|
|
58
|
-
event: 'network:error',
|
|
59
|
-
callback: (err) => this.logger.error(err, 'cluster_master had an error with one of its connections')
|
|
60
|
-
});
|
|
61
|
-
this.messaging.register({
|
|
62
|
-
event: 'network:disconnect',
|
|
63
|
-
identifier: 'node_id',
|
|
64
|
-
callback: (msg, nodeId) => {
|
|
65
|
-
if (!this.clusterState[nodeId])
|
|
66
|
-
return;
|
|
67
|
-
if (this.clusterState[nodeId].active.length === 0) {
|
|
68
|
-
this.logger.warn(`node ${nodeId} has disconnected`);
|
|
69
|
-
delete this.clusterState[nodeId];
|
|
70
|
-
}
|
|
71
|
-
else {
|
|
72
|
-
this.clusterState[nodeId].state = 'disconnected';
|
|
73
|
-
const timer = setTimeout(async () => {
|
|
74
|
-
await this._cleanUpNode(nodeId);
|
|
75
|
-
}, nodeDisconnectTimeout);
|
|
76
|
-
this.droppedNodes[nodeId] = timer;
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
});
|
|
80
|
-
// TODO: should this be in initialize?
|
|
81
|
-
const schedulePendingRequests = debounce(() => {
|
|
82
|
-
if (this.pendingWorkerRequests.size() && this._availableWorkers(false, true) >= 1) {
|
|
83
|
-
const requestedWorker = this.pendingWorkerRequests.dequeue();
|
|
84
|
-
const job = JSON.parse(requestedWorker.job);
|
|
85
|
-
this.allocateWorkers(job, requestedWorker.workers)
|
|
86
|
-
.catch((err) => {
|
|
87
|
-
const error = new TSError(err, {
|
|
88
|
-
reason: 'Error processing pending requests'
|
|
89
|
-
});
|
|
90
|
-
this.logger.error(error);
|
|
91
|
-
});
|
|
92
|
-
}
|
|
93
|
-
}, 500, { leading: false, trailing: true });
|
|
94
|
-
this.events.on('cluster:available_workers', schedulePendingRequests);
|
|
95
|
-
}
|
|
96
|
-
async _cleanUpNode(nodeId) {
|
|
97
|
-
// check workers and slicers
|
|
98
|
-
const node = this._checkNode(this.clusterState[nodeId]);
|
|
99
|
-
// if disconnected node had a slicer, we stop the execution of each slicer on it
|
|
100
|
-
// and mark it as failed
|
|
101
|
-
if (node.hasSlicer) {
|
|
102
|
-
await pMap(Object.values(node.slicerExecutions), async (exId) => {
|
|
103
|
-
const errMsg = `node ${nodeId} has been disconnected from cluster_master past the allowed timeout, it has an active slicer for execution: ${exId} which will be marked as terminated and shut down`;
|
|
104
|
-
this.logger.error(errMsg);
|
|
105
|
-
const metaData = this.executionStore.executionMetaData(null, errMsg);
|
|
106
|
-
this.pendingWorkerRequests.remove(exId, 'ex_id');
|
|
107
|
-
try {
|
|
108
|
-
await this.executionStore.setStatus(exId, 'terminated', metaData);
|
|
109
|
-
}
|
|
110
|
-
catch (err) {
|
|
111
|
-
this.logger.error(err, `failure to set execution ${exId} status to terminated`);
|
|
112
|
-
}
|
|
113
|
-
finally {
|
|
114
|
-
this.messaging.broadcast('cluster:execution:stop', { ex_id: exId });
|
|
115
|
-
}
|
|
116
|
-
});
|
|
117
|
-
}
|
|
118
|
-
// for any other worker not part of what is being shutdown, we attempt to reallocate
|
|
119
|
-
await pMap(Object.keys(node.workerExecutions), async (exId) => {
|
|
120
|
-
// looking for unique ex_id's not in slicerJobID
|
|
121
|
-
if (!node.slicerExecutions[exId]) {
|
|
122
|
-
const activeWorkers = this.clusterState[nodeId].active;
|
|
123
|
-
const numOfWorkers = activeWorkers.filter((worker) => worker.ex_id === exId).length;
|
|
124
|
-
try {
|
|
125
|
-
const execution = await this.executionStore.getActiveExecution(exId);
|
|
126
|
-
this.addWorkers(execution, numOfWorkers);
|
|
127
|
-
}
|
|
128
|
-
catch (err) {
|
|
129
|
-
this.logger.error(err, `failure to add workers to execution ${exId}`);
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
});
|
|
133
|
-
// cleanup key so we don't get ever growing obj
|
|
134
|
-
delete this.droppedNodes[nodeId];
|
|
135
|
-
delete this.clusterState[nodeId];
|
|
136
|
-
}
|
|
137
|
-
async initialize() {
|
|
138
|
-
this.logger.info('native clustering initializing');
|
|
139
|
-
this.executionStore = this.context.stores.executionStorage;
|
|
140
|
-
if (!this.executionStore) {
|
|
141
|
-
throw new Error('Missing required stores');
|
|
142
|
-
}
|
|
143
|
-
const server = this.clusterMasterServer.httpServer;
|
|
144
|
-
await this.messaging.listen({ server });
|
|
145
|
-
this.clusterStateInterval = setInterval(() => {
|
|
146
|
-
this.logger.trace('cluster_master requesting state update for all nodes');
|
|
147
|
-
this.messaging.broadcast('cluster:node:state');
|
|
148
|
-
}, this.nodeStateInterval);
|
|
149
|
-
}
|
|
150
|
-
getClusterState() {
|
|
151
|
-
return cloneDeep(this.clusterState);
|
|
152
|
-
}
|
|
153
|
-
_checkNode(node) {
|
|
154
|
-
const obj = {
|
|
155
|
-
hasSlicer: false,
|
|
156
|
-
numOfSlicers: 0,
|
|
157
|
-
slicerExecutions: {},
|
|
158
|
-
workerExecutions: {},
|
|
159
|
-
numOfWorkers: 0,
|
|
160
|
-
available: node.available
|
|
161
|
-
};
|
|
162
|
-
return node.active.reduce((prev, curr) => {
|
|
163
|
-
if (curr.assignment === 'execution_controller') {
|
|
164
|
-
prev.hasSlicer = true;
|
|
165
|
-
prev.numOfSlicers += 1;
|
|
166
|
-
prev.slicerExecutions[curr.ex_id] = curr.ex_id;
|
|
167
|
-
}
|
|
168
|
-
if (curr.assignment === 'worker') {
|
|
169
|
-
prev.numOfWorkers += 1;
|
|
170
|
-
// if not resgistered, set it to one, if so then increment it
|
|
171
|
-
if (!prev.workerExecutions[curr.ex_id]) {
|
|
172
|
-
prev.workerExecutions[curr.ex_id] = 1;
|
|
173
|
-
}
|
|
174
|
-
else {
|
|
175
|
-
prev.workerExecutions[curr.ex_id] += 1;
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
return prev;
|
|
179
|
-
}, obj);
|
|
180
|
-
}
|
|
181
|
-
_findNodeForSlicer(stateArray, errorNodes) {
|
|
182
|
-
let slicerNode = null;
|
|
183
|
-
for (let i = 0; i < stateArray.length; i += 1) {
|
|
184
|
-
if (stateArray[i].state === 'connected' && stateArray[i].available > 0 && !errorNodes[stateArray[i].node_id]) {
|
|
185
|
-
const node = this._checkNode(stateArray[i]);
|
|
186
|
-
if (!node.hasSlicer) {
|
|
187
|
-
slicerNode = stateArray[i].node_id;
|
|
188
|
-
break;
|
|
189
|
-
}
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
// if all nodes have a slicer
|
|
193
|
-
if (!slicerNode) {
|
|
194
|
-
// list is already sorted by num available since stateArray is sorted
|
|
195
|
-
slicerNode = stateArray[0].node_id;
|
|
196
|
-
}
|
|
197
|
-
return slicerNode;
|
|
198
|
-
}
|
|
199
|
-
_findNodesForExecution(exId, slicerOnly) {
|
|
200
|
-
const nodes = [];
|
|
201
|
-
for (const [, node] of Object.entries(this.clusterState)) {
|
|
202
|
-
if (node.state !== 'disconnected') {
|
|
203
|
-
const hasJob = node.active.filter((worker) => {
|
|
204
|
-
if (slicerOnly) {
|
|
205
|
-
return worker.ex_id === exId && worker.assignment === 'execution_controller';
|
|
206
|
-
}
|
|
207
|
-
return worker.ex_id === exId;
|
|
208
|
-
});
|
|
209
|
-
if (hasJob.length >= 1) {
|
|
210
|
-
nodes.push({
|
|
211
|
-
node_id: node.node_id,
|
|
212
|
-
ex_id: exId,
|
|
213
|
-
hostname: node.hostname,
|
|
214
|
-
workers: hasJob
|
|
215
|
-
});
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
return nodes;
|
|
220
|
-
}
|
|
221
|
-
_availableWorkers(all, forceCheck) {
|
|
222
|
-
let num = 0;
|
|
223
|
-
// determine which key to search for in cluster state
|
|
224
|
-
if (this.pendingWorkerRequests.size() === 0 || forceCheck) {
|
|
225
|
-
const key = all ? 'total' : 'available';
|
|
226
|
-
for (const [, node] of Object.entries(this.clusterState)) {
|
|
227
|
-
if (node.state === 'connected') {
|
|
228
|
-
num += node[key];
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
}
|
|
232
|
-
return num;
|
|
233
|
-
}
|
|
234
|
-
_findPort(nodeId) {
|
|
235
|
-
return this.messaging.send({
|
|
236
|
-
to: 'node_master',
|
|
237
|
-
address: nodeId,
|
|
238
|
-
message: 'cluster:node:get_port',
|
|
239
|
-
response: true
|
|
240
|
-
});
|
|
241
|
-
}
|
|
242
|
-
// designed to allocate additional workers, not any future slicers
|
|
243
|
-
async allocateWorkers(execution, numOfWorkersRequested) {
|
|
244
|
-
const exId = execution.ex_id;
|
|
245
|
-
const jobId = execution.job_id;
|
|
246
|
-
const jobStr = JSON.stringify(execution);
|
|
247
|
-
const sortedNodes = orderBy(this.clusterState, 'available', 'desc');
|
|
248
|
-
let workersRequested = numOfWorkersRequested;
|
|
249
|
-
let availWorkers = this._availableWorkers(false, true);
|
|
250
|
-
const dispatch = new Dispatch();
|
|
251
|
-
while (workersRequested > 0 && availWorkers > 0) {
|
|
252
|
-
for (let i = 0; i < sortedNodes.length; i += 1) {
|
|
253
|
-
// each iteration check if it can allocate
|
|
254
|
-
if (workersRequested > 0 && availWorkers > 0) {
|
|
255
|
-
if (sortedNodes[i].available >= 1) {
|
|
256
|
-
dispatch.set(sortedNodes[i].node_id, 1);
|
|
257
|
-
availWorkers -= 1;
|
|
258
|
-
workersRequested -= 1;
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
else {
|
|
262
|
-
break;
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
// if left over worker requests, enqueue them, queue works based off of id
|
|
267
|
-
// so it redundantly references ex_id
|
|
268
|
-
const workerData = {
|
|
269
|
-
job: jobStr,
|
|
270
|
-
id: exId,
|
|
271
|
-
ex_id: exId,
|
|
272
|
-
job_id: jobId,
|
|
273
|
-
workers: 1,
|
|
274
|
-
assignment: 'worker'
|
|
275
|
-
};
|
|
276
|
-
while (workersRequested > 0) {
|
|
277
|
-
this.logger.trace(`adding worker to pending queue for ex: ${exId}`);
|
|
278
|
-
this.pendingWorkerRequests.enqueue(workerData);
|
|
279
|
-
workersRequested -= 1;
|
|
280
|
-
}
|
|
281
|
-
const results = [];
|
|
282
|
-
for (const [nodeId, workerCount] of Object.entries(dispatch.getDispatch())) {
|
|
283
|
-
const requestedWorkersData = {
|
|
284
|
-
job: jobStr,
|
|
285
|
-
id: exId,
|
|
286
|
-
ex_id: exId,
|
|
287
|
-
job_id: jobId,
|
|
288
|
-
workers: workerCount,
|
|
289
|
-
assignment: 'worker'
|
|
290
|
-
};
|
|
291
|
-
const createRequest = this.messaging.send({
|
|
292
|
-
to: 'node_master',
|
|
293
|
-
address: nodeId,
|
|
294
|
-
message: 'cluster:workers:create',
|
|
295
|
-
payload: requestedWorkersData,
|
|
296
|
-
response: true
|
|
297
|
-
}).then((msg) => {
|
|
298
|
-
const createdWorkers = get(msg, 'payload.createdWorkers');
|
|
299
|
-
if (!isInteger(createdWorkers)) {
|
|
300
|
-
this.logger.error(`malformed response from create workers request to node ${nodeId}`, msg);
|
|
301
|
-
return;
|
|
302
|
-
}
|
|
303
|
-
if (createdWorkers < workerCount) {
|
|
304
|
-
this.logger.warn(`node ${nodeId} was only able to allocate ${createdWorkers} the request worker count of ${workerCount}, enqueing the remainder`);
|
|
305
|
-
const newWorkersRequest = cloneDeep(requestedWorkersData);
|
|
306
|
-
newWorkersRequest.workers = workerCount - createdWorkers;
|
|
307
|
-
this.pendingWorkerRequests.enqueue(newWorkersRequest);
|
|
308
|
-
}
|
|
309
|
-
else {
|
|
310
|
-
this.logger.debug(`node ${nodeId} allocated ${createdWorkers}`);
|
|
311
|
-
}
|
|
312
|
-
})
|
|
313
|
-
.catch((err) => {
|
|
314
|
-
this.logger.error(err, `An error has occurred in allocating : ${workerCount} workers to node : ${nodeId}, the worker request has been enqueued`);
|
|
315
|
-
this.pendingWorkerRequests.enqueue(requestedWorkersData);
|
|
316
|
-
});
|
|
317
|
-
results.push(createRequest);
|
|
318
|
-
}
|
|
319
|
-
// this will resolve successfully if one worker was actually allocated
|
|
320
|
-
return Promise.all(results);
|
|
321
|
-
}
|
|
322
|
-
async _createSlicer(ex, errorNodes) {
|
|
323
|
-
const execution = cloneDeep(ex);
|
|
324
|
-
const sortedNodes = orderBy(this.clusterState, 'available', 'desc');
|
|
325
|
-
const slicerNodeID = this._findNodeForSlicer(sortedNodes, errorNodes);
|
|
326
|
-
// need to mutate job so that workers will know the specific port and
|
|
327
|
-
// hostname of the created slicer
|
|
328
|
-
const portObj = await this._findPort(slicerNodeID);
|
|
329
|
-
execution.slicer_port = portObj.port;
|
|
330
|
-
execution.slicer_hostname = this.clusterState[slicerNodeID].hostname;
|
|
331
|
-
this.logger.debug(`node ${this.clusterState[slicerNodeID].hostname} has been elected for slicer, listening on port: ${portObj.port}`);
|
|
332
|
-
const exId = execution.ex_id;
|
|
333
|
-
const jobId = execution.job_id;
|
|
334
|
-
const jobStr = JSON.stringify(execution);
|
|
335
|
-
const data = {
|
|
336
|
-
job: jobStr,
|
|
337
|
-
ex_id: exId,
|
|
338
|
-
job_id: jobId,
|
|
339
|
-
workers: 1,
|
|
340
|
-
slicer_port: portObj.port,
|
|
341
|
-
node_id: slicerNodeID,
|
|
342
|
-
assignment: 'execution_controller'
|
|
343
|
-
};
|
|
344
|
-
try {
|
|
345
|
-
await this.messaging.send({
|
|
346
|
-
to: 'node_master',
|
|
347
|
-
address: slicerNodeID,
|
|
348
|
-
message: 'cluster:execution_controller:create',
|
|
349
|
-
payload: data,
|
|
350
|
-
response: true
|
|
351
|
-
});
|
|
352
|
-
return execution;
|
|
353
|
-
}
|
|
354
|
-
catch (err) {
|
|
355
|
-
const error = new TSError(err, {
|
|
356
|
-
reason: `failed to allocate execution_controller to ${slicerNodeID}`
|
|
357
|
-
});
|
|
358
|
-
this.logger.error(error);
|
|
359
|
-
errorNodes[slicerNodeID] = getFullErrorStack(error);
|
|
360
|
-
throw err;
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
async allocateSlicer(ex) {
|
|
364
|
-
let retryCount = 0;
|
|
365
|
-
const errorNodes = {};
|
|
366
|
-
const _allocateSlicer = async () => {
|
|
367
|
-
try {
|
|
368
|
-
return await this._createSlicer(ex, errorNodes);
|
|
369
|
-
}
|
|
370
|
-
catch (err) {
|
|
371
|
-
retryCount += 1;
|
|
372
|
-
if (retryCount >= this.slicerAllocationAttempts) {
|
|
373
|
-
throw new Error(`Failed to allocate execution_controller to nodes: ${JSON.stringify(errorNodes)}`);
|
|
374
|
-
}
|
|
375
|
-
else {
|
|
376
|
-
await pDelay(100);
|
|
377
|
-
return _allocateSlicer();
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
|
-
};
|
|
381
|
-
return _allocateSlicer();
|
|
382
|
-
}
|
|
383
|
-
addWorkers(execution, workerNum) {
|
|
384
|
-
const workerData = {
|
|
385
|
-
job: JSON.stringify(execution),
|
|
386
|
-
id: execution.ex_id,
|
|
387
|
-
ex_id: execution.ex_id,
|
|
388
|
-
job_id: execution.job_id,
|
|
389
|
-
workers: workerNum,
|
|
390
|
-
assignment: 'worker'
|
|
391
|
-
};
|
|
392
|
-
this.pendingWorkerRequests.enqueue(workerData);
|
|
393
|
-
return { action: 'enqueued', ex_id: execution.ex_id, workerNum };
|
|
394
|
-
}
|
|
395
|
-
setWorkers(execution, workerNum) {
|
|
396
|
-
const totalWorker = findWorkersByExecutionID(this.clusterState, execution.ex_id).length;
|
|
397
|
-
if (totalWorker > workerNum) {
|
|
398
|
-
const removedWorkersCount = totalWorker - workerNum;
|
|
399
|
-
return this.removeWorkers(execution.ex_id, removedWorkersCount);
|
|
400
|
-
}
|
|
401
|
-
if (totalWorker < workerNum) {
|
|
402
|
-
const addWorkersCount = workerNum - totalWorker;
|
|
403
|
-
return this.addWorkers(execution, addWorkersCount);
|
|
404
|
-
}
|
|
405
|
-
// if they are equal then no work needs to be done
|
|
406
|
-
return { action: 'set', ex_id: execution.ex_id, workerNum };
|
|
407
|
-
}
|
|
408
|
-
removeWorkers(exId, workerNum) {
|
|
409
|
-
const dispatch = new Dispatch();
|
|
410
|
-
const workers = findWorkersByExecutionID(this.clusterState, exId);
|
|
411
|
-
let workerCount = workerNum;
|
|
412
|
-
const workersData = workers.reduce((prev, curr) => {
|
|
413
|
-
if (!prev[curr.node_id]) {
|
|
414
|
-
prev[curr.node_id] = 1;
|
|
415
|
-
}
|
|
416
|
-
else {
|
|
417
|
-
prev[curr.node_id] += 1;
|
|
418
|
-
}
|
|
419
|
-
prev._total += 1;
|
|
420
|
-
return prev;
|
|
421
|
-
}, { _total: 0 });
|
|
422
|
-
if (workerNum >= workersData._total) {
|
|
423
|
-
const errMsg = `workers to be removed: ${workerNum} cannot be >= to current workers: ${workersData._total}`;
|
|
424
|
-
const error = new TSError(errMsg, {
|
|
425
|
-
statusCode: 400,
|
|
426
|
-
});
|
|
427
|
-
this.logger.error(error);
|
|
428
|
-
return Promise.reject(error);
|
|
429
|
-
}
|
|
430
|
-
while (workerCount) {
|
|
431
|
-
for (const [key] of Object.entries(workersData)) {
|
|
432
|
-
if (key !== '_total') {
|
|
433
|
-
if (workersData[key] >= 1 && workerCount > 0) {
|
|
434
|
-
dispatch.set(key, 1);
|
|
435
|
-
workersData[key] -= 1;
|
|
436
|
-
workerCount -= 1;
|
|
437
|
-
}
|
|
438
|
-
}
|
|
439
|
-
}
|
|
440
|
-
}
|
|
441
|
-
const nodes = dispatch.getDispatch();
|
|
442
|
-
const messagesSent = [];
|
|
443
|
-
for (const [key, val] of Object.entries(nodes)) {
|
|
444
|
-
messagesSent.push(this.messaging.send({
|
|
445
|
-
to: 'node_master',
|
|
446
|
-
address: key,
|
|
447
|
-
message: 'cluster:workers:remove',
|
|
448
|
-
ex_id: exId,
|
|
449
|
-
payload: { workers: val },
|
|
450
|
-
response: true
|
|
451
|
-
}));
|
|
452
|
-
}
|
|
453
|
-
return Promise.all(messagesSent)
|
|
454
|
-
.then(() => ({ action: 'remove', ex_id: exId, workerNum }))
|
|
455
|
-
.catch((err) => {
|
|
456
|
-
const error = new TSError(err, {
|
|
457
|
-
reason: `Error while releasing workers from job ${exId}`
|
|
458
|
-
});
|
|
459
|
-
this.logger.error(error);
|
|
460
|
-
return Promise.reject(error);
|
|
461
|
-
});
|
|
462
|
-
}
|
|
463
|
-
_notifyNodesWithExecution(exId, messageData, excludeNode) {
|
|
464
|
-
return new Promise((resolve, reject) => {
|
|
465
|
-
let nodes = this._findNodesForExecution(exId);
|
|
466
|
-
if (excludeNode) {
|
|
467
|
-
nodes = nodes.filter((node) => node.hostname !== excludeNode);
|
|
468
|
-
}
|
|
469
|
-
else if (messageData.message !== 'cluster:execution:stop' && nodes.length === 0) {
|
|
470
|
-
// exclude node is only in regards to a shutdown on the cluster_master, which
|
|
471
|
-
// already receives the shutdown notice so it can be empty, in all other
|
|
472
|
-
// circumstances if the node list length is zero then reject
|
|
473
|
-
const error = new TSError(`Could not find active execution processes for ex_id: ${exId}`);
|
|
474
|
-
error.statusCode = 404;
|
|
475
|
-
reject(error);
|
|
476
|
-
return;
|
|
477
|
-
}
|
|
478
|
-
const promises = nodes.map((node) => {
|
|
479
|
-
const sendingMsg = Object.assign(messageData, {
|
|
480
|
-
to: 'node_master',
|
|
481
|
-
address: node.node_id,
|
|
482
|
-
ex_id: exId,
|
|
483
|
-
response: false
|
|
484
|
-
});
|
|
485
|
-
this.logger.trace(`notifying node ${node.node_id} to stop execution ${exId}`, sendingMsg);
|
|
486
|
-
return this.messaging.send(sendingMsg);
|
|
487
|
-
});
|
|
488
|
-
Promise.all(promises)
|
|
489
|
-
.then(() => {
|
|
490
|
-
resolve(true);
|
|
491
|
-
})
|
|
492
|
-
.catch((err) => {
|
|
493
|
-
const error = new Error(`Failure to notify node with execution ${exId}, caused by ${err.message}`);
|
|
494
|
-
this.logger.error(error);
|
|
495
|
-
reject(error);
|
|
496
|
-
});
|
|
497
|
-
});
|
|
498
|
-
}
|
|
499
|
-
readyForAllocation() {
|
|
500
|
-
return this._availableWorkers() >= 2;
|
|
501
|
-
}
|
|
502
|
-
clusterAvailable() { }
|
|
503
|
-
async stopExecution(exId, options) {
|
|
504
|
-
// we are allowing stopExecution to be non blocking, we block at api level
|
|
505
|
-
this.pendingWorkerRequests.remove(exId, 'ex_id');
|
|
506
|
-
const sendingMessage = { message: 'cluster:execution:stop' };
|
|
507
|
-
if (options?.timeout) {
|
|
508
|
-
sendingMessage.timeout = options.timeout;
|
|
509
|
-
}
|
|
510
|
-
return this._notifyNodesWithExecution(exId, sendingMessage, options?.excludeNode);
|
|
511
|
-
}
|
|
512
|
-
async shutdown() {
|
|
513
|
-
clearInterval(this.clusterStateInterval);
|
|
514
|
-
this.logger.info('native clustering shutting down');
|
|
515
|
-
if (this.messaging) {
|
|
516
|
-
await this.messaging.shutdown();
|
|
517
|
-
}
|
|
518
|
-
else {
|
|
519
|
-
await pDelay(100);
|
|
520
|
-
}
|
|
521
|
-
}
|
|
522
|
-
async listResourcesForJobId() {
|
|
523
|
-
return [];
|
|
524
|
-
}
|
|
525
|
-
}
|
|
526
|
-
//# sourceMappingURL=index.js.map
|