teraslice 2.10.0 → 2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/interfaces.js +12 -0
- package/dist/src/lib/cluster/cluster_master.js +246 -0
- package/dist/src/lib/cluster/node_master.js +355 -0
- package/dist/src/lib/cluster/services/api.js +663 -0
- package/dist/src/lib/cluster/services/assets.js +226 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetes/index.js +192 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8s.js +481 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js +414 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8sState.js +59 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetes/utils.js +43 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/index.js +192 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/interfaces.js +2 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8s.js +423 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sDeploymentResource.js +60 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sJobResource.js +55 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.js +359 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sServiceResource.js +37 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sState.js +60 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/utils.js +170 -0
- package/dist/src/lib/cluster/services/cluster/backends/native/dispatch.js +13 -0
- package/dist/src/lib/cluster/services/cluster/backends/native/index.js +526 -0
- package/dist/src/lib/cluster/services/cluster/backends/native/messaging.js +547 -0
- package/dist/src/lib/cluster/services/cluster/backends/state-utils.js +26 -0
- package/dist/src/lib/cluster/services/cluster/index.js +17 -0
- package/dist/src/lib/cluster/services/execution.js +435 -0
- package/dist/src/lib/cluster/services/index.js +6 -0
- package/dist/src/lib/cluster/services/interfaces.js +2 -0
- package/dist/src/lib/cluster/services/jobs.js +454 -0
- package/dist/src/lib/config/default-sysconfig.js +26 -0
- package/dist/src/lib/config/index.js +22 -0
- package/dist/src/lib/config/schemas/system.js +360 -0
- package/dist/src/lib/storage/analytics.js +86 -0
- package/dist/src/lib/storage/assets.js +401 -0
- package/dist/src/lib/storage/backends/elasticsearch_store.js +494 -0
- package/dist/src/lib/storage/backends/mappings/analytics.js +50 -0
- package/dist/src/lib/storage/backends/mappings/asset.js +41 -0
- package/dist/src/lib/storage/backends/mappings/ex.js +62 -0
- package/dist/src/lib/storage/backends/mappings/job.js +38 -0
- package/dist/src/lib/storage/backends/mappings/state.js +38 -0
- package/dist/src/lib/storage/backends/s3_store.js +237 -0
- package/dist/src/lib/storage/execution.js +300 -0
- package/dist/src/lib/storage/index.js +7 -0
- package/dist/src/lib/storage/jobs.js +81 -0
- package/dist/src/lib/storage/state.js +255 -0
- package/dist/src/lib/utils/api_utils.js +157 -0
- package/dist/src/lib/utils/asset_utils.js +94 -0
- package/dist/src/lib/utils/date_utils.js +52 -0
- package/dist/src/lib/utils/encoding_utils.js +27 -0
- package/dist/src/lib/utils/events.js +4 -0
- package/dist/src/lib/utils/file_utils.js +124 -0
- package/dist/src/lib/utils/id_utils.js +15 -0
- package/dist/src/lib/utils/port_utils.js +32 -0
- package/dist/src/lib/workers/assets/index.js +3 -0
- package/dist/src/lib/workers/assets/loader-executable.js +40 -0
- package/dist/src/lib/workers/assets/loader.js +73 -0
- package/dist/src/lib/workers/assets/spawn.js +55 -0
- package/dist/src/lib/workers/context/execution-context.js +12 -0
- package/dist/src/lib/workers/context/terafoundation-context.js +8 -0
- package/dist/src/lib/workers/execution-controller/execution-analytics.js +188 -0
- package/dist/src/lib/workers/execution-controller/index.js +1024 -0
- package/dist/src/lib/workers/execution-controller/recovery.js +151 -0
- package/dist/src/lib/workers/execution-controller/scheduler.js +390 -0
- package/dist/src/lib/workers/execution-controller/slice-analytics.js +96 -0
- package/dist/src/lib/workers/helpers/job.js +80 -0
- package/dist/src/lib/workers/helpers/op-analytics.js +22 -0
- package/dist/src/lib/workers/helpers/terafoundation.js +34 -0
- package/dist/src/lib/workers/helpers/worker-shutdown.js +169 -0
- package/dist/src/lib/workers/metrics/index.js +108 -0
- package/dist/src/lib/workers/worker/index.js +378 -0
- package/dist/src/lib/workers/worker/slice.js +122 -0
- package/dist/test/config/schemas/system_schema-spec.js +37 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8s-spec.js +316 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sResource-spec.js +795 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sState-multicluster-spec.js +67 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sState-spec.js +84 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/utils-spec.js +132 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8s-v2-spec.js +455 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sResource-v2-spec.js +818 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sState-multicluster-v2-spec.js +67 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sState-v2-spec.js +84 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/utils-v2-spec.js +320 -0
- package/dist/test/lib/cluster/services/cluster/backends/state-utils-spec.js +37 -0
- package/dist/test/node_master-spec.js +188 -0
- package/dist/test/services/api-spec.js +80 -0
- package/dist/test/services/assets-spec.js +158 -0
- package/dist/test/services/messaging-spec.js +440 -0
- package/dist/test/storage/assets_storage-spec.js +95 -0
- package/dist/test/storage/s3_store-spec.js +138 -0
- package/dist/test/test.config.js +8 -0
- package/dist/test/test.setup.js +6 -0
- package/dist/test/utils/api_utils-spec.js +86 -0
- package/dist/test/utils/asset_utils-spec.js +141 -0
- package/dist/test/utils/elastic_utils-spec.js +25 -0
- package/dist/test/workers/execution-controller/execution-controller-spec.js +371 -0
- package/dist/test/workers/execution-controller/execution-special-test-cases-spec.js +520 -0
- package/dist/test/workers/execution-controller/execution-test-cases-spec.js +338 -0
- package/dist/test/workers/execution-controller/recovery-spec.js +160 -0
- package/dist/test/workers/execution-controller/scheduler-spec.js +249 -0
- package/dist/test/workers/execution-controller/slice-analytics-spec.js +121 -0
- package/dist/test/workers/fixtures/ops/example-op/processor.js +20 -0
- package/dist/test/workers/fixtures/ops/example-op/schema.js +19 -0
- package/dist/test/workers/fixtures/ops/example-reader/fetcher.js +20 -0
- package/dist/test/workers/fixtures/ops/example-reader/schema.js +41 -0
- package/dist/test/workers/fixtures/ops/example-reader/slicer.js +37 -0
- package/dist/test/workers/fixtures/ops/new-op/processor.js +29 -0
- package/dist/test/workers/fixtures/ops/new-op/schema.js +18 -0
- package/dist/test/workers/fixtures/ops/new-reader/fetcher.js +19 -0
- package/dist/test/workers/fixtures/ops/new-reader/schema.js +23 -0
- package/dist/test/workers/fixtures/ops/new-reader/slicer.js +13 -0
- package/dist/test/workers/helpers/configs.js +130 -0
- package/dist/test/workers/helpers/execution-controller-helper.js +49 -0
- package/dist/test/workers/helpers/index.js +5 -0
- package/dist/test/workers/helpers/test-context.js +210 -0
- package/dist/test/workers/helpers/zip-directory.js +25 -0
- package/dist/test/workers/worker/slice-spec.js +333 -0
- package/dist/test/workers/worker/worker-spec.js +356 -0
- package/package.json +94 -93
- package/service.js +0 -0
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
import { Queue, TSError, getFullErrorStack, logError, get, withoutNil, isEmpty, multiFieldSort, isString, flatten, includes, cloneDeep } from '@terascope/utils';
|
|
2
|
+
import { makeLogger } from '../../workers/helpers/terafoundation.js';
|
|
3
|
+
/**
|
|
4
|
+
* New execution result
|
|
5
|
+
* @typedef NewExecutionResult
|
|
6
|
+
* @property {string} job_id
|
|
7
|
+
* @property {string} ex_id
|
|
8
|
+
*/
|
|
9
|
+
/*
|
|
10
|
+
Execution Life Cycle for _status
|
|
11
|
+
pending -> scheduling -> running -> [ paused -> running ] -> [ stopped | completed ]
|
|
12
|
+
Exceptions
|
|
13
|
+
rejected - when a execution is rejected prior to scheduling
|
|
14
|
+
failed - when there is an error while the execution is running
|
|
15
|
+
aborted - when a execution was running at the point when the cluster shutsdown
|
|
16
|
+
*/
|
|
17
|
+
export class ExecutionService {
|
|
18
|
+
logger;
|
|
19
|
+
pendingExecutionQueue = new Queue();
|
|
20
|
+
isNative;
|
|
21
|
+
context;
|
|
22
|
+
clusterMasterServer;
|
|
23
|
+
executionStorage;
|
|
24
|
+
stateStorage;
|
|
25
|
+
clusterService;
|
|
26
|
+
allocateInterval;
|
|
27
|
+
reapInterval;
|
|
28
|
+
constructor(context, { clusterMasterServer }) {
|
|
29
|
+
this.context = context;
|
|
30
|
+
this.logger = makeLogger(context, 'execution_service');
|
|
31
|
+
this.isNative = context.sysconfig.teraslice.cluster_manager_type === 'native';
|
|
32
|
+
this.clusterMasterServer = clusterMasterServer;
|
|
33
|
+
}
|
|
34
|
+
async initialize() {
|
|
35
|
+
const { executionStorage, stateStorage } = this.context.stores;
|
|
36
|
+
if (executionStorage == null || stateStorage == null) {
|
|
37
|
+
throw new Error('Missing required stores');
|
|
38
|
+
}
|
|
39
|
+
const { clusterService } = this.context.services;
|
|
40
|
+
if (clusterService == null) {
|
|
41
|
+
throw new Error('Missing required services');
|
|
42
|
+
}
|
|
43
|
+
this.executionStorage = executionStorage;
|
|
44
|
+
this.stateStorage = stateStorage;
|
|
45
|
+
this.clusterService = clusterService;
|
|
46
|
+
this.logger.info('execution service is initializing...');
|
|
47
|
+
// listen for an execution finished events
|
|
48
|
+
// TODO: look closer at the types of the callback
|
|
49
|
+
this.clusterMasterServer.onExecutionFinished(this._finishExecution.bind(this));
|
|
50
|
+
// lets call this before calling it
|
|
51
|
+
// in the background
|
|
52
|
+
await this.reapExecutions();
|
|
53
|
+
const pending = await executionStorage.search('_status:pending', undefined, 10000, '_created:asc');
|
|
54
|
+
for (const execution of pending) {
|
|
55
|
+
this.logger.info(`enqueuing ${execution._status} execution: ${execution.ex_id}`);
|
|
56
|
+
this.enqueue(execution);
|
|
57
|
+
}
|
|
58
|
+
const queueSize = this.pendingExecutionQueue.size();
|
|
59
|
+
if (queueSize > 0) {
|
|
60
|
+
this.logger.info(`execution queue initialization complete, ${this.pendingExecutionQueue.size()} pending executions have been enqueued`);
|
|
61
|
+
}
|
|
62
|
+
else {
|
|
63
|
+
this.logger.debug('execution queue initialization complete');
|
|
64
|
+
}
|
|
65
|
+
const executionAllocator = this._executionAllocator().bind(this);
|
|
66
|
+
this.allocateInterval = setInterval(executionAllocator, 1000);
|
|
67
|
+
this.reapInterval = setInterval(this.reapExecutions.bind(this), this.context.sysconfig.teraslice.shutdown_timeout || 30000);
|
|
68
|
+
}
|
|
69
|
+
enqueue(ex) {
|
|
70
|
+
const size = this.pendingExecutionQueue.size();
|
|
71
|
+
this.logger.debug(ex, `enqueueing execution to be processed (queue size ${size})`);
|
|
72
|
+
this.pendingExecutionQueue.enqueue(cloneDeep(ex));
|
|
73
|
+
}
|
|
74
|
+
getClusterAnalytics() {
|
|
75
|
+
return this.clusterMasterServer.getClusterAnalytics();
|
|
76
|
+
}
|
|
77
|
+
async waitForExecutionStatus(exId, _status) {
|
|
78
|
+
const status = _status || 'stopped';
|
|
79
|
+
return new Promise((resolve) => {
|
|
80
|
+
const checkCluster = async () => {
|
|
81
|
+
const state = this.clusterService.getClusterState();
|
|
82
|
+
const dict = Object.create(null);
|
|
83
|
+
Object.values(state).forEach((node) => node.active.forEach((worker) => {
|
|
84
|
+
if (worker.ex_id) {
|
|
85
|
+
dict[worker.ex_id] = true;
|
|
86
|
+
}
|
|
87
|
+
}));
|
|
88
|
+
// if found, do not resolve
|
|
89
|
+
if (dict[exId]) {
|
|
90
|
+
setTimeout(checkCluster, 3000);
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
try {
|
|
94
|
+
await this.executionStorage.verifyStatusUpdate(exId, status);
|
|
95
|
+
await this.executionStorage.setStatus(exId, status);
|
|
96
|
+
}
|
|
97
|
+
catch (err) {
|
|
98
|
+
logError(this.logger, err, `failure setting execution, ${exId}, to ${status}`);
|
|
99
|
+
}
|
|
100
|
+
finally {
|
|
101
|
+
resolve(true);
|
|
102
|
+
}
|
|
103
|
+
};
|
|
104
|
+
checkCluster();
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
async shutdown() {
|
|
108
|
+
this.logger.info('shutting down');
|
|
109
|
+
clearInterval(this.allocateInterval);
|
|
110
|
+
clearInterval(this.reapInterval);
|
|
111
|
+
this.allocateInterval = undefined;
|
|
112
|
+
this.reapInterval = undefined;
|
|
113
|
+
const query = this.executionStorage.getLivingStatuses().map((str) => `_status:${str}`)
|
|
114
|
+
.join(' OR ');
|
|
115
|
+
const executions = await this.executionStorage.search(query);
|
|
116
|
+
await Promise.all(executions.map(async (execution) => {
|
|
117
|
+
if (!this.isNative)
|
|
118
|
+
return;
|
|
119
|
+
this.logger.warn(`marking execution ex_id: ${execution.ex_id}, job_id: ${execution.job_id} as terminated`);
|
|
120
|
+
const exId = execution.ex_id;
|
|
121
|
+
const { hostname } = this.context.sysconfig.teraslice;
|
|
122
|
+
// need to exclude sending a stop to cluster master host, the shutdown event
|
|
123
|
+
// has already been propagated this can cause a condition of it waiting for
|
|
124
|
+
// stop to return but it already has which pauses this service shutdown
|
|
125
|
+
await this.stopExecution(exId, { excludeNode: hostname });
|
|
126
|
+
await this.waitForExecutionStatus(exId, 'terminated');
|
|
127
|
+
}));
|
|
128
|
+
}
|
|
129
|
+
findAllWorkers() {
|
|
130
|
+
return flatten(Object.values(this.clusterService.getClusterState())
|
|
131
|
+
.filter((node) => node.state === 'connected')
|
|
132
|
+
.map((node) => {
|
|
133
|
+
const workers = node.active.filter(Boolean);
|
|
134
|
+
return workers.map((worker) => {
|
|
135
|
+
worker.node_id = node.node_id;
|
|
136
|
+
worker.hostname = node.hostname;
|
|
137
|
+
return worker;
|
|
138
|
+
});
|
|
139
|
+
}))
|
|
140
|
+
.filter(Boolean);
|
|
141
|
+
}
|
|
142
|
+
async addWorkers(exId, workerNum) {
|
|
143
|
+
return this.executionStorage.getActiveExecution(exId)
|
|
144
|
+
.then((execution) => this.clusterService.addWorkers(execution, workerNum));
|
|
145
|
+
}
|
|
146
|
+
async setWorkers(exId, workerNum) {
|
|
147
|
+
return this.executionStorage.getActiveExecution(exId)
|
|
148
|
+
.then((execution) => this.clusterService.setWorkers(execution, workerNum));
|
|
149
|
+
}
|
|
150
|
+
async removeWorkers(exId, workerNum) {
|
|
151
|
+
return this.executionStorage.getActiveExecution(exId)
|
|
152
|
+
.then((execution) => this.clusterService.removeWorkers(execution.ex_id, workerNum));
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Check if the execution is in a terminal status
|
|
156
|
+
*
|
|
157
|
+
* @param {import('@terascope/job-components').ExecutionConfig} execution
|
|
158
|
+
* @returns {boolean}
|
|
159
|
+
*/
|
|
160
|
+
isExecutionTerminal(execution) {
|
|
161
|
+
const terminalList = this.executionStorage.getTerminalStatuses();
|
|
162
|
+
return terminalList.find((tStat) => tStat === execution._status) != null;
|
|
163
|
+
}
|
|
164
|
+
// safely stop the execution without setting the ex status to stopping or stopped
|
|
165
|
+
async _finishExecution(exId, err) {
|
|
166
|
+
if (err) {
|
|
167
|
+
const error = new TSError(err, {
|
|
168
|
+
reason: `terminal error for execution: ${exId}, shutting down execution`,
|
|
169
|
+
context: {
|
|
170
|
+
ex_id: exId,
|
|
171
|
+
}
|
|
172
|
+
});
|
|
173
|
+
this.logger.error(error);
|
|
174
|
+
}
|
|
175
|
+
const execution = await this.getExecutionContext(exId);
|
|
176
|
+
if (!execution) {
|
|
177
|
+
throw new Error(`Execution: ${exId} was not found to finish execution`);
|
|
178
|
+
}
|
|
179
|
+
const status = execution._status;
|
|
180
|
+
if (['stopping', 'stopped'].includes(status)) {
|
|
181
|
+
this.logger.debug(`execution ${exId} is already stopping which means there is no need to stop the execution`);
|
|
182
|
+
return;
|
|
183
|
+
}
|
|
184
|
+
const runningStatuses = this.executionStorage.getRunningStatuses();
|
|
185
|
+
if (runningStatuses.includes(status)) {
|
|
186
|
+
// This should never happen. If we get here with a running status
|
|
187
|
+
// something has gone wrong. Mark execution as failed before shutdown.
|
|
188
|
+
this.logger.warn(`Cluster_master is changing status of execution ${exId} from ${status} to failed`);
|
|
189
|
+
await this.executionStorage.setStatus(exId, 'failed', this.executionStorage.executionMetaData(null, getFullErrorStack(err)));
|
|
190
|
+
}
|
|
191
|
+
this.logger.debug(`execution ${exId} finished, shutting down execution`);
|
|
192
|
+
try {
|
|
193
|
+
await this.clusterService.stopExecution(exId);
|
|
194
|
+
}
|
|
195
|
+
catch (stopErr) {
|
|
196
|
+
const stopError = new TSError(stopErr, {
|
|
197
|
+
reason: 'error finishing the execution',
|
|
198
|
+
context: {
|
|
199
|
+
ex_id: exId,
|
|
200
|
+
}
|
|
201
|
+
});
|
|
202
|
+
logError(this.logger, stopError);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
async stopExecution(exId, options) {
|
|
206
|
+
const execution = await this.getExecutionContext(exId);
|
|
207
|
+
if (!execution) {
|
|
208
|
+
throw new Error(`Execution: ${exId} was not found`);
|
|
209
|
+
}
|
|
210
|
+
const isTerminal = this.isExecutionTerminal(execution);
|
|
211
|
+
if (!options.force) {
|
|
212
|
+
if (isTerminal) {
|
|
213
|
+
this.logger.info(`execution ${exId} is in terminal status "${execution._status}", it cannot be stopped`);
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
216
|
+
if (execution._status === 'stopping') {
|
|
217
|
+
this.logger.info('execution is already stopping...');
|
|
218
|
+
// we are kicking this off in the background, not part of the promise chain
|
|
219
|
+
this.waitForExecutionStatus(exId);
|
|
220
|
+
return;
|
|
221
|
+
}
|
|
222
|
+
this.logger.debug(`stopping execution ${exId}...`, withoutNil(options));
|
|
223
|
+
await this.executionStorage.setStatus(exId, 'stopping');
|
|
224
|
+
}
|
|
225
|
+
else {
|
|
226
|
+
this.logger.info(`force stopping execution ${exId}...`, withoutNil(options));
|
|
227
|
+
}
|
|
228
|
+
await this.clusterService.stopExecution(exId, options);
|
|
229
|
+
// we are kicking this off in the background, not part of the promise chain
|
|
230
|
+
this.waitForExecutionStatus(exId);
|
|
231
|
+
}
|
|
232
|
+
async pauseExecution(exId) {
|
|
233
|
+
const status = 'paused';
|
|
234
|
+
const execution = await this.executionStorage.getActiveExecution(exId);
|
|
235
|
+
if (!this.clusterMasterServer.isClientReady(execution.ex_id)) {
|
|
236
|
+
throw new Error(`Execution ${execution.ex_id} is not available to pause`);
|
|
237
|
+
}
|
|
238
|
+
await this.clusterMasterServer.sendExecutionPause(exId);
|
|
239
|
+
await this.executionStorage.setStatus(exId, status);
|
|
240
|
+
return { status };
|
|
241
|
+
}
|
|
242
|
+
async resumeExecution(exId) {
|
|
243
|
+
const status = 'running';
|
|
244
|
+
const execution = await this.executionStorage.getActiveExecution(exId);
|
|
245
|
+
if (!this.clusterMasterServer.isClientReady(execution.ex_id)) {
|
|
246
|
+
throw new Error(`Execution ${execution.ex_id} is not available to resume`);
|
|
247
|
+
}
|
|
248
|
+
await this.clusterMasterServer.sendExecutionResume(execution.ex_id);
|
|
249
|
+
await this.executionStorage.setStatus(execution.ex_id, status);
|
|
250
|
+
return { status };
|
|
251
|
+
}
|
|
252
|
+
async getControllerStats(exId) {
|
|
253
|
+
// if no exId is provided it returns all running executions
|
|
254
|
+
const specificId = exId ?? false;
|
|
255
|
+
const exIds = await this.getRunningExecutions(exId);
|
|
256
|
+
const clients = this.clusterMasterServer.onlineClients.filter(({ clientId }) => {
|
|
257
|
+
if (specificId && clientId === specificId)
|
|
258
|
+
return true;
|
|
259
|
+
return includes(exIds, clientId);
|
|
260
|
+
});
|
|
261
|
+
function formatResponse(msg) {
|
|
262
|
+
const payload = get(msg, 'payload', {});
|
|
263
|
+
const identifiers = {
|
|
264
|
+
ex_id: payload.ex_id,
|
|
265
|
+
job_id: payload.job_id,
|
|
266
|
+
name: payload.name
|
|
267
|
+
};
|
|
268
|
+
return Object.assign(identifiers, payload.stats);
|
|
269
|
+
}
|
|
270
|
+
if (isEmpty(clients)) {
|
|
271
|
+
if (specificId) {
|
|
272
|
+
throw new TSError(`Could not find active slicer for ex_id: ${specificId}`, {
|
|
273
|
+
statusCode: 404
|
|
274
|
+
});
|
|
275
|
+
}
|
|
276
|
+
return [];
|
|
277
|
+
}
|
|
278
|
+
const promises = clients.map((client) => {
|
|
279
|
+
const { clientId } = client;
|
|
280
|
+
return this.clusterMasterServer
|
|
281
|
+
.sendExecutionAnalyticsRequest(clientId)
|
|
282
|
+
.then(formatResponse);
|
|
283
|
+
});
|
|
284
|
+
const results = await Promise.all(promises);
|
|
285
|
+
return multiFieldSort(results, ['name', 'started']).reverse();
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Create a new execution context
|
|
289
|
+
*
|
|
290
|
+
* @param {string|import('@terascope/job-components').JobConfigParams} job
|
|
291
|
+
* @return {Promise<NewExecutionResult>}
|
|
292
|
+
*/
|
|
293
|
+
async createExecutionContext(job) {
|
|
294
|
+
const ex = await this.executionStorage.create(job);
|
|
295
|
+
this.enqueue(ex);
|
|
296
|
+
return { job_id: ex.job_id, ex_id: ex.ex_id };
|
|
297
|
+
}
|
|
298
|
+
async getExecutionContext(exId) {
|
|
299
|
+
try {
|
|
300
|
+
const record = this.executionStorage.get(exId);
|
|
301
|
+
if (!record) {
|
|
302
|
+
throw new Error(`Execution ${exId} was not found`);
|
|
303
|
+
}
|
|
304
|
+
return record;
|
|
305
|
+
}
|
|
306
|
+
catch (err) {
|
|
307
|
+
logError(this.logger, err, `error getting execution context for ex: ${exId}`);
|
|
308
|
+
throw err;
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
async softDeleteExecutionContext(exId) {
|
|
312
|
+
const exIds = await this.getRunningExecutions(exId);
|
|
313
|
+
if (exIds.length > 0) {
|
|
314
|
+
throw new TSError(`Execution ${exId} is currently running, cannot delete a running execution.`, {
|
|
315
|
+
statusCode: 409
|
|
316
|
+
});
|
|
317
|
+
}
|
|
318
|
+
return this.executionStorage.softDelete(exId);
|
|
319
|
+
}
|
|
320
|
+
async getRunningExecutions(exId) {
|
|
321
|
+
let query = this.executionStorage.getRunningStatuses().map((state) => ` _status:${state} `)
|
|
322
|
+
.join('OR');
|
|
323
|
+
if (exId) {
|
|
324
|
+
query = `ex_id:"${exId}" AND (${query.trim()})`;
|
|
325
|
+
}
|
|
326
|
+
const exs = await this.executionStorage.search(query, undefined, undefined, '_created:desc');
|
|
327
|
+
return exs.map((ex) => ex.ex_id);
|
|
328
|
+
}
|
|
329
|
+
/**
|
|
330
|
+
* Recover the execution
|
|
331
|
+
*
|
|
332
|
+
* @param {string|import('@terascope/job-components').ExecutionConfig} exIdOrEx
|
|
333
|
+
* @param {import('@terascope/job-components').RecoveryCleanupType} [cleanupType]
|
|
334
|
+
* @return {Promise<NewExecutionResult>}
|
|
335
|
+
*/
|
|
336
|
+
async recoverExecution(exIdOrEx, cleanupType) {
|
|
337
|
+
const recoverFromEx = isString(exIdOrEx)
|
|
338
|
+
? await this.getExecutionContext(exIdOrEx)
|
|
339
|
+
: cloneDeep(exIdOrEx);
|
|
340
|
+
if (!recoverFromEx) {
|
|
341
|
+
throw new Error(`Could not find execution: ${exIdOrEx} to recover from`);
|
|
342
|
+
}
|
|
343
|
+
const ex = await this.executionStorage.createRecoveredExecution(recoverFromEx, cleanupType);
|
|
344
|
+
this.enqueue(ex);
|
|
345
|
+
return { job_id: ex.job_id, ex_id: ex.ex_id };
|
|
346
|
+
}
|
|
347
|
+
_executionAllocator() {
|
|
348
|
+
let allocatingExecution = false;
|
|
349
|
+
const allocator = async () => {
|
|
350
|
+
const canAllocate = !allocatingExecution
|
|
351
|
+
&& this.pendingExecutionQueue.size() > 0
|
|
352
|
+
&& this.clusterService.readyForAllocation();
|
|
353
|
+
if (!canAllocate)
|
|
354
|
+
return;
|
|
355
|
+
allocatingExecution = true;
|
|
356
|
+
let execution = this.pendingExecutionQueue.dequeue();
|
|
357
|
+
this.logger.info(`Scheduling execution: ${execution.ex_id}`);
|
|
358
|
+
try {
|
|
359
|
+
execution = await this.executionStorage.setStatus(execution.ex_id, 'scheduling');
|
|
360
|
+
execution = await this.clusterService.allocateSlicer(execution);
|
|
361
|
+
execution = await this.executionStorage.setStatus(execution.ex_id, 'initializing', {
|
|
362
|
+
slicer_port: execution.slicer_port,
|
|
363
|
+
slicer_hostname: execution.slicer_hostname
|
|
364
|
+
});
|
|
365
|
+
try {
|
|
366
|
+
await this.clusterService.allocateWorkers(execution, execution.workers);
|
|
367
|
+
}
|
|
368
|
+
catch (err) {
|
|
369
|
+
throw new TSError(err, {
|
|
370
|
+
reason: `Failure to allocateWorkers ${execution.ex_id}`
|
|
371
|
+
});
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
catch (err) {
|
|
375
|
+
const msg = `Failed to provision execution ${execution.ex_id}`;
|
|
376
|
+
const error = new TSError(err, {
|
|
377
|
+
reason: msg
|
|
378
|
+
});
|
|
379
|
+
this.logger.warn(msg);
|
|
380
|
+
try {
|
|
381
|
+
await this.executionStorage.setStatus(execution.ex_id, 'failed', this.executionStorage.executionMetaData(null, getFullErrorStack(error)));
|
|
382
|
+
}
|
|
383
|
+
catch (failedErr) {
|
|
384
|
+
this.logger.error(new TSError(err, {
|
|
385
|
+
reason: 'Failure to set execution status to failed after provision failed'
|
|
386
|
+
}));
|
|
387
|
+
}
|
|
388
|
+
const clusteringType = this.context.sysconfig.teraslice.cluster_manager_type;
|
|
389
|
+
if (clusteringType === 'kubernetes' || clusteringType === 'kubernetesV2') {
|
|
390
|
+
// Since this condition is only hit in cases where the pods
|
|
391
|
+
// are never scheduled, all this call to stopExecution
|
|
392
|
+
// accomplishes is to delete the k8s resources, which is
|
|
393
|
+
// probably just the k8s job for the execution controller.
|
|
394
|
+
// Calling delete on the worker deployment that doesn't
|
|
395
|
+
// exist is OK.
|
|
396
|
+
this.logger.warn(`Calling stopExecution on execution: ${execution.ex_id} to clean up k8s resources.`);
|
|
397
|
+
await this.clusterService.stopExecution(execution.ex_id);
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
finally {
|
|
401
|
+
allocatingExecution = false;
|
|
402
|
+
allocator();
|
|
403
|
+
}
|
|
404
|
+
};
|
|
405
|
+
return allocator;
|
|
406
|
+
}
|
|
407
|
+
async reapExecutions() {
|
|
408
|
+
// make sure to capture the error avoid throwing an
|
|
409
|
+
// unhandled rejection
|
|
410
|
+
try {
|
|
411
|
+
// sometimes in development an execution gets stuck in stopping
|
|
412
|
+
// status since the process gets force killed in before it
|
|
413
|
+
// can be updated to stopped.
|
|
414
|
+
const stopping = await this.executionStorage.search('_status:stopping');
|
|
415
|
+
for (const execution of stopping) {
|
|
416
|
+
const updatedAt = new Date(execution._updated).getTime();
|
|
417
|
+
const timeout = this.context.sysconfig.teraslice.shutdown_timeout;
|
|
418
|
+
const updatedWithTimeout = updatedAt + timeout;
|
|
419
|
+
// Since we don't want to break executions that actually are "stopping"
|
|
420
|
+
// we need to verify that the job has exceeded the shutdown timeout
|
|
421
|
+
if (Date.now() > updatedWithTimeout) {
|
|
422
|
+
this.logger.info(`stopping stuck executing ${execution._status} execution: ${execution.ex_id}`);
|
|
423
|
+
await this.executionStorage.setStatus(execution.ex_id, 'stopped');
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
catch (err) {
|
|
428
|
+
this.logger.error(err, 'failure reaping executions');
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
async listResourcesForJobId(jobId) {
|
|
432
|
+
return this.clusterService.listResourcesForJobId(jobId);
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
//# sourceMappingURL=execution.js.map
|