teraslice 2.10.0 → 2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/interfaces.js +12 -0
- package/dist/src/lib/cluster/cluster_master.js +246 -0
- package/dist/src/lib/cluster/node_master.js +355 -0
- package/dist/src/lib/cluster/services/api.js +663 -0
- package/dist/src/lib/cluster/services/assets.js +226 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetes/index.js +192 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8s.js +481 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js +414 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8sState.js +59 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetes/utils.js +43 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/index.js +192 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/interfaces.js +2 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8s.js +423 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sDeploymentResource.js +60 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sJobResource.js +55 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.js +359 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sServiceResource.js +37 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sState.js +60 -0
- package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/utils.js +170 -0
- package/dist/src/lib/cluster/services/cluster/backends/native/dispatch.js +13 -0
- package/dist/src/lib/cluster/services/cluster/backends/native/index.js +526 -0
- package/dist/src/lib/cluster/services/cluster/backends/native/messaging.js +547 -0
- package/dist/src/lib/cluster/services/cluster/backends/state-utils.js +26 -0
- package/dist/src/lib/cluster/services/cluster/index.js +17 -0
- package/dist/src/lib/cluster/services/execution.js +435 -0
- package/dist/src/lib/cluster/services/index.js +6 -0
- package/dist/src/lib/cluster/services/interfaces.js +2 -0
- package/dist/src/lib/cluster/services/jobs.js +454 -0
- package/dist/src/lib/config/default-sysconfig.js +26 -0
- package/dist/src/lib/config/index.js +22 -0
- package/dist/src/lib/config/schemas/system.js +360 -0
- package/dist/src/lib/storage/analytics.js +86 -0
- package/dist/src/lib/storage/assets.js +401 -0
- package/dist/src/lib/storage/backends/elasticsearch_store.js +494 -0
- package/dist/src/lib/storage/backends/mappings/analytics.js +50 -0
- package/dist/src/lib/storage/backends/mappings/asset.js +41 -0
- package/dist/src/lib/storage/backends/mappings/ex.js +62 -0
- package/dist/src/lib/storage/backends/mappings/job.js +38 -0
- package/dist/src/lib/storage/backends/mappings/state.js +38 -0
- package/dist/src/lib/storage/backends/s3_store.js +237 -0
- package/dist/src/lib/storage/execution.js +300 -0
- package/dist/src/lib/storage/index.js +7 -0
- package/dist/src/lib/storage/jobs.js +81 -0
- package/dist/src/lib/storage/state.js +255 -0
- package/dist/src/lib/utils/api_utils.js +157 -0
- package/dist/src/lib/utils/asset_utils.js +94 -0
- package/dist/src/lib/utils/date_utils.js +52 -0
- package/dist/src/lib/utils/encoding_utils.js +27 -0
- package/dist/src/lib/utils/events.js +4 -0
- package/dist/src/lib/utils/file_utils.js +124 -0
- package/dist/src/lib/utils/id_utils.js +15 -0
- package/dist/src/lib/utils/port_utils.js +32 -0
- package/dist/src/lib/workers/assets/index.js +3 -0
- package/dist/src/lib/workers/assets/loader-executable.js +40 -0
- package/dist/src/lib/workers/assets/loader.js +73 -0
- package/dist/src/lib/workers/assets/spawn.js +55 -0
- package/dist/src/lib/workers/context/execution-context.js +12 -0
- package/dist/src/lib/workers/context/terafoundation-context.js +8 -0
- package/dist/src/lib/workers/execution-controller/execution-analytics.js +188 -0
- package/dist/src/lib/workers/execution-controller/index.js +1024 -0
- package/dist/src/lib/workers/execution-controller/recovery.js +151 -0
- package/dist/src/lib/workers/execution-controller/scheduler.js +390 -0
- package/dist/src/lib/workers/execution-controller/slice-analytics.js +96 -0
- package/dist/src/lib/workers/helpers/job.js +80 -0
- package/dist/src/lib/workers/helpers/op-analytics.js +22 -0
- package/dist/src/lib/workers/helpers/terafoundation.js +34 -0
- package/dist/src/lib/workers/helpers/worker-shutdown.js +169 -0
- package/dist/src/lib/workers/metrics/index.js +108 -0
- package/dist/src/lib/workers/worker/index.js +378 -0
- package/dist/src/lib/workers/worker/slice.js +122 -0
- package/dist/test/config/schemas/system_schema-spec.js +37 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8s-spec.js +316 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sResource-spec.js +795 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sState-multicluster-spec.js +67 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sState-spec.js +84 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/utils-spec.js +132 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8s-v2-spec.js +455 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sResource-v2-spec.js +818 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sState-multicluster-v2-spec.js +67 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sState-v2-spec.js +84 -0
- package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/utils-v2-spec.js +320 -0
- package/dist/test/lib/cluster/services/cluster/backends/state-utils-spec.js +37 -0
- package/dist/test/node_master-spec.js +188 -0
- package/dist/test/services/api-spec.js +80 -0
- package/dist/test/services/assets-spec.js +158 -0
- package/dist/test/services/messaging-spec.js +440 -0
- package/dist/test/storage/assets_storage-spec.js +95 -0
- package/dist/test/storage/s3_store-spec.js +138 -0
- package/dist/test/test.config.js +8 -0
- package/dist/test/test.setup.js +6 -0
- package/dist/test/utils/api_utils-spec.js +86 -0
- package/dist/test/utils/asset_utils-spec.js +141 -0
- package/dist/test/utils/elastic_utils-spec.js +25 -0
- package/dist/test/workers/execution-controller/execution-controller-spec.js +371 -0
- package/dist/test/workers/execution-controller/execution-special-test-cases-spec.js +520 -0
- package/dist/test/workers/execution-controller/execution-test-cases-spec.js +338 -0
- package/dist/test/workers/execution-controller/recovery-spec.js +160 -0
- package/dist/test/workers/execution-controller/scheduler-spec.js +249 -0
- package/dist/test/workers/execution-controller/slice-analytics-spec.js +121 -0
- package/dist/test/workers/fixtures/ops/example-op/processor.js +20 -0
- package/dist/test/workers/fixtures/ops/example-op/schema.js +19 -0
- package/dist/test/workers/fixtures/ops/example-reader/fetcher.js +20 -0
- package/dist/test/workers/fixtures/ops/example-reader/schema.js +41 -0
- package/dist/test/workers/fixtures/ops/example-reader/slicer.js +37 -0
- package/dist/test/workers/fixtures/ops/new-op/processor.js +29 -0
- package/dist/test/workers/fixtures/ops/new-op/schema.js +18 -0
- package/dist/test/workers/fixtures/ops/new-reader/fetcher.js +19 -0
- package/dist/test/workers/fixtures/ops/new-reader/schema.js +23 -0
- package/dist/test/workers/fixtures/ops/new-reader/slicer.js +13 -0
- package/dist/test/workers/helpers/configs.js +130 -0
- package/dist/test/workers/helpers/execution-controller-helper.js +49 -0
- package/dist/test/workers/helpers/index.js +5 -0
- package/dist/test/workers/helpers/test-context.js +210 -0
- package/dist/test/workers/helpers/zip-directory.js +25 -0
- package/dist/test/workers/worker/slice-spec.js +333 -0
- package/dist/test/workers/worker/worker-spec.js +356 -0
- package/package.json +94 -93
- package/service.js +0 -0
|
@@ -0,0 +1,1024 @@
|
|
|
1
|
+
import ms from 'ms';
|
|
2
|
+
import { formatURL, ExecutionController as ExController, ClusterMaster } from '@terascope/teraslice-messaging';
|
|
3
|
+
import { TSError, includes, get, pDelay, getFullErrorStack, logError, pWhile, makeISODate, debounce, throttle } from '@terascope/utils';
|
|
4
|
+
import { isPromAvailable } from '@terascope/job-components';
|
|
5
|
+
import { waitForWorkerShutdown } from '../helpers/worker-shutdown.js';
|
|
6
|
+
import { StateStorage, ExecutionStorage, SliceState } from '../../storage/index.js';
|
|
7
|
+
import { makeLogger, generateWorkerId } from '../helpers/terafoundation.js';
|
|
8
|
+
import { ExecutionAnalytics } from './execution-analytics.js';
|
|
9
|
+
import { SliceAnalytics } from './slice-analytics.js';
|
|
10
|
+
import { Scheduler } from './scheduler.js';
|
|
11
|
+
import { Metrics } from '../metrics/index.js';
|
|
12
|
+
import { getPackageJSON } from '../../utils/file_utils.js';
|
|
13
|
+
export class ExecutionController {
|
|
14
|
+
context;
|
|
15
|
+
executionContext;
|
|
16
|
+
events;
|
|
17
|
+
logger;
|
|
18
|
+
server;
|
|
19
|
+
client;
|
|
20
|
+
stateStorage;
|
|
21
|
+
executionStorage;
|
|
22
|
+
isPaused = false;
|
|
23
|
+
isShutdown = false;
|
|
24
|
+
isShuttingDown = false;
|
|
25
|
+
isInitialized = false;
|
|
26
|
+
isStarted = false;
|
|
27
|
+
pendingDispatches = 0;
|
|
28
|
+
pendingSlices = 0;
|
|
29
|
+
isDoneProcessing = false;
|
|
30
|
+
isExecutionFinished = false;
|
|
31
|
+
isExecutionDone = false;
|
|
32
|
+
workersHaveConnected = false;
|
|
33
|
+
_handlers = new Map();
|
|
34
|
+
executionAnalytics;
|
|
35
|
+
scheduler;
|
|
36
|
+
metrics;
|
|
37
|
+
workerId;
|
|
38
|
+
exId;
|
|
39
|
+
shutdownTimeout;
|
|
40
|
+
workerDisconnectTimeout;
|
|
41
|
+
collectAnalytics;
|
|
42
|
+
slicerAnalytics;
|
|
43
|
+
_updateExecutionStats;
|
|
44
|
+
_startSliceFailureWatchDog;
|
|
45
|
+
workerConnectTimeoutId;
|
|
46
|
+
workerDisconnectTimeoutId;
|
|
47
|
+
sliceFailureInterval;
|
|
48
|
+
verifyStoresInterval;
|
|
49
|
+
slicerFailed = false;
|
|
50
|
+
startTime;
|
|
51
|
+
isDoneDispatching;
|
|
52
|
+
constructor(context, executionContext) {
|
|
53
|
+
const workerId = generateWorkerId(context);
|
|
54
|
+
// Use the bunyan logger.level() function to set the log level of context.logger equal
|
|
55
|
+
// to the log level of executionContext.logger.
|
|
56
|
+
// If a log_level was given in the job config, it will have overwritten the default
|
|
57
|
+
// log_level in the execution context.
|
|
58
|
+
context.logger.level(executionContext.logger.level());
|
|
59
|
+
const logger = makeLogger(context, 'execution_controller');
|
|
60
|
+
const events = context.apis.foundation.getSystemEvents();
|
|
61
|
+
const slicerPort = executionContext.config.slicer_port;
|
|
62
|
+
const performanceMetrics = executionContext.config.performance_metrics;
|
|
63
|
+
const config = context.sysconfig.teraslice;
|
|
64
|
+
const networkLatencyBuffer = get(config, 'network_latency_buffer');
|
|
65
|
+
const actionTimeout = get(config, 'action_timeout');
|
|
66
|
+
const workerDisconnectTimeout = get(config, 'worker_disconnect_timeout');
|
|
67
|
+
const nodeDisconnectTimeout = get(config, 'node_disconnect_timeout');
|
|
68
|
+
const shutdownTimeout = get(config, 'shutdown_timeout');
|
|
69
|
+
this.server = new ExController.Server({
|
|
70
|
+
port: slicerPort,
|
|
71
|
+
networkLatencyBuffer,
|
|
72
|
+
requestListener: this.requestListener.bind(this),
|
|
73
|
+
actionTimeout,
|
|
74
|
+
workerDisconnectTimeout,
|
|
75
|
+
logger
|
|
76
|
+
});
|
|
77
|
+
const clusterMasterPort = get(config, 'port');
|
|
78
|
+
const clusterMasterHostname = get(config, 'master_hostname');
|
|
79
|
+
this.client = new ClusterMaster.Client({
|
|
80
|
+
clusterMasterUrl: formatURL(clusterMasterHostname, clusterMasterPort),
|
|
81
|
+
nodeDisconnectTimeout,
|
|
82
|
+
networkLatencyBuffer,
|
|
83
|
+
actionTimeout,
|
|
84
|
+
exId: executionContext.exId,
|
|
85
|
+
connectTimeout: nodeDisconnectTimeout,
|
|
86
|
+
logger
|
|
87
|
+
});
|
|
88
|
+
this.executionAnalytics = new ExecutionAnalytics(context, executionContext, this.client);
|
|
89
|
+
this.scheduler = new Scheduler(context, executionContext);
|
|
90
|
+
this.metrics = performanceMetrics
|
|
91
|
+
? new Metrics({
|
|
92
|
+
logger
|
|
93
|
+
})
|
|
94
|
+
: null;
|
|
95
|
+
this.exId = executionContext.exId;
|
|
96
|
+
this.workerId = workerId;
|
|
97
|
+
this.logger = logger;
|
|
98
|
+
this.events = events;
|
|
99
|
+
this.context = context;
|
|
100
|
+
this.executionContext = executionContext;
|
|
101
|
+
this.collectAnalytics = this.executionContext.config.analytics;
|
|
102
|
+
this.shutdownTimeout = shutdownTimeout;
|
|
103
|
+
this.workerDisconnectTimeout = workerDisconnectTimeout;
|
|
104
|
+
this.executionStorage = new ExecutionStorage(context);
|
|
105
|
+
this.stateStorage = new StateStorage(context);
|
|
106
|
+
// TODO: see if I can remove this debounce
|
|
107
|
+
this._updateExecutionStats = debounce(() => {
|
|
108
|
+
this._updateExecutionStatsNow();
|
|
109
|
+
}, 100, {
|
|
110
|
+
leading: true,
|
|
111
|
+
trailing: true,
|
|
112
|
+
maxWait: 500
|
|
113
|
+
});
|
|
114
|
+
this._startSliceFailureWatchDog = this._initSliceFailureWatchDog();
|
|
115
|
+
}
|
|
116
|
+
async initialize() {
|
|
117
|
+
if (this.context.sysconfig.teraslice.cluster_manager_type === 'native') {
|
|
118
|
+
this.logger.warn('Skipping PromMetricsAPI initialization: incompatible with native clustering.');
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
const { terafoundation } = this.context.sysconfig;
|
|
122
|
+
const { config, exId, jobId } = this.executionContext;
|
|
123
|
+
await this.context.apis.foundation.promMetrics.init({
|
|
124
|
+
terasliceName: this.context.sysconfig.teraslice.name,
|
|
125
|
+
assignment: 'execution_controller',
|
|
126
|
+
logger: this.logger,
|
|
127
|
+
tf_prom_metrics_add_default: terafoundation.prom_metrics_add_default,
|
|
128
|
+
tf_prom_metrics_enabled: terafoundation.prom_metrics_enabled,
|
|
129
|
+
tf_prom_metrics_port: terafoundation.prom_metrics_port,
|
|
130
|
+
job_prom_metrics_add_default: config.prom_metrics_add_default,
|
|
131
|
+
job_prom_metrics_enabled: config.prom_metrics_enabled,
|
|
132
|
+
job_prom_metrics_port: config.prom_metrics_port,
|
|
133
|
+
labels: {
|
|
134
|
+
ex_id: exId,
|
|
135
|
+
job_id: jobId,
|
|
136
|
+
job_name: config.name,
|
|
137
|
+
assignment: 'execution_controller'
|
|
138
|
+
},
|
|
139
|
+
prefix: 'teraslice_job_',
|
|
140
|
+
prom_metrics_display_url: terafoundation.prom_metrics_display_url
|
|
141
|
+
});
|
|
142
|
+
await this.setupPromMetrics();
|
|
143
|
+
}
|
|
144
|
+
await Promise.all([
|
|
145
|
+
this.executionStorage.initialize(),
|
|
146
|
+
this.stateStorage.initialize(),
|
|
147
|
+
this.client.start()
|
|
148
|
+
]);
|
|
149
|
+
let verified;
|
|
150
|
+
let verifiedErr;
|
|
151
|
+
try {
|
|
152
|
+
verified = await this._verifyExecution();
|
|
153
|
+
}
|
|
154
|
+
catch (err) {
|
|
155
|
+
verifiedErr = err;
|
|
156
|
+
}
|
|
157
|
+
if (!verified) {
|
|
158
|
+
this.isShutdown = true;
|
|
159
|
+
await Promise.all([
|
|
160
|
+
this.executionStorage.shutdown(true),
|
|
161
|
+
this.stateStorage.shutdown(true),
|
|
162
|
+
this.client.shutdown()
|
|
163
|
+
]);
|
|
164
|
+
if (verifiedErr) {
|
|
165
|
+
throw verifiedErr;
|
|
166
|
+
}
|
|
167
|
+
return;
|
|
168
|
+
}
|
|
169
|
+
await this.server.start();
|
|
170
|
+
if (this.metrics != null) {
|
|
171
|
+
await this.metrics.initialize();
|
|
172
|
+
}
|
|
173
|
+
/// We set this to true later down the line. Not sure why
|
|
174
|
+
this.isInitialized = true;
|
|
175
|
+
this.server.onClientOnline((workerId) => {
|
|
176
|
+
clearTimeout(this.workerConnectTimeoutId);
|
|
177
|
+
this.workerConnectTimeoutId = undefined;
|
|
178
|
+
this.logger.trace(`worker ${workerId} is online`);
|
|
179
|
+
this.workersHaveConnected = true;
|
|
180
|
+
this.executionAnalytics.increment('workers_joined');
|
|
181
|
+
this._updateExecutionStats();
|
|
182
|
+
});
|
|
183
|
+
this.server.onClientAvailable((workerId) => {
|
|
184
|
+
this.logger.trace(`worker ${workerId} is available`);
|
|
185
|
+
this.executionAnalytics.set('workers_active', this.server.activeWorkerCount);
|
|
186
|
+
this.executionAnalytics.set('workers_available', this.server.availableClientCount);
|
|
187
|
+
this._updateExecutionStats();
|
|
188
|
+
});
|
|
189
|
+
this.server.onClientUnavailable(() => {
|
|
190
|
+
this.executionAnalytics.set('workers_active', this.server.activeWorkerCount);
|
|
191
|
+
this.executionAnalytics.set('workers_available', this.server.availableClientCount);
|
|
192
|
+
});
|
|
193
|
+
this.server.onClientDisconnect((workerId) => {
|
|
194
|
+
this.logger.trace(`worker ${workerId} disconnected but it may reconnect`);
|
|
195
|
+
this.executionAnalytics.increment('workers_disconnected');
|
|
196
|
+
this.executionAnalytics.set('workers_active', this.server.activeWorkerCount);
|
|
197
|
+
this._startWorkerDisconnectWatchDog();
|
|
198
|
+
this._updateExecutionStats();
|
|
199
|
+
});
|
|
200
|
+
this.server.onClientReconnect((workerId) => {
|
|
201
|
+
clearTimeout(this.workerDisconnectTimeoutId);
|
|
202
|
+
this.workerConnectTimeoutId = undefined;
|
|
203
|
+
this.logger.trace(`worker ${workerId} is reconnected`);
|
|
204
|
+
this.executionAnalytics.increment('workers_reconnected');
|
|
205
|
+
});
|
|
206
|
+
this.client.onExecutionPause(() => this.pause());
|
|
207
|
+
this.client.onExecutionResume(() => this.resume());
|
|
208
|
+
this.server.onSliceSuccess((workerId, response) => {
|
|
209
|
+
process.nextTick(() => {
|
|
210
|
+
const { slice_id: sliceId } = response.slice;
|
|
211
|
+
this.logger.info(`worker ${workerId} has completed its slice ${sliceId}`);
|
|
212
|
+
this.events.emit('slice:success', response);
|
|
213
|
+
this._removePendingSlice();
|
|
214
|
+
this._updateExecutionStats();
|
|
215
|
+
this.executionContext.onSliceComplete(response);
|
|
216
|
+
});
|
|
217
|
+
});
|
|
218
|
+
this.server.onSliceFailure((workerId, response) => {
|
|
219
|
+
process.nextTick(() => {
|
|
220
|
+
this.logger.error(`worker: ${workerId} has failure completing its slice`, response);
|
|
221
|
+
this.events.emit('slice:failure', response);
|
|
222
|
+
if (this.scheduler.canComplete()) {
|
|
223
|
+
this.setFailingStatus('slice failure event');
|
|
224
|
+
}
|
|
225
|
+
else if (this.scheduler.isRecovering()) {
|
|
226
|
+
this._terminalError(new Error('Slice failed while recovering'));
|
|
227
|
+
}
|
|
228
|
+
else {
|
|
229
|
+
// in persistent mode we set watchdogs to monitor
|
|
230
|
+
// when failing can be set back to running
|
|
231
|
+
this._startSliceFailureWatchDog();
|
|
232
|
+
}
|
|
233
|
+
this._removePendingSlice();
|
|
234
|
+
this._updateExecutionStats();
|
|
235
|
+
this.executionContext.onSliceComplete(response);
|
|
236
|
+
});
|
|
237
|
+
});
|
|
238
|
+
this._handlers.set('slicer:execution:update', (data) => {
|
|
239
|
+
this.logger.warn(data, 'event slicer:execution:update has been removed, used context.apis.executionContext.setMetadata(key, value): Promise<void>');
|
|
240
|
+
});
|
|
241
|
+
this._handlers.set('slicers:finished', (err) => {
|
|
242
|
+
if (err) {
|
|
243
|
+
this._terminalError(err);
|
|
244
|
+
}
|
|
245
|
+
});
|
|
246
|
+
this._handlers.set('recovery:failure', (err) => {
|
|
247
|
+
logError(this.logger, err, 'recovery finished due to failure');
|
|
248
|
+
this._terminalError(err);
|
|
249
|
+
});
|
|
250
|
+
for (const [event, handler] of this._handlers.entries()) {
|
|
251
|
+
if (handler !== null) {
|
|
252
|
+
this.events.on(event, handler);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
if (this.collectAnalytics) {
|
|
256
|
+
this.slicerAnalytics = new SliceAnalytics(this.context, this.executionContext);
|
|
257
|
+
}
|
|
258
|
+
// This initializes user code, need to throw terminal error
|
|
259
|
+
// so it can be surfaced
|
|
260
|
+
try {
|
|
261
|
+
await this.scheduler.initialize(this.stateStorage, this.executionStorage);
|
|
262
|
+
}
|
|
263
|
+
catch (err) {
|
|
264
|
+
await this._terminalError(err);
|
|
265
|
+
throw err;
|
|
266
|
+
}
|
|
267
|
+
this.logger.info(`execution: ${this.exId} initialized execution_controller`);
|
|
268
|
+
this.isInitialized = true;
|
|
269
|
+
/// This will change the '/ready' endpoint to Ready
|
|
270
|
+
this.server.executionReady = true;
|
|
271
|
+
}
|
|
272
|
+
async run() {
|
|
273
|
+
if (!this.isInitialized)
|
|
274
|
+
return;
|
|
275
|
+
this._startWorkConnectWatchDog();
|
|
276
|
+
this.executionAnalytics.start();
|
|
277
|
+
try {
|
|
278
|
+
await this._runExecution();
|
|
279
|
+
}
|
|
280
|
+
catch (err) {
|
|
281
|
+
logError(this.logger, err, 'Run execution error');
|
|
282
|
+
}
|
|
283
|
+
this.events.emit('worker:shutdown');
|
|
284
|
+
await this.executionContext.shutdown();
|
|
285
|
+
// help the workers go offline
|
|
286
|
+
this.server.isShuttingDown = true;
|
|
287
|
+
await this._finishExecution();
|
|
288
|
+
try {
|
|
289
|
+
await Promise.all([this.client.sendExecutionFinished(), this._waitForWorkersToExit()]);
|
|
290
|
+
}
|
|
291
|
+
catch (err) {
|
|
292
|
+
logError(this.logger, err, 'Failure sending execution finished');
|
|
293
|
+
}
|
|
294
|
+
this.logger.debug(`execution ${this.exId} is done`);
|
|
295
|
+
}
|
|
296
|
+
async resume() {
|
|
297
|
+
if (!this.isPaused)
|
|
298
|
+
return;
|
|
299
|
+
this.logger.info(`execution ${this.exId} is resuming...`);
|
|
300
|
+
this.isPaused = false;
|
|
301
|
+
this.scheduler.start();
|
|
302
|
+
await pDelay(100);
|
|
303
|
+
}
|
|
304
|
+
async pause() {
|
|
305
|
+
if (this.isPaused)
|
|
306
|
+
return;
|
|
307
|
+
this.logger.info(`execution ${this.exId} is pausing...`);
|
|
308
|
+
this.isPaused = true;
|
|
309
|
+
this.scheduler.pause();
|
|
310
|
+
await pDelay(100);
|
|
311
|
+
}
|
|
312
|
+
async setFailingStatus(reason) {
|
|
313
|
+
const errMsg = `execution ${this.exId} has encountered a processing error, reason: ${reason}`;
|
|
314
|
+
this.logger.error(errMsg);
|
|
315
|
+
const executionStats = this.executionAnalytics.getAnalytics();
|
|
316
|
+
const errorMeta = this.executionStorage.executionMetaData(executionStats, errMsg);
|
|
317
|
+
try {
|
|
318
|
+
await this.executionStorage.setStatus(this.exId, 'failing', errorMeta);
|
|
319
|
+
}
|
|
320
|
+
catch (err) {
|
|
321
|
+
logError(this.logger, err, 'Failure to set execution status to "failing"');
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
async _terminalError(err) {
|
|
325
|
+
if (this.isExecutionDone)
|
|
326
|
+
return;
|
|
327
|
+
this.slicerFailed = true;
|
|
328
|
+
const error = new TSError(err, {
|
|
329
|
+
reason: `slicer for ex ${this.exId} had an error, shutting down execution`
|
|
330
|
+
});
|
|
331
|
+
this.logger.error(error);
|
|
332
|
+
const executionStats = this.executionAnalytics.getAnalytics();
|
|
333
|
+
const fullStack = getFullErrorStack(error);
|
|
334
|
+
const errorMeta = this.executionStorage.executionMetaData(executionStats, fullStack);
|
|
335
|
+
try {
|
|
336
|
+
await this.executionStorage.setStatus(this.exId, 'failed', errorMeta);
|
|
337
|
+
}
|
|
338
|
+
catch (_err) {
|
|
339
|
+
logError(this.logger, _err, 'failure setting status to failed');
|
|
340
|
+
}
|
|
341
|
+
this.logger.fatal(`execution ${this.exId} is ended because of slice failure`);
|
|
342
|
+
await this._endExecution();
|
|
343
|
+
}
|
|
344
|
+
async shutdown(eventType, shutdownError, block = true) {
|
|
345
|
+
if (eventType === 'error' && shutdownError) {
|
|
346
|
+
/// Add errors to this list as needed. Errors not in this list won't cleanup resources
|
|
347
|
+
const errorList = [
|
|
348
|
+
'index specified in reader does not exist'
|
|
349
|
+
];
|
|
350
|
+
/// Tell cluster_master that shutdown is due to a specific error
|
|
351
|
+
/// Cleans up kubernetes resources. For native, kills processes
|
|
352
|
+
if (errorList.includes(shutdownError.message)) {
|
|
353
|
+
this.logger.warn('sent request to cluster_master to cleanup job resources.');
|
|
354
|
+
await this.client.sendExecutionFinished(shutdownError.message);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
/// This only applies to kubernetesV2
|
|
358
|
+
if (this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2'
|
|
359
|
+
&& eventType === 'SIGTERM') {
|
|
360
|
+
await this.stateStorage.refresh();
|
|
361
|
+
const status = await this.executionStorage.getStatus(this.exId);
|
|
362
|
+
const runningStatuses = this.executionStorage.getRunningStatuses();
|
|
363
|
+
this.logger.debug(`Execution ${this.exId} is currently in a ${status} state`);
|
|
364
|
+
/// This is an indication that the cluster_master did not call for this
|
|
365
|
+
/// shutdown. We want to restart in this case.
|
|
366
|
+
if (status !== 'stopping' && includes(runningStatuses, status)) {
|
|
367
|
+
this.logger.info('Skipping shutdown to allow for relocation...');
|
|
368
|
+
return;
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
if (this.isShutdown)
|
|
372
|
+
return;
|
|
373
|
+
if (!this.isInitialized)
|
|
374
|
+
return;
|
|
375
|
+
if (this.isShuttingDown) {
|
|
376
|
+
const msgs = [
|
|
377
|
+
'execution',
|
|
378
|
+
`shutdown was called for ${this.exId}`,
|
|
379
|
+
'but it was already shutting down',
|
|
380
|
+
block ? ', will block until done' : ''
|
|
381
|
+
];
|
|
382
|
+
this.logger.debug(msgs.join(' '));
|
|
383
|
+
if (block) {
|
|
384
|
+
await waitForWorkerShutdown(this.context, 'worker:shutdown:complete');
|
|
385
|
+
}
|
|
386
|
+
return;
|
|
387
|
+
}
|
|
388
|
+
this.logger.debug(`execution shutdown was called for ex ${this.exId}`);
|
|
389
|
+
const shutdownErrs = [];
|
|
390
|
+
const pushError = (err) => {
|
|
391
|
+
shutdownErrs.push(err);
|
|
392
|
+
};
|
|
393
|
+
// allow clients to go immediately from disconnect to offline
|
|
394
|
+
this.server.isShuttingDown = true;
|
|
395
|
+
// tell the scheduler to stop producing slices
|
|
396
|
+
await this.scheduler.stop();
|
|
397
|
+
// remove any listeners
|
|
398
|
+
for (const [event, handler] of this._handlers.entries()) {
|
|
399
|
+
if (handler !== null) {
|
|
400
|
+
this.events.removeListener(event, handler);
|
|
401
|
+
this._handlers.set(event, null);
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
this.isShuttingDown = true;
|
|
405
|
+
this.isPaused = false;
|
|
406
|
+
clearInterval(this.sliceFailureInterval);
|
|
407
|
+
clearTimeout(this.workerConnectTimeoutId);
|
|
408
|
+
clearTimeout(this.workerDisconnectTimeoutId);
|
|
409
|
+
clearInterval(this.verifyStoresInterval);
|
|
410
|
+
await this._waitForExecutionFinished();
|
|
411
|
+
await Promise.all([
|
|
412
|
+
(async () => {
|
|
413
|
+
if (!this.collectAnalytics)
|
|
414
|
+
return;
|
|
415
|
+
await this.slicerAnalytics.shutdown().catch(pushError);
|
|
416
|
+
})(),
|
|
417
|
+
(async () => {
|
|
418
|
+
// the execution analytics must be shutdown
|
|
419
|
+
// before the message client
|
|
420
|
+
await this.executionAnalytics.shutdown().catch(pushError);
|
|
421
|
+
await this.client.shutdown().catch(pushError);
|
|
422
|
+
})(),
|
|
423
|
+
(async () => {
|
|
424
|
+
await this.scheduler.shutdown().catch(pushError);
|
|
425
|
+
})(),
|
|
426
|
+
(async () => {
|
|
427
|
+
await this.server.shutdown().catch(pushError);
|
|
428
|
+
})(),
|
|
429
|
+
(async () => {
|
|
430
|
+
await Promise.all([
|
|
431
|
+
(async () => {
|
|
432
|
+
try {
|
|
433
|
+
await this.stateStorage.shutdown(true);
|
|
434
|
+
}
|
|
435
|
+
catch (err) {
|
|
436
|
+
pushError(err);
|
|
437
|
+
}
|
|
438
|
+
})(),
|
|
439
|
+
(async () => {
|
|
440
|
+
try {
|
|
441
|
+
await this.executionStorage.shutdown(true);
|
|
442
|
+
}
|
|
443
|
+
catch (err) {
|
|
444
|
+
pushError(err);
|
|
445
|
+
}
|
|
446
|
+
})()
|
|
447
|
+
]);
|
|
448
|
+
})(),
|
|
449
|
+
(async () => {
|
|
450
|
+
if (this.metrics == null)
|
|
451
|
+
return;
|
|
452
|
+
await this.metrics.shutdown().catch(pushError);
|
|
453
|
+
})()
|
|
454
|
+
]);
|
|
455
|
+
this.logger.warn(`execution controller ${this.exId} is shutdown`);
|
|
456
|
+
this.isShutdown = true;
|
|
457
|
+
if (shutdownErrs.length) {
|
|
458
|
+
const errMsg = shutdownErrs.map((e) => e.stack).join(', and');
|
|
459
|
+
const shutdownErr = new Error(`Failed to shutdown correctly: ${errMsg}`);
|
|
460
|
+
this.events.emit('worker:shutdown:complete', shutdownErr);
|
|
461
|
+
await pDelay(0);
|
|
462
|
+
throw shutdownErr;
|
|
463
|
+
}
|
|
464
|
+
this.events.emit('worker:shutdown:complete');
|
|
465
|
+
}
|
|
466
|
+
async _runExecution() {
|
|
467
|
+
// wait for paused
|
|
468
|
+
await pWhile(async () => {
|
|
469
|
+
if (!this.isPaused || this.isShutdown)
|
|
470
|
+
return true;
|
|
471
|
+
await pDelay(100);
|
|
472
|
+
return false;
|
|
473
|
+
});
|
|
474
|
+
this.logger.info(`starting execution ${this.exId}...`);
|
|
475
|
+
this.startTime = Date.now();
|
|
476
|
+
this.isStarted = true;
|
|
477
|
+
this._verifyStores();
|
|
478
|
+
// start creating / dispatching slices, this will block until done
|
|
479
|
+
await Promise.all([
|
|
480
|
+
this.client.sendAvailable().then(() => this.logger.debug('client.sendAvailable() promise resolved')),
|
|
481
|
+
this._runDispatch().then(() => this.logger.debug('_runDispatch() promise resolved')),
|
|
482
|
+
this.scheduler.run().then(() => this.logger.debug('scheduler.run() promise resolved'))
|
|
483
|
+
]);
|
|
484
|
+
const schedulerSuccessful = this.scheduler.isFinished && this.scheduler.slicersDone;
|
|
485
|
+
await this._waitForPendingSlices();
|
|
486
|
+
if (schedulerSuccessful && this.isDoneDispatching) {
|
|
487
|
+
this.logger.debug(`execution ${this.exId} is done processing slices`);
|
|
488
|
+
this.isDoneProcessing = true;
|
|
489
|
+
}
|
|
490
|
+
else if (!this.isShutdown) {
|
|
491
|
+
this.logger.debug(`execution ${this.exId} did not finish`);
|
|
492
|
+
}
|
|
493
|
+
else {
|
|
494
|
+
this.logger.debug(`execution ${this.exId} is exiting...`);
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
// dispatching should be pushed out into its own module
|
|
498
|
+
async _runDispatch() {
|
|
499
|
+
this.isDoneDispatching = false;
|
|
500
|
+
let dispatchInterval;
|
|
501
|
+
// returns a boolean to indicate whether
|
|
502
|
+
// dispatching should continue
|
|
503
|
+
const isRunning = () => {
|
|
504
|
+
if (this.isShuttingDown)
|
|
505
|
+
return false;
|
|
506
|
+
if (this.isExecutionDone)
|
|
507
|
+
return false;
|
|
508
|
+
if (this.scheduler.isFinished && !this.pendingDispatches)
|
|
509
|
+
return false;
|
|
510
|
+
return true;
|
|
511
|
+
};
|
|
512
|
+
const isPaused = () => this.isPaused;
|
|
513
|
+
const canDispatch = () => {
|
|
514
|
+
const workers = this.server.workerQueueSize;
|
|
515
|
+
const slices = this.scheduler.queueLength;
|
|
516
|
+
return workers > 0 && slices > 0;
|
|
517
|
+
};
|
|
518
|
+
const dequeueAndDispatch = () => {
|
|
519
|
+
const reenqueue = [];
|
|
520
|
+
const dispatch = [];
|
|
521
|
+
const slices = this.scheduler.getSlices(this.server.workerQueueSize);
|
|
522
|
+
slices.forEach((slice) => {
|
|
523
|
+
const workerId = this.server.dequeueWorker(slice);
|
|
524
|
+
if (!workerId) {
|
|
525
|
+
reenqueue.push(slice);
|
|
526
|
+
}
|
|
527
|
+
else {
|
|
528
|
+
this._addPendingDispatch();
|
|
529
|
+
this._addPendingSlice();
|
|
530
|
+
dispatch.push({ slice, workerId });
|
|
531
|
+
}
|
|
532
|
+
});
|
|
533
|
+
slices.length = 0;
|
|
534
|
+
if (dispatch.length > 0) {
|
|
535
|
+
process.nextTick(() => {
|
|
536
|
+
const promises = dispatch.map((input) => {
|
|
537
|
+
const { slice, workerId } = input;
|
|
538
|
+
return this._dispatchSlice(slice, workerId);
|
|
539
|
+
});
|
|
540
|
+
dispatch.length = 0;
|
|
541
|
+
Promise.all(promises).catch((err) => logError(this.logger, err, 'failure to dispatch slices'));
|
|
542
|
+
});
|
|
543
|
+
}
|
|
544
|
+
if (reenqueue.length > 0) {
|
|
545
|
+
// this isn't really ideal since we adding
|
|
546
|
+
// to the beginning of the queue and
|
|
547
|
+
// it may end up in a recursive loop trying
|
|
548
|
+
// to process that slice
|
|
549
|
+
this.scheduler.enqueueSlices(reenqueue, true);
|
|
550
|
+
reenqueue.length = 0;
|
|
551
|
+
}
|
|
552
|
+
};
|
|
553
|
+
await pDelay(0);
|
|
554
|
+
await new Promise((resolve) => {
|
|
555
|
+
this.logger.debug('dispatching slices...');
|
|
556
|
+
dispatchInterval = setInterval(() => {
|
|
557
|
+
if (!isRunning()) {
|
|
558
|
+
resolve(true);
|
|
559
|
+
return;
|
|
560
|
+
}
|
|
561
|
+
if (isPaused())
|
|
562
|
+
return;
|
|
563
|
+
if (canDispatch()) {
|
|
564
|
+
dequeueAndDispatch();
|
|
565
|
+
}
|
|
566
|
+
}, 5);
|
|
567
|
+
});
|
|
568
|
+
clearInterval(dispatchInterval);
|
|
569
|
+
this.isDoneDispatching = true;
|
|
570
|
+
this.logger.debug('done dispatching slices');
|
|
571
|
+
}
|
|
572
|
+
_dispatchSlice(slice, workerId) {
|
|
573
|
+
this.logger.trace(`dispatching slice ${slice.slice_id} for worker ${workerId}`);
|
|
574
|
+
return this.server
|
|
575
|
+
.dispatchSlice(slice, workerId)
|
|
576
|
+
.then((dispatched) => {
|
|
577
|
+
if (dispatched) {
|
|
578
|
+
this.logger.debug(`dispatched slice ${slice.slice_id} to worker ${workerId}`);
|
|
579
|
+
this.executionContext.onSliceDispatch(slice);
|
|
580
|
+
}
|
|
581
|
+
else {
|
|
582
|
+
this.logger.warn(`worker "${workerId}" is not available to process slice ${slice.slice_id}`);
|
|
583
|
+
this.scheduler.enqueueSlice(slice, true);
|
|
584
|
+
this._removePendingSlice();
|
|
585
|
+
}
|
|
586
|
+
this._removePendingDispatch();
|
|
587
|
+
})
|
|
588
|
+
.catch((err) => {
|
|
589
|
+
logError(this.logger, err, 'error dispatching slice');
|
|
590
|
+
this._removePendingDispatch();
|
|
591
|
+
this._removePendingSlice();
|
|
592
|
+
});
|
|
593
|
+
}
|
|
594
|
+
async _finishExecution() {
|
|
595
|
+
if (this.isExecutionFinished)
|
|
596
|
+
return;
|
|
597
|
+
this._logFinishedJob();
|
|
598
|
+
// refresh the state store index
|
|
599
|
+
// to prevent the execution from failing incorrectly
|
|
600
|
+
await this.stateStorage.refresh();
|
|
601
|
+
try {
|
|
602
|
+
await this._updateExecutionStatus();
|
|
603
|
+
}
|
|
604
|
+
catch (err) {
|
|
605
|
+
/* istanbul ignore next */
|
|
606
|
+
const error = new TSError(err, {
|
|
607
|
+
reason: `execution ${this.exId} has run to completion but the process has failed while updating the execution status, slicer will soon exit`
|
|
608
|
+
});
|
|
609
|
+
this.logger.error(error);
|
|
610
|
+
}
|
|
611
|
+
this.isExecutionFinished = true;
|
|
612
|
+
await this._endExecution();
|
|
613
|
+
}
|
|
614
|
+
async _endExecution() {
|
|
615
|
+
this.isExecutionDone = true;
|
|
616
|
+
await this.scheduler.shutdown();
|
|
617
|
+
}
|
|
618
|
+
_updateExecutionStatsNow() {
|
|
619
|
+
this.executionContext.onExecutionStats({
|
|
620
|
+
workers: {
|
|
621
|
+
connected: this.server.onlineClientCount,
|
|
622
|
+
available: this.server.availableClientCount
|
|
623
|
+
},
|
|
624
|
+
slices: {
|
|
625
|
+
processed: this.executionAnalytics.get('processed'),
|
|
626
|
+
failed: this.executionAnalytics.get('failed')
|
|
627
|
+
}
|
|
628
|
+
});
|
|
629
|
+
}
|
|
630
|
+
async _updateExecutionStatus() {
|
|
631
|
+
// if this.slicerFailed is true, slicer has already been marked as failed
|
|
632
|
+
if (this.slicerFailed)
|
|
633
|
+
return;
|
|
634
|
+
const executionStats = this.executionAnalytics.getAnalytics();
|
|
635
|
+
if (!this.isDoneProcessing) {
|
|
636
|
+
// if status is stopping or stopped, only update the execution metadata
|
|
637
|
+
const status = await this.executionStorage.getStatus(this.exId);
|
|
638
|
+
const isStopping = status === 'stopping' || status === 'stopped';
|
|
639
|
+
if (isStopping) {
|
|
640
|
+
this.logger.debug(`execution is set to ${status}, status will not be updated`);
|
|
641
|
+
await this.executionStorage.updatePartial(this.exId, async (existing) => {
|
|
642
|
+
const metaData = this.executionStorage.executionMetaData(executionStats);
|
|
643
|
+
return Object.assign(existing, metaData, {
|
|
644
|
+
_updated: makeISODate()
|
|
645
|
+
});
|
|
646
|
+
});
|
|
647
|
+
return;
|
|
648
|
+
}
|
|
649
|
+
const errMsg = `execution ${this.exId} received shutdown before the slicer could complete, setting status to "terminated"`;
|
|
650
|
+
const metaData = this.executionStorage.executionMetaData(executionStats, errMsg);
|
|
651
|
+
this.logger.error(errMsg);
|
|
652
|
+
await this.executionStorage.setStatus(this.exId, 'terminated', metaData);
|
|
653
|
+
return;
|
|
654
|
+
}
|
|
655
|
+
const [errors, started, pending] = await Promise.all([
|
|
656
|
+
this.stateStorage.countByState(this.exId, SliceState.error),
|
|
657
|
+
this.stateStorage.countByState(this.exId, SliceState.start),
|
|
658
|
+
this.stateStorage.countByState(this.exId, SliceState.pending),
|
|
659
|
+
]);
|
|
660
|
+
if (errors > 0 || started > 0) {
|
|
661
|
+
const errMsg = this._formatExecutionFailure({ errors, started, pending });
|
|
662
|
+
const errorMeta = this.executionStorage.executionMetaData(executionStats, errMsg);
|
|
663
|
+
this.logger.error(errMsg);
|
|
664
|
+
await this.executionStorage.setStatus(this.exId, 'failed', errorMeta);
|
|
665
|
+
return;
|
|
666
|
+
}
|
|
667
|
+
const metaData = this.executionStorage.executionMetaData(executionStats);
|
|
668
|
+
this.logger.info(`execution ${this.exId} has completed`);
|
|
669
|
+
await this.executionStorage.setStatus(this.exId, 'completed', metaData);
|
|
670
|
+
}
|
|
671
|
+
_logFinishedJob() {
|
|
672
|
+
const endTime = Date.now();
|
|
673
|
+
const elapsed = endTime - (this.startTime ?? 0);
|
|
674
|
+
const time = elapsed < 1000 ? 1 : Math.round(elapsed / 1000);
|
|
675
|
+
this.executionAnalytics.set('job_duration', time);
|
|
676
|
+
if (this.collectAnalytics && this.slicerAnalytics) {
|
|
677
|
+
this.slicerAnalytics.analyzeStats();
|
|
678
|
+
}
|
|
679
|
+
this.logger.info(`execution ${this.exId} has finished in ${time} seconds`);
|
|
680
|
+
}
|
|
681
|
+
_formatExecutionFailure({ started, errors, pending }) {
|
|
682
|
+
const startedMsg = started <= 1
|
|
683
|
+
? `had ${started} slice stuck in started`
|
|
684
|
+
: `had ${started} slices stuck in started`;
|
|
685
|
+
const pendingMsg = pending <= 1
|
|
686
|
+
? `had ${pending} slice are still pending`
|
|
687
|
+
: `had ${pending} slices are still pending`;
|
|
688
|
+
const errorsMsg = errors <= 1
|
|
689
|
+
? `had ${errors} slice failure`
|
|
690
|
+
: `had ${errors} slice failures`;
|
|
691
|
+
const none = (errors + started + pending) === 0;
|
|
692
|
+
const stateMessages = [
|
|
693
|
+
started || none ? startedMsg : '',
|
|
694
|
+
pending || none ? pendingMsg : '',
|
|
695
|
+
errors || none ? errorsMsg : '',
|
|
696
|
+
].filter(Boolean);
|
|
697
|
+
return `execution: ${this.exId} ${stateMessages} during processing`;
|
|
698
|
+
}
|
|
699
|
+
async _waitForWorkersToExit() {
|
|
700
|
+
if (!this.server.onlineClientCount)
|
|
701
|
+
return;
|
|
702
|
+
const timeoutOutAt = this.workerDisconnectTimeout + Date.now();
|
|
703
|
+
const logWaitingForWorkers = throttle(() => {
|
|
704
|
+
this.logger.debug(`waiting for ${this.server.onlineClientCount} to go offline`);
|
|
705
|
+
}, 1000);
|
|
706
|
+
const checkOnlineCount = async () => {
|
|
707
|
+
if (this.isExecutionFinished) {
|
|
708
|
+
this.logger.trace('execution finished while waiting for workers to go offline');
|
|
709
|
+
return;
|
|
710
|
+
}
|
|
711
|
+
if (!this.client.ready)
|
|
712
|
+
return;
|
|
713
|
+
if (!this.server.onlineClientCount) {
|
|
714
|
+
this.logger.trace('workers all workers have disconnected');
|
|
715
|
+
return;
|
|
716
|
+
}
|
|
717
|
+
const now = Date.now();
|
|
718
|
+
if (now > timeoutOutAt) {
|
|
719
|
+
return;
|
|
720
|
+
}
|
|
721
|
+
logWaitingForWorkers();
|
|
722
|
+
await pDelay(100);
|
|
723
|
+
await checkOnlineCount();
|
|
724
|
+
};
|
|
725
|
+
await checkOnlineCount();
|
|
726
|
+
}
|
|
727
|
+
async _waitForPendingSlices() {
|
|
728
|
+
const logPendingSlices = throttle(() => {
|
|
729
|
+
this.logger.debug(`waiting for ${this.pendingSlices} slices to finish`);
|
|
730
|
+
}, 1000);
|
|
731
|
+
const checkPendingSlices = async () => {
|
|
732
|
+
if (this.isShuttingDown)
|
|
733
|
+
return;
|
|
734
|
+
if (!this.pendingSlices) {
|
|
735
|
+
this.logger.debug('all pending slices are done');
|
|
736
|
+
return;
|
|
737
|
+
}
|
|
738
|
+
if (!this.server.onlineClientCount) {
|
|
739
|
+
this.logger.warn(`clients are all offline, but there are still ${this.pendingSlices} pending slices`);
|
|
740
|
+
return;
|
|
741
|
+
}
|
|
742
|
+
logPendingSlices();
|
|
743
|
+
await pDelay(100);
|
|
744
|
+
await checkPendingSlices();
|
|
745
|
+
};
|
|
746
|
+
await checkPendingSlices();
|
|
747
|
+
}
|
|
748
|
+
_waitForExecutionFinished() {
|
|
749
|
+
const timeout = Math.round(this.shutdownTimeout * 0.8);
|
|
750
|
+
const shutdownAt = timeout + Date.now();
|
|
751
|
+
const logShuttingDown = throttle(() => {
|
|
752
|
+
this.logger.debug('shutdown is waiting for execution to finish...');
|
|
753
|
+
}, 1000);
|
|
754
|
+
const checkExecution = async () => {
|
|
755
|
+
if (this.isExecutionDone) {
|
|
756
|
+
this.logger.trace('execution finished while shutting down');
|
|
757
|
+
return null;
|
|
758
|
+
}
|
|
759
|
+
if (!this.client.ready)
|
|
760
|
+
return null;
|
|
761
|
+
const now = Date.now();
|
|
762
|
+
if (now > shutdownAt) {
|
|
763
|
+
this.logger.error(`Shutdown timeout of ${ms(timeout)} waiting for execution ${this.exId} to finish...`);
|
|
764
|
+
this.logger.debug(`Execution controller state vars at timeout:\nisExecutionDone: ${this.isExecutionDone}\nclient.ready: ${this.client.ready}\n`
|
|
765
|
+
+ `onlineClientCount: ${this.server.onlineClientCount}\nserver.isShuttingDown: ${this.server.isShuttingDown}`
|
|
766
|
+
+ `isShuttingDown: ${this.isShuttingDown}\nisShutdown: ${this.isShutdown}\n`
|
|
767
|
+
+ `isDoneDispatching: ${this.isDoneDispatching}\npendingDispatches: ${this.pendingDispatches}\n`
|
|
768
|
+
+ `scheduler.isFinished: ${this.scheduler.isFinished}\npendingSlices: ${this.pendingSlices}\n`);
|
|
769
|
+
return null;
|
|
770
|
+
}
|
|
771
|
+
logShuttingDown();
|
|
772
|
+
await pDelay(100);
|
|
773
|
+
return checkExecution();
|
|
774
|
+
};
|
|
775
|
+
return checkExecution();
|
|
776
|
+
}
|
|
777
|
+
// verify the execution can be set to running
|
|
778
|
+
async _verifyExecution() {
|
|
779
|
+
let error;
|
|
780
|
+
const terminalStatuses = this.executionStorage.getTerminalStatuses();
|
|
781
|
+
const runningStatuses = this.executionStorage.getRunningStatuses();
|
|
782
|
+
const status = await this.executionStorage.getStatus(this.exId);
|
|
783
|
+
const invalidStateMsg = (state) => {
|
|
784
|
+
const prefix = `Execution ${this.exId} was starting in ${state} status`;
|
|
785
|
+
return `${prefix}, sending execution:finished event to cluster master`;
|
|
786
|
+
};
|
|
787
|
+
if (includes(terminalStatuses, status)) {
|
|
788
|
+
error = new Error(invalidStateMsg('terminal'));
|
|
789
|
+
}
|
|
790
|
+
else if (includes(runningStatuses, status)) {
|
|
791
|
+
// In the case of a running status on startup we
|
|
792
|
+
// want to continue to start up. Only in V2.
|
|
793
|
+
// Right now we will depend on kubernetes `crashloopbackoff` in the case of
|
|
794
|
+
// an unexpected exit to the ex process. Ex: an OOM
|
|
795
|
+
// NOTE: If this becomes an issue we may want to add a new state. Maybe `interrupted`
|
|
796
|
+
if (this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2') {
|
|
797
|
+
// Check to see if `isRelocatable` exists.
|
|
798
|
+
// Allows for older assets to work with k8sV2
|
|
799
|
+
if (this.executionContext.slicer().isRelocatable) {
|
|
800
|
+
this.logger.info(`Execution ${this.exId} detected to have been restarted..`);
|
|
801
|
+
const relocatable = this.executionContext.slicer().isRelocatable();
|
|
802
|
+
if (relocatable) {
|
|
803
|
+
this.logger.info(`Execution ${this.exId} is relocatable and will continue reinitializing...`);
|
|
804
|
+
}
|
|
805
|
+
else {
|
|
806
|
+
this.logger.error(`Execution ${this.exId} is not relocatable and will shutdown...`);
|
|
807
|
+
}
|
|
808
|
+
return relocatable;
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
error = new Error(invalidStateMsg('running'));
|
|
812
|
+
// If in a running status the execution process
|
|
813
|
+
// crashed and k8s is trying to restart the pod,
|
|
814
|
+
// e.g. execution controller OOM.
|
|
815
|
+
this.logger.warn(`Changing execution status from ${status} to failed`);
|
|
816
|
+
await this.executionStorage.setStatus(this.exId, 'failed', this.executionStorage.executionMetaData(null, getFullErrorStack(error)));
|
|
817
|
+
}
|
|
818
|
+
else {
|
|
819
|
+
return true;
|
|
820
|
+
}
|
|
821
|
+
try {
|
|
822
|
+
await this.client.sendExecutionFinished(error.message);
|
|
823
|
+
}
|
|
824
|
+
finally {
|
|
825
|
+
logError(this.logger, error, 'Unable to verify execution on initialization');
|
|
826
|
+
}
|
|
827
|
+
return false;
|
|
828
|
+
}
|
|
829
|
+
_verifyStores() {
|
|
830
|
+
let paused = false;
|
|
831
|
+
const logPaused = throttle((storesStr) => {
|
|
832
|
+
this.logger.warn(`${storesStr} are in a invalid state, scheduler is paused`);
|
|
833
|
+
}, 10 * 1000);
|
|
834
|
+
clearInterval(this.verifyStoresInterval);
|
|
835
|
+
this.verifyStoresInterval = setInterval(() => {
|
|
836
|
+
if (this.isShuttingDown || this.isShutdown)
|
|
837
|
+
return;
|
|
838
|
+
const invalid = [];
|
|
839
|
+
try {
|
|
840
|
+
const valid = this.executionStorage.verifyClient();
|
|
841
|
+
if (!valid) {
|
|
842
|
+
invalid.push('execution');
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
catch (err) {
|
|
846
|
+
clearInterval(this.verifyStoresInterval);
|
|
847
|
+
this._terminalError(err);
|
|
848
|
+
return;
|
|
849
|
+
}
|
|
850
|
+
try {
|
|
851
|
+
const valid = this.stateStorage.verifyClient();
|
|
852
|
+
if (!valid) {
|
|
853
|
+
invalid.push('state');
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
catch (err) {
|
|
857
|
+
clearInterval(this.verifyStoresInterval);
|
|
858
|
+
this._terminalError(err);
|
|
859
|
+
return;
|
|
860
|
+
}
|
|
861
|
+
if (invalid.length) {
|
|
862
|
+
const storesStr = `elasticsearch stores ${invalid.join(', ')}`;
|
|
863
|
+
if (paused) {
|
|
864
|
+
logPaused(storesStr);
|
|
865
|
+
return;
|
|
866
|
+
}
|
|
867
|
+
this.logger.warn(`${storesStr} are in a invalid state, pausing scheduler...`);
|
|
868
|
+
paused = true;
|
|
869
|
+
this.scheduler.pause();
|
|
870
|
+
return;
|
|
871
|
+
}
|
|
872
|
+
if (paused) {
|
|
873
|
+
this.logger.info('elasticsearch stores are now in a valid state, resumming scheduler...');
|
|
874
|
+
paused = false;
|
|
875
|
+
this.scheduler.start();
|
|
876
|
+
}
|
|
877
|
+
}, 100);
|
|
878
|
+
}
|
|
879
|
+
_initSliceFailureWatchDog() {
|
|
880
|
+
const probationWindow = this.executionContext.config.probation_window;
|
|
881
|
+
let watchDogSet = false;
|
|
882
|
+
let errorCount;
|
|
883
|
+
let processedCount;
|
|
884
|
+
return async () => {
|
|
885
|
+
if (watchDogSet)
|
|
886
|
+
return;
|
|
887
|
+
watchDogSet = true;
|
|
888
|
+
const analyticsData = this.executionAnalytics.getAnalytics();
|
|
889
|
+
// keep track of how many slices have been processed and failed
|
|
890
|
+
errorCount = analyticsData.failed;
|
|
891
|
+
processedCount = analyticsData.processed;
|
|
892
|
+
await this.setFailingStatus('slice failure watch dog');
|
|
893
|
+
this.sliceFailureInterval = setInterval(() => {
|
|
894
|
+
const currentAnalyticsData = this.executionAnalytics.getAnalytics();
|
|
895
|
+
const currentErrorCount = currentAnalyticsData.failed;
|
|
896
|
+
const currentProcessedCount = currentAnalyticsData.processed;
|
|
897
|
+
const errorCountTheSame = currentErrorCount === errorCount;
|
|
898
|
+
const slicesHaveProcessedSinceError = currentProcessedCount > processedCount;
|
|
899
|
+
if (errorCountTheSame && slicesHaveProcessedSinceError) {
|
|
900
|
+
clearInterval(this.sliceFailureInterval);
|
|
901
|
+
watchDogSet = false;
|
|
902
|
+
this.sliceFailureInterval = undefined;
|
|
903
|
+
const setStatusTo = this.scheduler.recovering ? 'recovering' : 'running';
|
|
904
|
+
this.logger.info(`No slice errors have occurred within execution: ${this.exId} will be set back to '${setStatusTo}' state`);
|
|
905
|
+
this.executionStorage.setStatus(this.exId, setStatusTo)
|
|
906
|
+
.catch((err) => {
|
|
907
|
+
logError(this.logger, err, 'failure to status back to running after running');
|
|
908
|
+
});
|
|
909
|
+
return;
|
|
910
|
+
}
|
|
911
|
+
errorCount = currentErrorCount;
|
|
912
|
+
processedCount = currentProcessedCount;
|
|
913
|
+
}, probationWindow);
|
|
914
|
+
};
|
|
915
|
+
}
|
|
916
|
+
_startWorkConnectWatchDog() {
|
|
917
|
+
clearTimeout(this.workerConnectTimeoutId);
|
|
918
|
+
const timeout = this.context.sysconfig.teraslice.slicer_timeout;
|
|
919
|
+
const err = new Error(`No workers have connected to slicer in the allotted time: ${ms(timeout)}`);
|
|
920
|
+
this.workerConnectTimeoutId = setTimeout(() => {
|
|
921
|
+
clearTimeout(this.workerConnectTimeoutId);
|
|
922
|
+
if (this.isShuttingDown)
|
|
923
|
+
return;
|
|
924
|
+
if (this.workersHaveConnected)
|
|
925
|
+
return;
|
|
926
|
+
this.logger.warn(`A worker has not connected to a slicer for execution: ${this.exId}, shutting down execution`);
|
|
927
|
+
this._terminalError(err);
|
|
928
|
+
}, timeout);
|
|
929
|
+
}
|
|
930
|
+
_startWorkerDisconnectWatchDog() {
|
|
931
|
+
clearTimeout(this.workerDisconnectTimeoutId);
|
|
932
|
+
if (this.isShuttingDown)
|
|
933
|
+
return;
|
|
934
|
+
if (this.server.onlineClientCount > 0)
|
|
935
|
+
return;
|
|
936
|
+
const err = new Error(`All workers from workers from ${this.exId} have disconnected`);
|
|
937
|
+
this.workerDisconnectTimeoutId = setTimeout(() => {
|
|
938
|
+
clearTimeout(this.workerDisconnectTimeoutId);
|
|
939
|
+
if (this.isShuttingDown)
|
|
940
|
+
return;
|
|
941
|
+
if (this.server.onlineClientCount > 0)
|
|
942
|
+
return;
|
|
943
|
+
this._terminalError(err);
|
|
944
|
+
}, this.workerDisconnectTimeout);
|
|
945
|
+
}
|
|
946
|
+
_removePendingSlice() {
|
|
947
|
+
this.pendingSlices--;
|
|
948
|
+
if (this.pendingSlices < 0) {
|
|
949
|
+
this.logger.warn('a slice was possibly finished more than once');
|
|
950
|
+
this.pendingSlices = 0;
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
_addPendingSlice() {
|
|
954
|
+
if (this.pendingSlices < 0) {
|
|
955
|
+
this.logger.warn('a slice was possibly finished more than once');
|
|
956
|
+
this.pendingSlices = 0;
|
|
957
|
+
}
|
|
958
|
+
this.pendingSlices++;
|
|
959
|
+
}
|
|
960
|
+
_removePendingDispatch() {
|
|
961
|
+
this.pendingDispatches--;
|
|
962
|
+
if (this.pendingDispatches < 0) {
|
|
963
|
+
this.logger.warn('a slice was possibly dispatched more than once');
|
|
964
|
+
this.pendingDispatches = 0;
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
_addPendingDispatch() {
|
|
968
|
+
if (this.pendingDispatches < 0) {
|
|
969
|
+
this.logger.warn('a slice was possibly dispatched more than once');
|
|
970
|
+
this.pendingDispatches = 0;
|
|
971
|
+
}
|
|
972
|
+
this.pendingDispatches++;
|
|
973
|
+
}
|
|
974
|
+
/**
|
|
975
|
+
* Adds all prom metrics specific to the execution_controller.
|
|
976
|
+
*
|
|
977
|
+
* If trying to add a new metric for the execution_controller, it belongs here.
|
|
978
|
+
* @async
|
|
979
|
+
* @function setupPromMetrics
|
|
980
|
+
* @return {Promise<void>}
|
|
981
|
+
* @link https://terascope.github.io/teraslice/docs/development/k8s#prometheus-metrics-api
|
|
982
|
+
*/
|
|
983
|
+
async setupPromMetrics() {
|
|
984
|
+
if (isPromAvailable(this.context)) {
|
|
985
|
+
this.logger.info(`adding ${this.context.assignment} prom metrics...`);
|
|
986
|
+
const { context, executionAnalytics } = this;
|
|
987
|
+
await Promise.all([
|
|
988
|
+
this.context.apis.foundation.promMetrics.addGauge('execution_controller_info', 'Information about Teraslice execution controller', ['arch', 'clustering_type', 'name', 'node_version', 'platform', 'teraslice_version']),
|
|
989
|
+
this.context.apis.foundation.promMetrics.addGauge('slices_processed', 'Number of slices processed by all workers', [], function collect() {
|
|
990
|
+
const slicesProcessed = executionAnalytics.get('processed');
|
|
991
|
+
const defaultLabels = {
|
|
992
|
+
...context.apis.foundation.promMetrics.getDefaultLabels()
|
|
993
|
+
};
|
|
994
|
+
this.set(defaultLabels, slicesProcessed);
|
|
995
|
+
})
|
|
996
|
+
]);
|
|
997
|
+
this.context.apis.foundation.promMetrics.set('execution_controller_info', {
|
|
998
|
+
arch: this.context.arch,
|
|
999
|
+
clustering_type: this.context.sysconfig.teraslice.cluster_manager_type,
|
|
1000
|
+
name: this.context.sysconfig.teraslice.name,
|
|
1001
|
+
node_version: process.version,
|
|
1002
|
+
platform: this.context.platform,
|
|
1003
|
+
teraslice_version: `v${getPackageJSON().version}`
|
|
1004
|
+
}, 1);
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
requestListener(req, res) {
|
|
1008
|
+
if (req.url === '/health') {
|
|
1009
|
+
if (this.server.executionReady) {
|
|
1010
|
+
res.writeHead(200);
|
|
1011
|
+
res.end('Ready');
|
|
1012
|
+
}
|
|
1013
|
+
else {
|
|
1014
|
+
res.writeHead(503);
|
|
1015
|
+
res.end('Service Unavailable');
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
else {
|
|
1019
|
+
res.writeHead(501);
|
|
1020
|
+
res.end('Not Implemented');
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
//# sourceMappingURL=index.js.map
|