teraslice 2.10.0 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/dist/src/interfaces.js +12 -0
  2. package/dist/src/lib/cluster/cluster_master.js +246 -0
  3. package/dist/src/lib/cluster/node_master.js +355 -0
  4. package/dist/src/lib/cluster/services/api.js +663 -0
  5. package/dist/src/lib/cluster/services/assets.js +226 -0
  6. package/dist/src/lib/cluster/services/cluster/backends/kubernetes/index.js +192 -0
  7. package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8s.js +481 -0
  8. package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js +414 -0
  9. package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8sState.js +59 -0
  10. package/dist/src/lib/cluster/services/cluster/backends/kubernetes/utils.js +43 -0
  11. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/index.js +192 -0
  12. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/interfaces.js +2 -0
  13. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8s.js +423 -0
  14. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sDeploymentResource.js +60 -0
  15. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sJobResource.js +55 -0
  16. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.js +359 -0
  17. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sServiceResource.js +37 -0
  18. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sState.js +60 -0
  19. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/utils.js +170 -0
  20. package/dist/src/lib/cluster/services/cluster/backends/native/dispatch.js +13 -0
  21. package/dist/src/lib/cluster/services/cluster/backends/native/index.js +526 -0
  22. package/dist/src/lib/cluster/services/cluster/backends/native/messaging.js +547 -0
  23. package/dist/src/lib/cluster/services/cluster/backends/state-utils.js +26 -0
  24. package/dist/src/lib/cluster/services/cluster/index.js +17 -0
  25. package/dist/src/lib/cluster/services/execution.js +435 -0
  26. package/dist/src/lib/cluster/services/index.js +6 -0
  27. package/dist/src/lib/cluster/services/interfaces.js +2 -0
  28. package/dist/src/lib/cluster/services/jobs.js +454 -0
  29. package/dist/src/lib/config/default-sysconfig.js +26 -0
  30. package/dist/src/lib/config/index.js +22 -0
  31. package/dist/src/lib/config/schemas/system.js +360 -0
  32. package/dist/src/lib/storage/analytics.js +86 -0
  33. package/dist/src/lib/storage/assets.js +401 -0
  34. package/dist/src/lib/storage/backends/elasticsearch_store.js +494 -0
  35. package/dist/src/lib/storage/backends/mappings/analytics.js +50 -0
  36. package/dist/src/lib/storage/backends/mappings/asset.js +41 -0
  37. package/dist/src/lib/storage/backends/mappings/ex.js +62 -0
  38. package/dist/src/lib/storage/backends/mappings/job.js +38 -0
  39. package/dist/src/lib/storage/backends/mappings/state.js +38 -0
  40. package/dist/src/lib/storage/backends/s3_store.js +237 -0
  41. package/dist/src/lib/storage/execution.js +300 -0
  42. package/dist/src/lib/storage/index.js +7 -0
  43. package/dist/src/lib/storage/jobs.js +81 -0
  44. package/dist/src/lib/storage/state.js +255 -0
  45. package/dist/src/lib/utils/api_utils.js +157 -0
  46. package/dist/src/lib/utils/asset_utils.js +94 -0
  47. package/dist/src/lib/utils/date_utils.js +52 -0
  48. package/dist/src/lib/utils/encoding_utils.js +27 -0
  49. package/dist/src/lib/utils/events.js +4 -0
  50. package/dist/src/lib/utils/file_utils.js +124 -0
  51. package/dist/src/lib/utils/id_utils.js +15 -0
  52. package/dist/src/lib/utils/port_utils.js +32 -0
  53. package/dist/src/lib/workers/assets/index.js +3 -0
  54. package/dist/src/lib/workers/assets/loader-executable.js +40 -0
  55. package/dist/src/lib/workers/assets/loader.js +73 -0
  56. package/dist/src/lib/workers/assets/spawn.js +55 -0
  57. package/dist/src/lib/workers/context/execution-context.js +12 -0
  58. package/dist/src/lib/workers/context/terafoundation-context.js +8 -0
  59. package/dist/src/lib/workers/execution-controller/execution-analytics.js +188 -0
  60. package/dist/src/lib/workers/execution-controller/index.js +1024 -0
  61. package/dist/src/lib/workers/execution-controller/recovery.js +151 -0
  62. package/dist/src/lib/workers/execution-controller/scheduler.js +390 -0
  63. package/dist/src/lib/workers/execution-controller/slice-analytics.js +96 -0
  64. package/dist/src/lib/workers/helpers/job.js +80 -0
  65. package/dist/src/lib/workers/helpers/op-analytics.js +22 -0
  66. package/dist/src/lib/workers/helpers/terafoundation.js +34 -0
  67. package/dist/src/lib/workers/helpers/worker-shutdown.js +169 -0
  68. package/dist/src/lib/workers/metrics/index.js +108 -0
  69. package/dist/src/lib/workers/worker/index.js +378 -0
  70. package/dist/src/lib/workers/worker/slice.js +122 -0
  71. package/dist/test/config/schemas/system_schema-spec.js +37 -0
  72. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8s-spec.js +316 -0
  73. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sResource-spec.js +795 -0
  74. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sState-multicluster-spec.js +67 -0
  75. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sState-spec.js +84 -0
  76. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/utils-spec.js +132 -0
  77. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8s-v2-spec.js +455 -0
  78. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sResource-v2-spec.js +818 -0
  79. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sState-multicluster-v2-spec.js +67 -0
  80. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sState-v2-spec.js +84 -0
  81. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/utils-v2-spec.js +320 -0
  82. package/dist/test/lib/cluster/services/cluster/backends/state-utils-spec.js +37 -0
  83. package/dist/test/node_master-spec.js +188 -0
  84. package/dist/test/services/api-spec.js +80 -0
  85. package/dist/test/services/assets-spec.js +158 -0
  86. package/dist/test/services/messaging-spec.js +440 -0
  87. package/dist/test/storage/assets_storage-spec.js +95 -0
  88. package/dist/test/storage/s3_store-spec.js +138 -0
  89. package/dist/test/test.config.js +8 -0
  90. package/dist/test/test.setup.js +6 -0
  91. package/dist/test/utils/api_utils-spec.js +86 -0
  92. package/dist/test/utils/asset_utils-spec.js +141 -0
  93. package/dist/test/utils/elastic_utils-spec.js +25 -0
  94. package/dist/test/workers/execution-controller/execution-controller-spec.js +371 -0
  95. package/dist/test/workers/execution-controller/execution-special-test-cases-spec.js +520 -0
  96. package/dist/test/workers/execution-controller/execution-test-cases-spec.js +338 -0
  97. package/dist/test/workers/execution-controller/recovery-spec.js +160 -0
  98. package/dist/test/workers/execution-controller/scheduler-spec.js +249 -0
  99. package/dist/test/workers/execution-controller/slice-analytics-spec.js +121 -0
  100. package/dist/test/workers/fixtures/ops/example-op/processor.js +20 -0
  101. package/dist/test/workers/fixtures/ops/example-op/schema.js +19 -0
  102. package/dist/test/workers/fixtures/ops/example-reader/fetcher.js +20 -0
  103. package/dist/test/workers/fixtures/ops/example-reader/schema.js +41 -0
  104. package/dist/test/workers/fixtures/ops/example-reader/slicer.js +37 -0
  105. package/dist/test/workers/fixtures/ops/new-op/processor.js +29 -0
  106. package/dist/test/workers/fixtures/ops/new-op/schema.js +18 -0
  107. package/dist/test/workers/fixtures/ops/new-reader/fetcher.js +19 -0
  108. package/dist/test/workers/fixtures/ops/new-reader/schema.js +23 -0
  109. package/dist/test/workers/fixtures/ops/new-reader/slicer.js +13 -0
  110. package/dist/test/workers/helpers/configs.js +130 -0
  111. package/dist/test/workers/helpers/execution-controller-helper.js +49 -0
  112. package/dist/test/workers/helpers/index.js +5 -0
  113. package/dist/test/workers/helpers/test-context.js +210 -0
  114. package/dist/test/workers/helpers/zip-directory.js +25 -0
  115. package/dist/test/workers/worker/slice-spec.js +333 -0
  116. package/dist/test/workers/worker/worker-spec.js +356 -0
  117. package/package.json +94 -93
  118. package/service.js +0 -0
@@ -0,0 +1,1024 @@
1
+ import ms from 'ms';
2
+ import { formatURL, ExecutionController as ExController, ClusterMaster } from '@terascope/teraslice-messaging';
3
+ import { TSError, includes, get, pDelay, getFullErrorStack, logError, pWhile, makeISODate, debounce, throttle } from '@terascope/utils';
4
+ import { isPromAvailable } from '@terascope/job-components';
5
+ import { waitForWorkerShutdown } from '../helpers/worker-shutdown.js';
6
+ import { StateStorage, ExecutionStorage, SliceState } from '../../storage/index.js';
7
+ import { makeLogger, generateWorkerId } from '../helpers/terafoundation.js';
8
+ import { ExecutionAnalytics } from './execution-analytics.js';
9
+ import { SliceAnalytics } from './slice-analytics.js';
10
+ import { Scheduler } from './scheduler.js';
11
+ import { Metrics } from '../metrics/index.js';
12
+ import { getPackageJSON } from '../../utils/file_utils.js';
13
+ export class ExecutionController {
14
+ context;
15
+ executionContext;
16
+ events;
17
+ logger;
18
+ server;
19
+ client;
20
+ stateStorage;
21
+ executionStorage;
22
+ isPaused = false;
23
+ isShutdown = false;
24
+ isShuttingDown = false;
25
+ isInitialized = false;
26
+ isStarted = false;
27
+ pendingDispatches = 0;
28
+ pendingSlices = 0;
29
+ isDoneProcessing = false;
30
+ isExecutionFinished = false;
31
+ isExecutionDone = false;
32
+ workersHaveConnected = false;
33
+ _handlers = new Map();
34
+ executionAnalytics;
35
+ scheduler;
36
+ metrics;
37
+ workerId;
38
+ exId;
39
+ shutdownTimeout;
40
+ workerDisconnectTimeout;
41
+ collectAnalytics;
42
+ slicerAnalytics;
43
+ _updateExecutionStats;
44
+ _startSliceFailureWatchDog;
45
+ workerConnectTimeoutId;
46
+ workerDisconnectTimeoutId;
47
+ sliceFailureInterval;
48
+ verifyStoresInterval;
49
+ slicerFailed = false;
50
+ startTime;
51
+ isDoneDispatching;
52
+ constructor(context, executionContext) {
53
+ const workerId = generateWorkerId(context);
54
+ // Use the bunyan logger.level() function to set the log level of context.logger equal
55
+ // to the log level of executionContext.logger.
56
+ // If a log_level was given in the job config, it will have overwritten the default
57
+ // log_level in the execution context.
58
+ context.logger.level(executionContext.logger.level());
59
+ const logger = makeLogger(context, 'execution_controller');
60
+ const events = context.apis.foundation.getSystemEvents();
61
+ const slicerPort = executionContext.config.slicer_port;
62
+ const performanceMetrics = executionContext.config.performance_metrics;
63
+ const config = context.sysconfig.teraslice;
64
+ const networkLatencyBuffer = get(config, 'network_latency_buffer');
65
+ const actionTimeout = get(config, 'action_timeout');
66
+ const workerDisconnectTimeout = get(config, 'worker_disconnect_timeout');
67
+ const nodeDisconnectTimeout = get(config, 'node_disconnect_timeout');
68
+ const shutdownTimeout = get(config, 'shutdown_timeout');
69
+ this.server = new ExController.Server({
70
+ port: slicerPort,
71
+ networkLatencyBuffer,
72
+ requestListener: this.requestListener.bind(this),
73
+ actionTimeout,
74
+ workerDisconnectTimeout,
75
+ logger
76
+ });
77
+ const clusterMasterPort = get(config, 'port');
78
+ const clusterMasterHostname = get(config, 'master_hostname');
79
+ this.client = new ClusterMaster.Client({
80
+ clusterMasterUrl: formatURL(clusterMasterHostname, clusterMasterPort),
81
+ nodeDisconnectTimeout,
82
+ networkLatencyBuffer,
83
+ actionTimeout,
84
+ exId: executionContext.exId,
85
+ connectTimeout: nodeDisconnectTimeout,
86
+ logger
87
+ });
88
+ this.executionAnalytics = new ExecutionAnalytics(context, executionContext, this.client);
89
+ this.scheduler = new Scheduler(context, executionContext);
90
+ this.metrics = performanceMetrics
91
+ ? new Metrics({
92
+ logger
93
+ })
94
+ : null;
95
+ this.exId = executionContext.exId;
96
+ this.workerId = workerId;
97
+ this.logger = logger;
98
+ this.events = events;
99
+ this.context = context;
100
+ this.executionContext = executionContext;
101
+ this.collectAnalytics = this.executionContext.config.analytics;
102
+ this.shutdownTimeout = shutdownTimeout;
103
+ this.workerDisconnectTimeout = workerDisconnectTimeout;
104
+ this.executionStorage = new ExecutionStorage(context);
105
+ this.stateStorage = new StateStorage(context);
106
+ // TODO: see if I can remove this debounce
107
+ this._updateExecutionStats = debounce(() => {
108
+ this._updateExecutionStatsNow();
109
+ }, 100, {
110
+ leading: true,
111
+ trailing: true,
112
+ maxWait: 500
113
+ });
114
+ this._startSliceFailureWatchDog = this._initSliceFailureWatchDog();
115
+ }
116
+ async initialize() {
117
+ if (this.context.sysconfig.teraslice.cluster_manager_type === 'native') {
118
+ this.logger.warn('Skipping PromMetricsAPI initialization: incompatible with native clustering.');
119
+ }
120
+ else {
121
+ const { terafoundation } = this.context.sysconfig;
122
+ const { config, exId, jobId } = this.executionContext;
123
+ await this.context.apis.foundation.promMetrics.init({
124
+ terasliceName: this.context.sysconfig.teraslice.name,
125
+ assignment: 'execution_controller',
126
+ logger: this.logger,
127
+ tf_prom_metrics_add_default: terafoundation.prom_metrics_add_default,
128
+ tf_prom_metrics_enabled: terafoundation.prom_metrics_enabled,
129
+ tf_prom_metrics_port: terafoundation.prom_metrics_port,
130
+ job_prom_metrics_add_default: config.prom_metrics_add_default,
131
+ job_prom_metrics_enabled: config.prom_metrics_enabled,
132
+ job_prom_metrics_port: config.prom_metrics_port,
133
+ labels: {
134
+ ex_id: exId,
135
+ job_id: jobId,
136
+ job_name: config.name,
137
+ assignment: 'execution_controller'
138
+ },
139
+ prefix: 'teraslice_job_',
140
+ prom_metrics_display_url: terafoundation.prom_metrics_display_url
141
+ });
142
+ await this.setupPromMetrics();
143
+ }
144
+ await Promise.all([
145
+ this.executionStorage.initialize(),
146
+ this.stateStorage.initialize(),
147
+ this.client.start()
148
+ ]);
149
+ let verified;
150
+ let verifiedErr;
151
+ try {
152
+ verified = await this._verifyExecution();
153
+ }
154
+ catch (err) {
155
+ verifiedErr = err;
156
+ }
157
+ if (!verified) {
158
+ this.isShutdown = true;
159
+ await Promise.all([
160
+ this.executionStorage.shutdown(true),
161
+ this.stateStorage.shutdown(true),
162
+ this.client.shutdown()
163
+ ]);
164
+ if (verifiedErr) {
165
+ throw verifiedErr;
166
+ }
167
+ return;
168
+ }
169
+ await this.server.start();
170
+ if (this.metrics != null) {
171
+ await this.metrics.initialize();
172
+ }
173
+ /// We set this to true later down the line. Not sure why
174
+ this.isInitialized = true;
175
+ this.server.onClientOnline((workerId) => {
176
+ clearTimeout(this.workerConnectTimeoutId);
177
+ this.workerConnectTimeoutId = undefined;
178
+ this.logger.trace(`worker ${workerId} is online`);
179
+ this.workersHaveConnected = true;
180
+ this.executionAnalytics.increment('workers_joined');
181
+ this._updateExecutionStats();
182
+ });
183
+ this.server.onClientAvailable((workerId) => {
184
+ this.logger.trace(`worker ${workerId} is available`);
185
+ this.executionAnalytics.set('workers_active', this.server.activeWorkerCount);
186
+ this.executionAnalytics.set('workers_available', this.server.availableClientCount);
187
+ this._updateExecutionStats();
188
+ });
189
+ this.server.onClientUnavailable(() => {
190
+ this.executionAnalytics.set('workers_active', this.server.activeWorkerCount);
191
+ this.executionAnalytics.set('workers_available', this.server.availableClientCount);
192
+ });
193
+ this.server.onClientDisconnect((workerId) => {
194
+ this.logger.trace(`worker ${workerId} disconnected but it may reconnect`);
195
+ this.executionAnalytics.increment('workers_disconnected');
196
+ this.executionAnalytics.set('workers_active', this.server.activeWorkerCount);
197
+ this._startWorkerDisconnectWatchDog();
198
+ this._updateExecutionStats();
199
+ });
200
+ this.server.onClientReconnect((workerId) => {
201
+ clearTimeout(this.workerDisconnectTimeoutId);
202
+ this.workerConnectTimeoutId = undefined;
203
+ this.logger.trace(`worker ${workerId} is reconnected`);
204
+ this.executionAnalytics.increment('workers_reconnected');
205
+ });
206
+ this.client.onExecutionPause(() => this.pause());
207
+ this.client.onExecutionResume(() => this.resume());
208
+ this.server.onSliceSuccess((workerId, response) => {
209
+ process.nextTick(() => {
210
+ const { slice_id: sliceId } = response.slice;
211
+ this.logger.info(`worker ${workerId} has completed its slice ${sliceId}`);
212
+ this.events.emit('slice:success', response);
213
+ this._removePendingSlice();
214
+ this._updateExecutionStats();
215
+ this.executionContext.onSliceComplete(response);
216
+ });
217
+ });
218
+ this.server.onSliceFailure((workerId, response) => {
219
+ process.nextTick(() => {
220
+ this.logger.error(`worker: ${workerId} has failure completing its slice`, response);
221
+ this.events.emit('slice:failure', response);
222
+ if (this.scheduler.canComplete()) {
223
+ this.setFailingStatus('slice failure event');
224
+ }
225
+ else if (this.scheduler.isRecovering()) {
226
+ this._terminalError(new Error('Slice failed while recovering'));
227
+ }
228
+ else {
229
+ // in persistent mode we set watchdogs to monitor
230
+ // when failing can be set back to running
231
+ this._startSliceFailureWatchDog();
232
+ }
233
+ this._removePendingSlice();
234
+ this._updateExecutionStats();
235
+ this.executionContext.onSliceComplete(response);
236
+ });
237
+ });
238
+ this._handlers.set('slicer:execution:update', (data) => {
239
+ this.logger.warn(data, 'event slicer:execution:update has been removed, used context.apis.executionContext.setMetadata(key, value): Promise<void>');
240
+ });
241
+ this._handlers.set('slicers:finished', (err) => {
242
+ if (err) {
243
+ this._terminalError(err);
244
+ }
245
+ });
246
+ this._handlers.set('recovery:failure', (err) => {
247
+ logError(this.logger, err, 'recovery finished due to failure');
248
+ this._terminalError(err);
249
+ });
250
+ for (const [event, handler] of this._handlers.entries()) {
251
+ if (handler !== null) {
252
+ this.events.on(event, handler);
253
+ }
254
+ }
255
+ if (this.collectAnalytics) {
256
+ this.slicerAnalytics = new SliceAnalytics(this.context, this.executionContext);
257
+ }
258
+ // This initializes user code, need to throw terminal error
259
+ // so it can be surfaced
260
+ try {
261
+ await this.scheduler.initialize(this.stateStorage, this.executionStorage);
262
+ }
263
+ catch (err) {
264
+ await this._terminalError(err);
265
+ throw err;
266
+ }
267
+ this.logger.info(`execution: ${this.exId} initialized execution_controller`);
268
+ this.isInitialized = true;
269
+ /// This will change the '/ready' endpoint to Ready
270
+ this.server.executionReady = true;
271
+ }
272
+ async run() {
273
+ if (!this.isInitialized)
274
+ return;
275
+ this._startWorkConnectWatchDog();
276
+ this.executionAnalytics.start();
277
+ try {
278
+ await this._runExecution();
279
+ }
280
+ catch (err) {
281
+ logError(this.logger, err, 'Run execution error');
282
+ }
283
+ this.events.emit('worker:shutdown');
284
+ await this.executionContext.shutdown();
285
+ // help the workers go offline
286
+ this.server.isShuttingDown = true;
287
+ await this._finishExecution();
288
+ try {
289
+ await Promise.all([this.client.sendExecutionFinished(), this._waitForWorkersToExit()]);
290
+ }
291
+ catch (err) {
292
+ logError(this.logger, err, 'Failure sending execution finished');
293
+ }
294
+ this.logger.debug(`execution ${this.exId} is done`);
295
+ }
296
+ async resume() {
297
+ if (!this.isPaused)
298
+ return;
299
+ this.logger.info(`execution ${this.exId} is resuming...`);
300
+ this.isPaused = false;
301
+ this.scheduler.start();
302
+ await pDelay(100);
303
+ }
304
+ async pause() {
305
+ if (this.isPaused)
306
+ return;
307
+ this.logger.info(`execution ${this.exId} is pausing...`);
308
+ this.isPaused = true;
309
+ this.scheduler.pause();
310
+ await pDelay(100);
311
+ }
312
+ async setFailingStatus(reason) {
313
+ const errMsg = `execution ${this.exId} has encountered a processing error, reason: ${reason}`;
314
+ this.logger.error(errMsg);
315
+ const executionStats = this.executionAnalytics.getAnalytics();
316
+ const errorMeta = this.executionStorage.executionMetaData(executionStats, errMsg);
317
+ try {
318
+ await this.executionStorage.setStatus(this.exId, 'failing', errorMeta);
319
+ }
320
+ catch (err) {
321
+ logError(this.logger, err, 'Failure to set execution status to "failing"');
322
+ }
323
+ }
324
+ async _terminalError(err) {
325
+ if (this.isExecutionDone)
326
+ return;
327
+ this.slicerFailed = true;
328
+ const error = new TSError(err, {
329
+ reason: `slicer for ex ${this.exId} had an error, shutting down execution`
330
+ });
331
+ this.logger.error(error);
332
+ const executionStats = this.executionAnalytics.getAnalytics();
333
+ const fullStack = getFullErrorStack(error);
334
+ const errorMeta = this.executionStorage.executionMetaData(executionStats, fullStack);
335
+ try {
336
+ await this.executionStorage.setStatus(this.exId, 'failed', errorMeta);
337
+ }
338
+ catch (_err) {
339
+ logError(this.logger, _err, 'failure setting status to failed');
340
+ }
341
+ this.logger.fatal(`execution ${this.exId} is ended because of slice failure`);
342
+ await this._endExecution();
343
+ }
344
+ async shutdown(eventType, shutdownError, block = true) {
345
+ if (eventType === 'error' && shutdownError) {
346
+ /// Add errors to this list as needed. Errors not in this list won't cleanup resources
347
+ const errorList = [
348
+ 'index specified in reader does not exist'
349
+ ];
350
+ /// Tell cluster_master that shutdown is due to a specific error
351
+ /// Cleans up kubernetes resources. For native, kills processes
352
+ if (errorList.includes(shutdownError.message)) {
353
+ this.logger.warn('sent request to cluster_master to cleanup job resources.');
354
+ await this.client.sendExecutionFinished(shutdownError.message);
355
+ }
356
+ }
357
+ /// This only applies to kubernetesV2
358
+ if (this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2'
359
+ && eventType === 'SIGTERM') {
360
+ await this.stateStorage.refresh();
361
+ const status = await this.executionStorage.getStatus(this.exId);
362
+ const runningStatuses = this.executionStorage.getRunningStatuses();
363
+ this.logger.debug(`Execution ${this.exId} is currently in a ${status} state`);
364
+ /// This is an indication that the cluster_master did not call for this
365
+ /// shutdown. We want to restart in this case.
366
+ if (status !== 'stopping' && includes(runningStatuses, status)) {
367
+ this.logger.info('Skipping shutdown to allow for relocation...');
368
+ return;
369
+ }
370
+ }
371
+ if (this.isShutdown)
372
+ return;
373
+ if (!this.isInitialized)
374
+ return;
375
+ if (this.isShuttingDown) {
376
+ const msgs = [
377
+ 'execution',
378
+ `shutdown was called for ${this.exId}`,
379
+ 'but it was already shutting down',
380
+ block ? ', will block until done' : ''
381
+ ];
382
+ this.logger.debug(msgs.join(' '));
383
+ if (block) {
384
+ await waitForWorkerShutdown(this.context, 'worker:shutdown:complete');
385
+ }
386
+ return;
387
+ }
388
+ this.logger.debug(`execution shutdown was called for ex ${this.exId}`);
389
+ const shutdownErrs = [];
390
+ const pushError = (err) => {
391
+ shutdownErrs.push(err);
392
+ };
393
+ // allow clients to go immediately from disconnect to offline
394
+ this.server.isShuttingDown = true;
395
+ // tell the scheduler to stop producing slices
396
+ await this.scheduler.stop();
397
+ // remove any listeners
398
+ for (const [event, handler] of this._handlers.entries()) {
399
+ if (handler !== null) {
400
+ this.events.removeListener(event, handler);
401
+ this._handlers.set(event, null);
402
+ }
403
+ }
404
+ this.isShuttingDown = true;
405
+ this.isPaused = false;
406
+ clearInterval(this.sliceFailureInterval);
407
+ clearTimeout(this.workerConnectTimeoutId);
408
+ clearTimeout(this.workerDisconnectTimeoutId);
409
+ clearInterval(this.verifyStoresInterval);
410
+ await this._waitForExecutionFinished();
411
+ await Promise.all([
412
+ (async () => {
413
+ if (!this.collectAnalytics)
414
+ return;
415
+ await this.slicerAnalytics.shutdown().catch(pushError);
416
+ })(),
417
+ (async () => {
418
+ // the execution analytics must be shutdown
419
+ // before the message client
420
+ await this.executionAnalytics.shutdown().catch(pushError);
421
+ await this.client.shutdown().catch(pushError);
422
+ })(),
423
+ (async () => {
424
+ await this.scheduler.shutdown().catch(pushError);
425
+ })(),
426
+ (async () => {
427
+ await this.server.shutdown().catch(pushError);
428
+ })(),
429
+ (async () => {
430
+ await Promise.all([
431
+ (async () => {
432
+ try {
433
+ await this.stateStorage.shutdown(true);
434
+ }
435
+ catch (err) {
436
+ pushError(err);
437
+ }
438
+ })(),
439
+ (async () => {
440
+ try {
441
+ await this.executionStorage.shutdown(true);
442
+ }
443
+ catch (err) {
444
+ pushError(err);
445
+ }
446
+ })()
447
+ ]);
448
+ })(),
449
+ (async () => {
450
+ if (this.metrics == null)
451
+ return;
452
+ await this.metrics.shutdown().catch(pushError);
453
+ })()
454
+ ]);
455
+ this.logger.warn(`execution controller ${this.exId} is shutdown`);
456
+ this.isShutdown = true;
457
+ if (shutdownErrs.length) {
458
+ const errMsg = shutdownErrs.map((e) => e.stack).join(', and');
459
+ const shutdownErr = new Error(`Failed to shutdown correctly: ${errMsg}`);
460
+ this.events.emit('worker:shutdown:complete', shutdownErr);
461
+ await pDelay(0);
462
+ throw shutdownErr;
463
+ }
464
+ this.events.emit('worker:shutdown:complete');
465
+ }
466
+ async _runExecution() {
467
+ // wait for paused
468
+ await pWhile(async () => {
469
+ if (!this.isPaused || this.isShutdown)
470
+ return true;
471
+ await pDelay(100);
472
+ return false;
473
+ });
474
+ this.logger.info(`starting execution ${this.exId}...`);
475
+ this.startTime = Date.now();
476
+ this.isStarted = true;
477
+ this._verifyStores();
478
+ // start creating / dispatching slices, this will block until done
479
+ await Promise.all([
480
+ this.client.sendAvailable().then(() => this.logger.debug('client.sendAvailable() promise resolved')),
481
+ this._runDispatch().then(() => this.logger.debug('_runDispatch() promise resolved')),
482
+ this.scheduler.run().then(() => this.logger.debug('scheduler.run() promise resolved'))
483
+ ]);
484
+ const schedulerSuccessful = this.scheduler.isFinished && this.scheduler.slicersDone;
485
+ await this._waitForPendingSlices();
486
+ if (schedulerSuccessful && this.isDoneDispatching) {
487
+ this.logger.debug(`execution ${this.exId} is done processing slices`);
488
+ this.isDoneProcessing = true;
489
+ }
490
+ else if (!this.isShutdown) {
491
+ this.logger.debug(`execution ${this.exId} did not finish`);
492
+ }
493
+ else {
494
+ this.logger.debug(`execution ${this.exId} is exiting...`);
495
+ }
496
+ }
497
+ // dispatching should be pushed out into its own module
498
+ async _runDispatch() {
499
+ this.isDoneDispatching = false;
500
+ let dispatchInterval;
501
+ // returns a boolean to indicate whether
502
+ // dispatching should continue
503
+ const isRunning = () => {
504
+ if (this.isShuttingDown)
505
+ return false;
506
+ if (this.isExecutionDone)
507
+ return false;
508
+ if (this.scheduler.isFinished && !this.pendingDispatches)
509
+ return false;
510
+ return true;
511
+ };
512
+ const isPaused = () => this.isPaused;
513
+ const canDispatch = () => {
514
+ const workers = this.server.workerQueueSize;
515
+ const slices = this.scheduler.queueLength;
516
+ return workers > 0 && slices > 0;
517
+ };
518
+ const dequeueAndDispatch = () => {
519
+ const reenqueue = [];
520
+ const dispatch = [];
521
+ const slices = this.scheduler.getSlices(this.server.workerQueueSize);
522
+ slices.forEach((slice) => {
523
+ const workerId = this.server.dequeueWorker(slice);
524
+ if (!workerId) {
525
+ reenqueue.push(slice);
526
+ }
527
+ else {
528
+ this._addPendingDispatch();
529
+ this._addPendingSlice();
530
+ dispatch.push({ slice, workerId });
531
+ }
532
+ });
533
+ slices.length = 0;
534
+ if (dispatch.length > 0) {
535
+ process.nextTick(() => {
536
+ const promises = dispatch.map((input) => {
537
+ const { slice, workerId } = input;
538
+ return this._dispatchSlice(slice, workerId);
539
+ });
540
+ dispatch.length = 0;
541
+ Promise.all(promises).catch((err) => logError(this.logger, err, 'failure to dispatch slices'));
542
+ });
543
+ }
544
+ if (reenqueue.length > 0) {
545
+ // this isn't really ideal since we adding
546
+ // to the beginning of the queue and
547
+ // it may end up in a recursive loop trying
548
+ // to process that slice
549
+ this.scheduler.enqueueSlices(reenqueue, true);
550
+ reenqueue.length = 0;
551
+ }
552
+ };
553
+ await pDelay(0);
554
+ await new Promise((resolve) => {
555
+ this.logger.debug('dispatching slices...');
556
+ dispatchInterval = setInterval(() => {
557
+ if (!isRunning()) {
558
+ resolve(true);
559
+ return;
560
+ }
561
+ if (isPaused())
562
+ return;
563
+ if (canDispatch()) {
564
+ dequeueAndDispatch();
565
+ }
566
+ }, 5);
567
+ });
568
+ clearInterval(dispatchInterval);
569
+ this.isDoneDispatching = true;
570
+ this.logger.debug('done dispatching slices');
571
+ }
572
+ _dispatchSlice(slice, workerId) {
573
+ this.logger.trace(`dispatching slice ${slice.slice_id} for worker ${workerId}`);
574
+ return this.server
575
+ .dispatchSlice(slice, workerId)
576
+ .then((dispatched) => {
577
+ if (dispatched) {
578
+ this.logger.debug(`dispatched slice ${slice.slice_id} to worker ${workerId}`);
579
+ this.executionContext.onSliceDispatch(slice);
580
+ }
581
+ else {
582
+ this.logger.warn(`worker "${workerId}" is not available to process slice ${slice.slice_id}`);
583
+ this.scheduler.enqueueSlice(slice, true);
584
+ this._removePendingSlice();
585
+ }
586
+ this._removePendingDispatch();
587
+ })
588
+ .catch((err) => {
589
+ logError(this.logger, err, 'error dispatching slice');
590
+ this._removePendingDispatch();
591
+ this._removePendingSlice();
592
+ });
593
+ }
594
+ async _finishExecution() {
595
+ if (this.isExecutionFinished)
596
+ return;
597
+ this._logFinishedJob();
598
+ // refresh the state store index
599
+ // to prevent the execution from failing incorrectly
600
+ await this.stateStorage.refresh();
601
+ try {
602
+ await this._updateExecutionStatus();
603
+ }
604
+ catch (err) {
605
+ /* istanbul ignore next */
606
+ const error = new TSError(err, {
607
+ reason: `execution ${this.exId} has run to completion but the process has failed while updating the execution status, slicer will soon exit`
608
+ });
609
+ this.logger.error(error);
610
+ }
611
+ this.isExecutionFinished = true;
612
+ await this._endExecution();
613
+ }
614
+ async _endExecution() {
615
+ this.isExecutionDone = true;
616
+ await this.scheduler.shutdown();
617
+ }
618
+ _updateExecutionStatsNow() {
619
+ this.executionContext.onExecutionStats({
620
+ workers: {
621
+ connected: this.server.onlineClientCount,
622
+ available: this.server.availableClientCount
623
+ },
624
+ slices: {
625
+ processed: this.executionAnalytics.get('processed'),
626
+ failed: this.executionAnalytics.get('failed')
627
+ }
628
+ });
629
+ }
630
+ async _updateExecutionStatus() {
631
+ // if this.slicerFailed is true, slicer has already been marked as failed
632
+ if (this.slicerFailed)
633
+ return;
634
+ const executionStats = this.executionAnalytics.getAnalytics();
635
+ if (!this.isDoneProcessing) {
636
+ // if status is stopping or stopped, only update the execution metadata
637
+ const status = await this.executionStorage.getStatus(this.exId);
638
+ const isStopping = status === 'stopping' || status === 'stopped';
639
+ if (isStopping) {
640
+ this.logger.debug(`execution is set to ${status}, status will not be updated`);
641
+ await this.executionStorage.updatePartial(this.exId, async (existing) => {
642
+ const metaData = this.executionStorage.executionMetaData(executionStats);
643
+ return Object.assign(existing, metaData, {
644
+ _updated: makeISODate()
645
+ });
646
+ });
647
+ return;
648
+ }
649
+ const errMsg = `execution ${this.exId} received shutdown before the slicer could complete, setting status to "terminated"`;
650
+ const metaData = this.executionStorage.executionMetaData(executionStats, errMsg);
651
+ this.logger.error(errMsg);
652
+ await this.executionStorage.setStatus(this.exId, 'terminated', metaData);
653
+ return;
654
+ }
655
+ const [errors, started, pending] = await Promise.all([
656
+ this.stateStorage.countByState(this.exId, SliceState.error),
657
+ this.stateStorage.countByState(this.exId, SliceState.start),
658
+ this.stateStorage.countByState(this.exId, SliceState.pending),
659
+ ]);
660
+ if (errors > 0 || started > 0) {
661
+ const errMsg = this._formatExecutionFailure({ errors, started, pending });
662
+ const errorMeta = this.executionStorage.executionMetaData(executionStats, errMsg);
663
+ this.logger.error(errMsg);
664
+ await this.executionStorage.setStatus(this.exId, 'failed', errorMeta);
665
+ return;
666
+ }
667
+ const metaData = this.executionStorage.executionMetaData(executionStats);
668
+ this.logger.info(`execution ${this.exId} has completed`);
669
+ await this.executionStorage.setStatus(this.exId, 'completed', metaData);
670
+ }
671
+ _logFinishedJob() {
672
+ const endTime = Date.now();
673
+ const elapsed = endTime - (this.startTime ?? 0);
674
+ const time = elapsed < 1000 ? 1 : Math.round(elapsed / 1000);
675
+ this.executionAnalytics.set('job_duration', time);
676
+ if (this.collectAnalytics && this.slicerAnalytics) {
677
+ this.slicerAnalytics.analyzeStats();
678
+ }
679
+ this.logger.info(`execution ${this.exId} has finished in ${time} seconds`);
680
+ }
681
+ _formatExecutionFailure({ started, errors, pending }) {
682
+ const startedMsg = started <= 1
683
+ ? `had ${started} slice stuck in started`
684
+ : `had ${started} slices stuck in started`;
685
+ const pendingMsg = pending <= 1
686
+ ? `had ${pending} slice are still pending`
687
+ : `had ${pending} slices are still pending`;
688
+ const errorsMsg = errors <= 1
689
+ ? `had ${errors} slice failure`
690
+ : `had ${errors} slice failures`;
691
+ const none = (errors + started + pending) === 0;
692
+ const stateMessages = [
693
+ started || none ? startedMsg : '',
694
+ pending || none ? pendingMsg : '',
695
+ errors || none ? errorsMsg : '',
696
+ ].filter(Boolean);
697
+ return `execution: ${this.exId} ${stateMessages} during processing`;
698
+ }
699
+ async _waitForWorkersToExit() {
700
+ if (!this.server.onlineClientCount)
701
+ return;
702
+ const timeoutOutAt = this.workerDisconnectTimeout + Date.now();
703
+ const logWaitingForWorkers = throttle(() => {
704
+ this.logger.debug(`waiting for ${this.server.onlineClientCount} to go offline`);
705
+ }, 1000);
706
+ const checkOnlineCount = async () => {
707
+ if (this.isExecutionFinished) {
708
+ this.logger.trace('execution finished while waiting for workers to go offline');
709
+ return;
710
+ }
711
+ if (!this.client.ready)
712
+ return;
713
+ if (!this.server.onlineClientCount) {
714
+ this.logger.trace('workers all workers have disconnected');
715
+ return;
716
+ }
717
+ const now = Date.now();
718
+ if (now > timeoutOutAt) {
719
+ return;
720
+ }
721
+ logWaitingForWorkers();
722
+ await pDelay(100);
723
+ await checkOnlineCount();
724
+ };
725
+ await checkOnlineCount();
726
+ }
727
+ async _waitForPendingSlices() {
728
+ const logPendingSlices = throttle(() => {
729
+ this.logger.debug(`waiting for ${this.pendingSlices} slices to finish`);
730
+ }, 1000);
731
+ const checkPendingSlices = async () => {
732
+ if (this.isShuttingDown)
733
+ return;
734
+ if (!this.pendingSlices) {
735
+ this.logger.debug('all pending slices are done');
736
+ return;
737
+ }
738
+ if (!this.server.onlineClientCount) {
739
+ this.logger.warn(`clients are all offline, but there are still ${this.pendingSlices} pending slices`);
740
+ return;
741
+ }
742
+ logPendingSlices();
743
+ await pDelay(100);
744
+ await checkPendingSlices();
745
+ };
746
+ await checkPendingSlices();
747
+ }
748
+ _waitForExecutionFinished() {
749
+ const timeout = Math.round(this.shutdownTimeout * 0.8);
750
+ const shutdownAt = timeout + Date.now();
751
+ const logShuttingDown = throttle(() => {
752
+ this.logger.debug('shutdown is waiting for execution to finish...');
753
+ }, 1000);
754
+ const checkExecution = async () => {
755
+ if (this.isExecutionDone) {
756
+ this.logger.trace('execution finished while shutting down');
757
+ return null;
758
+ }
759
+ if (!this.client.ready)
760
+ return null;
761
+ const now = Date.now();
762
+ if (now > shutdownAt) {
763
+ this.logger.error(`Shutdown timeout of ${ms(timeout)} waiting for execution ${this.exId} to finish...`);
764
+ this.logger.debug(`Execution controller state vars at timeout:\nisExecutionDone: ${this.isExecutionDone}\nclient.ready: ${this.client.ready}\n`
765
+ + `onlineClientCount: ${this.server.onlineClientCount}\nserver.isShuttingDown: ${this.server.isShuttingDown}`
766
+ + `isShuttingDown: ${this.isShuttingDown}\nisShutdown: ${this.isShutdown}\n`
767
+ + `isDoneDispatching: ${this.isDoneDispatching}\npendingDispatches: ${this.pendingDispatches}\n`
768
+ + `scheduler.isFinished: ${this.scheduler.isFinished}\npendingSlices: ${this.pendingSlices}\n`);
769
+ return null;
770
+ }
771
+ logShuttingDown();
772
+ await pDelay(100);
773
+ return checkExecution();
774
+ };
775
+ return checkExecution();
776
+ }
777
+ // verify the execution can be set to running
778
+ async _verifyExecution() {
779
+ let error;
780
+ const terminalStatuses = this.executionStorage.getTerminalStatuses();
781
+ const runningStatuses = this.executionStorage.getRunningStatuses();
782
+ const status = await this.executionStorage.getStatus(this.exId);
783
+ const invalidStateMsg = (state) => {
784
+ const prefix = `Execution ${this.exId} was starting in ${state} status`;
785
+ return `${prefix}, sending execution:finished event to cluster master`;
786
+ };
787
+ if (includes(terminalStatuses, status)) {
788
+ error = new Error(invalidStateMsg('terminal'));
789
+ }
790
+ else if (includes(runningStatuses, status)) {
791
+ // In the case of a running status on startup we
792
+ // want to continue to start up. Only in V2.
793
+ // Right now we will depend on kubernetes `crashloopbackoff` in the case of
794
+ // an unexpected exit to the ex process. Ex: an OOM
795
+ // NOTE: If this becomes an issue we may want to add a new state. Maybe `interrupted`
796
+ if (this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2') {
797
+ // Check to see if `isRelocatable` exists.
798
+ // Allows for older assets to work with k8sV2
799
+ if (this.executionContext.slicer().isRelocatable) {
800
+ this.logger.info(`Execution ${this.exId} detected to have been restarted..`);
801
+ const relocatable = this.executionContext.slicer().isRelocatable();
802
+ if (relocatable) {
803
+ this.logger.info(`Execution ${this.exId} is relocatable and will continue reinitializing...`);
804
+ }
805
+ else {
806
+ this.logger.error(`Execution ${this.exId} is not relocatable and will shutdown...`);
807
+ }
808
+ return relocatable;
809
+ }
810
+ }
811
+ error = new Error(invalidStateMsg('running'));
812
+ // If in a running status the execution process
813
+ // crashed and k8s is trying to restart the pod,
814
+ // e.g. execution controller OOM.
815
+ this.logger.warn(`Changing execution status from ${status} to failed`);
816
+ await this.executionStorage.setStatus(this.exId, 'failed', this.executionStorage.executionMetaData(null, getFullErrorStack(error)));
817
+ }
818
+ else {
819
+ return true;
820
+ }
821
+ try {
822
+ await this.client.sendExecutionFinished(error.message);
823
+ }
824
+ finally {
825
+ logError(this.logger, error, 'Unable to verify execution on initialization');
826
+ }
827
+ return false;
828
+ }
829
+ _verifyStores() {
830
+ let paused = false;
831
+ const logPaused = throttle((storesStr) => {
832
+ this.logger.warn(`${storesStr} are in a invalid state, scheduler is paused`);
833
+ }, 10 * 1000);
834
+ clearInterval(this.verifyStoresInterval);
835
+ this.verifyStoresInterval = setInterval(() => {
836
+ if (this.isShuttingDown || this.isShutdown)
837
+ return;
838
+ const invalid = [];
839
+ try {
840
+ const valid = this.executionStorage.verifyClient();
841
+ if (!valid) {
842
+ invalid.push('execution');
843
+ }
844
+ }
845
+ catch (err) {
846
+ clearInterval(this.verifyStoresInterval);
847
+ this._terminalError(err);
848
+ return;
849
+ }
850
+ try {
851
+ const valid = this.stateStorage.verifyClient();
852
+ if (!valid) {
853
+ invalid.push('state');
854
+ }
855
+ }
856
+ catch (err) {
857
+ clearInterval(this.verifyStoresInterval);
858
+ this._terminalError(err);
859
+ return;
860
+ }
861
+ if (invalid.length) {
862
+ const storesStr = `elasticsearch stores ${invalid.join(', ')}`;
863
+ if (paused) {
864
+ logPaused(storesStr);
865
+ return;
866
+ }
867
+ this.logger.warn(`${storesStr} are in a invalid state, pausing scheduler...`);
868
+ paused = true;
869
+ this.scheduler.pause();
870
+ return;
871
+ }
872
+ if (paused) {
873
+ this.logger.info('elasticsearch stores are now in a valid state, resumming scheduler...');
874
+ paused = false;
875
+ this.scheduler.start();
876
+ }
877
+ }, 100);
878
+ }
879
+ _initSliceFailureWatchDog() {
880
+ const probationWindow = this.executionContext.config.probation_window;
881
+ let watchDogSet = false;
882
+ let errorCount;
883
+ let processedCount;
884
+ return async () => {
885
+ if (watchDogSet)
886
+ return;
887
+ watchDogSet = true;
888
+ const analyticsData = this.executionAnalytics.getAnalytics();
889
+ // keep track of how many slices have been processed and failed
890
+ errorCount = analyticsData.failed;
891
+ processedCount = analyticsData.processed;
892
+ await this.setFailingStatus('slice failure watch dog');
893
+ this.sliceFailureInterval = setInterval(() => {
894
+ const currentAnalyticsData = this.executionAnalytics.getAnalytics();
895
+ const currentErrorCount = currentAnalyticsData.failed;
896
+ const currentProcessedCount = currentAnalyticsData.processed;
897
+ const errorCountTheSame = currentErrorCount === errorCount;
898
+ const slicesHaveProcessedSinceError = currentProcessedCount > processedCount;
899
+ if (errorCountTheSame && slicesHaveProcessedSinceError) {
900
+ clearInterval(this.sliceFailureInterval);
901
+ watchDogSet = false;
902
+ this.sliceFailureInterval = undefined;
903
+ const setStatusTo = this.scheduler.recovering ? 'recovering' : 'running';
904
+ this.logger.info(`No slice errors have occurred within execution: ${this.exId} will be set back to '${setStatusTo}' state`);
905
+ this.executionStorage.setStatus(this.exId, setStatusTo)
906
+ .catch((err) => {
907
+ logError(this.logger, err, 'failure to status back to running after running');
908
+ });
909
+ return;
910
+ }
911
+ errorCount = currentErrorCount;
912
+ processedCount = currentProcessedCount;
913
+ }, probationWindow);
914
+ };
915
+ }
916
+ _startWorkConnectWatchDog() {
917
+ clearTimeout(this.workerConnectTimeoutId);
918
+ const timeout = this.context.sysconfig.teraslice.slicer_timeout;
919
+ const err = new Error(`No workers have connected to slicer in the allotted time: ${ms(timeout)}`);
920
+ this.workerConnectTimeoutId = setTimeout(() => {
921
+ clearTimeout(this.workerConnectTimeoutId);
922
+ if (this.isShuttingDown)
923
+ return;
924
+ if (this.workersHaveConnected)
925
+ return;
926
+ this.logger.warn(`A worker has not connected to a slicer for execution: ${this.exId}, shutting down execution`);
927
+ this._terminalError(err);
928
+ }, timeout);
929
+ }
930
+ _startWorkerDisconnectWatchDog() {
931
+ clearTimeout(this.workerDisconnectTimeoutId);
932
+ if (this.isShuttingDown)
933
+ return;
934
+ if (this.server.onlineClientCount > 0)
935
+ return;
936
+ const err = new Error(`All workers from workers from ${this.exId} have disconnected`);
937
+ this.workerDisconnectTimeoutId = setTimeout(() => {
938
+ clearTimeout(this.workerDisconnectTimeoutId);
939
+ if (this.isShuttingDown)
940
+ return;
941
+ if (this.server.onlineClientCount > 0)
942
+ return;
943
+ this._terminalError(err);
944
+ }, this.workerDisconnectTimeout);
945
+ }
946
+ _removePendingSlice() {
947
+ this.pendingSlices--;
948
+ if (this.pendingSlices < 0) {
949
+ this.logger.warn('a slice was possibly finished more than once');
950
+ this.pendingSlices = 0;
951
+ }
952
+ }
953
+ _addPendingSlice() {
954
+ if (this.pendingSlices < 0) {
955
+ this.logger.warn('a slice was possibly finished more than once');
956
+ this.pendingSlices = 0;
957
+ }
958
+ this.pendingSlices++;
959
+ }
960
+ _removePendingDispatch() {
961
+ this.pendingDispatches--;
962
+ if (this.pendingDispatches < 0) {
963
+ this.logger.warn('a slice was possibly dispatched more than once');
964
+ this.pendingDispatches = 0;
965
+ }
966
+ }
967
+ _addPendingDispatch() {
968
+ if (this.pendingDispatches < 0) {
969
+ this.logger.warn('a slice was possibly dispatched more than once');
970
+ this.pendingDispatches = 0;
971
+ }
972
+ this.pendingDispatches++;
973
+ }
974
+ /**
975
+ * Adds all prom metrics specific to the execution_controller.
976
+ *
977
+ * If trying to add a new metric for the execution_controller, it belongs here.
978
+ * @async
979
+ * @function setupPromMetrics
980
+ * @return {Promise<void>}
981
+ * @link https://terascope.github.io/teraslice/docs/development/k8s#prometheus-metrics-api
982
+ */
983
+ async setupPromMetrics() {
984
+ if (isPromAvailable(this.context)) {
985
+ this.logger.info(`adding ${this.context.assignment} prom metrics...`);
986
+ const { context, executionAnalytics } = this;
987
+ await Promise.all([
988
+ this.context.apis.foundation.promMetrics.addGauge('execution_controller_info', 'Information about Teraslice execution controller', ['arch', 'clustering_type', 'name', 'node_version', 'platform', 'teraslice_version']),
989
+ this.context.apis.foundation.promMetrics.addGauge('slices_processed', 'Number of slices processed by all workers', [], function collect() {
990
+ const slicesProcessed = executionAnalytics.get('processed');
991
+ const defaultLabels = {
992
+ ...context.apis.foundation.promMetrics.getDefaultLabels()
993
+ };
994
+ this.set(defaultLabels, slicesProcessed);
995
+ })
996
+ ]);
997
+ this.context.apis.foundation.promMetrics.set('execution_controller_info', {
998
+ arch: this.context.arch,
999
+ clustering_type: this.context.sysconfig.teraslice.cluster_manager_type,
1000
+ name: this.context.sysconfig.teraslice.name,
1001
+ node_version: process.version,
1002
+ platform: this.context.platform,
1003
+ teraslice_version: `v${getPackageJSON().version}`
1004
+ }, 1);
1005
+ }
1006
+ }
1007
+ requestListener(req, res) {
1008
+ if (req.url === '/health') {
1009
+ if (this.server.executionReady) {
1010
+ res.writeHead(200);
1011
+ res.end('Ready');
1012
+ }
1013
+ else {
1014
+ res.writeHead(503);
1015
+ res.end('Service Unavailable');
1016
+ }
1017
+ }
1018
+ else {
1019
+ res.writeHead(501);
1020
+ res.end('Not Implemented');
1021
+ }
1022
+ }
1023
+ }
1024
+ //# sourceMappingURL=index.js.map