teraslice 0.87.1 → 0.88.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/cluster-service.js +24 -18
  2. package/dist/src/index.js +42 -0
  3. package/package.json +11 -15
  4. package/service.js +4 -6
  5. package/worker-service.js +6 -6
  6. package/index.js +0 -21
  7. package/lib/cluster/cluster_master.js +0 -164
  8. package/lib/cluster/node_master.js +0 -393
  9. package/lib/cluster/services/api.js +0 -581
  10. package/lib/cluster/services/assets.js +0 -211
  11. package/lib/cluster/services/cluster/backends/kubernetes/deployments/worker.hbs +0 -86
  12. package/lib/cluster/services/cluster/backends/kubernetes/index.js +0 -225
  13. package/lib/cluster/services/cluster/backends/kubernetes/jobs/execution_controller.hbs +0 -69
  14. package/lib/cluster/services/cluster/backends/kubernetes/k8s.js +0 -450
  15. package/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js +0 -443
  16. package/lib/cluster/services/cluster/backends/kubernetes/k8sState.js +0 -67
  17. package/lib/cluster/services/cluster/backends/kubernetes/utils.js +0 -58
  18. package/lib/cluster/services/cluster/backends/native/index.js +0 -611
  19. package/lib/cluster/services/cluster/backends/native/messaging.js +0 -563
  20. package/lib/cluster/services/cluster/backends/state-utils.js +0 -49
  21. package/lib/cluster/services/cluster/index.js +0 -15
  22. package/lib/cluster/services/execution.js +0 -459
  23. package/lib/cluster/services/jobs.js +0 -303
  24. package/lib/config/default-sysconfig.js +0 -47
  25. package/lib/config/index.js +0 -32
  26. package/lib/config/schemas/system.js +0 -333
  27. package/lib/processors/save_file/index.js +0 -9
  28. package/lib/processors/save_file/processor.js +0 -17
  29. package/lib/processors/save_file/schema.js +0 -17
  30. package/lib/processors/script.js +0 -130
  31. package/lib/processors/stdout/index.js +0 -9
  32. package/lib/processors/stdout/processor.js +0 -19
  33. package/lib/processors/stdout/schema.js +0 -18
  34. package/lib/storage/analytics.js +0 -106
  35. package/lib/storage/assets.js +0 -275
  36. package/lib/storage/backends/elasticsearch_store.js +0 -567
  37. package/lib/storage/backends/mappings/analytics.json +0 -49
  38. package/lib/storage/backends/mappings/asset.json +0 -40
  39. package/lib/storage/backends/mappings/ex.json +0 -55
  40. package/lib/storage/backends/mappings/job.json +0 -31
  41. package/lib/storage/backends/mappings/state.json +0 -37
  42. package/lib/storage/execution.js +0 -331
  43. package/lib/storage/index.js +0 -16
  44. package/lib/storage/jobs.js +0 -97
  45. package/lib/storage/state.js +0 -302
  46. package/lib/utils/api_utils.js +0 -173
  47. package/lib/utils/asset_utils.js +0 -117
  48. package/lib/utils/date_utils.js +0 -58
  49. package/lib/utils/encoding_utils.js +0 -29
  50. package/lib/utils/events.js +0 -7
  51. package/lib/utils/file_utils.js +0 -118
  52. package/lib/utils/id_utils.js +0 -19
  53. package/lib/utils/port_utils.js +0 -83
  54. package/lib/workers/assets/loader.js +0 -109
  55. package/lib/workers/assets/spawn.js +0 -78
  56. package/lib/workers/context/execution-context.js +0 -16
  57. package/lib/workers/context/terafoundation-context.js +0 -10
  58. package/lib/workers/execution-controller/execution-analytics.js +0 -211
  59. package/lib/workers/execution-controller/index.js +0 -1033
  60. package/lib/workers/execution-controller/recovery.js +0 -188
  61. package/lib/workers/execution-controller/scheduler.js +0 -461
  62. package/lib/workers/execution-controller/slice-analytics.js +0 -115
  63. package/lib/workers/helpers/job.js +0 -93
  64. package/lib/workers/helpers/op-analytics.js +0 -22
  65. package/lib/workers/helpers/terafoundation.js +0 -43
  66. package/lib/workers/helpers/worker-shutdown.js +0 -187
  67. package/lib/workers/metrics/index.js +0 -139
  68. package/lib/workers/worker/index.js +0 -344
  69. package/lib/workers/worker/slice.js +0 -143
@@ -1,459 +0,0 @@
1
- 'use strict';
2
-
3
- const sortBy = require('lodash/sortBy');
4
- const {
5
- Queue,
6
- TSError,
7
- getFullErrorStack,
8
- logError,
9
- get,
10
- withoutNil,
11
- isEmpty,
12
- isString,
13
- flatten,
14
- includes,
15
- cloneDeep,
16
- } = require('@terascope/utils');
17
- const { setInterval } = require('timers');
18
- const { makeLogger } = require('../../workers/helpers/terafoundation');
19
-
20
- /**
21
- * New execution result
22
- * @typedef NewExecutionResult
23
- * @property {string} job_id
24
- * @property {string} ex_id
25
- */
26
-
27
- /*
28
- Execution Life Cycle for _status
29
- pending -> scheduling -> running -> [ paused -> running ] -> [ stopped | completed ]
30
- Exceptions
31
- rejected - when a execution is rejected prior to scheduling
32
- failed - when there is an error while the execution is running
33
- aborted - when a execution was running at the point when the cluster shutsdown
34
- */
35
-
36
- module.exports = function executionService(context, { clusterMasterServer }) {
37
- const logger = makeLogger(context, 'execution_service');
38
- const pendingExecutionQueue = new Queue();
39
- const isNative = context.sysconfig.teraslice.cluster_manager_type === 'native';
40
-
41
- let exStore;
42
- let stateStore;
43
- let clusterService;
44
- let allocateInterval;
45
- let reapInterval;
46
-
47
- function enqueue(ex) {
48
- const size = pendingExecutionQueue.size();
49
- logger.debug(ex, `enqueueing execution to be processed (queue size ${size})`);
50
- pendingExecutionQueue.enqueue(cloneDeep(ex));
51
- }
52
-
53
- function getClusterAnalytics() {
54
- return clusterMasterServer.getClusterAnalytics();
55
- }
56
-
57
- async function waitForExecutionStatus(exId, _status) {
58
- const status = _status || 'stopped';
59
-
60
- return new Promise((resolve) => {
61
- async function checkCluster() {
62
- const state = clusterService.getClusterState();
63
- const dict = Object.create(null);
64
-
65
- Object.values(state).forEach((node) => node.active.forEach((worker) => {
66
- dict[worker.ex_id] = true;
67
- }));
68
-
69
- // if found, do not resolve
70
- if (dict[exId]) {
71
- setTimeout(checkCluster, 3000);
72
- return;
73
- }
74
-
75
- try {
76
- await exStore.verifyStatusUpdate(exId, status);
77
- await exStore.setStatus(exId, status);
78
- } catch (err) {
79
- logError(logger, err, `failure setting execution, ${exId}, to ${status}`);
80
- } finally {
81
- resolve(true);
82
- }
83
- }
84
- checkCluster();
85
- });
86
- }
87
-
88
- async function shutdown() {
89
- logger.info('shutting down');
90
-
91
- clearInterval(allocateInterval);
92
- clearInterval(reapInterval);
93
- allocateInterval = null;
94
- reapInterval = null;
95
-
96
- const query = exStore.getLivingStatuses().map((str) => `_status:${str}`).join(' OR ');
97
- const executions = await exStore.search(query);
98
- await Promise.all(executions.map(async (execution) => {
99
- if (!isNative) return;
100
- logger.warn(`marking execution ex_id: ${execution.ex_id}, job_id: ${execution.job_id} as terminated`);
101
- const exId = execution.ex_id;
102
- const { hostname } = context.sysconfig.teraslice;
103
-
104
- // need to exclude sending a stop to cluster master host, the shutdown event
105
- // has already been propagated this can cause a condition of it waiting for
106
- // stop to return but it already has which pauses this service shutdown
107
- await stopExecution(exId, null, hostname);
108
- await waitForExecutionStatus(exId, 'terminated');
109
- }));
110
- }
111
-
112
- function findAllWorkers() {
113
- return flatten(Object.values(clusterService.getClusterState())
114
- .filter((node) => node.state === 'connected')
115
- .map((node) => {
116
- const workers = node.active.filter(Boolean);
117
-
118
- return workers.map((worker) => {
119
- worker.node_id = node.node_id;
120
- worker.hostname = node.hostname;
121
- return worker;
122
- });
123
- }))
124
- .filter(Boolean);
125
- }
126
-
127
- async function addWorkers(exId, workerNum) {
128
- return exStore.getActiveExecution(exId)
129
- .then((execution) => clusterService.addWorkers(execution, workerNum));
130
- }
131
-
132
- async function setWorkers(exId, workerNum) {
133
- return exStore.getActiveExecution(exId)
134
- .then((execution) => clusterService.setWorkers(execution, workerNum));
135
- }
136
-
137
- async function removeWorkers(exId, workerNum) {
138
- return exStore.getActiveExecution(exId)
139
- .then((execution) => clusterService.removeWorkers(execution.ex_id, workerNum));
140
- }
141
-
142
- /**
143
- * Check if the execution is in a terminal status
144
- *
145
- * @param {import('@terascope/job-components').ExecutionConfig} execution
146
- * @returns {boolean}
147
- */
148
- function isExecutionTerminal(execution) {
149
- const terminalList = exStore.getTerminalStatuses();
150
- return terminalList.find((tStat) => tStat === execution._status) != null;
151
- }
152
-
153
- // safely stop the execution without setting the ex status to stopping or stopped
154
- async function finishExecution(exId, err) {
155
- if (err) {
156
- const error = new TSError(err, {
157
- reason: `terminal error for execution: ${exId}, shutting down execution`,
158
- context: {
159
- ex_id: exId,
160
- }
161
- });
162
- logger.error(error);
163
- }
164
- const execution = await getExecutionContext(exId);
165
- const status = execution._status;
166
- if (['stopping', 'stopped'].includes(status)) {
167
- logger.debug(`execution ${exId} is already stopping which means there is no need to stop the execution`);
168
- return;
169
- }
170
-
171
- logger.debug(`execution ${exId} finished, shutting down execution`);
172
- try {
173
- await clusterService.stopExecution(exId);
174
- } catch (stopErr) {
175
- const stopError = new TSError(stopErr, {
176
- reason: 'error finishing the execution',
177
- context: {
178
- ex_id: exId,
179
- }
180
- });
181
- logError(logger, stopError);
182
- }
183
- }
184
-
185
- async function stopExecution(exId, timeout, excludeNode) {
186
- const execution = await getExecutionContext(exId);
187
- const isTerminal = isExecutionTerminal(execution._status);
188
- if (isTerminal) {
189
- logger.info(`execution ${exId} is in terminal status "${execution._status}", it cannot be stopped`);
190
- return;
191
- }
192
-
193
- if (execution._status === 'stopping') {
194
- logger.info('execution is already stopping...');
195
- // we are kicking this off in the background, not part of the promise chain
196
- waitForExecutionStatus(exId);
197
- return;
198
- }
199
-
200
- logger.debug(`stopping execution ${exId}...`, withoutNil({ timeout, excludeNode }));
201
- await exStore.setStatus(exId, 'stopping');
202
- await clusterService.stopExecution(exId, timeout, excludeNode);
203
- // we are kicking this off in the background, not part of the promise chain
204
- waitForExecutionStatus(exId);
205
- }
206
-
207
- async function pauseExecution(exId) {
208
- const status = 'paused';
209
- const execution = await exStore.getActiveExecution(exId);
210
- if (!clusterMasterServer.isClientReady(execution.ex_id)) {
211
- throw new Error(`Execution ${execution.ex_id} is not available to pause`);
212
- }
213
- await clusterMasterServer.sendExecutionPause(exId);
214
- await exStore.setStatus(exId, status);
215
- return { status };
216
- }
217
-
218
- async function resumeExecution(exId) {
219
- const status = 'running';
220
- const execution = await exStore.getActiveExecution(exId);
221
- if (!clusterMasterServer.isClientReady(execution.ex_id)) {
222
- throw new Error(`Execution ${execution.ex_id} is not available to resume`);
223
- }
224
- await clusterMasterServer.sendExecutionResume(execution.ex_id);
225
- await exStore.setStatus(execution.ex_id, status);
226
- return { status };
227
- }
228
-
229
- async function getControllerStats(exId) {
230
- // if no exId is provided it returns all running executions
231
- const specificId = exId || false;
232
- const exIds = await getRunningExecutions(exId);
233
- const clients = clusterMasterServer.onlineClients.filter(({ clientId }) => {
234
- if (specificId && clientId === specificId) return true;
235
- return includes(exIds, clientId);
236
- });
237
-
238
- function formatResponse(msg) {
239
- const payload = get(msg, 'payload', {});
240
- const identifiers = {
241
- ex_id: payload.ex_id,
242
- job_id: payload.job_id,
243
- name: payload.name
244
- };
245
- return Object.assign(identifiers, payload.stats);
246
- }
247
-
248
- if (isEmpty(clients)) {
249
- if (specificId) {
250
- throw new TSError(`Could not find active slicer for ex_id: ${specificId}`, {
251
- statusCode: 404
252
- });
253
- }
254
- return [];
255
- }
256
-
257
- const promises = clients.map((client) => {
258
- const { clientId } = client;
259
- return clusterMasterServer
260
- .sendExecutionAnalyticsRequest(clientId)
261
- .then(formatResponse);
262
- });
263
-
264
- const results = await Promise.all(promises);
265
- return sortBy(results, ['name', 'started']).reverse();
266
- }
267
-
268
- /**
269
- * Create a new execution context
270
- *
271
- * @param {string|import('@terascope/job-components').JobConfig} job
272
- * @return {Promise<NewExecutionResult>}
273
- */
274
- async function createExecutionContext(job) {
275
- const ex = await exStore.create(job);
276
- enqueue(ex);
277
- return { job_id: ex.job_id, ex_id: ex.ex_id };
278
- }
279
-
280
- async function getExecutionContext(exId) {
281
- return exStore.get(exId)
282
- .catch((err) => logError(logger, err, `error getting execution context for ex: ${exId}`));
283
- }
284
-
285
- async function getRunningExecutions(exId) {
286
- let query = exStore.getRunningStatuses().map((state) => ` _status:${state} `).join('OR');
287
- if (exId) query = `ex_id:"${exId}" AND (${query.trim()})`;
288
- const exs = await exStore.search(query, null, null, '_created:desc');
289
- return exs.map((ex) => ex.ex_id);
290
- }
291
-
292
- /**
293
- * Recover the execution
294
- *
295
- * @param {string|import('@terascope/job-components').ExecutionConfig} exIdOrEx
296
- * @param {import('@terascope/job-components').RecoveryCleanupType} [cleanupType]
297
- * @return {Promise<NewExecutionResult>}
298
- */
299
- async function recoverExecution(exIdOrEx, cleanupType) {
300
- const recoverFromEx = isString(exIdOrEx)
301
- ? await getExecutionContext(exIdOrEx)
302
- : cloneDeep(exIdOrEx);
303
-
304
- const ex = await exStore.createRecoveredExecution(recoverFromEx, cleanupType);
305
- enqueue(ex);
306
- return { job_id: ex.job_id, ex_id: ex.ex_id };
307
- }
308
-
309
- function _executionAllocator() {
310
- let allocatingExecution = false;
311
- const { readyForAllocation } = clusterService;
312
- return async function allocator() {
313
- const canAllocate = !allocatingExecution
314
- && pendingExecutionQueue.size() > 0
315
- && readyForAllocation();
316
- if (!canAllocate) return;
317
-
318
- allocatingExecution = true;
319
- let execution = pendingExecutionQueue.dequeue();
320
-
321
- logger.info(`Scheduling execution: ${execution.ex_id}`);
322
-
323
- try {
324
- execution = await exStore.setStatus(execution.ex_id, 'scheduling');
325
-
326
- execution = await clusterService.allocateSlicer(execution);
327
-
328
- execution = await exStore.setStatus(execution.ex_id, 'initializing', {
329
- slicer_port: execution.slicer_port,
330
- slicer_hostname: execution.slicer_hostname
331
- });
332
-
333
- try {
334
- await clusterService.allocateWorkers(execution, execution.workers);
335
- } catch (err) {
336
- throw new TSError(err, {
337
- reason: `Failure to allocateWorkers ${execution.ex_id}`
338
- });
339
- }
340
- } catch (err) {
341
- const msg = `Failed to provision execution ${execution.ex_id}`;
342
- const error = new TSError(err, {
343
- reason: msg
344
- });
345
- logger.warn(msg);
346
-
347
- try {
348
- await exStore.setStatus(
349
- execution.ex_id,
350
- 'failed',
351
- exStore.executionMetaData(null, getFullErrorStack(error))
352
- );
353
- } catch (failedErr) {
354
- logger.error(new TSError(err, {
355
- reason: 'Failure to set execution status to failed after provision failed'
356
- }));
357
- }
358
-
359
- if (context.sysconfig.teraslice.cluster_manager_type === 'kubernetes') {
360
- // Since this condition is only hit in cases where the pods
361
- // are never scheduled, all this call to stopExecution
362
- // accomplishes is to delete the k8s resources, which is
363
- // probably just the k8s job for the execution controller.
364
- // Calling delete on the worker deployment that doesn't
365
- // exist is OK.
366
- logger.warn(`Calling stopExecution on execution: ${execution.ex_id} to clean up k8s resources.`);
367
- await clusterService.stopExecution(execution.ex_id);
368
- }
369
- } finally {
370
- allocatingExecution = false;
371
- allocator();
372
- }
373
- };
374
- }
375
-
376
- async function reapExecutions() {
377
- // make sure to capture the error avoid throwing an
378
- // unhandled rejection
379
- try {
380
- // sometimes in development an execution gets stuck in stopping
381
- // status since the process gets force killed in before it
382
- // can be updated to stopped.
383
- const stopping = await exStore.search('_status:stopping');
384
- for (const execution of stopping) {
385
- const updatedAt = new Date(execution._updated).getTime();
386
- const updatedWithTimeout = updatedAt + context.sysconfig.teraslice.shutdown_timeout;
387
- // Since we don't want to break executions that actually are "stopping"
388
- // we need to verify that the job has exceeded the shutdown timeout
389
- if (Date.now() > updatedWithTimeout) {
390
- logger.info(`stopping stuck executing ${execution._status} execution: ${execution.ex_id}`);
391
- await exStore.setStatus(execution.ex_id, 'stopped');
392
- }
393
- }
394
- } catch (err) {
395
- logger.error(err, 'failure reaping executions');
396
- }
397
- }
398
-
399
- async function initialize() {
400
- exStore = context.stores.execution;
401
- stateStore = context.stores.state;
402
- if (exStore == null || stateStore == null) {
403
- throw new Error('Missing required stores');
404
- }
405
-
406
- clusterService = context.services.cluster;
407
- if (clusterService == null) {
408
- throw new Error('Missing required services');
409
- }
410
-
411
- logger.info('execution service is initializing...');
412
-
413
- // listen for an execution finished events
414
- clusterMasterServer.onExecutionFinished(finishExecution);
415
-
416
- // lets call this before calling it
417
- // in the background
418
- await reapExecutions();
419
-
420
- const pending = await exStore.search('_status:pending', null, 10000, '_created:asc');
421
- for (const execution of pending) {
422
- logger.info(`enqueuing ${execution._status} execution: ${execution.ex_id}`);
423
- enqueue(execution);
424
- }
425
-
426
- const queueSize = pendingExecutionQueue.size();
427
-
428
- if (queueSize > 0) {
429
- logger.info(`execution queue initialization complete, ${pendingExecutionQueue.size()} pending executions have been enqueued`);
430
- } else {
431
- logger.debug('execution queue initialization complete');
432
- }
433
-
434
- allocateInterval = setInterval(_executionAllocator(), 1000);
435
- reapInterval = setInterval(
436
- reapExecutions,
437
- context.sysconfig.teraslice.shutdown_timeout || 30000
438
- );
439
- }
440
-
441
- return {
442
- getClusterAnalytics,
443
- getControllerStats,
444
- findAllWorkers,
445
- shutdown,
446
- initialize,
447
- stopExecution,
448
- pauseExecution,
449
- resumeExecution,
450
- recoverExecution,
451
- removeWorkers,
452
- addWorkers,
453
- setWorkers,
454
- createExecutionContext,
455
- getExecutionContext,
456
- isExecutionTerminal,
457
- waitForExecutionStatus
458
- };
459
- };