teraslice 2.11.0 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/dist/src/interfaces.js +12 -0
  2. package/dist/src/lib/cluster/cluster_master.js +246 -0
  3. package/dist/src/lib/cluster/node_master.js +355 -0
  4. package/dist/src/lib/cluster/services/api.js +663 -0
  5. package/dist/src/lib/cluster/services/assets.js +226 -0
  6. package/dist/src/lib/cluster/services/cluster/backends/kubernetes/index.js +192 -0
  7. package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8s.js +481 -0
  8. package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js +414 -0
  9. package/dist/src/lib/cluster/services/cluster/backends/kubernetes/k8sState.js +59 -0
  10. package/dist/src/lib/cluster/services/cluster/backends/kubernetes/utils.js +43 -0
  11. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/index.js +192 -0
  12. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/interfaces.js +2 -0
  13. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8s.js +423 -0
  14. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sDeploymentResource.js +60 -0
  15. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sJobResource.js +55 -0
  16. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.js +359 -0
  17. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sServiceResource.js +37 -0
  18. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sState.js +60 -0
  19. package/dist/src/lib/cluster/services/cluster/backends/kubernetesV2/utils.js +170 -0
  20. package/dist/src/lib/cluster/services/cluster/backends/native/dispatch.js +13 -0
  21. package/dist/src/lib/cluster/services/cluster/backends/native/index.js +526 -0
  22. package/dist/src/lib/cluster/services/cluster/backends/native/messaging.js +547 -0
  23. package/dist/src/lib/cluster/services/cluster/backends/state-utils.js +26 -0
  24. package/dist/src/lib/cluster/services/cluster/index.js +17 -0
  25. package/dist/src/lib/cluster/services/execution.js +435 -0
  26. package/dist/src/lib/cluster/services/index.js +6 -0
  27. package/dist/src/lib/cluster/services/interfaces.js +2 -0
  28. package/dist/src/lib/cluster/services/jobs.js +454 -0
  29. package/dist/src/lib/config/default-sysconfig.js +26 -0
  30. package/dist/src/lib/config/index.js +22 -0
  31. package/dist/src/lib/config/schemas/system.js +360 -0
  32. package/dist/src/lib/storage/analytics.js +86 -0
  33. package/dist/src/lib/storage/assets.js +401 -0
  34. package/dist/src/lib/storage/backends/elasticsearch_store.js +494 -0
  35. package/dist/src/lib/storage/backends/mappings/analytics.js +50 -0
  36. package/dist/src/lib/storage/backends/mappings/asset.js +41 -0
  37. package/dist/src/lib/storage/backends/mappings/ex.js +62 -0
  38. package/dist/src/lib/storage/backends/mappings/job.js +38 -0
  39. package/dist/src/lib/storage/backends/mappings/state.js +38 -0
  40. package/dist/src/lib/storage/backends/s3_store.js +237 -0
  41. package/dist/src/lib/storage/execution.js +300 -0
  42. package/dist/src/lib/storage/index.js +7 -0
  43. package/dist/src/lib/storage/jobs.js +81 -0
  44. package/dist/src/lib/storage/state.js +255 -0
  45. package/dist/src/lib/utils/api_utils.js +157 -0
  46. package/dist/src/lib/utils/asset_utils.js +94 -0
  47. package/dist/src/lib/utils/date_utils.js +52 -0
  48. package/dist/src/lib/utils/encoding_utils.js +27 -0
  49. package/dist/src/lib/utils/events.js +4 -0
  50. package/dist/src/lib/utils/file_utils.js +124 -0
  51. package/dist/src/lib/utils/id_utils.js +15 -0
  52. package/dist/src/lib/utils/port_utils.js +32 -0
  53. package/dist/src/lib/workers/assets/index.js +3 -0
  54. package/dist/src/lib/workers/assets/loader-executable.js +40 -0
  55. package/dist/src/lib/workers/assets/loader.js +73 -0
  56. package/dist/src/lib/workers/assets/spawn.js +55 -0
  57. package/dist/src/lib/workers/context/execution-context.js +12 -0
  58. package/dist/src/lib/workers/context/terafoundation-context.js +8 -0
  59. package/dist/src/lib/workers/execution-controller/execution-analytics.js +188 -0
  60. package/dist/src/lib/workers/execution-controller/index.js +1024 -0
  61. package/dist/src/lib/workers/execution-controller/recovery.js +151 -0
  62. package/dist/src/lib/workers/execution-controller/scheduler.js +390 -0
  63. package/dist/src/lib/workers/execution-controller/slice-analytics.js +96 -0
  64. package/dist/src/lib/workers/helpers/job.js +80 -0
  65. package/dist/src/lib/workers/helpers/op-analytics.js +22 -0
  66. package/dist/src/lib/workers/helpers/terafoundation.js +34 -0
  67. package/dist/src/lib/workers/helpers/worker-shutdown.js +169 -0
  68. package/dist/src/lib/workers/metrics/index.js +108 -0
  69. package/dist/src/lib/workers/worker/index.js +378 -0
  70. package/dist/src/lib/workers/worker/slice.js +122 -0
  71. package/dist/test/config/schemas/system_schema-spec.js +37 -0
  72. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8s-spec.js +316 -0
  73. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sResource-spec.js +795 -0
  74. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sState-multicluster-spec.js +67 -0
  75. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/k8sState-spec.js +84 -0
  76. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/utils-spec.js +132 -0
  77. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8s-v2-spec.js +455 -0
  78. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sResource-v2-spec.js +818 -0
  79. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sState-multicluster-v2-spec.js +67 -0
  80. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/k8sState-v2-spec.js +84 -0
  81. package/dist/test/lib/cluster/services/cluster/backends/kubernetes/v2/utils-v2-spec.js +320 -0
  82. package/dist/test/lib/cluster/services/cluster/backends/state-utils-spec.js +37 -0
  83. package/dist/test/node_master-spec.js +188 -0
  84. package/dist/test/services/api-spec.js +80 -0
  85. package/dist/test/services/assets-spec.js +158 -0
  86. package/dist/test/services/messaging-spec.js +440 -0
  87. package/dist/test/storage/assets_storage-spec.js +95 -0
  88. package/dist/test/storage/s3_store-spec.js +138 -0
  89. package/dist/test/test.config.js +8 -0
  90. package/dist/test/test.setup.js +6 -0
  91. package/dist/test/utils/api_utils-spec.js +86 -0
  92. package/dist/test/utils/asset_utils-spec.js +141 -0
  93. package/dist/test/utils/elastic_utils-spec.js +25 -0
  94. package/dist/test/workers/execution-controller/execution-controller-spec.js +371 -0
  95. package/dist/test/workers/execution-controller/execution-special-test-cases-spec.js +520 -0
  96. package/dist/test/workers/execution-controller/execution-test-cases-spec.js +338 -0
  97. package/dist/test/workers/execution-controller/recovery-spec.js +160 -0
  98. package/dist/test/workers/execution-controller/scheduler-spec.js +249 -0
  99. package/dist/test/workers/execution-controller/slice-analytics-spec.js +121 -0
  100. package/dist/test/workers/fixtures/ops/example-op/processor.js +20 -0
  101. package/dist/test/workers/fixtures/ops/example-op/schema.js +19 -0
  102. package/dist/test/workers/fixtures/ops/example-reader/fetcher.js +20 -0
  103. package/dist/test/workers/fixtures/ops/example-reader/schema.js +41 -0
  104. package/dist/test/workers/fixtures/ops/example-reader/slicer.js +37 -0
  105. package/dist/test/workers/fixtures/ops/new-op/processor.js +29 -0
  106. package/dist/test/workers/fixtures/ops/new-op/schema.js +18 -0
  107. package/dist/test/workers/fixtures/ops/new-reader/fetcher.js +19 -0
  108. package/dist/test/workers/fixtures/ops/new-reader/schema.js +23 -0
  109. package/dist/test/workers/fixtures/ops/new-reader/slicer.js +13 -0
  110. package/dist/test/workers/helpers/configs.js +130 -0
  111. package/dist/test/workers/helpers/execution-controller-helper.js +49 -0
  112. package/dist/test/workers/helpers/index.js +5 -0
  113. package/dist/test/workers/helpers/test-context.js +210 -0
  114. package/dist/test/workers/helpers/zip-directory.js +25 -0
  115. package/dist/test/workers/worker/slice-spec.js +333 -0
  116. package/dist/test/workers/worker/worker-spec.js +356 -0
  117. package/package.json +94 -94
  118. package/service.js +0 -0
@@ -0,0 +1,12 @@
1
+ export var ProcessAssignment;
2
+ (function (ProcessAssignment) {
3
+ ProcessAssignment["node_master"] = "node_master";
4
+ ProcessAssignment["cluster_master"] = "cluster_master";
5
+ ProcessAssignment["assets_service"] = "assets_service";
6
+ ProcessAssignment["execution_controller"] = "execution_controller";
7
+ ProcessAssignment["worker"] = "worker";
8
+ })(ProcessAssignment || (ProcessAssignment = {}));
9
+ export function isProcessAssignment(value) {
10
+ return value in ProcessAssignment;
11
+ }
12
+ //# sourceMappingURL=interfaces.js.map
@@ -0,0 +1,246 @@
1
+ import express from 'express';
2
+ import got from 'got';
3
+ import { pDelay, logError, get, parseError } from '@terascope/utils';
4
+ import { ClusterMaster as ClusterMasterMessaging } from '@terascope/teraslice-messaging';
5
+ import { isPromAvailable } from '@terascope/job-components';
6
+ import { makeLogger } from '../workers/helpers/terafoundation.js';
7
+ import { ExecutionService, ApiService, JobsService, makeClustering } from './services/index.js';
8
+ import { JobsStorage, ExecutionStorage, StateStorage } from '../storage/index.js';
9
+ export class ClusterMaster {
10
+ context;
11
+ logger;
12
+ running = false;
13
+ assetsUrl;
14
+ messagingServer;
15
+ constructor(context) {
16
+ this.context = context;
17
+ this.logger = makeLogger(context, 'cluster_master');
18
+ const assetsPort = process.env.assets_port;
19
+ this.assetsUrl = `http://127.0.0.1:${assetsPort}`;
20
+ }
21
+ async isAssetServiceUp() {
22
+ try {
23
+ const response = await got.get('status', {
24
+ prefixUrl: this.assetsUrl,
25
+ responseType: 'json',
26
+ throwHttpErrors: true,
27
+ timeout: {
28
+ request: 900
29
+ },
30
+ retry: {
31
+ limit: 0
32
+ }
33
+ });
34
+ return get(response, 'body.available', false);
35
+ }
36
+ catch (err) {
37
+ this.logger.debug(`asset service not up yet, error: ${parseError(err)}`);
38
+ return false;
39
+ }
40
+ }
41
+ async waitForAssetsService(timeoutAt) {
42
+ if (Date.now() > timeoutAt) {
43
+ return Promise.reject(new Error('Timeout waiting for asset service to come online'));
44
+ }
45
+ const isUp = await this.isAssetServiceUp();
46
+ if (isUp) {
47
+ return true;
48
+ }
49
+ await pDelay(1000);
50
+ return this.waitForAssetsService(timeoutAt);
51
+ }
52
+ async initialize() {
53
+ const clusterConfig = this.context.sysconfig.teraslice;
54
+ const { logger } = this;
55
+ try {
56
+ // Initialize the HTTP service for handling incoming requests.
57
+ const app = express();
58
+ this.messagingServer = new ClusterMasterMessaging.Server({
59
+ port: clusterConfig.port,
60
+ nodeDisconnectTimeout: clusterConfig.node_disconnect_timeout,
61
+ // setting request timeout to 5 minutes
62
+ serverTimeout: clusterConfig.api_response_timeout,
63
+ // we do this to override express final response handler
64
+ requestListener(req, res) {
65
+ // @ts-expect-error
66
+ app(req, res, (err) => {
67
+ if (err) {
68
+ logger.warn(err, 'unexpected server error');
69
+ }
70
+ res.setHeader('Content-Type', 'application/json');
71
+ res.statusCode = 500;
72
+ res.end(JSON.stringify({ error: 'api is not available' }));
73
+ });
74
+ },
75
+ networkLatencyBuffer: clusterConfig.network_latency_buffer,
76
+ actionTimeout: clusterConfig.action_timeout,
77
+ logger: this.logger,
78
+ });
79
+ const serviceOptions = {
80
+ assetsUrl: this.assetsUrl,
81
+ app,
82
+ clusterMasterServer: this.messagingServer
83
+ };
84
+ const executionService = new ExecutionService(this.context, serviceOptions);
85
+ const jobsService = new JobsService(this.context);
86
+ const clusterService = makeClustering(this.context, serviceOptions);
87
+ const apiService = new ApiService(this.context, serviceOptions);
88
+ const services = Object.freeze({
89
+ executionService,
90
+ jobsService,
91
+ clusterService,
92
+ apiService,
93
+ });
94
+ this.context.services = services;
95
+ await this.messagingServer.start();
96
+ this.logger.info(`cluster master listening on port ${clusterConfig.port}`);
97
+ const executionStorage = new ExecutionStorage(this.context);
98
+ const stateStorage = new StateStorage(this.context);
99
+ const jobsStorage = new JobsStorage(this.context);
100
+ await Promise.all([
101
+ executionStorage.initialize(),
102
+ stateStorage.initialize(),
103
+ jobsStorage.initialize()
104
+ ]);
105
+ this.context.stores = {
106
+ executionStorage,
107
+ stateStorage,
108
+ jobsStorage,
109
+ };
110
+ // order matters
111
+ await services.clusterService.initialize();
112
+ await services.executionService.initialize();
113
+ await services.jobsService.initialize();
114
+ this.logger.debug('services has been initialized');
115
+ // give the assets service a bit to come up
116
+ const fiveMinutes = 5 * 60 * 1000;
117
+ await this.waitForAssetsService(Date.now() + fiveMinutes);
118
+ // this needs to be last
119
+ await services.apiService.initialize();
120
+ /// initialize promClient
121
+ if (this.context.sysconfig.teraslice.cluster_manager_type === 'native') {
122
+ this.logger.warn('Skipping PromMetricsAPI initialization: incompatible with native clustering.');
123
+ }
124
+ else {
125
+ const { terafoundation } = this.context.sysconfig;
126
+ await this.context.apis.foundation.promMetrics.init({
127
+ terasliceName: this.context.sysconfig.teraslice.name,
128
+ tf_prom_metrics_add_default: terafoundation.prom_metrics_add_default,
129
+ tf_prom_metrics_enabled: terafoundation.prom_metrics_enabled,
130
+ tf_prom_metrics_port: terafoundation.prom_metrics_port,
131
+ logger: this.logger,
132
+ assignment: 'master',
133
+ prefix: 'teraslice_',
134
+ prom_metrics_display_url: terafoundation.prom_metrics_display_url
135
+ });
136
+ await this.setupPromMetrics();
137
+ }
138
+ this.logger.info('cluster master is ready!');
139
+ this.running = true;
140
+ }
141
+ catch (err) {
142
+ logError(this.logger, err, 'error during service initialization');
143
+ this.running = false;
144
+ throw err;
145
+ }
146
+ }
147
+ async run() {
148
+ return new Promise((resolve) => {
149
+ if (!this.running) {
150
+ resolve(true);
151
+ return;
152
+ }
153
+ const runningInterval = setInterval(() => {
154
+ if (!this.running) {
155
+ clearInterval(runningInterval);
156
+ resolve(true);
157
+ }
158
+ }, 1000);
159
+ });
160
+ }
161
+ async shutdown() {
162
+ this.running = false;
163
+ this.logger.info('cluster_master is shutting down');
164
+ this.messagingServer.isShuttingDown = true;
165
+ await Promise.all(Object.entries(this.context.services)
166
+ .map(async ([name, service]) => {
167
+ try {
168
+ await service.shutdown();
169
+ }
170
+ catch (err) {
171
+ logError(this.logger, err, `Failure to shutdown service ${name}`);
172
+ }
173
+ }));
174
+ await Promise.all(Object.entries(this.context.stores)
175
+ .map(async ([name, store]) => {
176
+ try {
177
+ await store.shutdown();
178
+ }
179
+ catch (err) {
180
+ logError(this.logger, err, `Failure to shutdown store ${name}`);
181
+ }
182
+ }));
183
+ await this.messagingServer.shutdown();
184
+ if (isPromAvailable(this.context)) {
185
+ await this.context.apis.foundation.promMetrics.shutdown();
186
+ }
187
+ }
188
+ /**
189
+ * Adds all prom metrics specific to the cluster_master.
190
+ *
191
+ * If trying to add a new metric for the cluster_master, it belongs here.
192
+ * @async
193
+ * @function setupPromMetrics
194
+ * @return {Promise<void>}
195
+ * @link https://terascope.github.io/teraslice/docs/development/k8s#prometheus-metrics-api
196
+ */
197
+ async setupPromMetrics() {
198
+ if (isPromAvailable(this.context)) {
199
+ this.logger.info(`adding ${this.context.assignment} prom metrics...`);
200
+ /*
201
+ TODO: After reviewing these metrics, I've conluded that all of these
202
+ can be handled by th execution controller. We might move these into the execution
203
+ controller metrics down the line. The master can maybe keep track of how many ex
204
+ controllers there are? Some sort of overview of everything and leave the specifics
205
+ to each ex.
206
+
207
+ */
208
+ await Promise.all([
209
+ this.context.apis.foundation.promMetrics.addGauge('master_info', 'Information about Teraslice cluster master', ['arch', 'clustering_type', 'name', 'node_version', 'platform', 'teraslice_version']),
210
+ this.context.apis.foundation.promMetrics.addGauge('slices_processed', 'Total slices processed across the cluster', []),
211
+ this.context.apis.foundation.promMetrics.addGauge('slices_failed', 'Total slices failed across the cluster', []),
212
+ this.context.apis.foundation.promMetrics.addGauge('slices_queued', 'Total slices queued across the cluster', []),
213
+ this.context.apis.foundation.promMetrics.addGauge('workers_joined', 'Total workers joined across the cluster', []),
214
+ this.context.apis.foundation.promMetrics.addGauge('workers_disconnected', 'Total workers disconnected across the cluster', []),
215
+ this.context.apis.foundation.promMetrics.addGauge('workers_reconnected', 'Total workers reconnected across the cluster', []),
216
+ this.context.apis.foundation.promMetrics.addGauge('controller_workers_active', 'Number of Teraslice workers actively processing slices.', ['ex_id', 'job_id', 'job_name']),
217
+ this.context.apis.foundation.promMetrics.addGauge('controller_workers_available', 'Number of Teraslice workers running and waiting for work.', ['ex_id', 'job_id', 'job_name']),
218
+ this.context.apis.foundation.promMetrics.addGauge('controller_workers_joined', 'Total number of Teraslice workers that have joined the execution controller for this job.', ['ex_id', 'job_id', 'job_name']),
219
+ this.context.apis.foundation.promMetrics.addGauge('controller_workers_reconnected', 'Total number of Teraslice workers that have reconnected to the execution controller for this job.', ['ex_id', 'job_id', 'job_name']),
220
+ this.context.apis.foundation.promMetrics.addGauge('controller_workers_disconnected', 'Total number of Teraslice workers that have disconnected from execution controller for this job.', ['ex_id', 'job_id', 'job_name']),
221
+ this.context.apis.foundation.promMetrics.addGauge('execution_info', 'Information about Teraslice execution.', ['ex_id', 'job_id', 'image', 'version']),
222
+ this.context.apis.foundation.promMetrics.addGauge('controller_slicers_count', 'Number of execution controllers (slicers) running for this execution.', ['ex_id', 'job_id', 'job_name']),
223
+ // Execution Related Metrics
224
+ this.context.apis.foundation.promMetrics.addGauge('execution_cpu_limit', 'CPU core limit for a Teraslice worker container.', ['ex_id', 'job_id', 'job_name']),
225
+ this.context.apis.foundation.promMetrics.addGauge('execution_cpu_request', 'Requested number of CPU cores for a Teraslice worker container.', ['ex_id', 'job_id', 'job_name']),
226
+ this.context.apis.foundation.promMetrics.addGauge('execution_memory_limit', 'Memory limit for Teraslice a worker container.', ['ex_id', 'job_id', 'job_name']),
227
+ this.context.apis.foundation.promMetrics.addGauge('execution_memory_request', 'Requested amount of memory for a Teraslice worker container.', ['ex_id', 'job_id', 'job_name']),
228
+ this.context.apis.foundation.promMetrics.addGauge('execution_status', 'Current status of the Teraslice execution.', ['ex_id', 'job_id', 'job_name', 'status']),
229
+ /*
230
+ TODO: The following gauges should be Counters. This was not done because
231
+ teraslice master already provided the count total for most of these metrics.
232
+ So setting the gauge is the only real way to gather the metrics in master.
233
+ Solution to convert would be setting the count in the ex process.
234
+ */
235
+ this.context.apis.foundation.promMetrics.addGauge('controller_slices_processed', 'Number of slices processed.', ['ex_id', 'job_id', 'job_name']),
236
+ this.context.apis.foundation.promMetrics.addGauge('controller_slices_failed', 'Number of slices failed.', ['ex_id', 'job_id', 'job_name']),
237
+ this.context.apis.foundation.promMetrics.addGauge('controller_slices_queued', 'Number of slices queued for processing.', ['ex_id', 'job_id', 'job_name']),
238
+ this.context.apis.foundation.promMetrics.addGauge('execution_created_timestamp_seconds', 'Execution creation time.', ['ex_id', 'job_id', 'job_name']),
239
+ this.context.apis.foundation.promMetrics.addGauge('execution_updated_timestamp_seconds', 'Execution update time.', ['ex_id', 'job_id', 'job_name']),
240
+ this.context.apis.foundation.promMetrics.addGauge('execution_slicers', 'Number of slicers defined on the execution.', ['ex_id', 'job_id', 'job_name']),
241
+ this.context.apis.foundation.promMetrics.addGauge('execution_workers', 'Number of workers defined on the execution. Note that the number of actual workers can differ from this value.', ['ex_id', 'job_id', 'job_name']),
242
+ ]);
243
+ }
244
+ }
245
+ }
246
+ //# sourceMappingURL=cluster_master.js.map
@@ -0,0 +1,355 @@
1
+ import ms from 'ms';
2
+ import { Mutex } from 'async-mutex';
3
+ import { getFullErrorStack, debounce, isEmpty, has } from '@terascope/utils';
4
+ import { makeLogger } from '../workers/helpers/terafoundation.js';
5
+ import { Messaging } from './services/cluster/backends/native/messaging.js';
6
+ import { spawnAssetLoader } from '../workers/assets/spawn.js';
7
+ import { safeEncode } from '../utils/encoding_utils.js';
8
+ import { findPort, getPorts } from '../utils/port_utils.js';
9
+ import { getPackageJSON } from '../utils/file_utils.js';
10
+ const nodeVersion = process.version;
11
+ const terasliceVersion = getPackageJSON().version;
12
+ export async function nodeMaster(context) {
13
+ const logger = makeLogger(context, 'node_master');
14
+ const configWorkerLimit = context.sysconfig.teraslice.workers;
15
+ const config = context.sysconfig.teraslice;
16
+ const events = context.apis.foundation.getSystemEvents();
17
+ const mutex = new Mutex();
18
+ const messaging = new Messaging(context, logger);
19
+ const host = messaging.getHostUrl();
20
+ const isShuttingDown = false;
21
+ const ports = getPorts(context);
22
+ logger.info(`node ${context.sysconfig._nodeName} is attempting to connect to cluster_master: ${host}`);
23
+ function sendNodeStateNow() {
24
+ if (isShuttingDown)
25
+ return;
26
+ const state = getNodeState();
27
+ messaging.send({
28
+ to: 'cluster_master',
29
+ message: 'node:state',
30
+ node_id: state.node_id,
31
+ payload: state
32
+ });
33
+ }
34
+ const sendNodeState = debounce(sendNodeStateNow, 500, { leading: false, trailing: true });
35
+ let pendingAllocations = 0;
36
+ function allocateWorkers(count, exConfig, fn) {
37
+ const startTime = Date.now();
38
+ pendingAllocations += count;
39
+ sendNodeStateNow();
40
+ const locked = mutex.isLocked() ? ' (locked)' : '';
41
+ logger.info(`allocating ${count} workers...${locked}`);
42
+ return mutex.runExclusive(async () => {
43
+ try {
44
+ await loadAssetsIfNeeded(exConfig.job, exConfig.ex_id);
45
+ }
46
+ catch (err) {
47
+ logger.error(`Failure to allocated assets for execution ${exConfig.ex_id}`);
48
+ throw err;
49
+ }
50
+ finally {
51
+ pendingAllocations -= count;
52
+ }
53
+ try {
54
+ const workers = await fn();
55
+ const elapsed = Date.now() - startTime;
56
+ if (workers.length === count) {
57
+ logger.info(`allocated ${workers.length} workers, took ${ms(elapsed)}`);
58
+ }
59
+ else {
60
+ logger.info(`allocated ${workers.length} out of the requested ${count} workers, took ${ms(elapsed)}`);
61
+ }
62
+ return workers.length;
63
+ }
64
+ catch (err) {
65
+ logger.error(`Failure to allocate workers for execution ${exConfig.ex_id}`);
66
+ throw err;
67
+ }
68
+ });
69
+ }
70
+ function canAllocateWorkers(requestedWorkers) {
71
+ const numOfCurrentWorkers = Object.keys(context.cluster.workers).length;
72
+ // if there is an over allocation, send back rest to be enqueued
73
+ if (configWorkerLimit < numOfCurrentWorkers + requestedWorkers) {
74
+ return configWorkerLimit - numOfCurrentWorkers > 0;
75
+ }
76
+ return true;
77
+ }
78
+ messaging.registerChildOnlineHook(sendNodeState);
79
+ messaging.register({
80
+ event: 'network:connect',
81
+ callback: () => {
82
+ logger.info(`node has successfully connected to: ${host}`);
83
+ const state = getNodeState();
84
+ messaging.send({
85
+ to: 'cluster_master', message: 'node:online', node_id: state.node_id, payload: state
86
+ });
87
+ }
88
+ });
89
+ messaging.register({
90
+ event: 'network:disconnect',
91
+ callback: () => logger.info(`node has disconnected from: ${host}`)
92
+ });
93
+ messaging.register({
94
+ event: 'network:error',
95
+ callback: (err) => logger.warn(err, `Attempting to connect to cluster_master: ${host}`)
96
+ });
97
+ messaging.register({
98
+ event: 'cluster:execution_controller:create',
99
+ // TODO: type this
100
+ callback: (createSlicerRequest) => {
101
+ const createSlicerMsg = createSlicerRequest.payload;
102
+ logger.info(`starting execution_controller for execution ${createSlicerMsg.ex_id}...`);
103
+ allocateWorkers(1, createSlicerMsg, async () => {
104
+ const controllerContext = {
105
+ assignment: 'execution_controller',
106
+ NODE_TYPE: 'execution_controller',
107
+ EX: safeEncode(createSlicerMsg.job),
108
+ job: createSlicerMsg.job,
109
+ node_id: context.sysconfig._nodeName,
110
+ ex_id: createSlicerMsg.ex_id,
111
+ job_id: createSlicerMsg.job_id,
112
+ slicer_port: createSlicerMsg.slicer_port
113
+ };
114
+ logger.trace('starting a execution controller', controllerContext);
115
+ return context.apis.foundation.startWorkers(1, controllerContext);
116
+ })
117
+ .then(() => messaging.respond(createSlicerRequest))
118
+ .catch((error) => {
119
+ messaging.respond(createSlicerRequest, {
120
+ error: getFullErrorStack(error),
121
+ });
122
+ });
123
+ }
124
+ });
125
+ messaging.register({
126
+ event: 'cluster:workers:create',
127
+ callback: (createWorkerRequest) => {
128
+ const createWorkerMsg = createWorkerRequest.payload;
129
+ const requestedWorkers = createWorkerMsg.workers;
130
+ logger.info(`starting ${requestedWorkers} workers for execution ${createWorkerMsg.ex_id}...`);
131
+ if (!canAllocateWorkers(requestedWorkers)) {
132
+ logger.warn(`worker is overallocated, maximum number of workers of ${configWorkerLimit}`);
133
+ messaging.respond(createWorkerRequest, {
134
+ payload: {
135
+ createdWorkers: 0,
136
+ }
137
+ });
138
+ return;
139
+ }
140
+ allocateWorkers(requestedWorkers, createWorkerMsg, async () => {
141
+ let newWorkers = requestedWorkers;
142
+ const numOfCurrentWorkers = Object.keys(context.cluster.workers).length;
143
+ // if there is an over allocation, send back rest to be enqueued
144
+ if (configWorkerLimit < numOfCurrentWorkers + requestedWorkers) {
145
+ newWorkers = configWorkerLimit - numOfCurrentWorkers;
146
+ logger.warn(`worker allocation request would exceed maximum number of workers of ${configWorkerLimit}`);
147
+ logger.warn(`reducing allocation to ${newWorkers} workers.`);
148
+ }
149
+ let workers = [];
150
+ if (newWorkers > 0) {
151
+ logger.trace(`starting ${newWorkers} workers`, createWorkerMsg.ex_id);
152
+ workers = context.apis.foundation.startWorkers(newWorkers, {
153
+ NODE_TYPE: 'worker',
154
+ EX: safeEncode(createWorkerMsg.job),
155
+ assignment: 'worker',
156
+ node_id: context.sysconfig._nodeName,
157
+ job: createWorkerMsg.job,
158
+ ex_id: createWorkerMsg.ex_id,
159
+ job_id: createWorkerMsg.job_id
160
+ });
161
+ }
162
+ return workers;
163
+ })
164
+ .then((createdWorkers) => messaging.respond(createWorkerRequest, {
165
+ payload: {
166
+ createdWorkers,
167
+ }
168
+ }))
169
+ .catch(() => messaging.respond(createWorkerRequest, {
170
+ payload: {
171
+ createdWorkers: 0,
172
+ }
173
+ }));
174
+ }
175
+ });
176
+ messaging.register({ event: 'cluster:node:state', callback: () => sendNodeState() });
177
+ // this fires when entire server will be shutdown
178
+ events.once('terafoundation:shutdown', () => {
179
+ logger.debug('received shutdown notice from terafoundation');
180
+ const filterFn = () => context.cluster.workers;
181
+ const isActionCompleteFn = () => isEmpty(getNodeState().active);
182
+ shutdownProcesses({}, filterFn, isActionCompleteFn, true);
183
+ });
184
+ messaging.register({
185
+ event: 'cluster:execution:stop',
186
+ callback: (networkMsg) => {
187
+ const exId = networkMsg.ex_id;
188
+ logger.debug(`received cluster execution stop for execution ${exId}`);
189
+ const filterFn = () => {
190
+ return Object.values(context.cluster.workers)
191
+ .filter((worker) => {
192
+ return worker.ex_id === exId;
193
+ });
194
+ };
195
+ function actionCompleteFn() {
196
+ const children = getNodeState().active;
197
+ const workers = children.filter((worker) => worker.ex_id === exId);
198
+ logger.debug(`waiting for ${workers.length} to stop for ex: ${exId}`);
199
+ return workers.length === 0;
200
+ }
201
+ shutdownProcesses(networkMsg, filterFn, actionCompleteFn);
202
+ }
203
+ });
204
+ messaging.register({
205
+ event: 'cluster:workers:remove',
206
+ callback: (networkMsg) => {
207
+ const numberToRemove = networkMsg.payload.workers;
208
+ const children = getNodeState().active;
209
+ const startingWorkerCount = children.filter((worker) => worker.ex_id === networkMsg.ex_id && worker.assignment === 'worker').length;
210
+ const filterFn = () => children.filter((worker) => worker.ex_id === networkMsg.ex_id && worker.assignment === 'worker').slice(0, numberToRemove);
211
+ function actionCompleteFn() {
212
+ const childWorkers = getNodeState().active;
213
+ const currentWorkersForJob = childWorkers.filter((worker) => worker.ex_id === networkMsg.ex_id && worker.assignment === 'worker').length;
214
+ return currentWorkersForJob + numberToRemove <= startingWorkerCount;
215
+ }
216
+ shutdownProcesses(networkMsg, filterFn, actionCompleteFn);
217
+ }
218
+ });
219
+ // used to find an open port for slicer
220
+ messaging.register({
221
+ event: 'cluster:node:get_port',
222
+ callback: async (msg) => {
223
+ const port = await findPort(ports);
224
+ logger.debug(`assigning port ${port} for new job`);
225
+ messaging.respond(msg, { port });
226
+ }
227
+ });
228
+ messaging.register({
229
+ event: 'cluster:error:terminal',
230
+ callback: () => {
231
+ logger.error('terminal error in cluster_master, flushing logs and shutting down');
232
+ logger.flush()
233
+ .then(() => process.exit(0));
234
+ }
235
+ });
236
+ messaging.register({
237
+ event: 'child:exit',
238
+ callback: () => sendNodeState()
239
+ });
240
+ function getAssetsFromJob(jobStr) {
241
+ const job = typeof jobStr === 'string' ? JSON.parse(jobStr) : jobStr;
242
+ return job.assets || [];
243
+ }
244
+ async function loadAssetsIfNeeded(job, exId) {
245
+ const assets = getAssetsFromJob(job);
246
+ if (!assets.length)
247
+ return;
248
+ logger.info(`node ${context.sysconfig._nodeName} is checking assets for job, exId: ${exId}`);
249
+ await spawnAssetLoader(assets, context);
250
+ }
251
+ function shutdownWorkers(signal, filterFn) {
252
+ const allWorkersForJob = filterFn();
253
+ allWorkersForJob.forEach((worker) => {
254
+ const workerID = worker.worker_id || worker.id;
255
+ if (has(context.cluster.workers, workerID)) {
256
+ const clusterWorker = context.cluster.workers[workerID];
257
+ const processId = clusterWorker.process.pid;
258
+ if (clusterWorker.isDead())
259
+ return;
260
+ // if the worker has already been sent a SIGTERM signal it should send a SIGKILL
261
+ logger.warn(`sending ${signal} to process ${processId}, assignment: ${worker.assignment}, ex_id: ${worker.ex_id}`);
262
+ clusterWorker.kill(signal);
263
+ }
264
+ });
265
+ }
266
+ function shutdownProcesses(message, filterFn, isActionCompleteFn, onlySigKill = false) {
267
+ const intervalTime = 200;
268
+ const needsResponse = message.response && message.to;
269
+ // give a little extra time to finish shutting down
270
+ let stopTime = config.shutdown_timeout + 3000;
271
+ if (!onlySigKill) {
272
+ shutdownWorkers('SIGTERM', filterFn);
273
+ }
274
+ const stop = setInterval(() => {
275
+ if (isActionCompleteFn()) {
276
+ clearInterval(stop);
277
+ if (needsResponse)
278
+ messaging.respond(message);
279
+ }
280
+ if (stopTime <= 0) {
281
+ clearInterval(stop);
282
+ shutdownWorkers('SIGKILL', filterFn);
283
+ if (needsResponse)
284
+ messaging.respond(message);
285
+ }
286
+ stopTime -= intervalTime;
287
+ }, intervalTime);
288
+ }
289
+ function getNodeState() {
290
+ const nodeId = context.sysconfig._nodeName;
291
+ const state = {
292
+ node_id: nodeId,
293
+ hostname: context.sysconfig.teraslice.hostname,
294
+ pid: process.pid,
295
+ node_version: nodeVersion,
296
+ teraslice_version: terasliceVersion,
297
+ total: context.sysconfig.teraslice.workers,
298
+ state: 'connected'
299
+ };
300
+ const clusterWorkers = context.cluster.workers;
301
+ const active = [];
302
+ Object.values(clusterWorkers).forEach((worker) => {
303
+ const child = {
304
+ worker_id: worker.id,
305
+ assignment: worker.assignment,
306
+ pid: worker.process.pid
307
+ };
308
+ if (worker.ex_id) {
309
+ child.ex_id = worker.ex_id;
310
+ }
311
+ if (worker.job_id) {
312
+ child.job_id = worker.job_id;
313
+ }
314
+ if (worker.assets) {
315
+ child.assets = worker.assets.map((asset) => asset.id);
316
+ }
317
+ active.push(child);
318
+ });
319
+ const total = state.total;
320
+ state.available = total - active.length - pendingAllocations;
321
+ state.active = active;
322
+ return state;
323
+ }
324
+ messaging.listen({
325
+ query: {
326
+ node_id: context.sysconfig._nodeName
327
+ }
328
+ });
329
+ if (context.sysconfig.teraslice.master) {
330
+ logger.debug(`node ${context.sysconfig._nodeName} is creating the cluster_master`);
331
+ const [clusterMaster] = context.apis.foundation.startWorkers(1, {
332
+ assignment: 'cluster_master',
333
+ assets_port: ports.assetsPort,
334
+ node_id: context.sysconfig._nodeName
335
+ });
336
+ clusterMaster.on('exit', (code) => {
337
+ if (code !== 0) {
338
+ throw Error(`Cluster master has shutdown with exit code ${code}!`);
339
+ }
340
+ });
341
+ logger.debug(`node ${context.sysconfig._nodeName} is creating assets endpoint on port ${ports.assetsPort}`);
342
+ const [assetService] = context.apis.foundation.startWorkers(1, {
343
+ assignment: 'assets_service',
344
+ // key needs to be called port to bypass cluster port sharing
345
+ port: ports.assetsPort,
346
+ node_id: context.sysconfig._nodeName
347
+ });
348
+ assetService.on('exit', (code) => {
349
+ if (code !== 0) {
350
+ throw Error(`Asset Service has shutdown with exit code ${code}!`);
351
+ }
352
+ });
353
+ }
354
+ }
355
+ //# sourceMappingURL=node_master.js.map