@onlineapps/service-wrapper 2.1.85 → 2.1.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/runtime-defaults.json +7 -8
- package/package.json +1 -1
- package/src/ServiceWrapper.js +106 -133
|
@@ -12,19 +12,18 @@
|
|
|
12
12
|
"heartbeatInterval": 30000
|
|
13
13
|
},
|
|
14
14
|
"infrastructureGate": {
|
|
15
|
-
"maxWaitMs":
|
|
16
|
-
"checkIntervalMs":
|
|
17
|
-
},
|
|
18
|
-
"infrastructureVerify": {
|
|
19
|
-
"maxRetries": 12,
|
|
20
|
-
"baseDelayMs": 5000,
|
|
21
|
-
"maxDelayMs": 30000,
|
|
22
|
-
"queueCheckTimeoutMs": 5000
|
|
15
|
+
"maxWaitMs": 30000,
|
|
16
|
+
"checkIntervalMs": 2000
|
|
23
17
|
},
|
|
24
18
|
"monitoring": {
|
|
25
19
|
"enabled": true,
|
|
26
20
|
"metrics": ["requests", "errors", "duration"]
|
|
27
21
|
},
|
|
22
|
+
"startupAlerts": {
|
|
23
|
+
"enabled": true,
|
|
24
|
+
"cooldownMs": 600000,
|
|
25
|
+
"stateFile": "/tmp/oa_startup_failure_state.json"
|
|
26
|
+
},
|
|
28
27
|
"logging": {
|
|
29
28
|
"enabled": true,
|
|
30
29
|
"level": "info",
|
package/package.json
CHANGED
package/src/ServiceWrapper.js
CHANGED
|
@@ -423,6 +423,14 @@ class ServiceWrapper {
|
|
|
423
423
|
|
|
424
424
|
// Cleanup before restart
|
|
425
425
|
await this._cleanupBeforeRestart();
|
|
426
|
+
|
|
427
|
+
// Startup failure alert (works without MQ/monitoring)
|
|
428
|
+
try {
|
|
429
|
+
await this._maybeSendStartupFailureAlert({ phase, phaseName, error, isTransient });
|
|
430
|
+
} catch (alertErr) {
|
|
431
|
+
// Alerts must never block restart; log and continue.
|
|
432
|
+
console.warn('[StartupAlerts] Failed to send startup failure alert:', alertErr.message);
|
|
433
|
+
}
|
|
426
434
|
|
|
427
435
|
if (isTransient) {
|
|
428
436
|
// Přechodná chyba → restart může pomoci
|
|
@@ -431,11 +439,85 @@ class ServiceWrapper {
|
|
|
431
439
|
} else {
|
|
432
440
|
// Trvalá chyba → restart nepomůže
|
|
433
441
|
console.error(`[FÁZE ${phase}] Permanent error - fix required, no restart`);
|
|
434
|
-
// TODO: Send alert (email, Slack, etc.)
|
|
435
442
|
process.exit(1);
|
|
436
443
|
}
|
|
437
444
|
}
|
|
438
445
|
|
|
446
|
+
/**
|
|
447
|
+
* Send startup-failure alert via SMTP (independent of MQ).
|
|
448
|
+
* Uses service-common sendMonitoringFailFallbackEmail (SMTP config via INFRA_REPORT_* env).
|
|
449
|
+
* Throttled by a persistent state file (survives container restarts).
|
|
450
|
+
*
|
|
451
|
+
* @private
|
|
452
|
+
*/
|
|
453
|
+
async _maybeSendStartupFailureAlert({ phase, phaseName, error, isTransient }) {
|
|
454
|
+
const cfg = this.config.wrapper?.startupAlerts;
|
|
455
|
+
if (!cfg || cfg.enabled !== true) {
|
|
456
|
+
return;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
const cooldownMs = cfg.cooldownMs;
|
|
460
|
+
const stateFile = cfg.stateFile;
|
|
461
|
+
if (typeof cooldownMs !== 'number' || Number.isNaN(cooldownMs) || cooldownMs <= 0) {
|
|
462
|
+
throw new Error(`[StartupAlerts] Invalid configuration - wrapper.startupAlerts.cooldownMs must be a positive number, got: ${cooldownMs}`);
|
|
463
|
+
}
|
|
464
|
+
if (typeof stateFile !== 'string' || stateFile.trim() === '') {
|
|
465
|
+
throw new Error(`[StartupAlerts] Invalid configuration - wrapper.startupAlerts.stateFile must be a non-empty string, got: ${stateFile}`);
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
const fs = require('fs');
|
|
469
|
+
const path = require('path');
|
|
470
|
+
const serviceName = this.config.service?.name || 'unnamed-service';
|
|
471
|
+
const now = Date.now();
|
|
472
|
+
|
|
473
|
+
let state = { lastAlertAt: 0, failureCount: 0, lastFailureAt: 0 };
|
|
474
|
+
try {
|
|
475
|
+
if (fs.existsSync(stateFile)) {
|
|
476
|
+
const raw = fs.readFileSync(stateFile, 'utf8');
|
|
477
|
+
state = { ...state, ...(JSON.parse(raw) || {}) };
|
|
478
|
+
} else {
|
|
479
|
+
// Ensure parent dir exists (best-effort)
|
|
480
|
+
const dir = path.dirname(stateFile);
|
|
481
|
+
try { fs.mkdirSync(dir, { recursive: true }); } catch (_) { /* ignore */ }
|
|
482
|
+
}
|
|
483
|
+
} catch (e) {
|
|
484
|
+
// If state cannot be read, continue with defaults (alerting must not crash init)
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
state.failureCount = (state.failureCount || 0) + 1;
|
|
488
|
+
state.lastFailureAt = now;
|
|
489
|
+
|
|
490
|
+
const shouldSend = !state.lastAlertAt || (now - state.lastAlertAt) >= cooldownMs;
|
|
491
|
+
if (!shouldSend) {
|
|
492
|
+
// Persist updated counters anyway
|
|
493
|
+
try { fs.writeFileSync(stateFile, JSON.stringify(state), 'utf8'); } catch (_) { /* ignore */ }
|
|
494
|
+
return;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
const { sendMonitoringFailFallbackEmail } = require('@onlineapps/service-common');
|
|
498
|
+
const transientLabel = isTransient ? 'TRANSIENT' : 'PERMANENT';
|
|
499
|
+
const subject = `[StartupFailure] ${serviceName} phase ${phase} (${transientLabel})`;
|
|
500
|
+
const text = [
|
|
501
|
+
`Service: ${serviceName}`,
|
|
502
|
+
`Phase: ${phase} (${phaseName})`,
|
|
503
|
+
`Type: ${transientLabel}`,
|
|
504
|
+
`Failure count (container): ${state.failureCount}`,
|
|
505
|
+
`Timestamp: ${new Date(now).toISOString()}`,
|
|
506
|
+
`Error: ${error?.message || 'unknown error'}`
|
|
507
|
+
].join('\n');
|
|
508
|
+
const html = `<pre>${text}</pre>`;
|
|
509
|
+
|
|
510
|
+
const sent = await sendMonitoringFailFallbackEmail(subject, text, html);
|
|
511
|
+
state.lastAlertAt = now;
|
|
512
|
+
try { fs.writeFileSync(stateFile, JSON.stringify(state), 'utf8'); } catch (_) { /* ignore */ }
|
|
513
|
+
|
|
514
|
+
if (sent) {
|
|
515
|
+
this.logger?.info('[StartupAlerts] ✓ Startup failure alert sent', { service: serviceName, phase, transient: isTransient });
|
|
516
|
+
} else {
|
|
517
|
+
this.logger?.warn('[StartupAlerts] Startup failure alert not sent (SMTP config missing or send failed)', { service: serviceName, phase });
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
|
|
439
521
|
async initialize() {
|
|
440
522
|
if (this.isInitialized) {
|
|
441
523
|
// Logger might not be initialized yet, use console as fallback
|
|
@@ -935,143 +1017,34 @@ class ServiceWrapper {
|
|
|
935
1017
|
* 1. Service has certificate (validated + registered) - checked in _startWorkflowListenersIfReady()
|
|
936
1018
|
* 2. Infrastructure is ready - checked in this method
|
|
937
1019
|
*
|
|
938
|
-
*
|
|
939
|
-
*
|
|
940
|
-
*
|
|
1020
|
+
* SEPARATION OF CONCERNS:
|
|
1021
|
+
* - Business services MUST NOT check infrastructure queues directly.
|
|
1022
|
+
* - Business services MUST trust Registry's infrastructure:health:all flag.
|
|
1023
|
+
* - Infrastructure services are responsible for their own queue health.
|
|
1024
|
+
* - If infrastructure is not ready within timeout, business service exits (Docker restarts it).
|
|
941
1025
|
*
|
|
942
|
-
* Uses exponential backoff retry mechanism if infrastructure not ready.
|
|
943
1026
|
* @private
|
|
944
1027
|
*/
|
|
945
1028
|
async _verifyInfrastructureReady(serviceName) {
|
|
946
|
-
|
|
947
|
-
const baseDelay = this.config.wrapper?.infrastructureVerify?.baseDelayMs;
|
|
948
|
-
const maxDelay = this.config.wrapper?.infrastructureVerify?.maxDelayMs;
|
|
949
|
-
const queueCheckTimeoutMs = this.config.wrapper?.infrastructureVerify?.queueCheckTimeoutMs;
|
|
950
|
-
|
|
951
|
-
if (typeof maxRetries !== 'number' || Number.isNaN(maxRetries) || maxRetries <= 0) {
|
|
952
|
-
throw new Error(`[InfrastructureVerify] Invalid configuration - wrapper.infrastructureVerify.maxRetries must be a positive number, got: ${maxRetries}`);
|
|
953
|
-
}
|
|
954
|
-
if (typeof baseDelay !== 'number' || Number.isNaN(baseDelay) || baseDelay <= 0) {
|
|
955
|
-
throw new Error(`[InfrastructureVerify] Invalid configuration - wrapper.infrastructureVerify.baseDelayMs must be a positive number, got: ${baseDelay}`);
|
|
956
|
-
}
|
|
957
|
-
if (typeof maxDelay !== 'number' || Number.isNaN(maxDelay) || maxDelay <= 0) {
|
|
958
|
-
throw new Error(`[InfrastructureVerify] Invalid configuration - wrapper.infrastructureVerify.maxDelayMs must be a positive number, got: ${maxDelay}`);
|
|
959
|
-
}
|
|
960
|
-
if (typeof queueCheckTimeoutMs !== 'number' || Number.isNaN(queueCheckTimeoutMs) || queueCheckTimeoutMs <= 0) {
|
|
961
|
-
throw new Error(`[InfrastructureVerify] Invalid configuration - wrapper.infrastructureVerify.queueCheckTimeoutMs must be a positive number, got: ${queueCheckTimeoutMs}`);
|
|
962
|
-
}
|
|
963
|
-
|
|
964
|
-
// Required infrastructure queues that must exist before business queues can be created
|
|
965
|
-
const requiredInfrastructureQueues = [
|
|
966
|
-
'workflow.init', // Gateway responsibility (workflow entrypoint)
|
|
967
|
-
'workflow.control', // Gateway responsibility (shared control-flow steps)
|
|
968
|
-
'registry.register', // Registry responsibility
|
|
969
|
-
];
|
|
970
|
-
|
|
971
|
-
// Optional infrastructure queues (may not exist if services are not running)
|
|
972
|
-
const optionalInfrastructureQueues = [
|
|
973
|
-
'validation.requests', // Validator responsibility
|
|
974
|
-
'workflow.completed', // Delivery Dispatcher responsibility
|
|
975
|
-
'workflow.failed' // Delivery Dispatcher responsibility
|
|
976
|
-
];
|
|
1029
|
+
this.logger?.info('[InfrastructureReady] Checking infrastructure status via Registry (trusting infrastructure:health:all flag)...');
|
|
977
1030
|
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
})
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
const channel = transport._queueChannel;
|
|
997
|
-
if (channel.closed) {
|
|
998
|
-
throw new Error('MQ channel is closed - cannot verify infrastructure queues');
|
|
999
|
-
}
|
|
1000
|
-
|
|
1001
|
-
const missingQueues = [];
|
|
1002
|
-
for (const queueName of requiredInfrastructureQueues) {
|
|
1003
|
-
try {
|
|
1004
|
-
// Add timeout to prevent hanging on checkQueue
|
|
1005
|
-
const checkPromise = channel.checkQueue(queueName);
|
|
1006
|
-
const checkTimeoutPromise = new Promise((_, reject) => {
|
|
1007
|
-
setTimeout(() => {
|
|
1008
|
-
reject(new Error(`checkQueue timeout after ${queueCheckTimeoutMs}ms`));
|
|
1009
|
-
}, queueCheckTimeoutMs);
|
|
1010
|
-
});
|
|
1011
|
-
|
|
1012
|
-
await Promise.race([checkPromise, checkTimeoutPromise]);
|
|
1013
|
-
this.logger?.info(`[InfrastructureVerify] ✓ Queue exists: ${queueName}`);
|
|
1014
|
-
} catch (checkErr) {
|
|
1015
|
-
if (checkErr.code === 404 || checkErr.message.includes('timeout')) {
|
|
1016
|
-
missingQueues.push(queueName);
|
|
1017
|
-
this.logger?.warn(`[InfrastructureVerify] ✗ Queue missing or timeout: ${queueName} (${checkErr.message})`);
|
|
1018
|
-
} else {
|
|
1019
|
-
// Other error (e.g., channel closed) - treat as missing
|
|
1020
|
-
missingQueues.push(queueName);
|
|
1021
|
-
this.logger?.warn(`[InfrastructureVerify] ✗ Cannot check queue ${queueName}: ${checkErr.message}`);
|
|
1022
|
-
}
|
|
1023
|
-
}
|
|
1024
|
-
}
|
|
1025
|
-
|
|
1026
|
-
if (missingQueues.length > 0) {
|
|
1027
|
-
const queueDetails = missingQueues.map((queueName) => {
|
|
1028
|
-
const owner = INFRA_QUEUE_OWNERS[queueName] || 'responsible infrastructure service';
|
|
1029
|
-
return `${queueName} (owner: ${owner})`;
|
|
1030
|
-
});
|
|
1031
|
-
throw new Error(
|
|
1032
|
-
`[InfrastructureVerify] Required RabbitMQ queue(s) missing: ${queueDetails.join(', ')}. ` +
|
|
1033
|
-
'Infrastructure service(s) responsible for these queues are not ready yet. ' +
|
|
1034
|
-
'Resolve the infrastructure issue and restart the business service.'
|
|
1035
|
-
);
|
|
1036
|
-
}
|
|
1037
|
-
|
|
1038
|
-
// Step 3: Log optional queues status (for debugging)
|
|
1039
|
-
for (const queueName of optionalInfrastructureQueues) {
|
|
1040
|
-
try {
|
|
1041
|
-
const checkPromise = channel.checkQueue(queueName);
|
|
1042
|
-
const checkTimeoutPromise = new Promise((_, reject) => {
|
|
1043
|
-
setTimeout(() => reject(new Error('timeout')), queueCheckTimeoutMs);
|
|
1044
|
-
});
|
|
1045
|
-
await Promise.race([checkPromise, checkTimeoutPromise]);
|
|
1046
|
-
this.logger?.info(`[InfrastructureVerify] ✓ Optional queue exists: ${queueName}`);
|
|
1047
|
-
} catch (checkErr) {
|
|
1048
|
-
if (checkErr.code === 404 || checkErr.message.includes('timeout')) {
|
|
1049
|
-
this.logger?.debug(`[InfrastructureVerify] Optional queue missing or timeout (not critical): ${queueName}`);
|
|
1050
|
-
}
|
|
1051
|
-
}
|
|
1052
|
-
}
|
|
1053
|
-
|
|
1054
|
-
// All checks passed!
|
|
1055
|
-
this.logger?.info('[InfrastructureVerify] ✓ Infrastructure verification complete - all required queues exist');
|
|
1056
|
-
return;
|
|
1057
|
-
|
|
1058
|
-
} catch (error) {
|
|
1059
|
-
if (attempt === maxRetries) {
|
|
1060
|
-
// Final attempt failed - throw error
|
|
1061
|
-
this.logger?.error('[InfrastructureVerify] ✗ Infrastructure verification failed after all retries', {
|
|
1062
|
-
error: error.message,
|
|
1063
|
-
stack: error.stack
|
|
1064
|
-
});
|
|
1065
|
-
throw new Error(`Infrastructure verification failed for ${serviceName}: ${error.message}`);
|
|
1066
|
-
}
|
|
1067
|
-
|
|
1068
|
-
// Not final attempt - continue retry loop
|
|
1069
|
-
const delay = Math.min(baseDelay * Math.pow(2, attempt - 1), maxDelay);
|
|
1070
|
-
this.logger?.warn(`[InfrastructureVerify] Verification failed (attempt ${attempt}/${maxRetries}), retrying in ${delay}ms...`, {
|
|
1071
|
-
error: error.message
|
|
1072
|
-
});
|
|
1073
|
-
await new Promise(resolve => setTimeout(resolve, delay));
|
|
1074
|
-
}
|
|
1031
|
+
try {
|
|
1032
|
+
// Single check - if infra not ready, fail fast and let Docker restart
|
|
1033
|
+
await this._waitForInfrastructureGate('business queue setup');
|
|
1034
|
+
this.logger?.info('[InfrastructureReady] ✓ Infrastructure is ready (infrastructure:health:all = true)');
|
|
1035
|
+
} catch (error) {
|
|
1036
|
+
// Infrastructure not ready - fail fast with clear message
|
|
1037
|
+
this.logger?.error('[InfrastructureReady] ✗ Infrastructure not ready - service will exit', {
|
|
1038
|
+
error: error.message,
|
|
1039
|
+
serviceName,
|
|
1040
|
+
action: 'Docker will restart the service. Check infrastructure services (Gateway, Registry, Validator, Delivery, Monitoring).'
|
|
1041
|
+
});
|
|
1042
|
+
throw new Error(
|
|
1043
|
+
`[InfrastructureReady] Infrastructure not ready for ${serviceName}. ` +
|
|
1044
|
+
'Registry reports infrastructure:health:all != true. ' +
|
|
1045
|
+
'Check infrastructure services and their logs. ' +
|
|
1046
|
+
'This service will exit and Docker will restart it.'
|
|
1047
|
+
);
|
|
1075
1048
|
}
|
|
1076
1049
|
}
|
|
1077
1050
|
|