@onlineapps/service-wrapper 2.1.85 → 2.1.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,19 +12,18 @@
12
12
  "heartbeatInterval": 30000
13
13
  },
14
14
  "infrastructureGate": {
15
- "maxWaitMs": 300000,
16
- "checkIntervalMs": 5000
17
- },
18
- "infrastructureVerify": {
19
- "maxRetries": 12,
20
- "baseDelayMs": 5000,
21
- "maxDelayMs": 30000,
22
- "queueCheckTimeoutMs": 5000
15
+ "maxWaitMs": 30000,
16
+ "checkIntervalMs": 2000
23
17
  },
24
18
  "monitoring": {
25
19
  "enabled": true,
26
20
  "metrics": ["requests", "errors", "duration"]
27
21
  },
22
+ "startupAlerts": {
23
+ "enabled": true,
24
+ "cooldownMs": 600000,
25
+ "stateFile": "/tmp/oa_startup_failure_state.json"
26
+ },
28
27
  "logging": {
29
28
  "enabled": true,
30
29
  "level": "info",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@onlineapps/service-wrapper",
3
- "version": "2.1.85",
3
+ "version": "2.1.87",
4
4
  "description": "Thin orchestration layer for microservices - delegates all infrastructure concerns to specialized connectors",
5
5
  "main": "src/index.js",
6
6
  "scripts": {
@@ -423,6 +423,14 @@ class ServiceWrapper {
423
423
 
424
424
  // Cleanup before restart
425
425
  await this._cleanupBeforeRestart();
426
+
427
+ // Startup failure alert (works without MQ/monitoring)
428
+ try {
429
+ await this._maybeSendStartupFailureAlert({ phase, phaseName, error, isTransient });
430
+ } catch (alertErr) {
431
+ // Alerts must never block restart; log and continue.
432
+ console.warn('[StartupAlerts] Failed to send startup failure alert:', alertErr.message);
433
+ }
426
434
 
427
435
  if (isTransient) {
428
436
  // Přechodná chyba → restart může pomoci
@@ -431,11 +439,85 @@ class ServiceWrapper {
431
439
  } else {
432
440
  // Trvalá chyba → restart nepomůže
433
441
  console.error(`[FÁZE ${phase}] Permanent error - fix required, no restart`);
434
- // TODO: Send alert (email, Slack, etc.)
435
442
  process.exit(1);
436
443
  }
437
444
  }
438
445
 
446
+ /**
447
+ * Send startup-failure alert via SMTP (independent of MQ).
448
+ * Uses service-common sendMonitoringFailFallbackEmail (SMTP config via INFRA_REPORT_* env).
449
+ * Throttled by a persistent state file (survives container restarts).
450
+ *
451
+ * @private
452
+ */
453
+ async _maybeSendStartupFailureAlert({ phase, phaseName, error, isTransient }) {
454
+ const cfg = this.config.wrapper?.startupAlerts;
455
+ if (!cfg || cfg.enabled !== true) {
456
+ return;
457
+ }
458
+
459
+ const cooldownMs = cfg.cooldownMs;
460
+ const stateFile = cfg.stateFile;
461
+ if (typeof cooldownMs !== 'number' || Number.isNaN(cooldownMs) || cooldownMs <= 0) {
462
+ throw new Error(`[StartupAlerts] Invalid configuration - wrapper.startupAlerts.cooldownMs must be a positive number, got: ${cooldownMs}`);
463
+ }
464
+ if (typeof stateFile !== 'string' || stateFile.trim() === '') {
465
+ throw new Error(`[StartupAlerts] Invalid configuration - wrapper.startupAlerts.stateFile must be a non-empty string, got: ${stateFile}`);
466
+ }
467
+
468
+ const fs = require('fs');
469
+ const path = require('path');
470
+ const serviceName = this.config.service?.name || 'unnamed-service';
471
+ const now = Date.now();
472
+
473
+ let state = { lastAlertAt: 0, failureCount: 0, lastFailureAt: 0 };
474
+ try {
475
+ if (fs.existsSync(stateFile)) {
476
+ const raw = fs.readFileSync(stateFile, 'utf8');
477
+ state = { ...state, ...(JSON.parse(raw) || {}) };
478
+ } else {
479
+ // Ensure parent dir exists (best-effort)
480
+ const dir = path.dirname(stateFile);
481
+ try { fs.mkdirSync(dir, { recursive: true }); } catch (_) { /* ignore */ }
482
+ }
483
+ } catch (e) {
484
+ // If state cannot be read, continue with defaults (alerting must not crash init)
485
+ }
486
+
487
+ state.failureCount = (state.failureCount || 0) + 1;
488
+ state.lastFailureAt = now;
489
+
490
+ const shouldSend = !state.lastAlertAt || (now - state.lastAlertAt) >= cooldownMs;
491
+ if (!shouldSend) {
492
+ // Persist updated counters anyway
493
+ try { fs.writeFileSync(stateFile, JSON.stringify(state), 'utf8'); } catch (_) { /* ignore */ }
494
+ return;
495
+ }
496
+
497
+ const { sendMonitoringFailFallbackEmail } = require('@onlineapps/service-common');
498
+ const transientLabel = isTransient ? 'TRANSIENT' : 'PERMANENT';
499
+ const subject = `[StartupFailure] ${serviceName} phase ${phase} (${transientLabel})`;
500
+ const text = [
501
+ `Service: ${serviceName}`,
502
+ `Phase: ${phase} (${phaseName})`,
503
+ `Type: ${transientLabel}`,
504
+ `Failure count (container): ${state.failureCount}`,
505
+ `Timestamp: ${new Date(now).toISOString()}`,
506
+ `Error: ${error?.message || 'unknown error'}`
507
+ ].join('\n');
508
+ const html = `<pre>${text}</pre>`;
509
+
510
+ const sent = await sendMonitoringFailFallbackEmail(subject, text, html);
511
+ state.lastAlertAt = now;
512
+ try { fs.writeFileSync(stateFile, JSON.stringify(state), 'utf8'); } catch (_) { /* ignore */ }
513
+
514
+ if (sent) {
515
+ this.logger?.info('[StartupAlerts] ✓ Startup failure alert sent', { service: serviceName, phase, transient: isTransient });
516
+ } else {
517
+ this.logger?.warn('[StartupAlerts] Startup failure alert not sent (SMTP config missing or send failed)', { service: serviceName, phase });
518
+ }
519
+ }
520
+
439
521
  async initialize() {
440
522
  if (this.isInitialized) {
441
523
  // Logger might not be initialized yet, use console as fallback
@@ -935,143 +1017,34 @@ class ServiceWrapper {
935
1017
  * 1. Service has certificate (validated + registered) - checked in _startWorkflowListenersIfReady()
936
1018
  * 2. Infrastructure is ready - checked in this method
937
1019
  *
938
- * Checks:
939
- * 1. All infrastructure services are UP (via Redis key infrastructure:health:all)
940
- * 2. Required infrastructure queues exist (workflow.init, registry.register, etc.)
1020
+ * SEPARATION OF CONCERNS:
1021
+ * - Business services MUST NOT check infrastructure queues directly.
1022
+ * - Business services MUST trust Registry's infrastructure:health:all flag.
1023
+ * - Infrastructure services are responsible for their own queue health.
1024
+ * - If infrastructure is not ready within timeout, business service exits (Docker restarts it).
941
1025
  *
942
- * Uses exponential backoff retry mechanism if infrastructure not ready.
943
1026
  * @private
944
1027
  */
945
1028
  async _verifyInfrastructureReady(serviceName) {
946
- const maxRetries = this.config.wrapper?.infrastructureVerify?.maxRetries;
947
- const baseDelay = this.config.wrapper?.infrastructureVerify?.baseDelayMs;
948
- const maxDelay = this.config.wrapper?.infrastructureVerify?.maxDelayMs;
949
- const queueCheckTimeoutMs = this.config.wrapper?.infrastructureVerify?.queueCheckTimeoutMs;
950
-
951
- if (typeof maxRetries !== 'number' || Number.isNaN(maxRetries) || maxRetries <= 0) {
952
- throw new Error(`[InfrastructureVerify] Invalid configuration - wrapper.infrastructureVerify.maxRetries must be a positive number, got: ${maxRetries}`);
953
- }
954
- if (typeof baseDelay !== 'number' || Number.isNaN(baseDelay) || baseDelay <= 0) {
955
- throw new Error(`[InfrastructureVerify] Invalid configuration - wrapper.infrastructureVerify.baseDelayMs must be a positive number, got: ${baseDelay}`);
956
- }
957
- if (typeof maxDelay !== 'number' || Number.isNaN(maxDelay) || maxDelay <= 0) {
958
- throw new Error(`[InfrastructureVerify] Invalid configuration - wrapper.infrastructureVerify.maxDelayMs must be a positive number, got: ${maxDelay}`);
959
- }
960
- if (typeof queueCheckTimeoutMs !== 'number' || Number.isNaN(queueCheckTimeoutMs) || queueCheckTimeoutMs <= 0) {
961
- throw new Error(`[InfrastructureVerify] Invalid configuration - wrapper.infrastructureVerify.queueCheckTimeoutMs must be a positive number, got: ${queueCheckTimeoutMs}`);
962
- }
963
-
964
- // Required infrastructure queues that must exist before business queues can be created
965
- const requiredInfrastructureQueues = [
966
- 'workflow.init', // Gateway responsibility (workflow entrypoint)
967
- 'workflow.control', // Gateway responsibility (shared control-flow steps)
968
- 'registry.register', // Registry responsibility
969
- ];
970
-
971
- // Optional infrastructure queues (may not exist if services are not running)
972
- const optionalInfrastructureQueues = [
973
- 'validation.requests', // Validator responsibility
974
- 'workflow.completed', // Delivery Dispatcher responsibility
975
- 'workflow.failed' // Delivery Dispatcher responsibility
976
- ];
1029
+ this.logger?.info('[InfrastructureReady] Checking infrastructure status via Registry (trusting infrastructure:health:all flag)...');
977
1030
 
978
- this.logger?.info('[InfrastructureVerify] Starting infrastructure readiness verification...', {
979
- maxRetries,
980
- baseDelay,
981
- requiredQueues: requiredInfrastructureQueues
982
- });
983
-
984
- for (let attempt = 1; attempt <= maxRetries; attempt++) {
985
- try {
986
- await this._waitForInfrastructureGate('business queue setup');
987
-
988
- // Step 2: Verify required infrastructure queues exist
989
- this.logger?.info('[InfrastructureVerify] Verifying required infrastructure queues exist...');
990
-
991
- const transport = this.mqClient._transport;
992
- if (!transport || !transport._queueChannel) {
993
- throw new Error('MQ transport not initialized - cannot verify infrastructure queues');
994
- }
995
-
996
- const channel = transport._queueChannel;
997
- if (channel.closed) {
998
- throw new Error('MQ channel is closed - cannot verify infrastructure queues');
999
- }
1000
-
1001
- const missingQueues = [];
1002
- for (const queueName of requiredInfrastructureQueues) {
1003
- try {
1004
- // Add timeout to prevent hanging on checkQueue
1005
- const checkPromise = channel.checkQueue(queueName);
1006
- const checkTimeoutPromise = new Promise((_, reject) => {
1007
- setTimeout(() => {
1008
- reject(new Error(`checkQueue timeout after ${queueCheckTimeoutMs}ms`));
1009
- }, queueCheckTimeoutMs);
1010
- });
1011
-
1012
- await Promise.race([checkPromise, checkTimeoutPromise]);
1013
- this.logger?.info(`[InfrastructureVerify] ✓ Queue exists: ${queueName}`);
1014
- } catch (checkErr) {
1015
- if (checkErr.code === 404 || checkErr.message.includes('timeout')) {
1016
- missingQueues.push(queueName);
1017
- this.logger?.warn(`[InfrastructureVerify] ✗ Queue missing or timeout: ${queueName} (${checkErr.message})`);
1018
- } else {
1019
- // Other error (e.g., channel closed) - treat as missing
1020
- missingQueues.push(queueName);
1021
- this.logger?.warn(`[InfrastructureVerify] ✗ Cannot check queue ${queueName}: ${checkErr.message}`);
1022
- }
1023
- }
1024
- }
1025
-
1026
- if (missingQueues.length > 0) {
1027
- const queueDetails = missingQueues.map((queueName) => {
1028
- const owner = INFRA_QUEUE_OWNERS[queueName] || 'responsible infrastructure service';
1029
- return `${queueName} (owner: ${owner})`;
1030
- });
1031
- throw new Error(
1032
- `[InfrastructureVerify] Required RabbitMQ queue(s) missing: ${queueDetails.join(', ')}. ` +
1033
- 'Infrastructure service(s) responsible for these queues are not ready yet. ' +
1034
- 'Resolve the infrastructure issue and restart the business service.'
1035
- );
1036
- }
1037
-
1038
- // Step 3: Log optional queues status (for debugging)
1039
- for (const queueName of optionalInfrastructureQueues) {
1040
- try {
1041
- const checkPromise = channel.checkQueue(queueName);
1042
- const checkTimeoutPromise = new Promise((_, reject) => {
1043
- setTimeout(() => reject(new Error('timeout')), queueCheckTimeoutMs);
1044
- });
1045
- await Promise.race([checkPromise, checkTimeoutPromise]);
1046
- this.logger?.info(`[InfrastructureVerify] ✓ Optional queue exists: ${queueName}`);
1047
- } catch (checkErr) {
1048
- if (checkErr.code === 404 || checkErr.message.includes('timeout')) {
1049
- this.logger?.debug(`[InfrastructureVerify] Optional queue missing or timeout (not critical): ${queueName}`);
1050
- }
1051
- }
1052
- }
1053
-
1054
- // All checks passed!
1055
- this.logger?.info('[InfrastructureVerify] ✓ Infrastructure verification complete - all required queues exist');
1056
- return;
1057
-
1058
- } catch (error) {
1059
- if (attempt === maxRetries) {
1060
- // Final attempt failed - throw error
1061
- this.logger?.error('[InfrastructureVerify] ✗ Infrastructure verification failed after all retries', {
1062
- error: error.message,
1063
- stack: error.stack
1064
- });
1065
- throw new Error(`Infrastructure verification failed for ${serviceName}: ${error.message}`);
1066
- }
1067
-
1068
- // Not final attempt - continue retry loop
1069
- const delay = Math.min(baseDelay * Math.pow(2, attempt - 1), maxDelay);
1070
- this.logger?.warn(`[InfrastructureVerify] Verification failed (attempt ${attempt}/${maxRetries}), retrying in ${delay}ms...`, {
1071
- error: error.message
1072
- });
1073
- await new Promise(resolve => setTimeout(resolve, delay));
1074
- }
1031
+ try {
1032
+ // Single check - if infra not ready, fail fast and let Docker restart
1033
+ await this._waitForInfrastructureGate('business queue setup');
1034
+ this.logger?.info('[InfrastructureReady] ✓ Infrastructure is ready (infrastructure:health:all = true)');
1035
+ } catch (error) {
1036
+ // Infrastructure not ready - fail fast with clear message
1037
+ this.logger?.error('[InfrastructureReady] Infrastructure not ready - service will exit', {
1038
+ error: error.message,
1039
+ serviceName,
1040
+ action: 'Docker will restart the service. Check infrastructure services (Gateway, Registry, Validator, Delivery, Monitoring).'
1041
+ });
1042
+ throw new Error(
1043
+ `[InfrastructureReady] Infrastructure not ready for ${serviceName}. ` +
1044
+ 'Registry reports infrastructure:health:all != true. ' +
1045
+ 'Check infrastructure services and their logs. ' +
1046
+ 'This service will exit and Docker will restart it.'
1047
+ );
1075
1048
  }
1076
1049
  }
1077
1050