@checkstack/healthcheck-backend 0.3.5 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -0
- package/package.json +2 -1
- package/src/index.ts +11 -6
- package/src/queue-executor.test.ts +26 -5
- package/src/queue-executor.ts +71 -34
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,36 @@
|
|
|
1
1
|
# @checkstack/healthcheck-backend
|
|
2
2
|
|
|
3
|
+
## 0.4.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- 18fa8e3: Add notification suppression toggle for maintenance windows
|
|
8
|
+
|
|
9
|
+
**New Feature:** When creating or editing a maintenance window, you can now enable "Suppress health notifications" to prevent health status change notifications from being sent for affected systems while the maintenance is active (in_progress status). This is useful for planned downtime where health alerts are expected and would otherwise create noise.
|
|
10
|
+
|
|
11
|
+
**Changes:**
|
|
12
|
+
|
|
13
|
+
- Added `suppressNotifications` field to maintenance schema
|
|
14
|
+
- Added new service-to-service API `hasActiveMaintenanceWithSuppression`
|
|
15
|
+
- Healthcheck queue executor now checks for suppression before sending notifications
|
|
16
|
+
- MaintenanceEditor UI includes new toggle checkbox
|
|
17
|
+
|
|
18
|
+
**Bug Fix:** Fixed migration system to correctly set PostgreSQL search_path when running plugin migrations. Previously, migrations could fail with "relation does not exist" errors because the schema context wasn't properly set.
|
|
19
|
+
|
|
20
|
+
### Patch Changes
|
|
21
|
+
|
|
22
|
+
- db9b37c: Fixed 500 errors on healthcheck `getHistory` and `getDetailedHistory` endpoints caused by the scoped database proxy not handling Drizzle's `$count()` utility method.
|
|
23
|
+
|
|
24
|
+
**Root Cause:** The `$count()` method returns a Promise directly (not a query builder), bypassing the chain-replay mechanism used for schema isolation. This caused queries to run without the proper `search_path`, resulting in database errors.
|
|
25
|
+
|
|
26
|
+
**Changes:**
|
|
27
|
+
|
|
28
|
+
- Added explicit `$count` method handling in `scoped-db.ts` to wrap count operations in transactions with proper schema isolation
|
|
29
|
+
- Wrapped `$count` return values with `Number()` in healthcheck service to handle BigInt serialization
|
|
30
|
+
|
|
31
|
+
- Updated dependencies [18fa8e3]
|
|
32
|
+
- @checkstack/maintenance-common@0.4.0
|
|
33
|
+
|
|
3
34
|
## 0.3.5
|
|
4
35
|
|
|
5
36
|
### Patch Changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@checkstack/healthcheck-backend",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "src/index.ts",
|
|
6
6
|
"scripts": {
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
"@checkstack/catalog-common": "workspace:*",
|
|
16
16
|
"@checkstack/healthcheck-common": "workspace:*",
|
|
17
17
|
"@checkstack/integration-backend": "workspace:*",
|
|
18
|
+
"@checkstack/maintenance-common": "workspace:*",
|
|
18
19
|
"@checkstack/queue-api": "workspace:*",
|
|
19
20
|
"@checkstack/signal-common": "workspace:*",
|
|
20
21
|
"@checkstack/command-backend": "workspace:*",
|
package/src/index.ts
CHANGED
|
@@ -22,6 +22,7 @@ import { createHealthCheckRouter } from "./router";
|
|
|
22
22
|
import { HealthCheckService } from "./service";
|
|
23
23
|
import { catalogHooks } from "@checkstack/catalog-backend";
|
|
24
24
|
import { CatalogApi } from "@checkstack/catalog-common";
|
|
25
|
+
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
25
26
|
import { healthCheckHooks } from "./hooks";
|
|
26
27
|
import { registerSearchProvider } from "@checkstack/command-backend";
|
|
27
28
|
import { resolveRoute } from "@checkstack/common";
|
|
@@ -59,7 +60,7 @@ export default createBackendPlugin({
|
|
|
59
60
|
|
|
60
61
|
// Register hooks as integration events
|
|
61
62
|
const integrationEvents = env.getExtensionPoint(
|
|
62
|
-
integrationEventExtensionPoint
|
|
63
|
+
integrationEventExtensionPoint,
|
|
63
64
|
);
|
|
64
65
|
|
|
65
66
|
integrationEvents.registerEvent(
|
|
@@ -71,7 +72,7 @@ export default createBackendPlugin({
|
|
|
71
72
|
category: "Health",
|
|
72
73
|
payloadSchema: systemDegradedPayloadSchema,
|
|
73
74
|
},
|
|
74
|
-
pluginMetadata
|
|
75
|
+
pluginMetadata,
|
|
75
76
|
);
|
|
76
77
|
|
|
77
78
|
integrationEvents.registerEvent(
|
|
@@ -82,7 +83,7 @@ export default createBackendPlugin({
|
|
|
82
83
|
category: "Health",
|
|
83
84
|
payloadSchema: systemHealthyPayloadSchema,
|
|
84
85
|
},
|
|
85
|
-
pluginMetadata
|
|
86
|
+
pluginMetadata,
|
|
86
87
|
);
|
|
87
88
|
|
|
88
89
|
env.registerInit({
|
|
@@ -112,6 +113,9 @@ export default createBackendPlugin({
|
|
|
112
113
|
// Create catalog client for notification delegation
|
|
113
114
|
const catalogClient = rpcClient.forPlugin(CatalogApi);
|
|
114
115
|
|
|
116
|
+
// Create maintenance client for notification suppression checks
|
|
117
|
+
const maintenanceClient = rpcClient.forPlugin(MaintenanceApi);
|
|
118
|
+
|
|
115
119
|
// Setup queue-based health check worker
|
|
116
120
|
await setupHealthCheckWorker({
|
|
117
121
|
db: database,
|
|
@@ -121,12 +125,13 @@ export default createBackendPlugin({
|
|
|
121
125
|
queueManager,
|
|
122
126
|
signalService,
|
|
123
127
|
catalogClient,
|
|
128
|
+
maintenanceClient,
|
|
124
129
|
getEmitHook: () => storedEmitHook,
|
|
125
130
|
});
|
|
126
131
|
|
|
127
132
|
const healthCheckRouter = createHealthCheckRouter(
|
|
128
133
|
database as NodePgDatabase<typeof schema>,
|
|
129
|
-
healthCheckRegistry
|
|
134
|
+
healthCheckRegistry,
|
|
130
135
|
);
|
|
131
136
|
rpc.registerRouter(healthCheckRouter, healthCheckContract);
|
|
132
137
|
|
|
@@ -181,11 +186,11 @@ export default createBackendPlugin({
|
|
|
181
186
|
catalogHooks.systemDeleted,
|
|
182
187
|
async (payload) => {
|
|
183
188
|
logger.debug(
|
|
184
|
-
`Cleaning up health check associations for deleted system: ${payload.systemId}
|
|
189
|
+
`Cleaning up health check associations for deleted system: ${payload.systemId}`,
|
|
185
190
|
);
|
|
186
191
|
await service.removeAllSystemAssociations(payload.systemId);
|
|
187
192
|
},
|
|
188
|
-
{ mode: "work-queue", workerGroup: "system-cleanup" }
|
|
193
|
+
{ mode: "work-queue", workerGroup: "system-cleanup" },
|
|
189
194
|
);
|
|
190
195
|
|
|
191
196
|
logger.debug("✅ Health Check Backend afterPluginsReady complete.");
|
|
@@ -75,6 +75,23 @@ const createMockCatalogClient = () => ({
|
|
|
75
75
|
createView: mock(async () => ({})),
|
|
76
76
|
});
|
|
77
77
|
|
|
78
|
+
// Helper to create mock maintenance client for notification suppression checks
|
|
79
|
+
const createMockMaintenanceClient = () => ({
|
|
80
|
+
hasActiveMaintenanceWithSuppression: mock(async () => ({
|
|
81
|
+
suppressed: false,
|
|
82
|
+
})),
|
|
83
|
+
// Other methods not used in queue-executor
|
|
84
|
+
listMaintenances: mock(async () => ({ maintenances: [] })),
|
|
85
|
+
getMaintenance: mock(async () => null),
|
|
86
|
+
getMaintenancesForSystem: mock(async () => []),
|
|
87
|
+
getBulkMaintenancesForSystems: mock(async () => ({ maintenances: {} })),
|
|
88
|
+
createMaintenance: mock(async () => ({})),
|
|
89
|
+
updateMaintenance: mock(async () => ({})),
|
|
90
|
+
addUpdate: mock(async () => ({})),
|
|
91
|
+
closeMaintenance: mock(async () => ({})),
|
|
92
|
+
deleteMaintenance: mock(async () => ({ success: true })),
|
|
93
|
+
});
|
|
94
|
+
|
|
78
95
|
describe("Queue-Based Health Check Executor", () => {
|
|
79
96
|
describe("scheduleHealthCheck", () => {
|
|
80
97
|
it("should enqueue a health check with delay and deterministic jobId", async () => {
|
|
@@ -127,6 +144,7 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
127
144
|
const mockLogger = createMockLogger();
|
|
128
145
|
const mockQueueManager = createMockQueueManager();
|
|
129
146
|
const mockCatalogClient = createMockCatalogClient();
|
|
147
|
+
const mockMaintenanceClient = createMockMaintenanceClient();
|
|
130
148
|
|
|
131
149
|
await setupHealthCheckWorker({
|
|
132
150
|
db: mockDb as unknown as Parameters<
|
|
@@ -143,11 +161,14 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
143
161
|
catalogClient: mockCatalogClient as unknown as Parameters<
|
|
144
162
|
typeof setupHealthCheckWorker
|
|
145
163
|
>[0]["catalogClient"],
|
|
164
|
+
maintenanceClient: mockMaintenanceClient as unknown as Parameters<
|
|
165
|
+
typeof setupHealthCheckWorker
|
|
166
|
+
>[0]["maintenanceClient"],
|
|
146
167
|
getEmitHook: () => undefined,
|
|
147
168
|
});
|
|
148
169
|
|
|
149
170
|
expect(mockLogger.debug).toHaveBeenCalledWith(
|
|
150
|
-
expect.stringContaining("Health Check Worker subscribed")
|
|
171
|
+
expect.stringContaining("Health Check Worker subscribed"),
|
|
151
172
|
);
|
|
152
173
|
});
|
|
153
174
|
});
|
|
@@ -210,10 +231,10 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
210
231
|
});
|
|
211
232
|
|
|
212
233
|
expect(mockLogger.debug).toHaveBeenCalledWith(
|
|
213
|
-
"Bootstrapping 2 health checks"
|
|
234
|
+
"Bootstrapping 2 health checks",
|
|
214
235
|
);
|
|
215
236
|
expect(mockLogger.debug).toHaveBeenCalledWith(
|
|
216
|
-
"✅ Bootstrapped 2 health checks"
|
|
237
|
+
"✅ Bootstrapped 2 health checks",
|
|
217
238
|
);
|
|
218
239
|
});
|
|
219
240
|
|
|
@@ -236,10 +257,10 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
236
257
|
});
|
|
237
258
|
|
|
238
259
|
expect(mockLogger.debug).toHaveBeenCalledWith(
|
|
239
|
-
"Bootstrapping 0 health checks"
|
|
260
|
+
"Bootstrapping 0 health checks",
|
|
240
261
|
);
|
|
241
262
|
expect(mockLogger.debug).toHaveBeenCalledWith(
|
|
242
|
-
"✅ Bootstrapped 0 health checks"
|
|
263
|
+
"✅ Bootstrapped 0 health checks",
|
|
243
264
|
);
|
|
244
265
|
});
|
|
245
266
|
});
|
package/src/queue-executor.ts
CHANGED
|
@@ -20,12 +20,14 @@ import {
|
|
|
20
20
|
type HealthCheckStatus,
|
|
21
21
|
} from "@checkstack/healthcheck-common";
|
|
22
22
|
import { CatalogApi, catalogRoutes } from "@checkstack/catalog-common";
|
|
23
|
+
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
23
24
|
import { resolveRoute, type InferClient } from "@checkstack/common";
|
|
24
25
|
import { HealthCheckService } from "./service";
|
|
25
26
|
import { healthCheckHooks } from "./hooks";
|
|
26
27
|
|
|
27
28
|
type Db = NodePgDatabase<typeof schema>;
|
|
28
29
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
30
|
+
type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
29
31
|
|
|
30
32
|
/**
|
|
31
33
|
* Payload for health check queue jobs
|
|
@@ -74,7 +76,7 @@ export async function scheduleHealthCheck(props: {
|
|
|
74
76
|
const jobId = `healthcheck:${payload.configId}:${payload.systemId}`;
|
|
75
77
|
|
|
76
78
|
logger?.debug(
|
|
77
|
-
`Scheduling recurring health check ${jobId} with interval ${intervalSeconds}s, startDelay ${startDelay}s
|
|
79
|
+
`Scheduling recurring health check ${jobId} with interval ${intervalSeconds}s, startDelay ${startDelay}s`,
|
|
78
80
|
);
|
|
79
81
|
|
|
80
82
|
return queue.scheduleRecurring(payload, {
|
|
@@ -87,21 +89,48 @@ export async function scheduleHealthCheck(props: {
|
|
|
87
89
|
|
|
88
90
|
/**
|
|
89
91
|
* Notify system subscribers about a health state change.
|
|
92
|
+
* Skips notification if the system has active maintenance with suppression enabled.
|
|
90
93
|
*/
|
|
91
94
|
async function notifyStateChange(props: {
|
|
92
95
|
systemId: string;
|
|
93
96
|
previousStatus: HealthCheckStatus;
|
|
94
97
|
newStatus: HealthCheckStatus;
|
|
95
98
|
catalogClient: CatalogClient;
|
|
99
|
+
maintenanceClient: MaintenanceClient;
|
|
96
100
|
logger: Logger;
|
|
97
101
|
}): Promise<void> {
|
|
98
|
-
const {
|
|
102
|
+
const {
|
|
103
|
+
systemId,
|
|
104
|
+
previousStatus,
|
|
105
|
+
newStatus,
|
|
106
|
+
catalogClient,
|
|
107
|
+
maintenanceClient,
|
|
108
|
+
logger,
|
|
109
|
+
} = props;
|
|
99
110
|
|
|
100
111
|
// Only notify on actual state changes
|
|
101
112
|
if (newStatus === previousStatus) {
|
|
102
113
|
return;
|
|
103
114
|
}
|
|
104
115
|
|
|
116
|
+
// Check if notifications should be suppressed due to active maintenance
|
|
117
|
+
try {
|
|
118
|
+
const { suppressed } =
|
|
119
|
+
await maintenanceClient.hasActiveMaintenanceWithSuppression({ systemId });
|
|
120
|
+
if (suppressed) {
|
|
121
|
+
logger.debug(
|
|
122
|
+
`Skipping notification for ${systemId}: active maintenance with suppression enabled`,
|
|
123
|
+
);
|
|
124
|
+
return;
|
|
125
|
+
}
|
|
126
|
+
} catch (error) {
|
|
127
|
+
// Log but continue with notification - suppression check failure shouldn't block notifications
|
|
128
|
+
logger.warn(
|
|
129
|
+
`Failed to check maintenance suppression for ${systemId}, proceeding with notification:`,
|
|
130
|
+
error,
|
|
131
|
+
);
|
|
132
|
+
}
|
|
133
|
+
|
|
105
134
|
const isRecovery = newStatus === "healthy" && previousStatus !== "healthy";
|
|
106
135
|
const isDegraded = newStatus === "degraded";
|
|
107
136
|
const isUnhealthy = newStatus === "unhealthy";
|
|
@@ -143,13 +172,13 @@ async function notifyStateChange(props: {
|
|
|
143
172
|
includeGroupSubscribers: true,
|
|
144
173
|
});
|
|
145
174
|
logger.debug(
|
|
146
|
-
`Notified subscribers: ${previousStatus} → ${newStatus} for system ${systemId}
|
|
175
|
+
`Notified subscribers: ${previousStatus} → ${newStatus} for system ${systemId}`,
|
|
147
176
|
);
|
|
148
177
|
} catch (error) {
|
|
149
178
|
// Log but don't fail the operation - notifications are best-effort
|
|
150
179
|
logger.warn(
|
|
151
180
|
`Failed to notify subscribers for health state change on system ${systemId}:`,
|
|
152
|
-
error
|
|
181
|
+
error,
|
|
153
182
|
);
|
|
154
183
|
}
|
|
155
184
|
}
|
|
@@ -165,6 +194,7 @@ async function executeHealthCheckJob(props: {
|
|
|
165
194
|
logger: Logger;
|
|
166
195
|
signalService: SignalService;
|
|
167
196
|
catalogClient: CatalogClient;
|
|
197
|
+
maintenanceClient: MaintenanceClient;
|
|
168
198
|
getEmitHook: () => EmitHookFn | undefined;
|
|
169
199
|
}): Promise<void> {
|
|
170
200
|
const {
|
|
@@ -175,6 +205,7 @@ async function executeHealthCheckJob(props: {
|
|
|
175
205
|
logger,
|
|
176
206
|
signalService,
|
|
177
207
|
catalogClient,
|
|
208
|
+
maintenanceClient,
|
|
178
209
|
getEmitHook,
|
|
179
210
|
} = props;
|
|
180
211
|
const { configId, systemId } = payload;
|
|
@@ -201,20 +232,20 @@ async function executeHealthCheckJob(props: {
|
|
|
201
232
|
.from(systemHealthChecks)
|
|
202
233
|
.innerJoin(
|
|
203
234
|
healthCheckConfigurations,
|
|
204
|
-
eq(systemHealthChecks.configurationId, healthCheckConfigurations.id)
|
|
235
|
+
eq(systemHealthChecks.configurationId, healthCheckConfigurations.id),
|
|
205
236
|
)
|
|
206
237
|
.where(
|
|
207
238
|
and(
|
|
208
239
|
eq(systemHealthChecks.systemId, systemId),
|
|
209
240
|
eq(systemHealthChecks.configurationId, configId),
|
|
210
|
-
eq(systemHealthChecks.enabled, true)
|
|
211
|
-
)
|
|
241
|
+
eq(systemHealthChecks.enabled, true),
|
|
242
|
+
),
|
|
212
243
|
);
|
|
213
244
|
|
|
214
245
|
// If configuration not found or disabled, exit without rescheduling
|
|
215
246
|
if (!configRow) {
|
|
216
247
|
logger.debug(
|
|
217
|
-
`Health check ${configId} for system ${systemId} not found or disabled, not rescheduling
|
|
248
|
+
`Health check ${configId} for system ${systemId} not found or disabled, not rescheduling`,
|
|
218
249
|
);
|
|
219
250
|
return;
|
|
220
251
|
}
|
|
@@ -234,7 +265,7 @@ async function executeHealthCheckJob(props: {
|
|
|
234
265
|
const strategy = registry.getStrategy(configRow.strategyId);
|
|
235
266
|
if (!strategy) {
|
|
236
267
|
logger.warn(
|
|
237
|
-
`Strategy ${configRow.strategyId} not found for config ${configId}
|
|
268
|
+
`Strategy ${configRow.strategyId} not found for config ${configId}`,
|
|
238
269
|
);
|
|
239
270
|
return;
|
|
240
271
|
}
|
|
@@ -244,7 +275,7 @@ async function executeHealthCheckJob(props: {
|
|
|
244
275
|
let connectedClient;
|
|
245
276
|
try {
|
|
246
277
|
connectedClient = await strategy.createClient(
|
|
247
|
-
configRow.config as Record<string, unknown
|
|
278
|
+
configRow.config as Record<string, unknown>,
|
|
248
279
|
);
|
|
249
280
|
} catch (error) {
|
|
250
281
|
// Connection failed
|
|
@@ -268,7 +299,7 @@ async function executeHealthCheckJob(props: {
|
|
|
268
299
|
});
|
|
269
300
|
|
|
270
301
|
logger.debug(
|
|
271
|
-
`Health check ${configId} for system ${systemId} failed: ${errorMessage}
|
|
302
|
+
`Health check ${configId} for system ${systemId} failed: ${errorMessage}`,
|
|
272
303
|
);
|
|
273
304
|
|
|
274
305
|
// Broadcast failure signal
|
|
@@ -289,6 +320,7 @@ async function executeHealthCheckJob(props: {
|
|
|
289
320
|
previousStatus,
|
|
290
321
|
newStatus: newState.status,
|
|
291
322
|
catalogClient,
|
|
323
|
+
maintenanceClient,
|
|
292
324
|
logger,
|
|
293
325
|
});
|
|
294
326
|
}
|
|
@@ -307,11 +339,11 @@ async function executeHealthCheckJob(props: {
|
|
|
307
339
|
try {
|
|
308
340
|
for (const collectorEntry of collectors) {
|
|
309
341
|
const registered = collectorRegistry.getCollector(
|
|
310
|
-
collectorEntry.collectorId
|
|
342
|
+
collectorEntry.collectorId,
|
|
311
343
|
);
|
|
312
344
|
if (!registered) {
|
|
313
345
|
logger.warn(
|
|
314
|
-
`Collector ${collectorEntry.collectorId} not found, skipping
|
|
346
|
+
`Collector ${collectorEntry.collectorId} not found, skipping`,
|
|
315
347
|
);
|
|
316
348
|
continue;
|
|
317
349
|
}
|
|
@@ -342,7 +374,7 @@ async function executeHealthCheckJob(props: {
|
|
|
342
374
|
const assertions = collectorEntry.assertions;
|
|
343
375
|
const failedAssertion = evaluateAssertions(
|
|
344
376
|
assertions,
|
|
345
|
-
collectorResult.result as Record<string, unknown
|
|
377
|
+
collectorResult.result as Record<string, unknown>,
|
|
346
378
|
);
|
|
347
379
|
if (failedAssertion) {
|
|
348
380
|
hasCollectorError = true;
|
|
@@ -351,7 +383,7 @@ async function executeHealthCheckJob(props: {
|
|
|
351
383
|
} ${failedAssertion.value ?? ""}`;
|
|
352
384
|
errorMessage = `Assertion failed: ${assertionFailed}`;
|
|
353
385
|
logger.debug(
|
|
354
|
-
`Collector ${storageKey} assertion failed: ${errorMessage}
|
|
386
|
+
`Collector ${storageKey} assertion failed: ${errorMessage}`,
|
|
355
387
|
);
|
|
356
388
|
}
|
|
357
389
|
}
|
|
@@ -409,7 +441,7 @@ async function executeHealthCheckJob(props: {
|
|
|
409
441
|
});
|
|
410
442
|
|
|
411
443
|
logger.debug(
|
|
412
|
-
`Ran health check ${configId} for system ${systemId}: ${result.status}
|
|
444
|
+
`Ran health check ${configId} for system ${systemId}: ${result.status}`,
|
|
413
445
|
);
|
|
414
446
|
|
|
415
447
|
// Broadcast enriched signal for realtime frontend updates (e.g., terminal feed)
|
|
@@ -430,6 +462,7 @@ async function executeHealthCheckJob(props: {
|
|
|
430
462
|
previousStatus,
|
|
431
463
|
newStatus: newState.status,
|
|
432
464
|
catalogClient,
|
|
465
|
+
maintenanceClient,
|
|
433
466
|
logger,
|
|
434
467
|
});
|
|
435
468
|
|
|
@@ -442,13 +475,13 @@ async function executeHealthCheckJob(props: {
|
|
|
442
475
|
systemId,
|
|
443
476
|
previousStatus,
|
|
444
477
|
healthyChecks: newState.checkStatuses.filter(
|
|
445
|
-
(c) => c.status === "healthy"
|
|
478
|
+
(c) => c.status === "healthy",
|
|
446
479
|
).length,
|
|
447
480
|
totalChecks: newState.checkStatuses.length,
|
|
448
481
|
timestamp: new Date().toISOString(),
|
|
449
482
|
});
|
|
450
483
|
logger.debug(
|
|
451
|
-
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}
|
|
484
|
+
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
|
|
452
485
|
);
|
|
453
486
|
} else if (
|
|
454
487
|
previousStatus === "healthy" &&
|
|
@@ -460,13 +493,13 @@ async function executeHealthCheckJob(props: {
|
|
|
460
493
|
previousStatus,
|
|
461
494
|
newStatus: newState.status,
|
|
462
495
|
healthyChecks: newState.checkStatuses.filter(
|
|
463
|
-
(c) => c.status === "healthy"
|
|
496
|
+
(c) => c.status === "healthy",
|
|
464
497
|
).length,
|
|
465
498
|
totalChecks: newState.checkStatuses.length,
|
|
466
499
|
timestamp: new Date().toISOString(),
|
|
467
500
|
});
|
|
468
501
|
logger.debug(
|
|
469
|
-
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}
|
|
502
|
+
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
|
|
470
503
|
);
|
|
471
504
|
}
|
|
472
505
|
}
|
|
@@ -476,7 +509,7 @@ async function executeHealthCheckJob(props: {
|
|
|
476
509
|
} catch (error) {
|
|
477
510
|
logger.error(
|
|
478
511
|
`Failed to execute health check ${configId} for system ${systemId}`,
|
|
479
|
-
error
|
|
512
|
+
error,
|
|
480
513
|
);
|
|
481
514
|
|
|
482
515
|
// Store failure (no latencyMs for failures)
|
|
@@ -523,6 +556,7 @@ async function executeHealthCheckJob(props: {
|
|
|
523
556
|
previousStatus,
|
|
524
557
|
newStatus: newState.status,
|
|
525
558
|
catalogClient,
|
|
559
|
+
maintenanceClient,
|
|
526
560
|
logger,
|
|
527
561
|
});
|
|
528
562
|
|
|
@@ -535,13 +569,13 @@ async function executeHealthCheckJob(props: {
|
|
|
535
569
|
systemId,
|
|
536
570
|
previousStatus,
|
|
537
571
|
healthyChecks: newState.checkStatuses.filter(
|
|
538
|
-
(c) => c.status === "healthy"
|
|
572
|
+
(c) => c.status === "healthy",
|
|
539
573
|
).length,
|
|
540
574
|
totalChecks: newState.checkStatuses.length,
|
|
541
575
|
timestamp: new Date().toISOString(),
|
|
542
576
|
});
|
|
543
577
|
logger.debug(
|
|
544
|
-
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}
|
|
578
|
+
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
|
|
545
579
|
);
|
|
546
580
|
} else if (
|
|
547
581
|
previousStatus === "healthy" &&
|
|
@@ -553,13 +587,13 @@ async function executeHealthCheckJob(props: {
|
|
|
553
587
|
previousStatus,
|
|
554
588
|
newStatus: newState.status,
|
|
555
589
|
healthyChecks: newState.checkStatuses.filter(
|
|
556
|
-
(c) => c.status === "healthy"
|
|
590
|
+
(c) => c.status === "healthy",
|
|
557
591
|
).length,
|
|
558
592
|
totalChecks: newState.checkStatuses.length,
|
|
559
593
|
timestamp: new Date().toISOString(),
|
|
560
594
|
});
|
|
561
595
|
logger.debug(
|
|
562
|
-
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}
|
|
596
|
+
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
|
|
563
597
|
);
|
|
564
598
|
}
|
|
565
599
|
}
|
|
@@ -577,6 +611,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
577
611
|
queueManager: QueueManager;
|
|
578
612
|
signalService: SignalService;
|
|
579
613
|
catalogClient: CatalogClient;
|
|
614
|
+
maintenanceClient: MaintenanceClient;
|
|
580
615
|
getEmitHook: () => EmitHookFn | undefined;
|
|
581
616
|
}): Promise<void> {
|
|
582
617
|
const {
|
|
@@ -587,6 +622,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
587
622
|
queueManager,
|
|
588
623
|
signalService,
|
|
589
624
|
catalogClient,
|
|
625
|
+
maintenanceClient,
|
|
590
626
|
getEmitHook,
|
|
591
627
|
} = props;
|
|
592
628
|
|
|
@@ -604,13 +640,14 @@ export async function setupHealthCheckWorker(props: {
|
|
|
604
640
|
logger,
|
|
605
641
|
signalService,
|
|
606
642
|
catalogClient,
|
|
643
|
+
maintenanceClient,
|
|
607
644
|
getEmitHook,
|
|
608
645
|
});
|
|
609
646
|
},
|
|
610
647
|
{
|
|
611
648
|
consumerGroup: WORKER_GROUP,
|
|
612
649
|
maxRetries: 0, // Health checks should not retry on failure
|
|
613
|
-
}
|
|
650
|
+
},
|
|
614
651
|
);
|
|
615
652
|
|
|
616
653
|
logger.debug("🎯 Health Check Worker subscribed to queue");
|
|
@@ -636,7 +673,7 @@ export async function bootstrapHealthChecks(props: {
|
|
|
636
673
|
.from(systemHealthChecks)
|
|
637
674
|
.innerJoin(
|
|
638
675
|
healthCheckConfigurations,
|
|
639
|
-
eq(systemHealthChecks.configurationId, healthCheckConfigurations.id)
|
|
676
|
+
eq(systemHealthChecks.configurationId, healthCheckConfigurations.id),
|
|
640
677
|
)
|
|
641
678
|
.where(eq(systemHealthChecks.enabled, true));
|
|
642
679
|
|
|
@@ -671,7 +708,7 @@ export async function bootstrapHealthChecks(props: {
|
|
|
671
708
|
let startDelay = 0;
|
|
672
709
|
if (lastRun) {
|
|
673
710
|
const elapsedSeconds = Math.floor(
|
|
674
|
-
(Date.now() - lastRun.getTime()) / 1000
|
|
711
|
+
(Date.now() - lastRun.getTime()) / 1000,
|
|
675
712
|
);
|
|
676
713
|
if (elapsedSeconds < check.interval) {
|
|
677
714
|
// Not overdue yet - schedule with remaining time
|
|
@@ -683,11 +720,11 @@ export async function bootstrapHealthChecks(props: {
|
|
|
683
720
|
check.systemId
|
|
684
721
|
} - lastRun: ${lastRun.toISOString()}, elapsed: ${elapsedSeconds}s, interval: ${
|
|
685
722
|
check.interval
|
|
686
|
-
}s, startDelay: ${startDelay}s
|
|
723
|
+
}s, startDelay: ${startDelay}s`,
|
|
687
724
|
);
|
|
688
725
|
} else {
|
|
689
726
|
logger.debug(
|
|
690
|
-
`Health check ${check.configId}:${check.systemId} - no lastRun found, running immediately
|
|
727
|
+
`Health check ${check.configId}:${check.systemId} - no lastRun found, running immediately`,
|
|
691
728
|
);
|
|
692
729
|
}
|
|
693
730
|
|
|
@@ -711,12 +748,12 @@ export async function bootstrapHealthChecks(props: {
|
|
|
711
748
|
const allRecurringJobs = await queue.listRecurringJobs();
|
|
712
749
|
const expectedJobIds = new Set(
|
|
713
750
|
enabledChecks.map(
|
|
714
|
-
(check) => `healthcheck:${check.configId}:${check.systemId}
|
|
715
|
-
)
|
|
751
|
+
(check) => `healthcheck:${check.configId}:${check.systemId}`,
|
|
752
|
+
),
|
|
716
753
|
);
|
|
717
754
|
|
|
718
755
|
const orphanedJobs = allRecurringJobs.filter(
|
|
719
|
-
(jobId) => jobId.startsWith("healthcheck:") && !expectedJobIds.has(jobId)
|
|
756
|
+
(jobId) => jobId.startsWith("healthcheck:") && !expectedJobIds.has(jobId),
|
|
720
757
|
);
|
|
721
758
|
|
|
722
759
|
for (const jobId of orphanedJobs) {
|
|
@@ -726,7 +763,7 @@ export async function bootstrapHealthChecks(props: {
|
|
|
726
763
|
|
|
727
764
|
if (orphanedJobs.length > 0) {
|
|
728
765
|
logger.info(
|
|
729
|
-
`🧹 Cleaned up ${orphanedJobs.length} orphaned health check jobs
|
|
766
|
+
`🧹 Cleaned up ${orphanedJobs.length} orphaned health check jobs`,
|
|
730
767
|
);
|
|
731
768
|
}
|
|
732
769
|
}
|