@checkstack/healthcheck-backend 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +30 -0
- package/package.json +13 -13
- package/src/queue-executor.test.ts +1 -1
- package/src/queue-executor.ts +169 -102
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,35 @@
|
|
|
1
1
|
# @checkstack/healthcheck-backend
|
|
2
2
|
|
|
3
|
+
## 0.10.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- 869b4ab: ## Health Check Execution Improvements
|
|
8
|
+
|
|
9
|
+
### Breaking Changes (backend-api)
|
|
10
|
+
|
|
11
|
+
- `HealthCheckStrategy.createClient()` now accepts `unknown` instead of `TConfig` due to TypeScript contravariance constraints. Implementations should use `this.config.validate(config)` to narrow the type.
|
|
12
|
+
|
|
13
|
+
### Features
|
|
14
|
+
|
|
15
|
+
- **Platform-level hard timeout**: The executor now wraps the entire health check execution (connection + all collectors) in a single timeout, ensuring checks never hang indefinitely.
|
|
16
|
+
- **Parallel collector execution**: Collectors now run in parallel using `Promise.allSettled()`, improving performance while ensuring all collectors complete regardless of individual failures.
|
|
17
|
+
- **Base strategy config schema**: All strategy configs now extend `baseStrategyConfigSchema` which provides a standardized `timeout` field with sensible defaults (30s, min 100ms).
|
|
18
|
+
|
|
19
|
+
### Fixes
|
|
20
|
+
|
|
21
|
+
- Fixed HTTP and Jenkins strategies clearing timeouts before reading the full response body.
|
|
22
|
+
- Simplified registry type signatures by using default type parameters.
|
|
23
|
+
|
|
24
|
+
### Patch Changes
|
|
25
|
+
|
|
26
|
+
- Updated dependencies [869b4ab]
|
|
27
|
+
- @checkstack/backend-api@0.8.0
|
|
28
|
+
- @checkstack/catalog-backend@0.2.13
|
|
29
|
+
- @checkstack/command-backend@0.1.11
|
|
30
|
+
- @checkstack/integration-backend@0.1.11
|
|
31
|
+
- @checkstack/queue-api@0.2.5
|
|
32
|
+
|
|
3
33
|
## 0.9.0
|
|
4
34
|
|
|
5
35
|
### Minor Changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@checkstack/healthcheck-backend",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.10.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "src/index.ts",
|
|
6
6
|
"scripts": {
|
|
@@ -10,17 +10,17 @@
|
|
|
10
10
|
"lint:code": "eslint . --max-warnings 0"
|
|
11
11
|
},
|
|
12
12
|
"dependencies": {
|
|
13
|
-
"@checkstack/backend-api": "0.
|
|
14
|
-
"@checkstack/catalog-backend": "0.2.
|
|
15
|
-
"@checkstack/catalog-common": "1.2.
|
|
16
|
-
"@checkstack/command-backend": "0.1.
|
|
17
|
-
"@checkstack/common": "0.6.
|
|
18
|
-
"@checkstack/healthcheck-common": "0.8.
|
|
19
|
-
"@checkstack/incident-common": "0.4.
|
|
20
|
-
"@checkstack/integration-backend": "0.1.
|
|
21
|
-
"@checkstack/maintenance-common": "0.4.
|
|
22
|
-
"@checkstack/queue-api": "0.2.
|
|
23
|
-
"@checkstack/signal-common": "0.1.
|
|
13
|
+
"@checkstack/backend-api": "0.7.0",
|
|
14
|
+
"@checkstack/catalog-backend": "0.2.12",
|
|
15
|
+
"@checkstack/catalog-common": "1.2.7",
|
|
16
|
+
"@checkstack/command-backend": "0.1.10",
|
|
17
|
+
"@checkstack/common": "0.6.2",
|
|
18
|
+
"@checkstack/healthcheck-common": "0.8.2",
|
|
19
|
+
"@checkstack/incident-common": "0.4.3",
|
|
20
|
+
"@checkstack/integration-backend": "0.1.10",
|
|
21
|
+
"@checkstack/maintenance-common": "0.4.5",
|
|
22
|
+
"@checkstack/queue-api": "0.2.4",
|
|
23
|
+
"@checkstack/signal-common": "0.1.6",
|
|
24
24
|
"@hono/zod-validator": "^0.7.6",
|
|
25
25
|
"drizzle-orm": "^0.45.1",
|
|
26
26
|
"hono": "^4.0.0",
|
|
@@ -30,7 +30,7 @@
|
|
|
30
30
|
"devDependencies": {
|
|
31
31
|
"@checkstack/drizzle-helper": "0.0.3",
|
|
32
32
|
"@checkstack/scripts": "0.1.1",
|
|
33
|
-
"@checkstack/test-utils-backend": "0.1.
|
|
33
|
+
"@checkstack/test-utils-backend": "0.1.10",
|
|
34
34
|
"@checkstack/tsconfig": "0.0.3",
|
|
35
35
|
"@orpc/server": "^1.13.2",
|
|
36
36
|
"@types/bun": "^1.0.0",
|
package/src/queue-executor.ts
CHANGED
|
@@ -5,6 +5,9 @@ import {
|
|
|
5
5
|
type CollectorRegistry,
|
|
6
6
|
evaluateAssertions,
|
|
7
7
|
type SafeDatabase,
|
|
8
|
+
type BaseStrategyConfig,
|
|
9
|
+
type ConnectedClient,
|
|
10
|
+
type TransportClient,
|
|
8
11
|
} from "@checkstack/backend-api";
|
|
9
12
|
import { QueueManager } from "@checkstack/queue-api";
|
|
10
13
|
import {
|
|
@@ -305,24 +308,175 @@ async function executeHealthCheckJob(props: {
|
|
|
305
308
|
return;
|
|
306
309
|
}
|
|
307
310
|
|
|
308
|
-
//
|
|
311
|
+
// Extract timeout from strategy config for platform-level enforcement
|
|
312
|
+
const strategyConfig = configRow.config as unknown as BaseStrategyConfig;
|
|
313
|
+
const executionTimeout = strategyConfig.timeout ?? 60_000;
|
|
314
|
+
|
|
315
|
+
// Execute health check using createClient pattern with unified hard timeout
|
|
309
316
|
const start = performance.now();
|
|
310
|
-
let
|
|
317
|
+
let connectionTimeMs: number | undefined;
|
|
318
|
+
let connectedClient:
|
|
319
|
+
| ConnectedClient<TransportClient<never, unknown>>
|
|
320
|
+
| undefined;
|
|
321
|
+
const collectors = configRow.collectors ?? [];
|
|
322
|
+
const collectorResults: Record<string, unknown> = {};
|
|
323
|
+
let hasCollectorError = false;
|
|
324
|
+
let errorMessage: string | undefined;
|
|
325
|
+
|
|
311
326
|
try {
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
327
|
+
// Platform-level hard timeout wrapping the entire execution sequence
|
|
328
|
+
await Promise.race([
|
|
329
|
+
(async () => {
|
|
330
|
+
// 1. Establish connection
|
|
331
|
+
connectedClient = await strategy.createClient(strategyConfig);
|
|
332
|
+
connectionTimeMs = Math.round(performance.now() - start);
|
|
333
|
+
|
|
334
|
+
// 2. Execute collectors in parallel
|
|
335
|
+
const collectorPromises = collectors.map(async (collectorEntry) => {
|
|
336
|
+
const registered = collectorRegistry.getCollector(
|
|
337
|
+
collectorEntry.collectorId,
|
|
338
|
+
);
|
|
339
|
+
if (!registered) {
|
|
340
|
+
logger.warn(
|
|
341
|
+
`Collector ${collectorEntry.collectorId} not found, skipping`,
|
|
342
|
+
);
|
|
343
|
+
return { storageKey: collectorEntry.id, skipped: true };
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
const storageKey = collectorEntry.id;
|
|
347
|
+
|
|
348
|
+
try {
|
|
349
|
+
const collectorResult = await registered.collector.execute({
|
|
350
|
+
config: collectorEntry.config,
|
|
351
|
+
client: connectedClient!.client,
|
|
352
|
+
pluginId: configRow.strategyId,
|
|
353
|
+
});
|
|
354
|
+
|
|
355
|
+
// Check for collector-level error
|
|
356
|
+
let collectorError: string | undefined;
|
|
357
|
+
if (collectorResult.error) {
|
|
358
|
+
collectorError = collectorResult.error;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Evaluate per-collector assertions
|
|
362
|
+
let assertionFailed: string | undefined;
|
|
363
|
+
if (
|
|
364
|
+
collectorEntry.assertions &&
|
|
365
|
+
collectorEntry.assertions.length > 0 &&
|
|
366
|
+
collectorResult.result
|
|
367
|
+
) {
|
|
368
|
+
const failedAssertion = evaluateAssertions(
|
|
369
|
+
collectorEntry.assertions,
|
|
370
|
+
collectorResult.result as Record<string, unknown>,
|
|
371
|
+
);
|
|
372
|
+
if (failedAssertion) {
|
|
373
|
+
assertionFailed = `${failedAssertion.field} ${
|
|
374
|
+
failedAssertion.operator
|
|
375
|
+
} ${failedAssertion.value ?? ""}`;
|
|
376
|
+
logger.debug(
|
|
377
|
+
`Collector ${storageKey} assertion failed: ${assertionFailed}`,
|
|
378
|
+
);
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// Strip ephemeral fields before storage
|
|
383
|
+
const strippedResult = stripEphemeralFields(
|
|
384
|
+
collectorResult.result as Record<string, unknown>,
|
|
385
|
+
registered.collector.result.schema,
|
|
386
|
+
);
|
|
387
|
+
|
|
388
|
+
return {
|
|
389
|
+
storageKey,
|
|
390
|
+
skipped: false,
|
|
391
|
+
success: true,
|
|
392
|
+
collectorError,
|
|
393
|
+
assertionFailed,
|
|
394
|
+
result: {
|
|
395
|
+
_collectorId: collectorEntry.collectorId,
|
|
396
|
+
_assertionFailed: assertionFailed,
|
|
397
|
+
...strippedResult,
|
|
398
|
+
},
|
|
399
|
+
};
|
|
400
|
+
} catch (error) {
|
|
401
|
+
const errorStr =
|
|
402
|
+
error instanceof Error ? error.message : String(error);
|
|
403
|
+
logger.debug(`Collector ${storageKey} failed: ${errorStr}`);
|
|
404
|
+
return {
|
|
405
|
+
storageKey,
|
|
406
|
+
skipped: false,
|
|
407
|
+
success: false,
|
|
408
|
+
error: errorStr,
|
|
409
|
+
result: {
|
|
410
|
+
_collectorId: collectorEntry.collectorId,
|
|
411
|
+
_assertionFailed: undefined,
|
|
412
|
+
error: errorStr,
|
|
413
|
+
},
|
|
414
|
+
};
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
|
|
418
|
+
// Wait for all collectors to complete
|
|
419
|
+
const settledResults = await Promise.allSettled(collectorPromises);
|
|
420
|
+
|
|
421
|
+
// Process results from all collectors
|
|
422
|
+
for (const settled of settledResults) {
|
|
423
|
+
if (settled.status === "rejected") {
|
|
424
|
+
// This shouldn't happen since we catch errors above, but handle it
|
|
425
|
+
hasCollectorError = true;
|
|
426
|
+
if (!errorMessage) errorMessage = String(settled.reason);
|
|
427
|
+
continue;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
const result = settled.value;
|
|
431
|
+
if (result.skipped) continue;
|
|
432
|
+
|
|
433
|
+
// Store the result
|
|
434
|
+
collectorResults[result.storageKey] = result.result;
|
|
435
|
+
|
|
436
|
+
// Track errors
|
|
437
|
+
if (
|
|
438
|
+
!result.success ||
|
|
439
|
+
result.collectorError ||
|
|
440
|
+
result.assertionFailed
|
|
441
|
+
) {
|
|
442
|
+
hasCollectorError = true;
|
|
443
|
+
if (!errorMessage) {
|
|
444
|
+
errorMessage =
|
|
445
|
+
result.error ||
|
|
446
|
+
result.collectorError ||
|
|
447
|
+
(result.assertionFailed
|
|
448
|
+
? `Assertion failed: ${result.assertionFailed}`
|
|
449
|
+
: undefined);
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
})(),
|
|
454
|
+
new Promise<never>((_, reject) =>
|
|
455
|
+
setTimeout(
|
|
456
|
+
() =>
|
|
457
|
+
reject(
|
|
458
|
+
new Error(`Execution timeout after ${executionTimeout}ms`),
|
|
459
|
+
),
|
|
460
|
+
executionTimeout,
|
|
461
|
+
),
|
|
462
|
+
),
|
|
463
|
+
]);
|
|
315
464
|
} catch (error) {
|
|
316
|
-
// Connection failed
|
|
317
465
|
const latencyMs = Math.round(performance.now() - start);
|
|
318
|
-
const
|
|
319
|
-
error instanceof Error ? error.message :
|
|
466
|
+
const caughtError =
|
|
467
|
+
error instanceof Error ? error.message : String(error);
|
|
468
|
+
|
|
469
|
+
// Use a specific error message if available, otherwise use the caught error
|
|
470
|
+
const finalError = errorMessage || caughtError;
|
|
320
471
|
|
|
321
472
|
const result = {
|
|
322
473
|
status: "unhealthy" as const,
|
|
323
474
|
latencyMs,
|
|
324
|
-
message:
|
|
325
|
-
metadata: {
|
|
475
|
+
message: finalError,
|
|
476
|
+
metadata: {
|
|
477
|
+
connected: !!connectedClient,
|
|
478
|
+
error: finalError,
|
|
479
|
+
},
|
|
326
480
|
};
|
|
327
481
|
|
|
328
482
|
await db.insert(healthCheckRuns).values({
|
|
@@ -333,7 +487,6 @@ async function executeHealthCheckJob(props: {
|
|
|
333
487
|
result: { ...result } as Record<string, unknown>,
|
|
334
488
|
});
|
|
335
489
|
|
|
336
|
-
// Trigger incremental hourly aggregation
|
|
337
490
|
await incrementHourlyAggregate({
|
|
338
491
|
db,
|
|
339
492
|
systemId,
|
|
@@ -346,10 +499,9 @@ async function executeHealthCheckJob(props: {
|
|
|
346
499
|
});
|
|
347
500
|
|
|
348
501
|
logger.debug(
|
|
349
|
-
`Health check ${configId} for system ${systemId} failed: ${
|
|
502
|
+
`Health check ${configId} for system ${systemId} failed: ${finalError}`,
|
|
350
503
|
);
|
|
351
504
|
|
|
352
|
-
// Broadcast failure signal
|
|
353
505
|
await signalService.broadcast(HEALTH_CHECK_RUN_COMPLETED, {
|
|
354
506
|
systemId,
|
|
355
507
|
systemName,
|
|
@@ -359,7 +511,6 @@ async function executeHealthCheckJob(props: {
|
|
|
359
511
|
latencyMs: result.latencyMs,
|
|
360
512
|
});
|
|
361
513
|
|
|
362
|
-
// Check and notify state change
|
|
363
514
|
const newState = await service.getSystemHealthStatus(systemId);
|
|
364
515
|
if (newState.status !== previousStatus) {
|
|
365
516
|
await notifyStateChange({
|
|
@@ -374,98 +525,14 @@ async function executeHealthCheckJob(props: {
|
|
|
374
525
|
}
|
|
375
526
|
|
|
376
527
|
return;
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
const connectionTimeMs = Math.round(performance.now() - start);
|
|
380
|
-
|
|
381
|
-
// Execute collectors
|
|
382
|
-
const collectors = configRow.collectors ?? [];
|
|
383
|
-
const collectorResults: Record<string, unknown> = {};
|
|
384
|
-
let hasCollectorError = false;
|
|
385
|
-
let errorMessage: string | undefined;
|
|
386
|
-
|
|
387
|
-
try {
|
|
388
|
-
for (const collectorEntry of collectors) {
|
|
389
|
-
const registered = collectorRegistry.getCollector(
|
|
390
|
-
collectorEntry.collectorId,
|
|
391
|
-
);
|
|
392
|
-
if (!registered) {
|
|
393
|
-
logger.warn(
|
|
394
|
-
`Collector ${collectorEntry.collectorId} not found, skipping`,
|
|
395
|
-
);
|
|
396
|
-
continue;
|
|
397
|
-
}
|
|
398
|
-
|
|
399
|
-
// Use the collector's UUID as the storage key
|
|
400
|
-
const storageKey = collectorEntry.id;
|
|
401
|
-
|
|
528
|
+
} finally {
|
|
529
|
+
if (connectedClient) {
|
|
402
530
|
try {
|
|
403
|
-
|
|
404
|
-
config: collectorEntry.config,
|
|
405
|
-
client: connectedClient.client,
|
|
406
|
-
pluginId: configRow.strategyId,
|
|
407
|
-
});
|
|
408
|
-
|
|
409
|
-
// Check for collector-level error
|
|
410
|
-
if (collectorResult.error) {
|
|
411
|
-
hasCollectorError = true;
|
|
412
|
-
errorMessage = collectorResult.error;
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
// Evaluate per-collector assertions
|
|
416
|
-
let assertionFailed: string | undefined;
|
|
417
|
-
if (
|
|
418
|
-
collectorEntry.assertions &&
|
|
419
|
-
collectorEntry.assertions.length > 0 &&
|
|
420
|
-
collectorResult.result
|
|
421
|
-
) {
|
|
422
|
-
const assertions = collectorEntry.assertions;
|
|
423
|
-
const failedAssertion = evaluateAssertions(
|
|
424
|
-
assertions,
|
|
425
|
-
collectorResult.result as Record<string, unknown>,
|
|
426
|
-
);
|
|
427
|
-
if (failedAssertion) {
|
|
428
|
-
hasCollectorError = true;
|
|
429
|
-
assertionFailed = `${failedAssertion.field} ${
|
|
430
|
-
failedAssertion.operator
|
|
431
|
-
} ${failedAssertion.value ?? ""}`;
|
|
432
|
-
errorMessage = `Assertion failed: ${assertionFailed}`;
|
|
433
|
-
logger.debug(
|
|
434
|
-
`Collector ${storageKey} assertion failed: ${errorMessage}`,
|
|
435
|
-
);
|
|
436
|
-
}
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
// Strip ephemeral fields (like HTTP body) before storage to save space
|
|
440
|
-
const strippedResult = stripEphemeralFields(
|
|
441
|
-
collectorResult.result as Record<string, unknown>,
|
|
442
|
-
registered.collector.result.schema,
|
|
443
|
-
);
|
|
444
|
-
|
|
445
|
-
// Store result under the collector's UUID, with collector type and assertion metadata
|
|
446
|
-
collectorResults[storageKey] = {
|
|
447
|
-
_collectorId: collectorEntry.collectorId, // Store the type for frontend schema linking
|
|
448
|
-
_assertionFailed: assertionFailed, // null if no assertion failed
|
|
449
|
-
...strippedResult,
|
|
450
|
-
};
|
|
531
|
+
connectedClient.close();
|
|
451
532
|
} catch (error) {
|
|
452
|
-
|
|
453
|
-
errorMessage = error instanceof Error ? error.message : String(error);
|
|
454
|
-
collectorResults[storageKey] = {
|
|
455
|
-
_collectorId: collectorEntry.collectorId,
|
|
456
|
-
_assertionFailed: undefined,
|
|
457
|
-
error: errorMessage,
|
|
458
|
-
};
|
|
459
|
-
logger.debug(`Collector ${storageKey} failed: ${errorMessage}`);
|
|
533
|
+
logger.warn(`Failed to close connection: ${error}`);
|
|
460
534
|
}
|
|
461
535
|
}
|
|
462
|
-
} finally {
|
|
463
|
-
// Clean up connection
|
|
464
|
-
try {
|
|
465
|
-
connectedClient.close();
|
|
466
|
-
} catch {
|
|
467
|
-
// Ignore close errors
|
|
468
|
-
}
|
|
469
536
|
}
|
|
470
537
|
|
|
471
538
|
// Determine health status based on collector results
|