@fjall/components-infrastructure 2.16.0 → 2.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/lambda-assets/cert-generator/asset/index.js +14 -1
- package/dist/lib/patterns/aws/clickhouseDatabase.d.ts +17 -0
- package/dist/lib/patterns/aws/clickhouseDatabase.js +16 -0
- package/dist/lib/patterns/aws/compute.d.ts +2 -2
- package/dist/lib/patterns/aws/computeEcs.d.ts +12 -1
- package/dist/lib/patterns/aws/computeEcs.js +57 -0
- package/dist/lib/patterns/aws/computeEcsTypes.d.ts +51 -1
- package/dist/lib/patterns/aws/database.d.ts +19 -1
- package/dist/lib/patterns/aws/database.js +21 -2
- package/dist/lib/resources/aws/compute/ec2GracefulTerminationHandler.js +13 -3
- package/dist/lib/resources/aws/compute/ecs.js +14 -3
- package/dist/lib/resources/aws/compute/ecsConstants.d.ts +2 -0
- package/dist/lib/resources/aws/compute/ecsConstants.js +4 -0
- package/dist/lib/resources/aws/compute/ecsTaskDefinition.d.ts +2 -0
- package/dist/lib/resources/aws/compute/ecsTaskDefinition.js +13 -4
- package/dist/lib/resources/aws/compute/ecsTypes.d.ts +17 -1
- package/dist/lib/resources/aws/compute/ecsValidation.d.ts +7 -0
- package/dist/lib/resources/aws/compute/ecsValidation.js +10 -0
- package/dist/lib/resources/aws/compute/lambda.js +20 -2
- package/dist/lib/resources/aws/compute/persistentDataVolume.js +5 -1
- package/dist/lib/resources/aws/database/rdsInstance.d.ts +19 -0
- package/dist/lib/resources/aws/database/rdsInstance.js +13 -1
- package/dist/lib/resources/aws/messaging/sns.d.ts +5 -0
- package/dist/lib/resources/aws/messaging/sns.js +7 -1
- package/dist/lib/resources/aws/messaging/sqs.d.ts +6 -0
- package/dist/lib/resources/aws/messaging/sqs.js +10 -2
- package/dist/lib/resources/aws/monitoring/clickhouseAlarms.d.ts +10 -15
- package/dist/lib/resources/aws/monitoring/clickhouseAlarms.js +34 -56
- package/dist/lib/resources/aws/monitoring/ecsAlarms.js +5 -1
- package/dist/lib/resources/aws/monitoring/index.d.ts +2 -0
- package/dist/lib/resources/aws/monitoring/index.js +2 -0
- package/dist/lib/resources/aws/monitoring/logPatternAlarms.d.ts +55 -0
- package/dist/lib/resources/aws/monitoring/logPatternAlarms.js +74 -0
- package/dist/lib/resources/aws/monitoring/metricNamespaces.d.ts +13 -0
- package/dist/lib/resources/aws/monitoring/metricNamespaces.js +12 -0
- package/package.json +5 -7
|
@@ -13,6 +13,13 @@ import { ScalingType } from "./ecsTypes.js";
|
|
|
13
13
|
* layer consumers never see a `migrations` field, so duplicating the
|
|
14
14
|
* validation here would be unreachable.
|
|
15
15
|
*
|
|
16
|
+
* Same applies to `service.awaitMigrationsFrom`: it is a patterns-layer
|
|
17
|
+
* cross-service ordering knob, resolved into a `node.addDependency(...)` edge
|
|
18
|
+
* (`wireServiceMigrationDependencies` in `computeEcs.ts`) BEFORE reaching the
|
|
19
|
+
* resources layer. It is not a field on `EcsServiceProps`, so a direct
|
|
20
|
+
* `new EcsCluster(...)` consumer cannot pass it — there is no resources-layer
|
|
21
|
+
* code path to validate.
|
|
22
|
+
*
|
|
16
23
|
* @param props - The cluster props to validate
|
|
17
24
|
* @throws Error if validation fails
|
|
18
25
|
*/
|
|
@@ -94,6 +101,9 @@ export function validateEcsClusterProps(props) {
|
|
|
94
101
|
if (max !== undefined && (max < 100 || max > 200)) {
|
|
95
102
|
throw new Error(`Service '${service.name}': deployment.maxHealthyPercent must be between 100 and 200 (got ${max}).`);
|
|
96
103
|
}
|
|
104
|
+
if (min !== undefined && max !== undefined && min > max) {
|
|
105
|
+
throw new Error(`Service '${service.name}': deployment.minHealthyPercent (${min}) must be <= maxHealthyPercent (${max}).`);
|
|
106
|
+
}
|
|
97
107
|
if (min === 100 && max === 100) {
|
|
98
108
|
throw new Error(`Service '${service.name}': deployment.minHealthyPercent and maxHealthyPercent cannot both be 100 ` +
|
|
99
109
|
"(no capacity to drain or expand — deploys would never roll forward).");
|
|
@@ -3,12 +3,12 @@ import { SingletonFunction as singletonFunction, Function, Code, Architecture, F
|
|
|
3
3
|
import { FactName } from "aws-cdk-lib/region-info";
|
|
4
4
|
import path from "node:path";
|
|
5
5
|
import { fileURLToPath } from "node:url";
|
|
6
|
+
import { createHash } from "node:crypto";
|
|
6
7
|
import { SqsEventSource, DynamoEventSource, S3EventSource } from "aws-cdk-lib/aws-lambda-event-sources";
|
|
7
8
|
import { EventType } from "aws-cdk-lib/aws-s3";
|
|
8
9
|
import { PolicyStatement, Effect } from "aws-cdk-lib/aws-iam";
|
|
9
10
|
import { RetentionDays } from "aws-cdk-lib/aws-logs";
|
|
10
11
|
import { LogGroup } from "../logging/logGroup.js";
|
|
11
|
-
import { v4 as uuid } from "uuid";
|
|
12
12
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
13
13
|
import { resolveImportedSecret } from "../secrets/index.js";
|
|
14
14
|
import { toPascalCase } from "../../../utils/capitaliseString.js";
|
|
@@ -65,11 +65,29 @@ const SECRETS_EXTENSION = {
|
|
|
65
65
|
* mis-tune alarms relative to runtime behaviour.
|
|
66
66
|
*/
|
|
67
67
|
const LAMBDA_DEFAULT_TIMEOUT_SECONDS = 300;
|
|
68
|
+
/**
|
|
69
|
+
* Stable, deterministic uuid for a SingletonFunction so its logical ID
|
|
70
|
+
* (`SingletonLambda${uuid-without-dashes}`) does not drift across synths. A
|
|
71
|
+
* random default would recreate the singleton — and re-run any custom resource
|
|
72
|
+
* fronted by it — on every deploy.
|
|
73
|
+
*/
|
|
74
|
+
function deriveStableSingletonUuid(scope, id) {
|
|
75
|
+
const hash = createHash("sha256")
|
|
76
|
+
.update(`${scope.node.path}/${id}`)
|
|
77
|
+
.digest("hex");
|
|
78
|
+
return [
|
|
79
|
+
hash.slice(0, 8),
|
|
80
|
+
hash.slice(8, 12),
|
|
81
|
+
hash.slice(12, 16),
|
|
82
|
+
hash.slice(16, 20),
|
|
83
|
+
hash.slice(20, 32)
|
|
84
|
+
].join("-");
|
|
85
|
+
}
|
|
68
86
|
export class SingletonFunction extends singletonFunction {
|
|
69
87
|
constructor(scope, id, props) {
|
|
70
88
|
super(scope, id, {
|
|
71
89
|
...props,
|
|
72
|
-
uuid: props.uuid ??
|
|
90
|
+
uuid: props.uuid ?? deriveStableSingletonUuid(scope, id),
|
|
73
91
|
timeout: Duration.seconds(props.timeout ?? LAMBDA_DEFAULT_TIMEOUT_SECONDS),
|
|
74
92
|
description: props.lambdaDescription ?? `${id} singleton lambda`,
|
|
75
93
|
runtime: props.runtime,
|
|
@@ -105,7 +105,11 @@ export class PersistentDataVolume extends Construct {
|
|
|
105
105
|
Tags.of(this.volume).add(PERSISTENT_DATA_VOLUME_TAG_STACK_ID, Aws.STACK_ID);
|
|
106
106
|
this.queue = new SQSQueue(this, `${id}Queue`, {
|
|
107
107
|
visibilityTimeout: QUEUE_VISIBILITY_TIMEOUT_SECONDS,
|
|
108
|
-
deadLetterQueue: { enabled: true, maxReceiveCount: 5 }
|
|
108
|
+
deadLetterQueue: { enabled: true, maxReceiveCount: 5 },
|
|
109
|
+
// Transient volume-attach signals — no durable state (the EBS volume
|
|
110
|
+
// itself is SNAPSHOT above). Pinned DESTROY (now also the SQSQueue wrapper
|
|
111
|
+
// default) so a replacing deploy reclaims this queue + DLQ, not orphans.
|
|
112
|
+
removalPolicy: "DESTROY"
|
|
109
113
|
});
|
|
110
114
|
const sourcePath = path.resolve(__dirname, LAUNCHING_LAMBDA_SOURCE_FILE);
|
|
111
115
|
const source = readFileSync(sourcePath, "utf-8");
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { Duration } from "aws-cdk-lib";
|
|
2
2
|
import { Connections, type IConnectable, type IVpc } from "aws-cdk-lib/aws-ec2";
|
|
3
3
|
import { type IInstanceEngine } from "aws-cdk-lib/aws-rds";
|
|
4
|
+
import { type IGrantable, type Grant } from "aws-cdk-lib/aws-iam";
|
|
4
5
|
import { Construct } from "constructs";
|
|
5
6
|
import { SecurityGroup } from "../networking/securityGroup.js";
|
|
6
7
|
import { Secret } from "../secrets/index.js";
|
|
@@ -31,6 +32,15 @@ interface RdsProps {
|
|
|
31
32
|
encryption?: EncryptionConfig;
|
|
32
33
|
publiclyAccessible?: boolean;
|
|
33
34
|
deletionProtection?: boolean;
|
|
35
|
+
/**
|
|
36
|
+
* Enable RDS IAM database authentication on the instance. Opt-in; defaults to
|
|
37
|
+
* off (undefined → CDK omits the property, leaving existing consumers' synth
|
|
38
|
+
* unchanged). When true, IAM principals granted via {@link grantIamConnect}
|
|
39
|
+
* connect with short-lived `rds-db:connect` tokens instead of a stored
|
|
40
|
+
* password — password auth keeps working in parallel. See ADR
|
|
41
|
+
* decisions/2026-06-17-rls-role-auth-and-launch-gating.md.
|
|
42
|
+
*/
|
|
43
|
+
iamAuthentication?: boolean;
|
|
34
44
|
/** ARN or identifier of DB instance snapshot to restore from */
|
|
35
45
|
snapshotIdentifier?: string;
|
|
36
46
|
/** Username from the snapshot (required when restoring from snapshot to reset password) */
|
|
@@ -70,6 +80,15 @@ export declare class RdsInstance extends Construct implements IConnectable {
|
|
|
70
80
|
}>;
|
|
71
81
|
getDatabaseName(): string;
|
|
72
82
|
getConnectionString(): string;
|
|
83
|
+
/**
|
|
84
|
+
* Grant an IAM principal permission to connect as `dbUsername` via RDS IAM
|
|
85
|
+
* database authentication. Requires the instance to be created with
|
|
86
|
+
* `iamAuthentication: true`. Delegates to the L2 `grantConnect`, which scopes
|
|
87
|
+
* `rds-db:connect` to the exact `dbuser:<dbiResourceId>/<dbUsername>` ARN —
|
|
88
|
+
* never a bare wildcard. See ADR
|
|
89
|
+
* decisions/2026-06-17-rls-role-auth-and-launch-gating.md.
|
|
90
|
+
*/
|
|
91
|
+
grantIamConnect(grantee: IGrantable, dbUsername: string): Grant;
|
|
73
92
|
static build(id: string, props: RdsProps): (sb: StackBuilder) => Construct;
|
|
74
93
|
}
|
|
75
94
|
export {};
|
|
@@ -136,7 +136,8 @@ export class RdsInstance extends Construct {
|
|
|
136
136
|
deletionProtection: props.deletionProtection ?? true,
|
|
137
137
|
preferredMaintenanceWindow: props.preferredMaintenanceWindow ??
|
|
138
138
|
RDS_DEFAULTS.PREFERRED_MAINTENANCE_WINDOW,
|
|
139
|
-
publiclyAccessible: props.publiclyAccessible ?? false
|
|
139
|
+
publiclyAccessible: props.publiclyAccessible ?? false,
|
|
140
|
+
iamAuthentication: props.iamAuthentication
|
|
140
141
|
};
|
|
141
142
|
if (props.snapshotIdentifier) {
|
|
142
143
|
// Create from snapshot
|
|
@@ -328,6 +329,17 @@ exports.handler = async (event) => {
|
|
|
328
329
|
getConnectionString() {
|
|
329
330
|
return `${this.engineConfig.family}://${this.getHostEndpoint()}:${this.getHostPort()}/${this.getDatabaseName()}`;
|
|
330
331
|
}
|
|
332
|
+
/**
|
|
333
|
+
* Grant an IAM principal permission to connect as `dbUsername` via RDS IAM
|
|
334
|
+
* database authentication. Requires the instance to be created with
|
|
335
|
+
* `iamAuthentication: true`. Delegates to the L2 `grantConnect`, which scopes
|
|
336
|
+
* `rds-db:connect` to the exact `dbuser:<dbiResourceId>/<dbUsername>` ARN —
|
|
337
|
+
* never a bare wildcard. See ADR
|
|
338
|
+
* decisions/2026-06-17-rls-role-auth-and-launch-gating.md.
|
|
339
|
+
*/
|
|
340
|
+
grantIamConnect(grantee, dbUsername) {
|
|
341
|
+
return this.database.grantConnect(grantee, dbUsername);
|
|
342
|
+
}
|
|
331
343
|
static build(id, props) {
|
|
332
344
|
return (sb) => {
|
|
333
345
|
const newProps = {
|
|
@@ -7,6 +7,11 @@ export interface SNSTopicProps {
|
|
|
7
7
|
displayName?: string;
|
|
8
8
|
fifo?: boolean;
|
|
9
9
|
contentBasedDeduplication?: boolean;
|
|
10
|
+
/**
|
|
11
|
+
* Removal policy for the topic. Defaults to DESTROY — a topic is a transient
|
|
12
|
+
* fan-out medium with no durable state (see the constructor). Pass "RETAIN"
|
|
13
|
+
* only for a topic that must survive stack deletion.
|
|
14
|
+
*/
|
|
10
15
|
removalPolicy?: RemovalPolicyString;
|
|
11
16
|
}
|
|
12
17
|
export declare class SNSTopic extends Construct {
|
|
@@ -22,7 +22,13 @@ export class SNSTopic extends Construct {
|
|
|
22
22
|
? (props.contentBasedDeduplication ?? true)
|
|
23
23
|
: undefined
|
|
24
24
|
});
|
|
25
|
-
|
|
25
|
+
// An SNS topic is a transient fan-out medium: it holds no durable state
|
|
26
|
+
// (subscriptions are re-created by the next deploy), so the wrapper defaults
|
|
27
|
+
// to DESTROY — deliberately NOT the env-aware production->RETAIN of
|
|
28
|
+
// data-bearing wrappers (S3, EventBus). RETAIN here buys no protection and
|
|
29
|
+
// orphans the topic on a parent construct's logical-ID churn (the same leak
|
|
30
|
+
// class fixed for SQS). Durable topics opt into RETAIN via props.removalPolicy.
|
|
31
|
+
this.topic.applyRemovalPolicy(toRemovalPolicy(props.removalPolicy ?? "DESTROY"));
|
|
26
32
|
new CfnOutput(this, `${id}TopicArn`, {
|
|
27
33
|
key: `${id}TopicArn`,
|
|
28
34
|
value: this.topic.topicArn,
|
|
@@ -60,6 +60,12 @@ export interface SQSQueueProps {
|
|
|
60
60
|
contentBasedDeduplication?: boolean;
|
|
61
61
|
fifoThroughputLimit?: "perQueue" | "perMessageGroupId";
|
|
62
62
|
deduplicationScope?: "queue" | "messageGroup";
|
|
63
|
+
/**
|
|
64
|
+
* Removal policy for the queue (and its auto-created DLQ, which tracks it).
|
|
65
|
+
* Defaults to DESTROY — a queue is a transient work medium (see the
|
|
66
|
+
* constructor). Pass "RETAIN" only for a queue whose contents are
|
|
67
|
+
* irreplaceable and must survive stack deletion.
|
|
68
|
+
*/
|
|
63
69
|
removalPolicy?: RemovalPolicyString;
|
|
64
70
|
}
|
|
65
71
|
export declare class SQSQueue extends Construct {
|
|
@@ -83,6 +83,14 @@ export class SQSQueue extends Construct {
|
|
|
83
83
|
this.id = id;
|
|
84
84
|
// Sanitise id for CloudFormation output keys (must be alphanumeric)
|
|
85
85
|
const outputName = toPascalCase(id);
|
|
86
|
+
// SQS queues are transient work mediums: their contents (job messages,
|
|
87
|
+
// failed-delivery copies) regenerate from a source of truth held elsewhere
|
|
88
|
+
// (Postgres, the producing schedule/rule), so the wrapper defaults to
|
|
89
|
+
// DESTROY — deliberately NOT the env-aware production->RETAIN of
|
|
90
|
+
// data-bearing wrappers (S3, EventBus). RETAIN here buys no protection and
|
|
91
|
+
// orphans the queue on a parent construct's logical-ID churn (the prod
|
|
92
|
+
// orphan leak). Durable queues opt into RETAIN via props.removalPolicy.
|
|
93
|
+
const resolvedRemovalPolicy = toRemovalPolicy(props.removalPolicy ?? "DESTROY");
|
|
86
94
|
const isFifo = props.queueType === "fifo";
|
|
87
95
|
const queueName = props.queueName
|
|
88
96
|
? isFifo
|
|
@@ -110,7 +118,7 @@ export class SQSQueue extends Construct {
|
|
|
110
118
|
fifo: isFifo,
|
|
111
119
|
encryption: toEncryption(props.encryption),
|
|
112
120
|
retentionPeriod: Duration.days(SQS_LIMITS.DEAD_LETTER_QUEUE.DEFAULT_RETENTION_DAYS),
|
|
113
|
-
removalPolicy:
|
|
121
|
+
removalPolicy: resolvedRemovalPolicy
|
|
114
122
|
});
|
|
115
123
|
deadLetterQueue = {
|
|
116
124
|
queue: this.dlq,
|
|
@@ -156,7 +164,7 @@ export class SQSQueue extends Construct {
|
|
|
156
164
|
deduplicationScope: isFifo
|
|
157
165
|
? toDeduplicationScope(props.deduplicationScope)
|
|
158
166
|
: undefined,
|
|
159
|
-
removalPolicy:
|
|
167
|
+
removalPolicy: resolvedRemovalPolicy
|
|
160
168
|
});
|
|
161
169
|
new CfnOutput(this, `${outputName}QueueUrl`, {
|
|
162
170
|
key: `${outputName}QueueUrl`,
|
|
@@ -29,32 +29,27 @@ export interface ClickHouseAlarmsProps {
|
|
|
29
29
|
asgName: string;
|
|
30
30
|
alarmTopic: ITopic;
|
|
31
31
|
/**
|
|
32
|
-
*
|
|
33
|
-
* emits `serverLogger.warn("ClickHouse", "Stuck merge detected")` when
|
|
34
|
-
* `system.merges` shows a merge elapsed > 30 min.
|
|
35
|
-
*/
|
|
36
|
-
webappLogGroup: ILogGroup;
|
|
37
|
-
/**
|
|
38
|
-
* Backup-task log group. Required to wire the backup-failure alarm —
|
|
32
|
+
* Backup-task log group. When present, wires the backup-failure alarm —
|
|
39
33
|
* `BACKUP DATABASE … TO S3(…)` emits `AccessDenied` / `S3Exception` lines
|
|
40
34
|
* when the IAM grant or bucket policy is misconfigured (silent before the
|
|
41
35
|
* alarm landed; the daily backup task exited non-zero with no signal).
|
|
36
|
+
* Omitted when `backupSchedule: false` — no backup task, no log group.
|
|
42
37
|
*/
|
|
43
|
-
backupTaskLogGroup
|
|
38
|
+
backupTaskLogGroup?: ILogGroup;
|
|
44
39
|
config?: ClickHouseAlarmThresholds;
|
|
45
40
|
}
|
|
46
41
|
/**
|
|
47
|
-
* Single-node ClickHouse posture alarms. Covers host-level CPU + (optional)
|
|
48
|
-
* memory and disk via the CloudWatch Agent metric namespace `CWAgent`, plus
|
|
49
|
-
*
|
|
42
|
+
* Single-node ClickHouse host-posture alarms. Covers host-level CPU + (optional)
|
|
43
|
+
* memory and disk via the CloudWatch Agent metric namespace `CWAgent`, plus the
|
|
44
|
+
* backup-failure log alarm when a backup-task log group is supplied:
|
|
50
45
|
*
|
|
51
|
-
* - **Stuck merges** — `client.ts` polls `system.merges` every 5 min and logs
|
|
52
|
-
* `serverLogger.warn("ClickHouse", "Stuck merge detected")` when elapsed
|
|
53
|
-
* exceeds 30 min. The metric filter on the webapp log group emits a count
|
|
54
|
-
* metric per match; the alarm fires on Sum >= 1 over 5 min × 2 evaluations.
|
|
55
46
|
* - **Backup failures** — `AccessDenied` or `S3Exception` from the backup
|
|
56
47
|
* task's BACKUP DATABASE TO S3 statement. Closes the silent-failure mode
|
|
57
48
|
* that masked the original IAM-grant misconfiguration (see
|
|
58
49
|
* `designs/2026-04-27-clickhouse-backup-iam-role.md`).
|
|
50
|
+
*
|
|
51
|
+
* The stuck-merge alarm — a `"Stuck merge detected"` line emitted by the webapp
|
|
52
|
+
* app process, not this construct — lives on the app service's declarative
|
|
53
|
+
* `logAlarms` instead; it is an app-log alarm, not a database-host concern.
|
|
59
54
|
*/
|
|
60
55
|
export declare function createClickHouseAlarms(props: ClickHouseAlarmsProps): Alarm[];
|
|
@@ -4,23 +4,23 @@ import { SnsAction } from "aws-cdk-lib/aws-cloudwatch-actions";
|
|
|
4
4
|
import { Metric } from "aws-cdk-lib/aws-cloudwatch";
|
|
5
5
|
import { FilterPattern, MetricFilter } from "aws-cdk-lib/aws-logs";
|
|
6
6
|
import { ALARM_DEFAULTS, registerAlarm, buildAlarmDescription } from "./alarmDefaults.js";
|
|
7
|
-
|
|
7
|
+
import { METRIC_NAMESPACE } from "./metricNamespaces.js";
|
|
8
8
|
/**
|
|
9
|
-
* Single-node ClickHouse posture alarms. Covers host-level CPU + (optional)
|
|
10
|
-
* memory and disk via the CloudWatch Agent metric namespace `CWAgent`, plus
|
|
11
|
-
*
|
|
9
|
+
* Single-node ClickHouse host-posture alarms. Covers host-level CPU + (optional)
|
|
10
|
+
* memory and disk via the CloudWatch Agent metric namespace `CWAgent`, plus the
|
|
11
|
+
* backup-failure log alarm when a backup-task log group is supplied:
|
|
12
12
|
*
|
|
13
|
-
* - **Stuck merges** — `client.ts` polls `system.merges` every 5 min and logs
|
|
14
|
-
* `serverLogger.warn("ClickHouse", "Stuck merge detected")` when elapsed
|
|
15
|
-
* exceeds 30 min. The metric filter on the webapp log group emits a count
|
|
16
|
-
* metric per match; the alarm fires on Sum >= 1 over 5 min × 2 evaluations.
|
|
17
13
|
* - **Backup failures** — `AccessDenied` or `S3Exception` from the backup
|
|
18
14
|
* task's BACKUP DATABASE TO S3 statement. Closes the silent-failure mode
|
|
19
15
|
* that masked the original IAM-grant misconfiguration (see
|
|
20
16
|
* `designs/2026-04-27-clickhouse-backup-iam-role.md`).
|
|
17
|
+
*
|
|
18
|
+
* The stuck-merge alarm — a `"Stuck merge detected"` line emitted by the webapp
|
|
19
|
+
* app process, not this construct — lives on the app service's declarative
|
|
20
|
+
* `logAlarms` instead; it is an app-log alarm, not a database-host concern.
|
|
21
21
|
*/
|
|
22
22
|
export function createClickHouseAlarms(props) {
|
|
23
|
-
const { scope, instanceRole, asgName, alarmTopic,
|
|
23
|
+
const { scope, instanceRole, asgName, alarmTopic, backupTaskLogGroup, config = {} } = props;
|
|
24
24
|
const alarms = [];
|
|
25
25
|
const snsAction = new SnsAction(alarmTopic);
|
|
26
26
|
const cpuAlarm = new Alarm(scope, "ClickHouseCpuAlarm", {
|
|
@@ -87,53 +87,31 @@ export function createClickHouseAlarms(props) {
|
|
|
87
87
|
treatMissingData: TreatMissingData.NOT_BREACHING
|
|
88
88
|
});
|
|
89
89
|
registerAlarm(diskCriticalAlarm, snsAction, alarms);
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
filterPattern: FilterPattern.literal('"Stuck merge detected"'),
|
|
96
|
-
metricValue: "1",
|
|
97
|
-
defaultValue: 0
|
|
98
|
-
});
|
|
99
|
-
const stuckMergeAlarm = new Alarm(scope, "ClickHouseStuckMergeAlarm", {
|
|
100
|
-
alarmDescription: buildAlarmDescription("ClickHouse merge stuck > 30 min — investigate parts pressure or replica health", undefined),
|
|
101
|
-
metric: new Metric({
|
|
102
|
-
namespace: CLICKHOUSE_METRIC_NAMESPACE,
|
|
103
|
-
metricName: stuckMergeMetricName,
|
|
104
|
-
period: Duration.minutes(5),
|
|
105
|
-
statistic: "Sum"
|
|
106
|
-
}),
|
|
107
|
-
threshold: 1,
|
|
108
|
-
evaluationPeriods: 2,
|
|
109
|
-
datapointsToAlarm: 2,
|
|
110
|
-
comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
|
|
111
|
-
treatMissingData: TreatMissingData.NOT_BREACHING
|
|
112
|
-
});
|
|
113
|
-
registerAlarm(stuckMergeAlarm, snsAction, alarms);
|
|
114
|
-
const backupFailureMetricName = "ClickHouseBackupFailureCount";
|
|
115
|
-
new MetricFilter(scope, "ClickHouseBackupFailureMetricFilter", {
|
|
116
|
-
logGroup: backupTaskLogGroup,
|
|
117
|
-
metricNamespace: CLICKHOUSE_METRIC_NAMESPACE,
|
|
118
|
-
metricName: backupFailureMetricName,
|
|
119
|
-
filterPattern: FilterPattern.anyTerm("AccessDenied", "S3Exception"),
|
|
120
|
-
metricValue: "1",
|
|
121
|
-
defaultValue: 0
|
|
122
|
-
});
|
|
123
|
-
const backupFailureAlarm = new Alarm(scope, "ClickHouseBackupFailureAlarm", {
|
|
124
|
-
alarmDescription: buildAlarmDescription(`ClickHouse BACKUP TO S3 emitted AccessDenied/S3Exception — verify instance role '${instanceRole.roleName}' grant on backup bucket`, undefined),
|
|
125
|
-
metric: new Metric({
|
|
126
|
-
namespace: CLICKHOUSE_METRIC_NAMESPACE,
|
|
90
|
+
if (backupTaskLogGroup !== undefined) {
|
|
91
|
+
const backupFailureMetricName = "ClickHouseBackupFailureCount";
|
|
92
|
+
new MetricFilter(scope, "ClickHouseBackupFailureMetricFilter", {
|
|
93
|
+
logGroup: backupTaskLogGroup,
|
|
94
|
+
metricNamespace: METRIC_NAMESPACE.CLICKHOUSE,
|
|
127
95
|
metricName: backupFailureMetricName,
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
96
|
+
filterPattern: FilterPattern.anyTerm("AccessDenied", "S3Exception"),
|
|
97
|
+
metricValue: "1",
|
|
98
|
+
defaultValue: 0
|
|
99
|
+
});
|
|
100
|
+
const backupFailureAlarm = new Alarm(scope, "ClickHouseBackupFailureAlarm", {
|
|
101
|
+
alarmDescription: buildAlarmDescription(`ClickHouse BACKUP TO S3 emitted AccessDenied/S3Exception — verify instance role '${instanceRole.roleName}' grant on backup bucket`, undefined),
|
|
102
|
+
metric: new Metric({
|
|
103
|
+
namespace: METRIC_NAMESPACE.CLICKHOUSE,
|
|
104
|
+
metricName: backupFailureMetricName,
|
|
105
|
+
period: Duration.hours(1),
|
|
106
|
+
statistic: "Sum"
|
|
107
|
+
}),
|
|
108
|
+
threshold: 1,
|
|
109
|
+
evaluationPeriods: 1,
|
|
110
|
+
datapointsToAlarm: 1,
|
|
111
|
+
comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
|
|
112
|
+
treatMissingData: TreatMissingData.NOT_BREACHING
|
|
113
|
+
});
|
|
114
|
+
registerAlarm(backupFailureAlarm, snsAction, alarms);
|
|
115
|
+
}
|
|
138
116
|
return alarms;
|
|
139
117
|
}
|
|
@@ -43,7 +43,11 @@ export function createEcsServiceAlarms(props) {
|
|
|
43
43
|
evaluationPeriods: 2,
|
|
44
44
|
datapointsToAlarm: 2,
|
|
45
45
|
comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD,
|
|
46
|
-
|
|
46
|
+
// RunningTaskCount (Container Insights) is sparse, so missing data must
|
|
47
|
+
// not breach — otherwise metric gaps false-alarm healthy services and
|
|
48
|
+
// scaled-to-zero workers alarm permanently. A real count < threshold
|
|
49
|
+
// still fires.
|
|
50
|
+
treatMissingData: TreatMissingData.NOT_BREACHING
|
|
47
51
|
});
|
|
48
52
|
registerAlarm(runningTasksAlarm, snsAction, alarms);
|
|
49
53
|
}
|
|
@@ -4,3 +4,5 @@ export { createRdsAlarms, type RdsAlarmThresholds, type RdsAlarmsProps } from ".
|
|
|
4
4
|
export { createLambdaAlarms, type LambdaAlarmThresholds, type LambdaAlarmsProps } from "./lambdaAlarms.js";
|
|
5
5
|
export { createScheduleAlarms, type ScheduleAlarmThresholds, type CreateScheduleAlarmsProps } from "./scheduleAlarms.js";
|
|
6
6
|
export { createClickHouseAlarms, type ClickHouseAlarmThresholds, type ClickHouseAlarmsProps } from "./clickhouseAlarms.js";
|
|
7
|
+
export { createLogPatternAlarms, type LogPatternAlarmSpec, type LogPatternAlarmsProps } from "./logPatternAlarms.js";
|
|
8
|
+
export { METRIC_NAMESPACE, type MetricNamespace } from "./metricNamespaces.js";
|
|
@@ -4,3 +4,5 @@ export { createRdsAlarms } from "./rdsAlarms.js";
|
|
|
4
4
|
export { createLambdaAlarms } from "./lambdaAlarms.js";
|
|
5
5
|
export { createScheduleAlarms } from "./scheduleAlarms.js";
|
|
6
6
|
export { createClickHouseAlarms } from "./clickhouseAlarms.js";
|
|
7
|
+
export { createLogPatternAlarms } from "./logPatternAlarms.js";
|
|
8
|
+
export { METRIC_NAMESPACE } from "./metricNamespaces.js";
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { Duration } from "aws-cdk-lib";
|
|
2
|
+
import { Alarm } from "aws-cdk-lib/aws-cloudwatch";
|
|
3
|
+
import { type ILogGroup } from "aws-cdk-lib/aws-logs";
|
|
4
|
+
import type { ITopic } from "aws-cdk-lib/aws-sns";
|
|
5
|
+
import type { Construct } from "constructs";
|
|
6
|
+
/**
|
|
7
|
+
* One log-pattern alarm: a CloudWatch metric filter over a log group plus the
|
|
8
|
+
* alarm that fires when the pattern is matched. Declarative — callers describe
|
|
9
|
+
* the match with plain strings; the raw CDK `FilterPattern` is built inside the
|
|
10
|
+
* primitive, so the spec never leaks raw CDK at the call site.
|
|
11
|
+
*
|
|
12
|
+
* Exactly one of `literal` / `anyTerms` must be set.
|
|
13
|
+
*/
|
|
14
|
+
export interface LogPatternAlarmSpec {
|
|
15
|
+
/** Stable logical-ID stem; the MetricFilter + Alarm derive their construct IDs from it. */
|
|
16
|
+
idStem: string;
|
|
17
|
+
/** CloudWatch metric name emitted by the filter and read by the alarm. */
|
|
18
|
+
metricName: string;
|
|
19
|
+
/** Responder-facing alarm description (the applicationId tag is appended automatically). */
|
|
20
|
+
description: string;
|
|
21
|
+
/** Match this exact CloudWatch Logs filter pattern (e.g. `'"Stuck merge detected"'`). */
|
|
22
|
+
literal?: string;
|
|
23
|
+
/** Match if ANY of these terms appears (CloudWatch Logs OR semantics). */
|
|
24
|
+
anyTerms?: string[];
|
|
25
|
+
/** Metric namespace for this spec; falls back to the props-level default. */
|
|
26
|
+
metricNamespace?: string;
|
|
27
|
+
/** Sum-over-period threshold the alarm fires at. Default 1 (first match). */
|
|
28
|
+
threshold?: number;
|
|
29
|
+
/** Metric period. Default 1 minute (fast fail-closed detection). */
|
|
30
|
+
period?: Duration;
|
|
31
|
+
/** Consecutive periods evaluated. Default 1. */
|
|
32
|
+
evaluationPeriods?: number;
|
|
33
|
+
/** Datapoints within the window that must breach. Default = evaluationPeriods. */
|
|
34
|
+
datapointsToAlarm?: number;
|
|
35
|
+
}
|
|
36
|
+
export interface LogPatternAlarmsProps {
|
|
37
|
+
scope: Construct;
|
|
38
|
+
/** Log group the metric filters attach to. */
|
|
39
|
+
logGroup: ILogGroup;
|
|
40
|
+
/** SNS topic alarms notify on both ALARM and OK transitions. */
|
|
41
|
+
alarmTopic: ITopic;
|
|
42
|
+
/** Default metric namespace for specs that do not override it per-entry. */
|
|
43
|
+
metricNamespace?: string;
|
|
44
|
+
/** The alarms to create. */
|
|
45
|
+
specs: LogPatternAlarmSpec[];
|
|
46
|
+
/** Application ID for webhook-to-application alarm mapping. */
|
|
47
|
+
applicationId?: string;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Create CloudWatch log-pattern alarms — a MetricFilter + Alarm per spec —
|
|
51
|
+
* wired to the SNS topic. Sibling to `createEcsServiceAlarms`: same conventions
|
|
52
|
+
* (`registerAlarm` for ALARM+OK actions, applicationId tagging via
|
|
53
|
+
* `tagAlarmsWithApplicationId`). Reusable across every factory consumer.
|
|
54
|
+
*/
|
|
55
|
+
export declare function createLogPatternAlarms(props: LogPatternAlarmsProps): Alarm[];
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { Duration } from "aws-cdk-lib";
|
|
2
|
+
import { Alarm, ComparisonOperator, Metric, TreatMissingData } from "aws-cdk-lib/aws-cloudwatch";
|
|
3
|
+
import { SnsAction } from "aws-cdk-lib/aws-cloudwatch-actions";
|
|
4
|
+
import { FilterPattern, MetricFilter } from "aws-cdk-lib/aws-logs";
|
|
5
|
+
import { buildAlarmDescription, registerAlarm, tagAlarmsWithApplicationId } from "./alarmDefaults.js";
|
|
6
|
+
/**
|
|
7
|
+
* Validate a spec carries exactly one matcher, a usable id stem, and a
|
|
8
|
+
* resolvable namespace; return the built filter pattern + resolved namespace.
|
|
9
|
+
* Throws (not a `Result`) — this is synth-time CDK construction, where a
|
|
10
|
+
* malformed spec must fail the synth loudly rather than emit a dead alarm.
|
|
11
|
+
*/
|
|
12
|
+
function resolveSpec(spec, defaultNamespace) {
|
|
13
|
+
if (spec.idStem.trim() === "") {
|
|
14
|
+
throw new Error("logPatternAlarm spec requires a non-empty idStem");
|
|
15
|
+
}
|
|
16
|
+
const hasLiteral = spec.literal !== undefined;
|
|
17
|
+
const hasAnyTerms = spec.anyTerms !== undefined;
|
|
18
|
+
if (hasLiteral === hasAnyTerms) {
|
|
19
|
+
throw new Error(`logPatternAlarm spec '${spec.idStem}' must set exactly one of 'literal' or 'anyTerms'`);
|
|
20
|
+
}
|
|
21
|
+
if (hasAnyTerms &&
|
|
22
|
+
spec.anyTerms !== undefined &&
|
|
23
|
+
spec.anyTerms.length === 0) {
|
|
24
|
+
throw new Error(`logPatternAlarm spec '${spec.idStem}' has an empty 'anyTerms' array`);
|
|
25
|
+
}
|
|
26
|
+
const namespace = spec.metricNamespace ?? defaultNamespace;
|
|
27
|
+
if (namespace === undefined || namespace.trim() === "") {
|
|
28
|
+
throw new Error(`logPatternAlarm spec '${spec.idStem}' has no metric namespace — set spec.metricNamespace or props.metricNamespace`);
|
|
29
|
+
}
|
|
30
|
+
const filterPattern = spec.literal !== undefined
|
|
31
|
+
? FilterPattern.literal(spec.literal)
|
|
32
|
+
: FilterPattern.anyTerm(...(spec.anyTerms ?? []));
|
|
33
|
+
return { filterPattern, namespace };
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Create CloudWatch log-pattern alarms — a MetricFilter + Alarm per spec —
|
|
37
|
+
* wired to the SNS topic. Sibling to `createEcsServiceAlarms`: same conventions
|
|
38
|
+
* (`registerAlarm` for ALARM+OK actions, applicationId tagging via
|
|
39
|
+
* `tagAlarmsWithApplicationId`). Reusable across every factory consumer.
|
|
40
|
+
*/
|
|
41
|
+
export function createLogPatternAlarms(props) {
|
|
42
|
+
const { scope, logGroup, alarmTopic, metricNamespace, specs, applicationId } = props;
|
|
43
|
+
const alarms = [];
|
|
44
|
+
const snsAction = new SnsAction(alarmTopic);
|
|
45
|
+
for (const spec of specs) {
|
|
46
|
+
const { filterPattern, namespace } = resolveSpec(spec, metricNamespace);
|
|
47
|
+
new MetricFilter(scope, `${spec.idStem}MetricFilter`, {
|
|
48
|
+
logGroup,
|
|
49
|
+
metricNamespace: namespace,
|
|
50
|
+
metricName: spec.metricName,
|
|
51
|
+
filterPattern,
|
|
52
|
+
metricValue: "1",
|
|
53
|
+
defaultValue: 0
|
|
54
|
+
});
|
|
55
|
+
const evaluationPeriods = spec.evaluationPeriods ?? 1;
|
|
56
|
+
const alarm = new Alarm(scope, `${spec.idStem}Alarm`, {
|
|
57
|
+
alarmDescription: buildAlarmDescription(spec.description, applicationId),
|
|
58
|
+
metric: new Metric({
|
|
59
|
+
namespace,
|
|
60
|
+
metricName: spec.metricName,
|
|
61
|
+
period: spec.period ?? Duration.minutes(1),
|
|
62
|
+
statistic: "Sum"
|
|
63
|
+
}),
|
|
64
|
+
threshold: spec.threshold ?? 1,
|
|
65
|
+
evaluationPeriods,
|
|
66
|
+
datapointsToAlarm: spec.datapointsToAlarm ?? evaluationPeriods,
|
|
67
|
+
comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
|
|
68
|
+
treatMissingData: TreatMissingData.NOT_BREACHING
|
|
69
|
+
});
|
|
70
|
+
registerAlarm(alarm, snsAction, alarms);
|
|
71
|
+
}
|
|
72
|
+
tagAlarmsWithApplicationId(alarms, applicationId);
|
|
73
|
+
return alarms;
|
|
74
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Single source of truth for Fjall CloudWatch metric namespaces.
|
|
3
|
+
*
|
|
4
|
+
* Log-pattern metric filters and their alarms read from these namespaces. The
|
|
5
|
+
* webapp app service declares RLS alarms under `WEBAPP` and relocates the
|
|
6
|
+
* ClickHouse stuck-merge alarm under `CLICKHOUSE` (the log line is app-emitted,
|
|
7
|
+
* so it is an app-log alarm carrying a database-domain namespace).
|
|
8
|
+
*/
|
|
9
|
+
export declare const METRIC_NAMESPACE: {
|
|
10
|
+
readonly CLICKHOUSE: "Fjall/ClickHouse";
|
|
11
|
+
readonly WEBAPP: "Fjall/WebApp";
|
|
12
|
+
};
|
|
13
|
+
export type MetricNamespace = (typeof METRIC_NAMESPACE)[keyof typeof METRIC_NAMESPACE];
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Single source of truth for Fjall CloudWatch metric namespaces.
|
|
3
|
+
*
|
|
4
|
+
* Log-pattern metric filters and their alarms read from these namespaces. The
|
|
5
|
+
* webapp app service declares RLS alarms under `WEBAPP` and relocates the
|
|
6
|
+
* ClickHouse stuck-merge alarm under `CLICKHOUSE` (the log line is app-emitted,
|
|
7
|
+
* so it is an app-log alarm carrying a database-domain namespace).
|
|
8
|
+
*/
|
|
9
|
+
export const METRIC_NAMESPACE = {
|
|
10
|
+
CLICKHOUSE: "Fjall/ClickHouse",
|
|
11
|
+
WEBAPP: "Fjall/WebApp"
|
|
12
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fjall/components-infrastructure",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.18.0",
|
|
4
4
|
"license": "SEE LICENSE IN LICENSE",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -53,7 +53,6 @@
|
|
|
53
53
|
"@peculiar/x509": "1.14.0",
|
|
54
54
|
"@types/aws-lambda": "^8.10.161",
|
|
55
55
|
"@types/node": "^25.6.0",
|
|
56
|
-
"@types/uuid": "^11.0.0",
|
|
57
56
|
"@typescript-eslint/eslint-plugin": "^8.59.1",
|
|
58
57
|
"@typescript-eslint/parser": "^8.59.1",
|
|
59
58
|
"eslint": "^10.2.1",
|
|
@@ -63,10 +62,9 @@
|
|
|
63
62
|
},
|
|
64
63
|
"dependencies": {
|
|
65
64
|
"@aws-sdk/client-organizations": "^3.1038.0",
|
|
66
|
-
"@fjall/generator": "^2.
|
|
67
|
-
"@fjall/util": "^2.
|
|
68
|
-
"constructs": "^10.0.0"
|
|
69
|
-
"uuid": "^14.0.0"
|
|
65
|
+
"@fjall/generator": "^2.18.0",
|
|
66
|
+
"@fjall/util": "^2.18.0",
|
|
67
|
+
"constructs": "^10.0.0"
|
|
70
68
|
},
|
|
71
69
|
"overrides": {
|
|
72
70
|
"@smithy/core": "2.5.5"
|
|
@@ -79,5 +77,5 @@
|
|
|
79
77
|
"engines": {
|
|
80
78
|
"node": ">=18.0.0"
|
|
81
79
|
},
|
|
82
|
-
"gitHead": "
|
|
80
|
+
"gitHead": "37008ca5469398c42a09e6babc8cc4192ab938b2"
|
|
83
81
|
}
|