@fjall/components-infrastructure 2.16.0 → 2.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/lib/lambda-assets/cert-generator/asset/index.js +14 -1
  2. package/dist/lib/patterns/aws/clickhouseDatabase.d.ts +17 -0
  3. package/dist/lib/patterns/aws/clickhouseDatabase.js +16 -0
  4. package/dist/lib/patterns/aws/compute.d.ts +2 -2
  5. package/dist/lib/patterns/aws/computeEcs.d.ts +12 -1
  6. package/dist/lib/patterns/aws/computeEcs.js +57 -0
  7. package/dist/lib/patterns/aws/computeEcsTypes.d.ts +51 -1
  8. package/dist/lib/patterns/aws/database.d.ts +19 -1
  9. package/dist/lib/patterns/aws/database.js +21 -2
  10. package/dist/lib/resources/aws/compute/ec2GracefulTerminationHandler.js +13 -3
  11. package/dist/lib/resources/aws/compute/ecs.js +14 -3
  12. package/dist/lib/resources/aws/compute/ecsConstants.d.ts +2 -0
  13. package/dist/lib/resources/aws/compute/ecsConstants.js +4 -0
  14. package/dist/lib/resources/aws/compute/ecsTaskDefinition.d.ts +2 -0
  15. package/dist/lib/resources/aws/compute/ecsTaskDefinition.js +13 -4
  16. package/dist/lib/resources/aws/compute/ecsTypes.d.ts +17 -1
  17. package/dist/lib/resources/aws/compute/ecsValidation.d.ts +7 -0
  18. package/dist/lib/resources/aws/compute/ecsValidation.js +10 -0
  19. package/dist/lib/resources/aws/compute/lambda.js +20 -2
  20. package/dist/lib/resources/aws/compute/persistentDataVolume.js +5 -1
  21. package/dist/lib/resources/aws/database/rdsInstance.d.ts +19 -0
  22. package/dist/lib/resources/aws/database/rdsInstance.js +13 -1
  23. package/dist/lib/resources/aws/messaging/sns.d.ts +5 -0
  24. package/dist/lib/resources/aws/messaging/sns.js +7 -1
  25. package/dist/lib/resources/aws/messaging/sqs.d.ts +6 -0
  26. package/dist/lib/resources/aws/messaging/sqs.js +10 -2
  27. package/dist/lib/resources/aws/monitoring/clickhouseAlarms.d.ts +10 -15
  28. package/dist/lib/resources/aws/monitoring/clickhouseAlarms.js +34 -56
  29. package/dist/lib/resources/aws/monitoring/ecsAlarms.js +5 -1
  30. package/dist/lib/resources/aws/monitoring/index.d.ts +2 -0
  31. package/dist/lib/resources/aws/monitoring/index.js +2 -0
  32. package/dist/lib/resources/aws/monitoring/logPatternAlarms.d.ts +55 -0
  33. package/dist/lib/resources/aws/monitoring/logPatternAlarms.js +74 -0
  34. package/dist/lib/resources/aws/monitoring/metricNamespaces.d.ts +13 -0
  35. package/dist/lib/resources/aws/monitoring/metricNamespaces.js +12 -0
  36. package/package.json +5 -7
@@ -13,6 +13,13 @@ import { ScalingType } from "./ecsTypes.js";
13
13
  * layer consumers never see a `migrations` field, so duplicating the
14
14
  * validation here would be unreachable.
15
15
  *
16
+ * Same applies to `service.awaitMigrationsFrom`: it is a patterns-layer
17
+ * cross-service ordering knob, resolved into a `node.addDependency(...)` edge
18
+ * (`wireServiceMigrationDependencies` in `computeEcs.ts`) BEFORE reaching the
19
+ * resources layer. It is not a field on `EcsServiceProps`, so a direct
20
+ * `new EcsCluster(...)` consumer cannot pass it — there is no resources-layer
21
+ * code path to validate.
22
+ *
16
23
  * @param props - The cluster props to validate
17
24
  * @throws Error if validation fails
18
25
  */
@@ -94,6 +101,9 @@ export function validateEcsClusterProps(props) {
94
101
  if (max !== undefined && (max < 100 || max > 200)) {
95
102
  throw new Error(`Service '${service.name}': deployment.maxHealthyPercent must be between 100 and 200 (got ${max}).`);
96
103
  }
104
+ if (min !== undefined && max !== undefined && min > max) {
105
+ throw new Error(`Service '${service.name}': deployment.minHealthyPercent (${min}) must be <= maxHealthyPercent (${max}).`);
106
+ }
97
107
  if (min === 100 && max === 100) {
98
108
  throw new Error(`Service '${service.name}': deployment.minHealthyPercent and maxHealthyPercent cannot both be 100 ` +
99
109
  "(no capacity to drain or expand — deploys would never roll forward).");
@@ -3,12 +3,12 @@ import { SingletonFunction as singletonFunction, Function, Code, Architecture, F
3
3
  import { FactName } from "aws-cdk-lib/region-info";
4
4
  import path from "node:path";
5
5
  import { fileURLToPath } from "node:url";
6
+ import { createHash } from "node:crypto";
6
7
  import { SqsEventSource, DynamoEventSource, S3EventSource } from "aws-cdk-lib/aws-lambda-event-sources";
7
8
  import { EventType } from "aws-cdk-lib/aws-s3";
8
9
  import { PolicyStatement, Effect } from "aws-cdk-lib/aws-iam";
9
10
  import { RetentionDays } from "aws-cdk-lib/aws-logs";
10
11
  import { LogGroup } from "../logging/logGroup.js";
11
- import { v4 as uuid } from "uuid";
12
12
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
13
13
  import { resolveImportedSecret } from "../secrets/index.js";
14
14
  import { toPascalCase } from "../../../utils/capitaliseString.js";
@@ -65,11 +65,29 @@ const SECRETS_EXTENSION = {
65
65
  * mis-tune alarms relative to runtime behaviour.
66
66
  */
67
67
  const LAMBDA_DEFAULT_TIMEOUT_SECONDS = 300;
68
+ /**
69
+ * Stable, deterministic uuid for a SingletonFunction so its logical ID
70
+ * (`SingletonLambda${uuid-without-dashes}`) does not drift across synths. A
71
+ * random default would recreate the singleton — and re-run any custom resource
72
+ * fronted by it — on every deploy.
73
+ */
74
+ function deriveStableSingletonUuid(scope, id) {
75
+ const hash = createHash("sha256")
76
+ .update(`${scope.node.path}/${id}`)
77
+ .digest("hex");
78
+ return [
79
+ hash.slice(0, 8),
80
+ hash.slice(8, 12),
81
+ hash.slice(12, 16),
82
+ hash.slice(16, 20),
83
+ hash.slice(20, 32)
84
+ ].join("-");
85
+ }
68
86
  export class SingletonFunction extends singletonFunction {
69
87
  constructor(scope, id, props) {
70
88
  super(scope, id, {
71
89
  ...props,
72
- uuid: props.uuid ?? uuid(),
90
+ uuid: props.uuid ?? deriveStableSingletonUuid(scope, id),
73
91
  timeout: Duration.seconds(props.timeout ?? LAMBDA_DEFAULT_TIMEOUT_SECONDS),
74
92
  description: props.lambdaDescription ?? `${id} singleton lambda`,
75
93
  runtime: props.runtime,
@@ -105,7 +105,11 @@ export class PersistentDataVolume extends Construct {
105
105
  Tags.of(this.volume).add(PERSISTENT_DATA_VOLUME_TAG_STACK_ID, Aws.STACK_ID);
106
106
  this.queue = new SQSQueue(this, `${id}Queue`, {
107
107
  visibilityTimeout: QUEUE_VISIBILITY_TIMEOUT_SECONDS,
108
- deadLetterQueue: { enabled: true, maxReceiveCount: 5 }
108
+ deadLetterQueue: { enabled: true, maxReceiveCount: 5 },
109
+ // Transient volume-attach signals — no durable state (the EBS volume
110
+ // itself is SNAPSHOT above). Pinned DESTROY (now also the SQSQueue wrapper
111
+ // default) so a replacing deploy reclaims this queue + DLQ, not orphans.
112
+ removalPolicy: "DESTROY"
109
113
  });
110
114
  const sourcePath = path.resolve(__dirname, LAUNCHING_LAMBDA_SOURCE_FILE);
111
115
  const source = readFileSync(sourcePath, "utf-8");
@@ -1,6 +1,7 @@
1
1
  import { Duration } from "aws-cdk-lib";
2
2
  import { Connections, type IConnectable, type IVpc } from "aws-cdk-lib/aws-ec2";
3
3
  import { type IInstanceEngine } from "aws-cdk-lib/aws-rds";
4
+ import { type IGrantable, type Grant } from "aws-cdk-lib/aws-iam";
4
5
  import { Construct } from "constructs";
5
6
  import { SecurityGroup } from "../networking/securityGroup.js";
6
7
  import { Secret } from "../secrets/index.js";
@@ -31,6 +32,15 @@ interface RdsProps {
31
32
  encryption?: EncryptionConfig;
32
33
  publiclyAccessible?: boolean;
33
34
  deletionProtection?: boolean;
35
+ /**
36
+ * Enable RDS IAM database authentication on the instance. Opt-in; defaults to
37
+ * off (undefined → CDK omits the property, leaving existing consumers' synth
38
+ * unchanged). When true, IAM principals granted via {@link grantIamConnect}
39
+ * connect with short-lived `rds-db:connect` tokens instead of a stored
40
+ * password — password auth keeps working in parallel. See ADR
41
+ * decisions/2026-06-17-rls-role-auth-and-launch-gating.md.
42
+ */
43
+ iamAuthentication?: boolean;
34
44
  /** ARN or identifier of DB instance snapshot to restore from */
35
45
  snapshotIdentifier?: string;
36
46
  /** Username from the snapshot (required when restoring from snapshot to reset password) */
@@ -70,6 +80,15 @@ export declare class RdsInstance extends Construct implements IConnectable {
70
80
  }>;
71
81
  getDatabaseName(): string;
72
82
  getConnectionString(): string;
83
+ /**
84
+ * Grant an IAM principal permission to connect as `dbUsername` via RDS IAM
85
+ * database authentication. Requires the instance to be created with
86
+ * `iamAuthentication: true`. Delegates to the L2 `grantConnect`, which scopes
87
+ * `rds-db:connect` to the exact `dbuser:<dbiResourceId>/<dbUsername>` ARN —
88
+ * never a bare wildcard. See ADR
89
+ * decisions/2026-06-17-rls-role-auth-and-launch-gating.md.
90
+ */
91
+ grantIamConnect(grantee: IGrantable, dbUsername: string): Grant;
73
92
  static build(id: string, props: RdsProps): (sb: StackBuilder) => Construct;
74
93
  }
75
94
  export {};
@@ -136,7 +136,8 @@ export class RdsInstance extends Construct {
136
136
  deletionProtection: props.deletionProtection ?? true,
137
137
  preferredMaintenanceWindow: props.preferredMaintenanceWindow ??
138
138
  RDS_DEFAULTS.PREFERRED_MAINTENANCE_WINDOW,
139
- publiclyAccessible: props.publiclyAccessible ?? false
139
+ publiclyAccessible: props.publiclyAccessible ?? false,
140
+ iamAuthentication: props.iamAuthentication
140
141
  };
141
142
  if (props.snapshotIdentifier) {
142
143
  // Create from snapshot
@@ -328,6 +329,17 @@ exports.handler = async (event) => {
328
329
  getConnectionString() {
329
330
  return `${this.engineConfig.family}://${this.getHostEndpoint()}:${this.getHostPort()}/${this.getDatabaseName()}`;
330
331
  }
332
+ /**
333
+ * Grant an IAM principal permission to connect as `dbUsername` via RDS IAM
334
+ * database authentication. Requires the instance to be created with
335
+ * `iamAuthentication: true`. Delegates to the L2 `grantConnect`, which scopes
336
+ * `rds-db:connect` to the exact `dbuser:<dbiResourceId>/<dbUsername>` ARN —
337
+ * never a bare wildcard. See ADR
338
+ * decisions/2026-06-17-rls-role-auth-and-launch-gating.md.
339
+ */
340
+ grantIamConnect(grantee, dbUsername) {
341
+ return this.database.grantConnect(grantee, dbUsername);
342
+ }
331
343
  static build(id, props) {
332
344
  return (sb) => {
333
345
  const newProps = {
@@ -7,6 +7,11 @@ export interface SNSTopicProps {
7
7
  displayName?: string;
8
8
  fifo?: boolean;
9
9
  contentBasedDeduplication?: boolean;
10
+ /**
11
+ * Removal policy for the topic. Defaults to DESTROY — a topic is a transient
12
+ * fan-out medium with no durable state (see the constructor). Pass "RETAIN"
13
+ * only for a topic that must survive stack deletion.
14
+ */
10
15
  removalPolicy?: RemovalPolicyString;
11
16
  }
12
17
  export declare class SNSTopic extends Construct {
@@ -22,7 +22,13 @@ export class SNSTopic extends Construct {
22
22
  ? (props.contentBasedDeduplication ?? true)
23
23
  : undefined
24
24
  });
25
- this.topic.applyRemovalPolicy(toRemovalPolicy(props.removalPolicy));
25
+ // An SNS topic is a transient fan-out medium: it holds no durable state
26
+ // (subscriptions are re-created by the next deploy), so the wrapper defaults
27
+ // to DESTROY — deliberately NOT the env-aware production->RETAIN of
28
+ // data-bearing wrappers (S3, EventBus). RETAIN here buys no protection and
29
+ // orphans the topic on a parent construct's logical-ID churn (the same leak
30
+ // class fixed for SQS). Durable topics opt into RETAIN via props.removalPolicy.
31
+ this.topic.applyRemovalPolicy(toRemovalPolicy(props.removalPolicy ?? "DESTROY"));
26
32
  new CfnOutput(this, `${id}TopicArn`, {
27
33
  key: `${id}TopicArn`,
28
34
  value: this.topic.topicArn,
@@ -60,6 +60,12 @@ export interface SQSQueueProps {
60
60
  contentBasedDeduplication?: boolean;
61
61
  fifoThroughputLimit?: "perQueue" | "perMessageGroupId";
62
62
  deduplicationScope?: "queue" | "messageGroup";
63
+ /**
64
+ * Removal policy for the queue (and its auto-created DLQ, which tracks it).
65
+ * Defaults to DESTROY — a queue is a transient work medium (see the
66
+ * constructor). Pass "RETAIN" only for a queue whose contents are
67
+ * irreplaceable and must survive stack deletion.
68
+ */
63
69
  removalPolicy?: RemovalPolicyString;
64
70
  }
65
71
  export declare class SQSQueue extends Construct {
@@ -83,6 +83,14 @@ export class SQSQueue extends Construct {
83
83
  this.id = id;
84
84
  // Sanitise id for CloudFormation output keys (must be alphanumeric)
85
85
  const outputName = toPascalCase(id);
86
+ // SQS queues are transient work mediums: their contents (job messages,
87
+ // failed-delivery copies) regenerate from a source of truth held elsewhere
88
+ // (Postgres, the producing schedule/rule), so the wrapper defaults to
89
+ // DESTROY — deliberately NOT the env-aware production->RETAIN of
90
+ // data-bearing wrappers (S3, EventBus). RETAIN here buys no protection and
91
+ // orphans the queue on a parent construct's logical-ID churn (the prod
92
+ // orphan leak). Durable queues opt into RETAIN via props.removalPolicy.
93
+ const resolvedRemovalPolicy = toRemovalPolicy(props.removalPolicy ?? "DESTROY");
86
94
  const isFifo = props.queueType === "fifo";
87
95
  const queueName = props.queueName
88
96
  ? isFifo
@@ -110,7 +118,7 @@ export class SQSQueue extends Construct {
110
118
  fifo: isFifo,
111
119
  encryption: toEncryption(props.encryption),
112
120
  retentionPeriod: Duration.days(SQS_LIMITS.DEAD_LETTER_QUEUE.DEFAULT_RETENTION_DAYS),
113
- removalPolicy: toRemovalPolicy(props.removalPolicy)
121
+ removalPolicy: resolvedRemovalPolicy
114
122
  });
115
123
  deadLetterQueue = {
116
124
  queue: this.dlq,
@@ -156,7 +164,7 @@ export class SQSQueue extends Construct {
156
164
  deduplicationScope: isFifo
157
165
  ? toDeduplicationScope(props.deduplicationScope)
158
166
  : undefined,
159
- removalPolicy: toRemovalPolicy(props.removalPolicy)
167
+ removalPolicy: resolvedRemovalPolicy
160
168
  });
161
169
  new CfnOutput(this, `${outputName}QueueUrl`, {
162
170
  key: `${outputName}QueueUrl`,
@@ -29,32 +29,27 @@ export interface ClickHouseAlarmsProps {
29
29
  asgName: string;
30
30
  alarmTopic: ITopic;
31
31
  /**
32
- * Webapp log group. Required to wire the stuck-merge alarm — `client.ts`
33
- * emits `serverLogger.warn("ClickHouse", "Stuck merge detected")` when
34
- * `system.merges` shows a merge elapsed > 30 min.
35
- */
36
- webappLogGroup: ILogGroup;
37
- /**
38
- * Backup-task log group. Required to wire the backup-failure alarm —
32
+ * Backup-task log group. When present, wires the backup-failure alarm —
39
33
  * `BACKUP DATABASE … TO S3(…)` emits `AccessDenied` / `S3Exception` lines
40
34
  * when the IAM grant or bucket policy is misconfigured (silent before the
41
35
  * alarm landed; the daily backup task exited non-zero with no signal).
36
+ * Omitted when `backupSchedule: false` — no backup task, no log group.
42
37
  */
43
- backupTaskLogGroup: ILogGroup;
38
+ backupTaskLogGroup?: ILogGroup;
44
39
  config?: ClickHouseAlarmThresholds;
45
40
  }
46
41
  /**
47
- * Single-node ClickHouse posture alarms. Covers host-level CPU + (optional)
48
- * memory and disk via the CloudWatch Agent metric namespace `CWAgent`, plus
49
- * two log-driven alarms:
42
+ * Single-node ClickHouse host-posture alarms. Covers host-level CPU + (optional)
43
+ * memory and disk via the CloudWatch Agent metric namespace `CWAgent`, plus the
44
+ * backup-failure log alarm when a backup-task log group is supplied:
50
45
  *
51
- * - **Stuck merges** — `client.ts` polls `system.merges` every 5 min and logs
52
- * `serverLogger.warn("ClickHouse", "Stuck merge detected")` when elapsed
53
- * exceeds 30 min. The metric filter on the webapp log group emits a count
54
- * metric per match; the alarm fires on Sum >= 1 over 5 min × 2 evaluations.
55
46
  * - **Backup failures** — `AccessDenied` or `S3Exception` from the backup
56
47
  * task's BACKUP DATABASE TO S3 statement. Closes the silent-failure mode
57
48
  * that masked the original IAM-grant misconfiguration (see
58
49
  * `designs/2026-04-27-clickhouse-backup-iam-role.md`).
50
+ *
51
+ * The stuck-merge alarm — a `"Stuck merge detected"` line emitted by the webapp
52
+ * app process, not this construct — lives on the app service's declarative
53
+ * `logAlarms` instead; it is an app-log alarm, not a database-host concern.
59
54
  */
60
55
  export declare function createClickHouseAlarms(props: ClickHouseAlarmsProps): Alarm[];
@@ -4,23 +4,23 @@ import { SnsAction } from "aws-cdk-lib/aws-cloudwatch-actions";
4
4
  import { Metric } from "aws-cdk-lib/aws-cloudwatch";
5
5
  import { FilterPattern, MetricFilter } from "aws-cdk-lib/aws-logs";
6
6
  import { ALARM_DEFAULTS, registerAlarm, buildAlarmDescription } from "./alarmDefaults.js";
7
- const CLICKHOUSE_METRIC_NAMESPACE = "Fjall/ClickHouse";
7
+ import { METRIC_NAMESPACE } from "./metricNamespaces.js";
8
8
  /**
9
- * Single-node ClickHouse posture alarms. Covers host-level CPU + (optional)
10
- * memory and disk via the CloudWatch Agent metric namespace `CWAgent`, plus
11
- * two log-driven alarms:
9
+ * Single-node ClickHouse host-posture alarms. Covers host-level CPU + (optional)
10
+ * memory and disk via the CloudWatch Agent metric namespace `CWAgent`, plus the
11
+ * backup-failure log alarm when a backup-task log group is supplied:
12
12
  *
13
- * - **Stuck merges** — `client.ts` polls `system.merges` every 5 min and logs
14
- * `serverLogger.warn("ClickHouse", "Stuck merge detected")` when elapsed
15
- * exceeds 30 min. The metric filter on the webapp log group emits a count
16
- * metric per match; the alarm fires on Sum >= 1 over 5 min × 2 evaluations.
17
13
  * - **Backup failures** — `AccessDenied` or `S3Exception` from the backup
18
14
  * task's BACKUP DATABASE TO S3 statement. Closes the silent-failure mode
19
15
  * that masked the original IAM-grant misconfiguration (see
20
16
  * `designs/2026-04-27-clickhouse-backup-iam-role.md`).
17
+ *
18
+ * The stuck-merge alarm — a `"Stuck merge detected"` line emitted by the webapp
19
+ * app process, not this construct — lives on the app service's declarative
20
+ * `logAlarms` instead; it is an app-log alarm, not a database-host concern.
21
21
  */
22
22
  export function createClickHouseAlarms(props) {
23
- const { scope, instanceRole, asgName, alarmTopic, webappLogGroup, backupTaskLogGroup, config = {} } = props;
23
+ const { scope, instanceRole, asgName, alarmTopic, backupTaskLogGroup, config = {} } = props;
24
24
  const alarms = [];
25
25
  const snsAction = new SnsAction(alarmTopic);
26
26
  const cpuAlarm = new Alarm(scope, "ClickHouseCpuAlarm", {
@@ -87,53 +87,31 @@ export function createClickHouseAlarms(props) {
87
87
  treatMissingData: TreatMissingData.NOT_BREACHING
88
88
  });
89
89
  registerAlarm(diskCriticalAlarm, snsAction, alarms);
90
- const stuckMergeMetricName = "ClickHouseStuckMergeCount";
91
- new MetricFilter(scope, "ClickHouseStuckMergeMetricFilter", {
92
- logGroup: webappLogGroup,
93
- metricNamespace: CLICKHOUSE_METRIC_NAMESPACE,
94
- metricName: stuckMergeMetricName,
95
- filterPattern: FilterPattern.literal('"Stuck merge detected"'),
96
- metricValue: "1",
97
- defaultValue: 0
98
- });
99
- const stuckMergeAlarm = new Alarm(scope, "ClickHouseStuckMergeAlarm", {
100
- alarmDescription: buildAlarmDescription("ClickHouse merge stuck > 30 min — investigate parts pressure or replica health", undefined),
101
- metric: new Metric({
102
- namespace: CLICKHOUSE_METRIC_NAMESPACE,
103
- metricName: stuckMergeMetricName,
104
- period: Duration.minutes(5),
105
- statistic: "Sum"
106
- }),
107
- threshold: 1,
108
- evaluationPeriods: 2,
109
- datapointsToAlarm: 2,
110
- comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
111
- treatMissingData: TreatMissingData.NOT_BREACHING
112
- });
113
- registerAlarm(stuckMergeAlarm, snsAction, alarms);
114
- const backupFailureMetricName = "ClickHouseBackupFailureCount";
115
- new MetricFilter(scope, "ClickHouseBackupFailureMetricFilter", {
116
- logGroup: backupTaskLogGroup,
117
- metricNamespace: CLICKHOUSE_METRIC_NAMESPACE,
118
- metricName: backupFailureMetricName,
119
- filterPattern: FilterPattern.anyTerm("AccessDenied", "S3Exception"),
120
- metricValue: "1",
121
- defaultValue: 0
122
- });
123
- const backupFailureAlarm = new Alarm(scope, "ClickHouseBackupFailureAlarm", {
124
- alarmDescription: buildAlarmDescription(`ClickHouse BACKUP TO S3 emitted AccessDenied/S3Exception — verify instance role '${instanceRole.roleName}' grant on backup bucket`, undefined),
125
- metric: new Metric({
126
- namespace: CLICKHOUSE_METRIC_NAMESPACE,
90
+ if (backupTaskLogGroup !== undefined) {
91
+ const backupFailureMetricName = "ClickHouseBackupFailureCount";
92
+ new MetricFilter(scope, "ClickHouseBackupFailureMetricFilter", {
93
+ logGroup: backupTaskLogGroup,
94
+ metricNamespace: METRIC_NAMESPACE.CLICKHOUSE,
127
95
  metricName: backupFailureMetricName,
128
- period: Duration.hours(1),
129
- statistic: "Sum"
130
- }),
131
- threshold: 1,
132
- evaluationPeriods: 1,
133
- datapointsToAlarm: 1,
134
- comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
135
- treatMissingData: TreatMissingData.NOT_BREACHING
136
- });
137
- registerAlarm(backupFailureAlarm, snsAction, alarms);
96
+ filterPattern: FilterPattern.anyTerm("AccessDenied", "S3Exception"),
97
+ metricValue: "1",
98
+ defaultValue: 0
99
+ });
100
+ const backupFailureAlarm = new Alarm(scope, "ClickHouseBackupFailureAlarm", {
101
+ alarmDescription: buildAlarmDescription(`ClickHouse BACKUP TO S3 emitted AccessDenied/S3Exception — verify instance role '${instanceRole.roleName}' grant on backup bucket`, undefined),
102
+ metric: new Metric({
103
+ namespace: METRIC_NAMESPACE.CLICKHOUSE,
104
+ metricName: backupFailureMetricName,
105
+ period: Duration.hours(1),
106
+ statistic: "Sum"
107
+ }),
108
+ threshold: 1,
109
+ evaluationPeriods: 1,
110
+ datapointsToAlarm: 1,
111
+ comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
112
+ treatMissingData: TreatMissingData.NOT_BREACHING
113
+ });
114
+ registerAlarm(backupFailureAlarm, snsAction, alarms);
115
+ }
138
116
  return alarms;
139
117
  }
@@ -43,7 +43,11 @@ export function createEcsServiceAlarms(props) {
43
43
  evaluationPeriods: 2,
44
44
  datapointsToAlarm: 2,
45
45
  comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD,
46
- treatMissingData: TreatMissingData.BREACHING
46
+ // RunningTaskCount (Container Insights) is sparse, so missing data must
47
+ // not breach — otherwise metric gaps false-alarm healthy services and
48
+ // scaled-to-zero workers alarm permanently. A real count < threshold
49
+ // still fires.
50
+ treatMissingData: TreatMissingData.NOT_BREACHING
47
51
  });
48
52
  registerAlarm(runningTasksAlarm, snsAction, alarms);
49
53
  }
@@ -4,3 +4,5 @@ export { createRdsAlarms, type RdsAlarmThresholds, type RdsAlarmsProps } from ".
4
4
  export { createLambdaAlarms, type LambdaAlarmThresholds, type LambdaAlarmsProps } from "./lambdaAlarms.js";
5
5
  export { createScheduleAlarms, type ScheduleAlarmThresholds, type CreateScheduleAlarmsProps } from "./scheduleAlarms.js";
6
6
  export { createClickHouseAlarms, type ClickHouseAlarmThresholds, type ClickHouseAlarmsProps } from "./clickhouseAlarms.js";
7
+ export { createLogPatternAlarms, type LogPatternAlarmSpec, type LogPatternAlarmsProps } from "./logPatternAlarms.js";
8
+ export { METRIC_NAMESPACE, type MetricNamespace } from "./metricNamespaces.js";
@@ -4,3 +4,5 @@ export { createRdsAlarms } from "./rdsAlarms.js";
4
4
  export { createLambdaAlarms } from "./lambdaAlarms.js";
5
5
  export { createScheduleAlarms } from "./scheduleAlarms.js";
6
6
  export { createClickHouseAlarms } from "./clickhouseAlarms.js";
7
+ export { createLogPatternAlarms } from "./logPatternAlarms.js";
8
+ export { METRIC_NAMESPACE } from "./metricNamespaces.js";
@@ -0,0 +1,55 @@
1
+ import { Duration } from "aws-cdk-lib";
2
+ import { Alarm } from "aws-cdk-lib/aws-cloudwatch";
3
+ import { type ILogGroup } from "aws-cdk-lib/aws-logs";
4
+ import type { ITopic } from "aws-cdk-lib/aws-sns";
5
+ import type { Construct } from "constructs";
6
+ /**
7
+ * One log-pattern alarm: a CloudWatch metric filter over a log group plus the
8
+ * alarm that fires when the pattern is matched. Declarative — callers describe
9
+ * the match with plain strings; the raw CDK `FilterPattern` is built inside the
10
+ * primitive, so the spec never leaks raw CDK at the call site.
11
+ *
12
+ * Exactly one of `literal` / `anyTerms` must be set.
13
+ */
14
+ export interface LogPatternAlarmSpec {
15
+ /** Stable logical-ID stem; the MetricFilter + Alarm derive their construct IDs from it. */
16
+ idStem: string;
17
+ /** CloudWatch metric name emitted by the filter and read by the alarm. */
18
+ metricName: string;
19
+ /** Responder-facing alarm description (the applicationId tag is appended automatically). */
20
+ description: string;
21
+ /** Match this exact CloudWatch Logs filter pattern (e.g. `'"Stuck merge detected"'`). */
22
+ literal?: string;
23
+ /** Match if ANY of these terms appears (CloudWatch Logs OR semantics). */
24
+ anyTerms?: string[];
25
+ /** Metric namespace for this spec; falls back to the props-level default. */
26
+ metricNamespace?: string;
27
+ /** Sum-over-period threshold the alarm fires at. Default 1 (first match). */
28
+ threshold?: number;
29
+ /** Metric period. Default 1 minute (fast fail-closed detection). */
30
+ period?: Duration;
31
+ /** Consecutive periods evaluated. Default 1. */
32
+ evaluationPeriods?: number;
33
+ /** Datapoints within the window that must breach. Default = evaluationPeriods. */
34
+ datapointsToAlarm?: number;
35
+ }
36
+ export interface LogPatternAlarmsProps {
37
+ scope: Construct;
38
+ /** Log group the metric filters attach to. */
39
+ logGroup: ILogGroup;
40
+ /** SNS topic alarms notify on both ALARM and OK transitions. */
41
+ alarmTopic: ITopic;
42
+ /** Default metric namespace for specs that do not override it per-entry. */
43
+ metricNamespace?: string;
44
+ /** The alarms to create. */
45
+ specs: LogPatternAlarmSpec[];
46
+ /** Application ID for webhook-to-application alarm mapping. */
47
+ applicationId?: string;
48
+ }
49
+ /**
50
+ * Create CloudWatch log-pattern alarms — a MetricFilter + Alarm per spec —
51
+ * wired to the SNS topic. Sibling to `createEcsServiceAlarms`: same conventions
52
+ * (`registerAlarm` for ALARM+OK actions, applicationId tagging via
53
+ * `tagAlarmsWithApplicationId`). Reusable across every factory consumer.
54
+ */
55
+ export declare function createLogPatternAlarms(props: LogPatternAlarmsProps): Alarm[];
@@ -0,0 +1,74 @@
1
+ import { Duration } from "aws-cdk-lib";
2
+ import { Alarm, ComparisonOperator, Metric, TreatMissingData } from "aws-cdk-lib/aws-cloudwatch";
3
+ import { SnsAction } from "aws-cdk-lib/aws-cloudwatch-actions";
4
+ import { FilterPattern, MetricFilter } from "aws-cdk-lib/aws-logs";
5
+ import { buildAlarmDescription, registerAlarm, tagAlarmsWithApplicationId } from "./alarmDefaults.js";
6
+ /**
7
+ * Validate a spec carries exactly one matcher, a usable id stem, and a
8
+ * resolvable namespace; return the built filter pattern + resolved namespace.
9
+ * Throws (not a `Result`) — this is synth-time CDK construction, where a
10
+ * malformed spec must fail the synth loudly rather than emit a dead alarm.
11
+ */
12
+ function resolveSpec(spec, defaultNamespace) {
13
+ if (spec.idStem.trim() === "") {
14
+ throw new Error("logPatternAlarm spec requires a non-empty idStem");
15
+ }
16
+ const hasLiteral = spec.literal !== undefined;
17
+ const hasAnyTerms = spec.anyTerms !== undefined;
18
+ if (hasLiteral === hasAnyTerms) {
19
+ throw new Error(`logPatternAlarm spec '${spec.idStem}' must set exactly one of 'literal' or 'anyTerms'`);
20
+ }
21
+ if (hasAnyTerms &&
22
+ spec.anyTerms !== undefined &&
23
+ spec.anyTerms.length === 0) {
24
+ throw new Error(`logPatternAlarm spec '${spec.idStem}' has an empty 'anyTerms' array`);
25
+ }
26
+ const namespace = spec.metricNamespace ?? defaultNamespace;
27
+ if (namespace === undefined || namespace.trim() === "") {
28
+ throw new Error(`logPatternAlarm spec '${spec.idStem}' has no metric namespace — set spec.metricNamespace or props.metricNamespace`);
29
+ }
30
+ const filterPattern = spec.literal !== undefined
31
+ ? FilterPattern.literal(spec.literal)
32
+ : FilterPattern.anyTerm(...(spec.anyTerms ?? []));
33
+ return { filterPattern, namespace };
34
+ }
35
+ /**
36
+ * Create CloudWatch log-pattern alarms — a MetricFilter + Alarm per spec —
37
+ * wired to the SNS topic. Sibling to `createEcsServiceAlarms`: same conventions
38
+ * (`registerAlarm` for ALARM+OK actions, applicationId tagging via
39
+ * `tagAlarmsWithApplicationId`). Reusable across every factory consumer.
40
+ */
41
+ export function createLogPatternAlarms(props) {
42
+ const { scope, logGroup, alarmTopic, metricNamespace, specs, applicationId } = props;
43
+ const alarms = [];
44
+ const snsAction = new SnsAction(alarmTopic);
45
+ for (const spec of specs) {
46
+ const { filterPattern, namespace } = resolveSpec(spec, metricNamespace);
47
+ new MetricFilter(scope, `${spec.idStem}MetricFilter`, {
48
+ logGroup,
49
+ metricNamespace: namespace,
50
+ metricName: spec.metricName,
51
+ filterPattern,
52
+ metricValue: "1",
53
+ defaultValue: 0
54
+ });
55
+ const evaluationPeriods = spec.evaluationPeriods ?? 1;
56
+ const alarm = new Alarm(scope, `${spec.idStem}Alarm`, {
57
+ alarmDescription: buildAlarmDescription(spec.description, applicationId),
58
+ metric: new Metric({
59
+ namespace,
60
+ metricName: spec.metricName,
61
+ period: spec.period ?? Duration.minutes(1),
62
+ statistic: "Sum"
63
+ }),
64
+ threshold: spec.threshold ?? 1,
65
+ evaluationPeriods,
66
+ datapointsToAlarm: spec.datapointsToAlarm ?? evaluationPeriods,
67
+ comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
68
+ treatMissingData: TreatMissingData.NOT_BREACHING
69
+ });
70
+ registerAlarm(alarm, snsAction, alarms);
71
+ }
72
+ tagAlarmsWithApplicationId(alarms, applicationId);
73
+ return alarms;
74
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Single source of truth for Fjall CloudWatch metric namespaces.
3
+ *
4
+ * Log-pattern metric filters and their alarms read from these namespaces. The
5
+ * webapp app service declares RLS alarms under `WEBAPP` and relocates the
6
+ * ClickHouse stuck-merge alarm under `CLICKHOUSE` (the log line is app-emitted,
7
+ * so it is an app-log alarm carrying a database-domain namespace).
8
+ */
9
+ export declare const METRIC_NAMESPACE: {
10
+ readonly CLICKHOUSE: "Fjall/ClickHouse";
11
+ readonly WEBAPP: "Fjall/WebApp";
12
+ };
13
+ export type MetricNamespace = (typeof METRIC_NAMESPACE)[keyof typeof METRIC_NAMESPACE];
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Single source of truth for Fjall CloudWatch metric namespaces.
3
+ *
4
+ * Log-pattern metric filters and their alarms read from these namespaces. The
5
+ * webapp app service declares RLS alarms under `WEBAPP` and relocates the
6
+ * ClickHouse stuck-merge alarm under `CLICKHOUSE` (the log line is app-emitted,
7
+ * so it is an app-log alarm carrying a database-domain namespace).
8
+ */
9
+ export const METRIC_NAMESPACE = {
10
+ CLICKHOUSE: "Fjall/ClickHouse",
11
+ WEBAPP: "Fjall/WebApp"
12
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fjall/components-infrastructure",
3
- "version": "2.16.0",
3
+ "version": "2.18.0",
4
4
  "license": "SEE LICENSE IN LICENSE",
5
5
  "type": "module",
6
6
  "bin": {
@@ -53,7 +53,6 @@
53
53
  "@peculiar/x509": "1.14.0",
54
54
  "@types/aws-lambda": "^8.10.161",
55
55
  "@types/node": "^25.6.0",
56
- "@types/uuid": "^11.0.0",
57
56
  "@typescript-eslint/eslint-plugin": "^8.59.1",
58
57
  "@typescript-eslint/parser": "^8.59.1",
59
58
  "eslint": "^10.2.1",
@@ -63,10 +62,9 @@
63
62
  },
64
63
  "dependencies": {
65
64
  "@aws-sdk/client-organizations": "^3.1038.0",
66
- "@fjall/generator": "^2.16.0",
67
- "@fjall/util": "^2.16.0",
68
- "constructs": "^10.0.0",
69
- "uuid": "^14.0.0"
65
+ "@fjall/generator": "^2.18.0",
66
+ "@fjall/util": "^2.18.0",
67
+ "constructs": "^10.0.0"
70
68
  },
71
69
  "overrides": {
72
70
  "@smithy/core": "2.5.5"
@@ -79,5 +77,5 @@
79
77
  "engines": {
80
78
  "node": ">=18.0.0"
81
79
  },
82
- "gitHead": "2383b19f1e7db980ae603a6c75dca2c61b7a1d42"
80
+ "gitHead": "37008ca5469398c42a09e6babc8cc4192ab938b2"
83
81
  }