@rulebricks/cli 2.1.7 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/README.md +51 -16
  2. package/cluster-setup/aws/README.md +96 -47
  3. package/cluster-setup/aws/check-aws-access.sh +216 -52
  4. package/cluster-setup/aws/parameters.json +13 -0
  5. package/cluster-setup/aws/rulebricks-cluster.cfn.yaml +355 -0
  6. package/cluster-setup/azure/README.md +103 -55
  7. package/cluster-setup/azure/check-aks-prereqs.sh +236 -56
  8. package/cluster-setup/azure/parameters.json +30 -0
  9. package/cluster-setup/azure/rulebricks-cluster.bicep +546 -0
  10. package/cluster-setup/gcp/README.md +51 -34
  11. package/cluster-setup/gcp/check-gke-prereqs.sh +222 -60
  12. package/dist/commands/backup.d.ts +5 -0
  13. package/dist/commands/backup.js +104 -0
  14. package/dist/commands/deploy.d.ts +3 -1
  15. package/dist/commands/deploy.js +226 -326
  16. package/dist/commands/destroy.d.ts +1 -1
  17. package/dist/commands/destroy.js +73 -123
  18. package/dist/commands/init.d.ts +5 -1
  19. package/dist/commands/init.js +78 -54
  20. package/dist/commands/list.d.ts +1 -0
  21. package/dist/commands/list.js +74 -0
  22. package/dist/commands/open.d.ts +1 -1
  23. package/dist/commands/open.js +4 -12
  24. package/dist/commands/redeploy.d.ts +6 -0
  25. package/dist/commands/redeploy.js +310 -0
  26. package/dist/commands/restore.d.ts +5 -0
  27. package/dist/commands/restore.js +338 -0
  28. package/dist/commands/status.js +62 -49
  29. package/dist/commands/upgrade.js +74 -51
  30. package/dist/components/DNSWaitScreen.d.ts +5 -1
  31. package/dist/components/DNSWaitScreen.js +47 -41
  32. package/dist/components/Wizard/WizardContext.d.ts +157 -36
  33. package/dist/components/Wizard/WizardContext.js +872 -160
  34. package/dist/components/Wizard/steps/CloudProviderStep.js +192 -107
  35. package/dist/components/Wizard/steps/DomainStep.js +5 -24
  36. package/dist/components/Wizard/steps/ExternalServicesStep.d.ts +6 -0
  37. package/dist/components/Wizard/steps/ExternalServicesStep.js +645 -0
  38. package/dist/components/Wizard/steps/FeatureConfigStep.d.ts +2 -1
  39. package/dist/components/Wizard/steps/FeatureConfigStep.js +739 -425
  40. package/dist/components/Wizard/steps/FeaturesStep.js +31 -35
  41. package/dist/components/Wizard/steps/ObservabilityStep.d.ts +6 -0
  42. package/dist/components/Wizard/steps/ObservabilityStep.js +137 -0
  43. package/dist/components/Wizard/steps/ReviewStep.d.ts +2 -1
  44. package/dist/components/Wizard/steps/ReviewStep.js +56 -12
  45. package/dist/components/Wizard/steps/StorageStep.d.ts +9 -0
  46. package/dist/components/Wizard/steps/StorageStep.js +592 -0
  47. package/dist/components/Wizard/steps/SupabaseCredentialsStep.js +20 -21
  48. package/dist/components/Wizard/steps/VersionStep.js +45 -23
  49. package/dist/components/Wizard/steps/index.d.ts +3 -3
  50. package/dist/components/Wizard/steps/index.js +3 -3
  51. package/dist/components/common/CommandApproval.d.ts +12 -0
  52. package/dist/components/common/CommandApproval.js +91 -0
  53. package/dist/components/common/DeploymentPicker.d.ts +14 -0
  54. package/dist/components/common/DeploymentPicker.js +16 -0
  55. package/dist/components/common/index.d.ts +2 -0
  56. package/dist/components/common/index.js +2 -0
  57. package/dist/index.js +94 -62
  58. package/dist/lib/cloudCli.d.ts +134 -63
  59. package/dist/lib/cloudCli.js +512 -220
  60. package/dist/lib/clusterSetupDefaults.d.ts +30 -0
  61. package/dist/lib/clusterSetupDefaults.js +64 -0
  62. package/dist/lib/commandApproval.d.ts +26 -0
  63. package/dist/lib/commandApproval.js +114 -0
  64. package/dist/lib/config.d.ts +12 -10
  65. package/dist/lib/config.js +91 -33
  66. package/dist/lib/configFixtures.d.ts +5 -0
  67. package/dist/lib/configFixtures.js +513 -0
  68. package/dist/lib/deploymentHealth.d.ts +32 -0
  69. package/dist/lib/deploymentHealth.js +157 -0
  70. package/dist/lib/dns.d.ts +1 -1
  71. package/dist/lib/dns.js +19 -1
  72. package/dist/lib/dns.test.d.ts +1 -0
  73. package/dist/lib/dns.test.js +27 -0
  74. package/dist/lib/dockerHub.d.ts +12 -1
  75. package/dist/lib/dockerHub.js +18 -8
  76. package/dist/lib/helm.d.ts +4 -0
  77. package/dist/lib/helm.js +16 -0
  78. package/dist/lib/helmValues.d.ts +25 -0
  79. package/dist/lib/helmValues.js +1841 -289
  80. package/dist/lib/helmValues.test.d.ts +1 -0
  81. package/dist/lib/helmValues.test.js +1012 -0
  82. package/dist/lib/htpasswd.d.ts +1 -0
  83. package/dist/lib/htpasswd.js +15 -0
  84. package/dist/lib/kubernetes.d.ts +124 -17
  85. package/dist/lib/kubernetes.js +576 -145
  86. package/dist/lib/secrets.d.ts +23 -0
  87. package/dist/lib/secrets.js +158 -0
  88. package/dist/lib/validateValues.d.ts +31 -0
  89. package/dist/lib/validateValues.js +253 -0
  90. package/dist/lib/versions.d.ts +82 -11
  91. package/dist/lib/versions.js +131 -31
  92. package/dist/lib/versions.test.d.ts +1 -0
  93. package/dist/lib/versions.test.js +81 -0
  94. package/dist/lib/wizardSteps.d.ts +14 -0
  95. package/dist/lib/wizardSteps.js +23 -0
  96. package/dist/lib/workloadIdentity.d.ts +26 -0
  97. package/dist/lib/workloadIdentity.js +323 -0
  98. package/dist/lib/workloadIdentity.test.d.ts +1 -0
  99. package/dist/lib/workloadIdentity.test.js +57 -0
  100. package/dist/types/index.d.ts +1860 -164
  101. package/dist/types/index.js +518 -295
  102. package/package.json +9 -4
  103. package/schema/values.schema.json +1934 -0
  104. package/cluster-setup/aws/cluster.yaml +0 -33
  105. package/cluster-setup/azure/main.bicep +0 -282
  106. package/cluster-setup/azure/main.parameters.json +0 -21
  107. package/dist/components/Wizard/steps/CredentialsStep.d.ts +0 -6
  108. package/dist/components/Wizard/steps/CredentialsStep.js +0 -22
  109. package/dist/components/Wizard/steps/DeploymentModeStep.d.ts +0 -5
  110. package/dist/components/Wizard/steps/DeploymentModeStep.js +0 -26
  111. package/dist/components/Wizard/steps/TierStep.d.ts +0 -6
  112. package/dist/components/Wizard/steps/TierStep.js +0 -29
  113. package/dist/lib/terraform.d.ts +0 -66
  114. package/dist/lib/terraform.js +0 -754
  115. package/terraform/aws/main.tf +0 -355
  116. package/terraform/azure/main.tf +0 -371
  117. package/terraform/gcp/main.tf +0 -407
@@ -1,7 +1,103 @@
1
- import { TIER_CONFIGS, isSupportedDnsProvider, getLoggingDestinationLabel, } from "../types/index.js";
1
+ import { getReleaseName, isSupportedDnsProvider, validateRemoteWriteConfig, } from "../types/index.js";
2
2
  import { saveHelmValues, getHelmValuesPath } from "./config.js";
3
+ import { assertValidHelmValues } from "./validateValues.js";
4
+ import { SUPABASE_POSTGRES_IMAGE_REPOSITORY, SUPABASE_POSTGRES_IMAGE_TAG, DEFAULT_IMAGE_REGISTRY, IMAGE_REPOSITORIES, IMAGE_DIGESTS, KAFKA_PROXY_IMAGE, } from "./versions.js";
5
+ import { createHmac } from "crypto";
3
6
  import fs from "fs/promises";
4
7
  import YAML from "yaml";
8
+ // Names of the Kubernetes Secrets the CLI creates in k8s secret mode. Shared by
9
+ // the value generator (which sets the secretRef fields) and src/lib/secrets.ts
10
+ // (which creates the Secrets) so they always agree.
11
+ //
12
+ // The base MUST be the Helm release name, not config.name. Most chart consumers
13
+ // read the secretRef *value* (name-agnostic), but a few templates hardcode the
14
+ // canonical <release>-* name — e.g. templates/migration-job.yaml derives
15
+ // DB_PASSWORD from `{{ .Release.Name }}-supabase-db`. Naming these secrets with
16
+ // the release name keeps the CLI a faithful drop-in for the unmodified chart so
17
+ // we never have to customize the chart to match the CLI.
18
+ export function deploymentSecretNames(config) {
19
+ const base = getReleaseName(config.name);
20
+ return {
21
+ app: `${base}-app-secrets`,
22
+ db: `${base}-supabase-db`,
23
+ dbBootstrap: `${base}-supabase-db-bootstrap`,
24
+ jwt: `${base}-supabase-jwt`,
25
+ dashboard: `${base}-supabase-dashboard`,
26
+ realtime: `${base}-supabase-realtime`,
27
+ smtp: `${base}-supabase-smtp`,
28
+ };
29
+ }
30
+ // Baseline Kafka topic partitioning. These are NOT user-tunable sizing knobs
31
+ // (tiers were removed); they are a structural contract that must stay
32
+ // consistent across three places at once: the kafka.provisioning topic
33
+ // partitions, rulebricks.hps.workers.solutionPartitions (the worker-fleet
34
+ // concurrency ceiling the chart cross-checks), and the worker KEDA
35
+ // maxReplicaCount (validated to be <= solutionPartitions). They mirror the Helm
36
+ // chart's own defaults, so operators who need a different size tune the chart
37
+ // values directly. Partitions can never be decreased, so solution is sized with
38
+ // generous headroom up front; idle partitions are effectively free.
39
+ const SOLUTION_TOPIC_PARTITIONS = 128;
40
+ const LOGS_TOPIC_PARTITIONS = 24;
41
+ // RPC + log topics: replication factor 1. RPC traffic is transient and
42
+ // latency-sensitive (the HPS producer's acks=-1 would otherwise wait on full
43
+ // ISR replication); the in-cluster broker is single-replica by default.
44
+ const TOPIC_REPLICATION_FACTOR = 1;
45
+ // global.version must be empty or a semantic version per the chart schema. The
46
+ // CLI normally pins a real version, but migrated/legacy configs can carry
47
+ // "latest"; emitting that would fail chart validation, so we omit it instead
48
+ // and let the chart fall back to its default.
49
+ const SEMVER_PATTERN = /^\d+\.\d+\.\d+(-[0-9A-Za-z.-]+)?$/;
50
+ // Healthy defaults for the decision-log archive that ClickHouse reads:
51
+ // flush a gzipped NDJSON file at ~64 MiB (uncompressed) or after 5 minutes,
52
+ // whichever comes first. Users can override these in their Helm values.
53
+ //
54
+ // max_bytes MUST stay well below the Vector pod's memory limit
55
+ // (vector.resources.limits.memory in the chart): the object-storage sink buffers
56
+ // the whole uncompressed batch in memory before it flushes, so a batch sized at
57
+ // or above the pod limit gets OOMKilled before it can ever write a blob - which
58
+ // silently disables decision-log export entirely. 64 MiB leaves comfortable
59
+ // headroom under the chart's 1 GiB Vector limit while still producing large,
60
+ // scan-efficient files for ClickHouse.
61
+ const DECISION_LOG_BATCH = { max_bytes: 67108864, timeout_secs: 300 };
62
+ const SUPABASE_JWT_ISSUED_AT = 1641769200;
63
+ const SUPABASE_JWT_EXPIRES_AT = 4102444800;
64
+ // VRL that normalizes the Kafka decision-log envelope into the ClickHouse column
65
+ // types. Inlined as a real multi-line string (not a chart `{{ include }}`) so
66
+ // that YAML.stringify / Helm's toYaml emit it as a block scalar. A templated
67
+ // single-line include gets rendered into a single-quoted YAML scalar, whose
68
+ // newlines YAML folds into spaces - collapsing the statements onto one line and
69
+ // breaking VRL parsing. Keep in sync with rulebricks.vector.normalizeLogs.
70
+ const VECTOR_NORMALIZE_LOGS_VRL = [
71
+ "parsed, err = parse_json(string!(.message))",
72
+ "if err == null {",
73
+ " . = parsed",
74
+ "}",
75
+ '.timestamp = parse_timestamp!(to_string(.timestamp) ?? to_string(now()), format: "%+")',
76
+ '.api_key = to_string(.api_key) ?? ""',
77
+ ".user_id = to_string(.user_id) ?? null",
78
+ ".environment = to_string(.environment) ?? null",
79
+ ".ip = to_string(.ip) ?? null",
80
+ ".method = to_string(.method) ?? null",
81
+ '.url = to_string(.url) ?? ""',
82
+ ".status = to_int(.status) ?? 0",
83
+ ".rule_name = to_string(.rule_name) ?? null",
84
+ ".rule_id = to_string(.rule_id) ?? null",
85
+ ".rule_slug = to_string(.rule_slug) ?? null",
86
+ ".rule_version = to_string(.rule_version) ?? null",
87
+ ".operation = to_string(.operation) ?? null",
88
+ '.level = to_string(.level) ?? "info"',
89
+ ".error = to_string(.error) ?? null",
90
+ ".trace_id = to_string(.trace_id) ?? null",
91
+ ".span_id = to_string(.span_id) ?? null",
92
+ '.request = to_string(.request) ?? "null"',
93
+ '.response = to_string(.response) ?? "null"',
94
+ '.decision = to_string(.decision) ?? "{}"',
95
+ '.params = to_string(.params) ?? "{}"',
96
+ ].join("\n");
97
+ function decisionLogPathPrefix(config) {
98
+ const path = config.storage?.paths?.decisionLogs || "decision-logs";
99
+ return `${path.replace(/^\/+|\/+$/g, "")}/year=%Y/month=%m/day=%d/hour=%H/`;
100
+ }
5
101
  /**
6
102
  * Generates Vector sink configuration based on logging settings
7
103
  */
@@ -10,84 +106,85 @@ function generateVectorSinks(config) {
10
106
  // Console sink is always enabled
11
107
  console: {
12
108
  type: "console",
13
- inputs: ["kafka"],
109
+ inputs: ["normalize_logs"],
14
110
  encoding: {
15
111
  codec: "json",
16
112
  },
17
113
  },
18
114
  };
19
- // Add external sink if configured
20
- if (config.features.logging.sink !== "console" &&
21
- config.features.logging.sink !== "pending") {
22
- const { sink, bucket, region } = config.features.logging;
23
- switch (sink) {
24
- // Cloud Storage sinks
115
+ if (config.storage) {
116
+ const storage = config.storage;
117
+ switch (config.storage.provider) {
25
118
  case "s3":
26
- sinks.s3 = {
119
+ sinks.decision_logs = {
27
120
  type: "aws_s3",
28
- inputs: ["kafka"],
29
- bucket: bucket,
30
- region: region,
31
- key_prefix: "rulebricks/logs/%Y/%m/%d/",
121
+ inputs: ["normalize_logs"],
122
+ bucket: storage.bucket,
123
+ region: storage.region,
124
+ key_prefix: decisionLogPathPrefix(config),
125
+ filename_extension: "ndjson",
32
126
  compression: "gzip",
33
- encoding: {
34
- codec: "json",
35
- },
127
+ encoding: { codec: "json" },
128
+ framing: { method: "newline_delimited" },
129
+ batch: { ...DECISION_LOG_BATCH },
36
130
  };
37
131
  break;
38
- case "azure-blob":
39
- if (!bucket) {
40
- throw new Error("Azure Blob logging requires a storage account.");
41
- }
42
- const azureBlobSink = {
132
+ case "azure-blob": {
133
+ const sink = {
43
134
  type: "azure_blob",
44
- inputs: ["kafka"],
45
- account_name: bucket,
46
- container_name: config.features.logging.azureBlobContainer || "rulebricks-logs",
47
- blob_prefix: "rulebricks/logs/%Y/%m/%d/",
135
+ inputs: ["normalize_logs"],
136
+ account_name: storage.bucket,
137
+ container_name: storage.azureBlobContainer || "rulebricks",
138
+ blob_prefix: decisionLogPathPrefix(config),
139
+ // azure_blob has no filename_extension (unlike aws_s3/gcs); it always
140
+ // writes ".log" (".log.gz" when compressed). ClickHouse globs on *.gz.
48
141
  compression: "gzip",
49
- encoding: {
50
- codec: "json",
51
- },
142
+ encoding: { codec: "json" },
143
+ framing: { method: "newline_delimited" },
144
+ batch: { ...DECISION_LOG_BATCH },
52
145
  };
53
- if (config.features.logging.cloudAuthMode === "secret") {
54
- if (!config.features.logging.azureBlobConnectionStringSecretRef) {
55
- throw new Error("Azure Blob connection string auth requires a secret ref.");
56
- }
57
- azureBlobSink.connection_string = "${AZURE_STORAGE_CONNECTION_STRING}";
146
+ if (config.storage.cloudAuthMode === "secret") {
147
+ sink.connection_string = "${AZURE_STORAGE_CONNECTION_STRING}";
58
148
  }
59
149
  else {
60
- if (!config.features.logging.azureBlobClientId ||
61
- !config.features.logging.azureBlobTenantId) {
62
- throw new Error("Azure Blob workload identity requires client ID and tenant ID.");
63
- }
64
- azureBlobSink.auth = {
150
+ sink.auth = {
65
151
  azure_credential_kind: "workload_identity",
66
- client_id: config.features.logging.azureBlobClientId,
67
- tenant_id: config.features.logging.azureBlobTenantId,
152
+ client_id: config.storage.azureBlobClientId,
153
+ tenant_id: config.storage.azureBlobTenantId,
68
154
  token_file_path: "/var/run/secrets/azure/tokens/azure-identity-token",
69
155
  };
70
156
  }
71
- sinks.azure_blob = azureBlobSink;
157
+ sinks.decision_logs = sink;
72
158
  break;
159
+ }
73
160
  case "gcs":
74
- sinks.gcs = {
161
+ sinks.decision_logs = {
75
162
  type: "gcp_cloud_storage",
76
- inputs: ["kafka"],
77
- bucket: bucket,
78
- key_prefix: "rulebricks/logs/%Y/%m/%d/",
163
+ inputs: ["normalize_logs"],
164
+ bucket: storage.bucket,
165
+ key_prefix: decisionLogPathPrefix(config),
166
+ filename_extension: "ndjson",
79
167
  compression: "gzip",
80
- encoding: {
81
- codec: "json",
82
- },
168
+ encoding: { codec: "json" },
169
+ framing: { method: "newline_delimited" },
170
+ batch: { ...DECISION_LOG_BATCH },
83
171
  };
84
172
  break;
173
+ }
174
+ }
175
+ // Add external logging-platform sink if configured. Decision logs always go
176
+ // to object storage via the decision_logs sink above; this is an additional
177
+ // platform destination (Datadog, Splunk, etc.).
178
+ if (config.features.logging.sink !== "console" &&
179
+ config.features.logging.sink !== "pending") {
180
+ const { sink, bucket, region } = config.features.logging;
181
+ switch (sink) {
85
182
  // Logging platform sinks
86
183
  // For platforms, bucket is repurposed for API key/token, region for site/URL
87
184
  case "datadog":
88
185
  sinks.datadog = {
89
186
  type: "datadog_logs",
90
- inputs: ["kafka"],
187
+ inputs: ["normalize_logs"],
91
188
  default_api_key: bucket, // API key stored in bucket field
92
189
  site: region || "datadoghq.com", // Site stored in region field
93
190
  compression: "gzip",
@@ -99,7 +196,7 @@ function generateVectorSinks(config) {
99
196
  case "splunk":
100
197
  sinks.splunk = {
101
198
  type: "splunk_hec_logs",
102
- inputs: ["kafka"],
199
+ inputs: ["normalize_logs"],
103
200
  endpoint: region, // URL stored in region field
104
201
  default_token: bucket, // HEC token stored in bucket field
105
202
  compression: "gzip",
@@ -114,7 +211,7 @@ function generateVectorSinks(config) {
114
211
  const esConfig = JSON.parse(bucket || "{}");
115
212
  sinks.elasticsearch = {
116
213
  type: "elasticsearch",
117
- inputs: ["kafka"],
214
+ inputs: ["normalize_logs"],
118
215
  endpoints: [esConfig.url],
119
216
  bulk: {
120
217
  index: esConfig.index || "rulebricks-logs",
@@ -134,7 +231,7 @@ function generateVectorSinks(config) {
134
231
  // Fallback if JSON parsing fails
135
232
  sinks.elasticsearch = {
136
233
  type: "elasticsearch",
137
- inputs: ["kafka"],
234
+ inputs: ["normalize_logs"],
138
235
  endpoints: [bucket],
139
236
  bulk: {
140
237
  index: region || "rulebricks-logs",
@@ -145,7 +242,7 @@ function generateVectorSinks(config) {
145
242
  case "loki":
146
243
  sinks.loki = {
147
244
  type: "loki",
148
- inputs: ["kafka"],
245
+ inputs: ["normalize_logs"],
149
246
  endpoint: bucket, // Loki URL stored in bucket field
150
247
  labels: {
151
248
  app: "rulebricks",
@@ -159,7 +256,7 @@ function generateVectorSinks(config) {
159
256
  case "newrelic":
160
257
  sinks.newrelic = {
161
258
  type: "new_relic",
162
- inputs: ["kafka"],
259
+ inputs: ["normalize_logs"],
163
260
  license_key: bucket, // License key stored in bucket field
164
261
  account_id: region, // Account ID stored in region field
165
262
  api: "logs",
@@ -172,7 +269,7 @@ function generateVectorSinks(config) {
172
269
  case "axiom":
173
270
  sinks.axiom = {
174
271
  type: "axiom",
175
- inputs: ["kafka"],
272
+ inputs: ["normalize_logs"],
176
273
  token: bucket, // API token stored in bucket field
177
274
  dataset: region || "rulebricks", // Dataset stored in region field
178
275
  compression: "gzip",
@@ -186,20 +283,32 @@ function generateVectorSinks(config) {
186
283
  return sinks;
187
284
  }
188
285
  function generateVectorEnv(config) {
189
- const env = [
190
- {
191
- name: "KAFKA_BOOTSTRAP_SERVERS",
286
+ // Kafka connection settings come from the templated vector-kafka-env ConfigMap
287
+ // so the in-cluster vs external (and bridge) decision lives in one place.
288
+ const configMapKeys = [
289
+ "KAFKA_BOOTSTRAP_SERVERS",
290
+ "KAFKA_TLS_ENABLED",
291
+ "KAFKA_SASL_ENABLED",
292
+ "KAFKA_SASL_MECHANISM",
293
+ "KAFKA_LOG_TOPIC",
294
+ ];
295
+ const env = configMapKeys.map((key) => ({
296
+ name: key,
297
+ valueFrom: { configMapKeyRef: { name: "vector-kafka-env", key } },
298
+ }));
299
+ // SASL credentials (inline PLAIN/SCRAM). Optional so in-cluster/token-auth
300
+ // deploys work without the secret existing.
301
+ for (const key of ["KAFKA_SASL_USERNAME", "KAFKA_SASL_PASSWORD"]) {
302
+ env.push({
303
+ name: key,
192
304
  valueFrom: {
193
- configMapKeyRef: {
194
- name: "vector-kafka-env",
195
- key: "KAFKA_BOOTSTRAP_SERVERS",
196
- },
305
+ secretKeyRef: { name: "vector-kafka-credentials", key, optional: true },
197
306
  },
198
- },
199
- ];
200
- const azureBlobSecretRef = config.features.logging.azureBlobConnectionStringSecretRef;
201
- if (config.features.logging.sink === "azure-blob" &&
202
- config.features.logging.cloudAuthMode === "secret" &&
307
+ });
308
+ }
309
+ const azureBlobSecretRef = config.storage?.azureBlobConnectionStringSecretRef;
310
+ if (config.storage?.provider === "azure-blob" &&
311
+ config.storage.cloudAuthMode === "secret" &&
203
312
  azureBlobSecretRef) {
204
313
  env.push({
205
314
  name: "AZURE_STORAGE_CONNECTION_STRING",
@@ -211,20 +320,20 @@ function generateVectorEnv(config) {
211
320
  return env;
212
321
  }
213
322
  function generateVectorServiceAccount(config) {
323
+ // AWS uses EKS Pod Identity: NO eks.amazonaws.com/role-arn annotation - the
324
+ // CLI's workload-identity step creates a namespace-scoped association for this
325
+ // SA (to a role granting both the object-storage and MSK access Vector needs).
326
+ // Azure/GCP still annotate the SA, which is how their workload identity binds.
214
327
  const annotations = {};
215
- if (config.features.logging.sink === "s3" && config.features.logging.awsIamRoleArn) {
216
- annotations["eks.amazonaws.com/role-arn"] =
217
- config.features.logging.awsIamRoleArn;
218
- }
219
- if (config.features.logging.sink === "azure-blob" &&
220
- config.features.logging.cloudAuthMode !== "secret" &&
221
- config.features.logging.azureBlobClientId) {
328
+ if (config.storage?.provider === "azure-blob" &&
329
+ config.storage.cloudAuthMode !== "secret" &&
330
+ config.storage.azureBlobClientId) {
222
331
  annotations["azure.workload.identity/client-id"] =
223
- config.features.logging.azureBlobClientId;
332
+ config.storage.azureBlobClientId;
224
333
  }
225
- if (config.features.logging.sink === "gcs" && config.features.logging.gcpServiceAccountEmail) {
334
+ if (config.storage?.provider === "gcs" && config.storage.gcpServiceAccountEmail) {
226
335
  annotations["iam.gke.io/gcp-service-account"] =
227
- config.features.logging.gcpServiceAccountEmail;
336
+ config.storage.gcpServiceAccountEmail;
228
337
  }
229
338
  return {
230
339
  create: true,
@@ -233,9 +342,11 @@ function generateVectorServiceAccount(config) {
233
342
  };
234
343
  }
235
344
  function generateVectorPodLabels(config) {
236
- const labels = {};
237
- if (config.features.logging.sink === "azure-blob" &&
238
- config.features.logging.cloudAuthMode !== "secret") {
345
+ const labels = {
346
+ "rulebricks.com/workload-group": "infrastructure",
347
+ };
348
+ if (config.storage?.provider === "azure-blob" &&
349
+ config.storage.cloudAuthMode !== "secret") {
239
350
  labels["azure.workload.identity/use"] = "true";
240
351
  }
241
352
  return labels;
@@ -258,6 +369,48 @@ function secretKeySelector(ref) {
258
369
  key: ref.key,
259
370
  };
260
371
  }
372
+ function base64UrlJson(value) {
373
+ return Buffer.from(JSON.stringify(value)).toString("base64url");
374
+ }
375
+ // Self-hosted Supabase derives the anon and service_role API keys from the JWT
376
+ // secret: each is an HS256 JWT (role: anon / service_role) signed with the secret.
377
+ // https://supabase.com/docs/guides/self-hosting/self-hosted-auth-keys
378
+ export function signSupabaseJwt(role, secret) {
379
+ const header = base64UrlJson({ alg: "HS256", typ: "JWT" });
380
+ const payload = base64UrlJson({
381
+ role,
382
+ iss: "supabase",
383
+ iat: SUPABASE_JWT_ISSUED_AT,
384
+ exp: SUPABASE_JWT_EXPIRES_AT,
385
+ });
386
+ const body = `${header}.${payload}`;
387
+ const signature = createHmac("sha256", secret).update(body).digest("base64url");
388
+ return `${body}.${signature}`;
389
+ }
390
+ // Realtime needs SECRET_KEY_BASE (signs/encrypts its tokens) and a 16-byte
391
+ // DB_ENC_KEY (encrypts tenant DB creds). Derive both deterministically from the
392
+ // JWT secret so they are stable across redeploys with no extra state to persist,
393
+ // and anchored to the one root secret the operator already manages.
394
+ export function deriveRealtimeSecrets(jwtSecret) {
395
+ const secretKeyBase = createHmac("sha256", jwtSecret)
396
+ .update("supabase-realtime-secret-key-base")
397
+ .digest("hex"); // 64 chars
398
+ const dbEncKey = createHmac("sha256", jwtSecret)
399
+ .update("supabase-realtime-db-enc-key")
400
+ .digest("hex")
401
+ .slice(0, 16); // Realtime requires exactly 16 bytes
402
+ return { secretKeyBase, dbEncKey };
403
+ }
404
+ /**
405
+ * Strips surrounding whitespace and embedded control characters (notably the
406
+ * trailing carriage return that sneaks in when a remote_write URL is pasted from
407
+ * a CRLF file or captured from command output). A stray "\r" corrupts the URL
408
+ * the Prometheus operator hands to remote_write, so normalize it at the source.
409
+ */
410
+ function sanitizeRemoteWriteUrl(url) {
411
+ // eslint-disable-next-line no-control-regex
412
+ return url.replace(/[\u0000-\u001F\u007F]/g, "").trim();
413
+ }
261
414
  function generateRemoteWriteSpec(config) {
262
415
  if (config.features.monitoring.destination === "local-grafana") {
263
416
  return [];
@@ -265,11 +418,18 @@ function generateRemoteWriteSpec(config) {
265
418
  const remoteWrite = config.features.monitoring.remoteWrite;
266
419
  if (!remoteWrite) {
267
420
  return config.features.monitoring.remoteWriteUrl
268
- ? [{ url: config.features.monitoring.remoteWriteUrl }]
421
+ ? [{ url: sanitizeRemoteWriteUrl(config.features.monitoring.remoteWriteUrl) }]
269
422
  : [];
270
423
  }
424
+ // Enforce the same per-destination/auth requirements the wizard and Zod
425
+ // schema do. This is unreachable for CLI-generated configs (they are gated
426
+ // earlier) but guards hand-edited values and keeps one source of truth.
427
+ const remoteWriteErrors = validateRemoteWriteConfig(remoteWrite);
428
+ if (remoteWriteErrors.length > 0) {
429
+ throw new Error(remoteWriteErrors.join(" "));
430
+ }
271
431
  const base = {
272
- url: remoteWrite.url,
432
+ url: sanitizeRemoteWriteUrl(remoteWrite.url),
273
433
  };
274
434
  switch (remoteWrite.destination) {
275
435
  case "aws-amp":
@@ -294,12 +454,125 @@ function generateRemoteWriteSpec(config) {
294
454
  return [base];
295
455
  }
296
456
  }
457
+ function isClickStackEnabled(config) {
458
+ return config.features.observability?.clickstack?.enabled ?? true;
459
+ }
460
+ function generateClickStackValues(enabled, config, storageClass, infrastructurePodLabels, operationalDaemonSetTolerations) {
461
+ const clickstack = config.features.observability?.clickstack;
462
+ const telemetryRetentionDays = clickstack?.telemetryRetentionDays ?? 7;
463
+ const clickHouseStorageSize = clickstack?.clickHouseStorageSize ?? "100Gi";
464
+ // Registry host for the clickstack images. The clickstack subchart routes
465
+ // these through its own image helper, so the split { registry, repository }
466
+ // shape lets global.imageRegistry + digest pinning flow through.
467
+ const reg = config.imageRegistry || DEFAULT_IMAGE_REGISTRY;
468
+ return {
469
+ enabled,
470
+ clickhouse: {
471
+ database: "otel",
472
+ username: "rulebricks",
473
+ existingSecret: "",
474
+ existingSecretKey: "admin-password",
475
+ retentionDays: telemetryRetentionDays,
476
+ ttl: "",
477
+ },
478
+ hyperdx: {
479
+ enabled,
480
+ image: {
481
+ registry: reg,
482
+ repository: IMAGE_REPOSITORIES.hyperdx.repository,
483
+ tag: IMAGE_REPOSITORIES.hyperdx.tag,
484
+ pullPolicy: "IfNotPresent",
485
+ },
486
+ resources: {
487
+ requests: { cpu: "250m", memory: "512Mi" },
488
+ limits: { cpu: "1000m", memory: "1Gi" },
489
+ },
490
+ ingress: {
491
+ enabled,
492
+ className: "traefik",
493
+ hostname: "",
494
+ allowedIPs: [],
495
+ },
496
+ podLabels: infrastructurePodLabels,
497
+ },
498
+ collector: {
499
+ image: {
500
+ registry: reg,
501
+ repository: IMAGE_REPOSITORIES.clickstackOtelCollector.repository,
502
+ tag: IMAGE_REPOSITORIES.clickstackOtelCollector.tag,
503
+ pullPolicy: "IfNotPresent",
504
+ },
505
+ memoryLimitMiB: 800,
506
+ agent: {
507
+ enabled,
508
+ securityContext: {
509
+ runAsUser: 0,
510
+ runAsGroup: 0,
511
+ },
512
+ resources: {
513
+ requests: { cpu: "100m", memory: "256Mi" },
514
+ limits: { cpu: "500m", memory: "512Mi" },
515
+ },
516
+ tolerations: operationalDaemonSetTolerations,
517
+ podLabels: infrastructurePodLabels,
518
+ },
519
+ gateway: {
520
+ replicas: 1,
521
+ resources: {
522
+ requests: { cpu: "250m", memory: "512Mi" },
523
+ limits: { cpu: "2000m", memory: "1Gi" },
524
+ },
525
+ podLabels: infrastructurePodLabels,
526
+ },
527
+ },
528
+ ferretdb: {
529
+ enabled,
530
+ image: {
531
+ registry: reg,
532
+ repository: IMAGE_REPOSITORIES.ferretdb.repository,
533
+ tag: IMAGE_REPOSITORIES.ferretdb.tag,
534
+ pullPolicy: "IfNotPresent",
535
+ },
536
+ postgresImage: {
537
+ registry: reg,
538
+ repository: IMAGE_REPOSITORIES.postgresDocumentdb.repository,
539
+ tag: IMAGE_REPOSITORIES.postgresDocumentdb.tag,
540
+ pullPolicy: "IfNotPresent",
541
+ },
542
+ auth: {
543
+ username: "hyperdx",
544
+ password: "",
545
+ existingSecret: "",
546
+ existingSecretKey: "password",
547
+ },
548
+ persistence: {
549
+ enabled,
550
+ size: "10Gi",
551
+ storageClassName: storageClass,
552
+ },
553
+ resources: {
554
+ ferretdb: {
555
+ requests: { cpu: "100m", memory: "256Mi" },
556
+ limits: { cpu: "500m", memory: "512Mi" },
557
+ },
558
+ postgres: {
559
+ requests: { cpu: "250m", memory: "512Mi" },
560
+ limits: { cpu: "1000m", memory: "1Gi" },
561
+ },
562
+ },
563
+ podLabels: infrastructurePodLabels,
564
+ podAnnotations: {
565
+ "cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
566
+ },
567
+ },
568
+ };
569
+ }
297
570
  function generatePrometheusServiceAccount(config) {
571
+ // AWS (AMP remote write) uses EKS Pod Identity - the association is created by
572
+ // the CLI's workload-identity step, so no eks.amazonaws.com/role-arn annotation.
573
+ // Azure Monitor still annotates the SA for its workload identity.
298
574
  const annotations = {};
299
575
  const remoteWrite = config.features.monitoring.remoteWrite;
300
- if (remoteWrite?.destination === "aws-amp" && remoteWrite.awsRoleArn) {
301
- annotations["eks.amazonaws.com/role-arn"] = remoteWrite.awsRoleArn;
302
- }
303
576
  if (remoteWrite?.destination === "azure-monitor" &&
304
577
  remoteWrite.authType === "workload-identity" &&
305
578
  remoteWrite.clientId) {
@@ -343,8 +616,16 @@ function generateAzureMonitorRemoteWrite(remoteWrite, base) {
343
616
  if (!remoteWrite.clientId || !remoteWrite.tenantId) {
344
617
  throw new Error("Azure Monitor remote_write workload identity requires client ID and tenant ID.");
345
618
  }
346
- azureAd.workloadIdentity = {
347
- clientId: remoteWrite.clientId,
619
+ // The prometheus-operator AzureAD schema supports only managedIdentity,
620
+ // oauth, and sdk (there is no "workloadIdentity" field - emitting it makes
621
+ // the operator reject the whole remoteWrite with "must provide Azure Managed
622
+ // Identity or Azure OAuth or Azure SDK", which silently prevents the
623
+ // Prometheus StatefulSet from being created). For AKS workload identity we
624
+ // use the Azure SDK credential: it reads the projected token + AZURE_CLIENT_ID
625
+ // injected by the workload-identity webhook (driven by the prometheus
626
+ // ServiceAccount's azure.workload.identity/client-id annotation and the
627
+ // azure.workload.identity/use pod label), so only the tenant ID is needed here.
628
+ azureAd.sdk = {
348
629
  tenantId: remoteWrite.tenantId,
349
630
  };
350
631
  }
@@ -392,57 +673,690 @@ function generateGenericRemoteWrite(remoteWrite, base) {
392
673
  return base;
393
674
  }
394
675
  /**
395
- * Generates Kafka extra environment variables for tuning
676
+ * Generates the Kafka broker config map (Kafka.spec.kafka.config for Strimzi).
677
+ * These are the former KAFKA_CFG_* tuning env vars, as their Kafka property
678
+ * names. Kept in lockstep with the chart's kafka.config.
679
+ */
680
+ function generateKafkaConfig() {
681
+ return {
682
+ "auto.create.topics.enable": "true",
683
+ "log.retention.hours": "24",
684
+ "num.partitions": "12",
685
+ "num.network.threads": "8",
686
+ "num.io.threads": "8",
687
+ "socket.send.buffer.bytes": "1048576",
688
+ "socket.receive.buffer.bytes": "1048576",
689
+ "socket.request.max.bytes": "209715200",
690
+ // Broker-wide max record size; must exceed every per-topic max.message.bytes.
691
+ "message.max.bytes": "2097152",
692
+ "replica.fetch.max.bytes": "4194304",
693
+ // Broker-wide default retention; the application topics carry tighter caps.
694
+ "log.retention.bytes": "536870912",
695
+ "log.segment.bytes": "1073741824",
696
+ "num.replica.fetchers": "4",
697
+ "queued.max.requests": "10000",
698
+ "replica.socket.receive.buffer.bytes": "1048576",
699
+ "log.cleaner.dedupe.buffer.size": "268435456",
700
+ "log.cleaner.io.buffer.size": "1048576",
701
+ };
702
+ }
703
+ /**
704
+ * Effective Kafka topic prefix as HPS/Vector/KEDA will see it.
705
+ * Mirrors generateAppLogging: in-cluster Kafka runs UNPREFIXED (dedicated
706
+ * broker, and prefixing would desync chart-side consumers from producers);
707
+ * external Kafka uses the explicit prefix, falling back to the chart default.
708
+ */
709
+ function effectiveTopicPrefix(config) {
710
+ if (!isExternalKafka(config)) {
711
+ return "";
712
+ }
713
+ const ext = config.externalServices?.kafka?.external ?? {};
714
+ return ext.topicPrefix !== undefined ? ext.topicPrefix : "com.rulebricks.";
715
+ }
716
+ /**
717
+ * Explicit topic management for in-cluster Kafka.
718
+ *
719
+ * Generates the kafka.provisioning block consumed by BOTH the subchart
720
+ * provisioning Job (creates topics) and the chart's kafka-topic-align Job
721
+ * (idempotently converges pre-existing topics on upgrade). Topic names are
722
+ * derived from the SAME prefix written to app.logging.kafkaTopicPrefix - the
723
+ * chart fails the render if these ever diverge.
724
+ *
725
+ * Sizing policy (baseline constants, mirroring the chart defaults):
726
+ * - solution/solution-response: SOLUTION_TOPIC_PARTITIONS (the worker-fleet
727
+ * concurrency CEILING; partitions can never be decreased, workers are sized
728
+ * separately by the cluster autoscaler). RF stays 1: RPC traffic is transient
729
+ * and latency-sensitive, and the HPS producer's acks=-1 would otherwise wait
730
+ * on full ISR replication.
731
+ * - logs: LOGS_TOPIC_PARTITIONS (durable data feeding the Vector -> object
732
+ * storage pipeline).
733
+ */
734
+ function generateKafkaTopics(config) {
735
+ // External MSK IAM: the chart's kafka-topic-provision Job creates these on the
736
+ // managed broker (through the proxy bridge), so they must be populated here -
737
+ // MSK Serverless won't auto-create them. Other external brokers (SCRAM / Event
738
+ // Hubs / GCP, no bridge) a plain client can reach stay customer-managed.
739
+ if (isExternalKafka(config) && !kafkaUsesBridge(config)) {
740
+ return [];
741
+ }
742
+ const prefix = effectiveTopicPrefix(config);
743
+ const rpcTopicConfig = {
744
+ "retention.ms": "300000",
745
+ "segment.ms": "300000",
746
+ "segment.bytes": "67108864",
747
+ "retention.bytes": "67108864",
748
+ "max.message.bytes": "2097152",
749
+ };
750
+ return [
751
+ {
752
+ name: `${prefix}solution`,
753
+ partitions: SOLUTION_TOPIC_PARTITIONS,
754
+ replicas: TOPIC_REPLICATION_FACTOR,
755
+ config: rpcTopicConfig,
756
+ },
757
+ {
758
+ name: `${prefix}solution-response`,
759
+ partitions: SOLUTION_TOPIC_PARTITIONS,
760
+ replicas: TOPIC_REPLICATION_FACTOR,
761
+ config: rpcTopicConfig,
762
+ },
763
+ {
764
+ name: `${prefix}logs`,
765
+ partitions: LOGS_TOPIC_PARTITIONS,
766
+ replicas: TOPIC_REPLICATION_FACTOR,
767
+ config: {
768
+ "retention.ms": "86400000",
769
+ "retention.bytes": "268435456",
770
+ "max.message.bytes": "2097152",
771
+ },
772
+ },
773
+ ];
774
+ }
775
+ function generateWorkerPodAntiAffinity() {
776
+ return {
777
+ podAntiAffinity: {
778
+ preferredDuringSchedulingIgnoredDuringExecution: [
779
+ {
780
+ weight: 50,
781
+ podAffinityTerm: {
782
+ labelSelector: {
783
+ matchExpressions: [
784
+ {
785
+ key: "rulebricks.com/workload-group",
786
+ operator: "In",
787
+ values: ["infrastructure"],
788
+ },
789
+ ],
790
+ },
791
+ topologyKey: "kubernetes.io/hostname",
792
+ },
793
+ },
794
+ ],
795
+ },
796
+ };
797
+ }
798
+ function generateScheduling(tolerations, affinity) {
799
+ return {
800
+ ...(affinity ? { affinity } : {}),
801
+ ...(tolerations ? { tolerations } : {}),
802
+ };
803
+ }
804
+ /**
805
+ * Burst-pool scheduling, always on. Cluster-setup provisions a dedicated
806
+ * worker pool labeled and tainted rulebricks.com/pool=burst (one big
807
+ * Deallocate-parked node on Azure or an on-demand nodegroup on AWS); workers
808
+ * tolerate the taint and SOFTLY prefer the label. On clusters without such a
809
+ * pool both are inert, so BYO clusters schedule exactly as before - zero
810
+ * configuration required either way.
811
+ */
812
+ const BURST_POOL_TOLERATION = {
813
+ key: "rulebricks.com/pool",
814
+ operator: "Equal",
815
+ value: "burst",
816
+ effect: "NoSchedule",
817
+ };
818
+ const BURST_POOL_NODE_PREFERENCE = {
819
+ weight: 100,
820
+ preference: {
821
+ matchExpressions: [
822
+ { key: "rulebricks.com/pool", operator: "In", values: ["burst"] },
823
+ ],
824
+ },
825
+ };
826
+ function generateBackupValues(config) {
827
+ const usesInClusterPostgres = config.database.type === "self-hosted" &&
828
+ config.externalServices?.postgres?.mode !== "external";
829
+ const enabled = usesInClusterPostgres && config.backup?.enabled === true;
830
+ // The backup CronJob streams pg_dump from the running DB (using supabase.db.image)
831
+ // and uploads it with rclone, so no backup-specific image is needed here. The
832
+ // chart default rclone image applies unless overridden in values.
833
+ return {
834
+ enabled,
835
+ schedule: config.backup?.schedule || "0 2 * * *",
836
+ retentionDays: config.backup?.retentionDays || 7,
837
+ };
838
+ }
839
+ function isExternalRedis(config) {
840
+ return config.externalServices?.redis?.mode === "external";
841
+ }
842
+ function isExternalKafka(config) {
843
+ return config.externalServices?.kafka?.mode === "external";
844
+ }
845
+ /**
846
+ * Whether the Vector kafka-proxy bridge sidecar is required. Only AWS MSK IAM
847
+ * needs it: Vector's kafka source can't speak token mechanisms, while Azure
848
+ * Event Hubs and GCP both use SASL PLAIN/SCRAM that Vector handles directly.
849
+ */
850
+ function kafkaUsesBridge(config) {
851
+ if (!isExternalKafka(config))
852
+ return false;
853
+ const ext = config.externalServices?.kafka?.external;
854
+ return (ext?.preset === "aws-msk-iam" || ext?.sasl?.mechanism === "aws-iam");
855
+ }
856
+ /**
857
+ * Whether Vector's kafka source connects with a direct PLAIN/SCRAM credential
858
+ * and therefore needs username/password. This mirrors the vector-kafka-env
859
+ * ConfigMap, which only sets KAFKA_SASL_ENABLED=true for external, non-token,
860
+ * non-bridge mechanisms (and where vector-kafka-credentials is populated). For
861
+ * in-cluster, bridge, and token-auth paths SASL is disabled, so username and
862
+ * password MUST be omitted: an empty env default (${VAR:-}) renders unquoted
863
+ * via Helm's toYaml and Vector reads the value as YAML null, which it rejects
864
+ * at startup ("invalid type: unit value, expected any valid TOML value").
865
+ */
866
+ function kafkaUsesDirectSasl(config) {
867
+ if (!isExternalKafka(config))
868
+ return false;
869
+ if (kafkaUsesBridge(config))
870
+ return false;
871
+ const mechanism = config.externalServices?.kafka?.external?.sasl?.mechanism;
872
+ if (!mechanism)
873
+ return false;
874
+ return mechanism !== "aws-iam" && mechanism !== "oauthbearer";
875
+ }
876
+ /**
877
+ * Builds the rulebricks.redis block: in-cluster sizing when embedded, or
878
+ * external connection settings when the user points at managed Redis.
396
879
  */
397
- function generateKafkaExtraEnvVars() {
880
+ function generateRedisBlock(config, storageClass, infrastructurePodLabels, coreScheduling) {
881
+ if (!isExternalRedis(config)) {
882
+ // Sizing (resources, persistence size) falls back to the chart defaults;
883
+ // only the deployment-specific storage class is set here.
884
+ return {
885
+ podLabels: infrastructurePodLabels,
886
+ ...coreScheduling,
887
+ persistence: {
888
+ enabled: true,
889
+ storageClass,
890
+ },
891
+ };
892
+ }
893
+ const ext = config.externalServices?.redis?.external ?? {};
894
+ const external = {
895
+ host: ext.host ?? "",
896
+ port: ext.port ?? 6379,
897
+ tls: { enabled: ext.tls ?? false },
898
+ };
899
+ if (ext.password) {
900
+ external.password = ext.password;
901
+ }
902
+ if (ext.existingSecret) {
903
+ external.existingSecret = ext.existingSecret;
904
+ external.existingSecretKey = ext.existingSecretKey || "redis-password";
905
+ }
906
+ if (ext.httpApi?.enabled) {
907
+ external.httpApi = {
908
+ enabled: true,
909
+ url: ext.httpApi.url ?? "",
910
+ token: ext.httpApi.token ?? "",
911
+ };
912
+ }
913
+ return {
914
+ enabled: false,
915
+ external,
916
+ };
917
+ }
918
+ function generateCacheObservabilityBlock(config, infrastructurePodLabels) {
919
+ const cache = config.features.cache;
920
+ const valkeyAdmin = cache?.valkeyAdmin;
921
+ const redisExporter = cache?.redisExporter;
922
+ const valkeyAdminIngressEnabled = valkeyAdmin?.exposure === "ingress";
923
+ return {
924
+ valkeyAdmin: {
925
+ enabled: valkeyAdmin?.enabled ?? false,
926
+ exposure: valkeyAdmin?.exposure ?? "internal",
927
+ podLabels: infrastructurePodLabels,
928
+ ingress: {
929
+ enabled: valkeyAdminIngressEnabled,
930
+ hostname: valkeyAdminIngressEnabled
931
+ ? valkeyAdmin?.hostname || `valkey.${config.domain}`
932
+ : "",
933
+ basicAuth: {
934
+ users: valkeyAdmin?.basicAuthUsers ?? [],
935
+ existingSecret: valkeyAdmin?.basicAuthExistingSecret ?? "",
936
+ },
937
+ allowedIPs: valkeyAdmin?.allowedIPs ?? [],
938
+ },
939
+ },
940
+ redisExporter: {
941
+ enabled: redisExporter?.enabled ?? true,
942
+ podLabels: infrastructurePodLabels,
943
+ },
944
+ };
945
+ }
946
+ function generateKafkaExporterBlock(config, infrastructurePodLabels) {
947
+ const requested = config.features.cache?.kafkaExporter?.enabled;
948
+ const canUseKafkaExporter = !isExternalKafka(config);
949
+ return {
950
+ enabled: requested ?? canUseKafkaExporter,
951
+ podLabels: infrastructurePodLabels,
952
+ brokers: isExternalKafka(config)
953
+ ? config.externalServices?.kafka?.external?.brokers ?? ""
954
+ : "",
955
+ };
956
+ }
957
+ /**
958
+ * Builds the rulebricks.app.logging block. Decision logging is always enabled;
959
+ * external Kafka adds brokers + SSL/SASL, while embedded auto-discovers the
960
+ * in-cluster Kafka service.
961
+ */
962
+ function generateAppLogging(config) {
963
+ if (!isExternalKafka(config)) {
964
+ return {
965
+ enabled: true,
966
+ kafkaBrokers: "", // Auto-discover from Kafka subchart
967
+ kafkaTopic: "logs",
968
+ // The in-cluster app/HPS produce to unprefixed topics (logs, solution,
969
+ // solution-response). The chart default prefix ("com.rulebricks.") is meant
970
+ // for shared/managed Kafka collision avoidance, but when applied here it
971
+ // makes the chart-side consumers diverge from the producers: Vector would
972
+ // subscribe to "com.rulebricks.logs" (no data) and the KEDA worker trigger
973
+ // would watch "com.rulebricks.solution" (no lag signal). Disable prefixing
974
+ // for the dedicated in-cluster broker so everything lines up.
975
+ kafkaTopicPrefix: "",
976
+ };
977
+ }
978
+ const ext = config.externalServices?.kafka?.external ?? {};
979
+ const logging = {
980
+ enabled: true,
981
+ kafkaBrokers: ext.brokers ?? "",
982
+ kafkaTopic: ext.topic || "logs",
983
+ kafkaSsl: ext.ssl ?? false,
984
+ };
985
+ // Topic prefix: emit only when explicitly provided (incl. "" to disable). When
986
+ // omitted, the chart default (com.rulebricks.) applies via value merge.
987
+ if (ext.topicPrefix !== undefined) {
988
+ logging.kafkaTopicPrefix = ext.topicPrefix;
989
+ }
990
+ if (ext.sasl?.mechanism) {
991
+ const sasl = { mechanism: ext.sasl.mechanism };
992
+ if (ext.sasl.region)
993
+ sasl.region = ext.sasl.region;
994
+ if (ext.sasl.username)
995
+ sasl.username = ext.sasl.username;
996
+ if (ext.sasl.password)
997
+ sasl.password = ext.sasl.password;
998
+ if (ext.sasl.existingSecret)
999
+ sasl.existingSecret = ext.sasl.existingSecret;
1000
+ logging.kafkaSasl = sasl;
1001
+ }
1002
+ return logging;
1003
+ }
1004
+ /**
1005
+ * HPS service account. When external Kafka uses MSK IAM, HPS authenticates to the
1006
+ * broker with its pod's cloud identity - under EKS Pod Identity that comes from a
1007
+ * namespace-scoped association (created by the CLI's workload-identity step for
1008
+ * the `<release>-hps` SA), NOT an eks.amazonaws.com/role-arn annotation. We only
1009
+ * CREATE the SA here so the association has a subject to bind.
1010
+ */
1011
+ function generateHpsServiceAccount(config) {
1012
+ if (kafkaUsesBridge(config)) {
1013
+ return { create: true, annotations: {} };
1014
+ }
1015
+ return { create: false, annotations: {} };
1016
+ }
1017
+ /**
1018
+ * Top-level kafkaBridge block consumed by the Vector env ConfigMap. Only enabled
1019
+ * for AWS MSK IAM, where a kafka-proxy sidecar fronts the brokers for Vector.
1020
+ */
1021
+ function generateKafkaBridge(config) {
1022
+ if (!kafkaUsesBridge(config)) {
1023
+ return { enabled: false };
1024
+ }
1025
+ const ext = config.externalServices?.kafka?.external ?? {};
1026
+ return {
1027
+ enabled: true,
1028
+ provider: "aws",
1029
+ region: ext.sasl?.region ?? "",
1030
+ brokers: ext.brokers ?? "",
1031
+ localPort: 19092,
1032
+ image: KAFKA_PROXY_IMAGE,
1033
+ awsRoleArn: ext.identity?.awsRoleArn ?? "",
1034
+ };
1035
+ }
1036
+ /**
1037
+ * kafka-proxy sidecar for the Vector pod (AWS MSK IAM). Maps each upstream
1038
+ * broker to a sequential local port and authenticates with the pod's IRSA role.
1039
+ */
1040
+ function generateVectorExtraContainers(config) {
1041
+ if (!kafkaUsesBridge(config))
1042
+ return undefined;
1043
+ const ext = config.externalServices?.kafka?.external ?? {};
1044
+ const brokers = (ext.brokers ?? "")
1045
+ .split(",")
1046
+ .map((b) => b.trim())
1047
+ .filter(Boolean);
1048
+ if (brokers.length === 0)
1049
+ return undefined;
1050
+ const basePort = 19092;
1051
+ const mappings = brokers.map((broker, i) => `--bootstrap-server-mapping=${broker},127.0.0.1:${basePort + i}`);
398
1052
  return [
399
1053
  {
400
- name: "KAFKA_JVM_PERFORMANCE_OPTS",
401
- value: "-XX:MaxDirectMemorySize=256M -Djdk.nio.maxCachedBufferSize=262144",
402
- },
403
- { name: "KAFKA_CFG_QUEUED_MAX_REQUESTS", value: "10000" },
404
- { name: "KAFKA_CFG_NUM_NETWORK_THREADS", value: "8" },
405
- { name: "KAFKA_CFG_NUM_IO_THREADS", value: "8" },
406
- { name: "KAFKA_CFG_SOCKET_SEND_BUFFER_BYTES", value: "1048576" },
407
- { name: "KAFKA_CFG_SOCKET_RECEIVE_BUFFER_BYTES", value: "1048576" },
408
- { name: "KAFKA_CFG_SOCKET_REQUEST_MAX_BYTES", value: "209715200" },
409
- { name: "KAFKA_CFG_LOG_RETENTION_BYTES", value: "4294967296" },
410
- { name: "KAFKA_CFG_LOG_SEGMENT_BYTES", value: "1073741824" },
411
- { name: "KAFKA_CFG_NUM_REPLICA_FETCHERS", value: "4" },
412
- { name: "KAFKA_CFG_REPLICA_SOCKET_RECEIVE_BUFFER_BYTES", value: "1048576" },
413
- { name: "KAFKA_CFG_LOG_CLEANER_DEDUPE_BUFFER_SIZE", value: "268435456" },
414
- { name: "KAFKA_CFG_LOG_CLEANER_IO_BUFFER_SIZE", value: "1048576" },
415
- { name: "KAFKA_CFG_MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION", value: "10" },
1054
+ name: "kafka-proxy",
1055
+ image: KAFKA_PROXY_IMAGE,
1056
+ args: [
1057
+ "server",
1058
+ ...mappings,
1059
+ "--tls-enable",
1060
+ "--sasl-enable",
1061
+ "--sasl-method=AWS_MSK_IAM",
1062
+ `--sasl-aws-region=${ext.sasl?.region ?? ""}`,
1063
+ ],
1064
+ ports: brokers.map((_, i) => ({ containerPort: basePort + i })),
1065
+ },
416
1066
  ];
417
1067
  }
1068
+ // VRL for the Vector agent: parse JSON app/HPS log lines, lift trace_id/span_id
1069
+ // for logs<->traces correlation, and flatten useful Kubernetes metadata. Kept
1070
+ // in sync with charts/.../values.yaml vector-agent.customConfig.transforms.
1071
+ const VECTOR_APP_LOGS_VRL = [
1072
+ 'parsed, err = parse_json(to_string(.message) ?? "")',
1073
+ "if err == null && is_object(parsed) {",
1074
+ " .log = parsed",
1075
+ " .trace_id = parsed.trace_id",
1076
+ " .span_id = parsed.span_id",
1077
+ ' if exists(parsed.level) { .level = to_string(parsed.level) ?? "info" }',
1078
+ "}",
1079
+ ".pod = .kubernetes.pod_name",
1080
+ ".namespace = .kubernetes.pod_namespace",
1081
+ ".container = .kubernetes.container_name",
1082
+ ".node = .kubernetes.pod_node_name",
1083
+ ].join("\n");
418
1084
  /**
419
- * Generates Helm values from the deployment configuration
1085
+ * global.tracing block (in-cluster OTel Collector -> pluggable trace backend).
1086
+ * Emits the destination-specific sub-block (elastic | otlp | azure-monitor) and
1087
+ * returns undefined when tracing is disabled so it is omitted entirely.
420
1088
  */
421
- export async function generateHelmValues(config, options = {}) {
422
- const tierConfig = TIER_CONFIGS[config.tier];
423
- const { tlsEnabled = true } = options;
1089
+ function generateTracingGlobal(config) {
1090
+ const tracing = config.features.tracing;
1091
+ if (!tracing?.enabled)
1092
+ return undefined;
1093
+ const destination = tracing.destination ?? "elastic";
1094
+ const reg = config.imageRegistry || DEFAULT_IMAGE_REGISTRY;
1095
+ const base = {
1096
+ enabled: true,
1097
+ destination,
1098
+ samplingRatio: tracing.samplingRatio ?? 1,
1099
+ // RB image dict for the parent chart's otel-collector deployment. The
1100
+ // rulebricks.image helper requires image.repository and applies
1101
+ // global.imageRegistry to the host.
1102
+ collector: {
1103
+ image: {
1104
+ registry: reg,
1105
+ repository: IMAGE_REPOSITORIES.opentelemetryCollector.repository,
1106
+ tag: IMAGE_REPOSITORIES.opentelemetryCollector.tag,
1107
+ },
1108
+ },
1109
+ };
1110
+ if (destination === "elastic") {
1111
+ const elastic = tracing.elastic ?? {};
1112
+ const authMode = elastic.authMode ?? "secret-token";
1113
+ const elasticBlock = {
1114
+ endpoint: elastic.endpoint ?? "",
1115
+ authMode,
1116
+ tlsInsecureSkipVerify: false,
1117
+ };
1118
+ if (authMode === "secret-token" && elastic.secretToken) {
1119
+ elasticBlock.secretToken = elastic.secretToken;
1120
+ }
1121
+ if (authMode === "api-key" && elastic.apiKey) {
1122
+ elasticBlock.apiKey = elastic.apiKey;
1123
+ }
1124
+ return { ...base, elastic: elasticBlock };
1125
+ }
1126
+ if (destination === "otlp") {
1127
+ const otlp = tracing.otlp ?? {};
1128
+ const authMode = otlp.authMode ?? "none";
1129
+ const otlpBlock = {
1130
+ endpoint: otlp.endpoint ?? "",
1131
+ authMode,
1132
+ tlsInsecureSkipVerify: otlp.tlsInsecureSkipVerify ?? false,
1133
+ };
1134
+ if (authMode === "bearer" && otlp.token)
1135
+ otlpBlock.token = otlp.token;
1136
+ if (authMode === "api-key" && otlp.apiKey)
1137
+ otlpBlock.apiKey = otlp.apiKey;
1138
+ if (authMode === "header") {
1139
+ otlpBlock.headerName = otlp.headerName ?? "Authorization";
1140
+ if (otlp.headerValue)
1141
+ otlpBlock.headerValue = otlp.headerValue;
1142
+ }
1143
+ if (otlp.headers && Object.keys(otlp.headers).length > 0) {
1144
+ otlpBlock.headers = otlp.headers;
1145
+ }
1146
+ return { ...base, otlp: otlpBlock };
1147
+ }
1148
+ // azure-monitor
1149
+ const azure = tracing.azureMonitor ?? {};
1150
+ return {
1151
+ ...base,
1152
+ azureMonitor: { connectionString: azure.connectionString ?? "" },
1153
+ };
1154
+ }
1155
+ /**
1156
+ * traefik.tracing block: makes Traefik the root span and propagates the W3C
1157
+ * traceparent to backends. Empty object when tracing is disabled.
1158
+ */
1159
+ function generateTraefikTracing(config, releaseName) {
1160
+ if (!isClickStackEnabled(config) && !config.features.tracing?.enabled)
1161
+ return {};
1162
+ return {
1163
+ otlp: {
1164
+ enabled: true,
1165
+ http: {
1166
+ enabled: true,
1167
+ endpoint: `http://${releaseName}-otel-collector:4318/v1/traces`,
1168
+ },
1169
+ },
1170
+ };
1171
+ }
1172
+ /**
1173
+ * vector-agent block: a second Vector deployment (role Agent / DaemonSet) that
1174
+ * tails all pod logs and ships them to a customer-managed Elasticsearch. Decision
1175
+ * logs are unaffected (they stay in ClickHouse via the `vector` aggregator).
1176
+ */
1177
+ function generateVectorAgent(config, podLabels, tolerations) {
1178
+ const appLogs = config.features.logging.appLogs;
1179
+ if (!appLogs?.enabled) {
1180
+ return { enabled: false };
1181
+ }
1182
+ const destination = appLogs.destination ?? "elasticsearch";
1183
+ let sinkName = "elasticsearch";
1184
+ let sink;
1185
+ if (destination === "loki") {
1186
+ const loki = appLogs.loki ?? {};
1187
+ sinkName = "loki";
1188
+ sink = {
1189
+ type: "loki",
1190
+ inputs: ["app_logs"],
1191
+ endpoint: loki.endpoint,
1192
+ labels: loki.labels ?? {
1193
+ app: "rulebricks",
1194
+ namespace: "{{ namespace }}",
1195
+ pod: "{{ pod }}",
1196
+ container: "{{ container }}",
1197
+ },
1198
+ encoding: { codec: "json" },
1199
+ };
1200
+ }
1201
+ else if (destination === "generic") {
1202
+ const generic = appLogs.generic ?? {};
1203
+ sinkName = "generic_http";
1204
+ sink = {
1205
+ type: "http",
1206
+ inputs: ["app_logs"],
1207
+ uri: generic.endpoint,
1208
+ method: "post",
1209
+ encoding: { codec: "json" },
1210
+ };
1211
+ if (generic.authHeader) {
1212
+ sink.request = { headers: { Authorization: generic.authHeader } };
1213
+ }
1214
+ }
1215
+ else {
1216
+ const es = appLogs.elasticsearch ?? {};
1217
+ const authMode = es.authMode ?? "basic";
1218
+ sink = {
1219
+ type: "elasticsearch",
1220
+ inputs: ["app_logs"],
1221
+ endpoints: [es.endpoint],
1222
+ mode: "bulk",
1223
+ bulk: { index: es.index || "rulebricks-app-logs" },
1224
+ tls: { verify_certificate: es.verifyCertificate ?? true },
1225
+ };
1226
+ if (authMode === "basic") {
1227
+ sink.auth = { strategy: "basic", user: es.username, password: es.password };
1228
+ }
1229
+ else if (authMode === "api-key") {
1230
+ sink.request = { headers: { Authorization: `ApiKey ${es.apiKey}` } };
1231
+ }
1232
+ }
1233
+ return {
1234
+ enabled: true,
1235
+ role: "Agent",
1236
+ podLabels,
1237
+ // Follow active worker pools without tolerating shutdown, out-of-service,
1238
+ // or unreachable node taints.
1239
+ tolerations,
1240
+ resources: {
1241
+ requests: { cpu: "100m", memory: "256Mi" },
1242
+ limits: { cpu: "500m", memory: "512Mi" },
1243
+ },
1244
+ customConfig: {
1245
+ data_dir: "/vector-data-dir",
1246
+ sources: {
1247
+ kubernetes_logs: {
1248
+ type: "kubernetes_logs",
1249
+ // Skip both Vector deployments: the aggregator
1250
+ // (app.kubernetes.io/name=vector) re-emits decision logs on stdout
1251
+ // (those belong in ClickHouse, not Elasticsearch) and the agent
1252
+ // itself (vector-agent) to avoid a self-scrape loop.
1253
+ extra_label_selector: "app.kubernetes.io/name notin (vector,vector-agent)",
1254
+ },
1255
+ },
1256
+ transforms: {
1257
+ app_logs: {
1258
+ type: "remap",
1259
+ inputs: ["kubernetes_logs"],
1260
+ source: VECTOR_APP_LOGS_VRL,
1261
+ },
1262
+ },
1263
+ sinks: { [sinkName]: sink },
1264
+ },
1265
+ };
1266
+ }
1267
+ /**
1268
+ * Builds Helm values from the deployment configuration.
1269
+ */
1270
+ export function buildHelmValues(config, options = {}) {
1271
+ if (config.database.type === "self-hosted" &&
1272
+ !config.database.supabaseJwtSecret) {
1273
+ throw new Error("Self-hosted Supabase is missing a JWT secret. Run `rulebricks redeploy <name>` to regenerate deployment credentials, or set database.supabaseJwtSecret in config.yaml.");
1274
+ }
1275
+ if (config.features.ai.enabled && !config.features.ai.openaiApiKey) {
1276
+ throw new Error("AI features are enabled but the OpenAI API key is missing. Run `rulebricks redeploy <name>` and enter your OpenAI API key, or disable AI features in config.yaml.");
1277
+ }
1278
+ const { tlsEnabled = true, secretMode = "inline" } = options;
424
1279
  const useLocalGrafana = config.features.monitoring.destination === "local-grafana";
425
1280
  // Determine if external-dns should be enabled
426
1281
  const externalDnsEnabled = config.dns.autoManage && isSupportedDnsProvider(config.dns.provider);
427
- // Determine storage class based on provider
428
- // Note: GCP uses "hyperdisk-balanced" because C4A instances only support Hyperdisk (not Persistent Disk)
429
- const storageClass = config.infrastructure.provider === "aws"
430
- ? "gp3"
431
- : config.infrastructure.provider === "gcp"
432
- ? "hyperdisk-balanced"
433
- : config.infrastructure.provider === "azure"
434
- ? "managed-premium"
435
- : "gp3";
436
- // ARM64 tolerations for GKE C4A nodes (and other ARM64 providers)
437
- // GKE automatically taints ARM64 nodes with kubernetes.io/arch=arm64:NoSchedule
438
- const arm64Tolerations = [
439
- {
440
- key: "kubernetes.io/arch",
441
- operator: "Equal",
442
- value: "arm64",
443
- effect: "NoSchedule",
444
- },
1282
+ const gcpDiskType = config.infrastructure.nodeArchitecture === "amd64"
1283
+ ? "pd-balanced"
1284
+ : "hyperdisk-balanced";
1285
+ // Prefer the live cluster's StorageClass. Provider defaults are only a
1286
+ // fallback for legacy configs that predate capability scanning.
1287
+ const storageClass = config.infrastructure.storageClass ||
1288
+ (config.infrastructure.provider === "aws"
1289
+ ? "gp3"
1290
+ : config.infrastructure.provider === "gcp"
1291
+ ? gcpDiskType
1292
+ : config.infrastructure.provider === "azure"
1293
+ ? "managed-premium"
1294
+ : "gp3");
1295
+ const shouldApplyArm64Toleration = config.infrastructure.arm64TolerationRequired ?? false;
1296
+ const architectureTolerations = shouldApplyArm64Toleration
1297
+ ? [
1298
+ {
1299
+ key: "kubernetes.io/arch",
1300
+ operator: "Equal",
1301
+ value: "arm64",
1302
+ effect: "NoSchedule",
1303
+ },
1304
+ ]
1305
+ : undefined;
1306
+ const coreScheduling = generateScheduling(architectureTolerations);
1307
+ // Workers always tolerate + softly prefer the optional burst pool
1308
+ // (rulebricks.com/pool=burst). The preference is soft, so clusters without a
1309
+ // burst pool schedule workers on ordinary capacity exactly as before.
1310
+ const workerTolerations = [
1311
+ ...(architectureTolerations ?? []),
1312
+ BURST_POOL_TOLERATION,
445
1313
  ];
1314
+ const operationalDaemonSetTolerations = workerTolerations;
1315
+ const workerScheduling = generateScheduling(workerTolerations, {
1316
+ ...generateWorkerPodAntiAffinity(),
1317
+ nodeAffinity: {
1318
+ preferredDuringSchedulingIgnoredDuringExecution: [
1319
+ BURST_POOL_NODE_PREFERENCE,
1320
+ ],
1321
+ },
1322
+ });
1323
+ const infrastructurePodLabels = {
1324
+ "rulebricks.com/workload-group": "infrastructure",
1325
+ };
1326
+ const applicationPodLabels = {
1327
+ "rulebricks.com/workload-group": "application",
1328
+ };
1329
+ const productVersion = config.version;
1330
+ // Scheduling priority tiers. The chart creates release-scoped
1331
+ // PriorityClasses (<release>-critical / <release>-burst); stateful
1332
+ // infrastructure references the critical class so it can always preempt
1333
+ // burst workers to reschedule, and workers reference the burst class so
1334
+ // they are strictly the first preemption victims. Subchart values cannot
1335
+ // template release names, so the CLI emits them as literals.
1336
+ const releaseName = getReleaseName(config.name);
1337
+ const criticalPriorityClass = `${releaseName}-critical`;
1338
+ const burstPriorityClass = `${releaseName}-burst`;
1339
+ // Subcharts that don't honor global.imagePullSecrets (keda, strimzi, traefik,
1340
+ // vector) need the pull secret on their own key so their pods can pull the
1341
+ // private docker.io/rulebricks/* images from index.docker.io.
1342
+ const rulebricksPullSecret = [{ name: `${releaseName}-regcred` }];
1343
+ // Registry host for every image. Empty config.imageRegistry => docker.io. When
1344
+ // set, the host is rewritten into global.imageRegistry (which kube-prometheus-stack
1345
+ // and our subcharts honor) and into each of the six Tier-2 charts' own image
1346
+ // keys below, always keeping the rulebricks/<name> path.
1347
+ const reg = config.imageRegistry || DEFAULT_IMAGE_REGISTRY;
1348
+ const clickStackEnabled = isClickStackEnabled(config);
1349
+ const clickStackConfig = config.features.observability?.clickstack;
1350
+ const clickHouseStorageSize = clickStackConfig?.clickHouseStorageSize ?? "100Gi";
1351
+ // Distributed tracing (self-hosted only). Lives under global so the
1352
+ // rulebricks subchart deployments can read it; the collector + traefik are
1353
+ // wired below from the same source.
1354
+ const tracingGlobal = clickStackEnabled ? undefined : generateTracingGlobal(config);
1355
+ // Never let the cluster-autoscaler evict single-replica stateful pods
1356
+ // during node scale-down; an evicted broker/db stalls the whole pipeline.
1357
+ const safeToEvictAnnotations = {
1358
+ "cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
1359
+ };
446
1360
  // Build global.supabase configuration
447
1361
  const supabaseGlobalConfig = config.database.type === "supabase-cloud"
448
1362
  ? {
@@ -452,27 +1366,51 @@ export async function generateHelmValues(config, options = {}) {
452
1366
  accessToken: config.database.supabaseAccessToken || undefined,
453
1367
  projectRef: config.database.supabaseProjectRef || undefined,
454
1368
  }
455
- : {
456
- jwtSecret: config.database.supabaseJwtSecret || undefined,
457
- anonKey: undefined,
458
- serviceKey: undefined,
1369
+ : (() => {
1370
+ const jwtSecret = config.database.supabaseJwtSecret || "";
1371
+ return {
1372
+ jwtSecret: jwtSecret || undefined,
1373
+ anonKey: jwtSecret ? signSupabaseJwt("anon", jwtSecret) : undefined,
1374
+ serviceKey: jwtSecret
1375
+ ? signSupabaseJwt("service_role", jwtSecret)
1376
+ : undefined,
1377
+ };
1378
+ })();
1379
+ // Always emit email configuration so auth pods receive template/subject env
1380
+ // vars regardless of Helm merge order. Custom values take precedence over
1381
+ // built-in defaults when explicitly enabled.
1382
+ const customEmails = config.features.customEmails;
1383
+ if (customEmails?.enabled &&
1384
+ customEmails.subjects &&
1385
+ customEmails.templates) {
1386
+ supabaseGlobalConfig.emails = {
1387
+ subjects: {
1388
+ invite: customEmails.subjects.invite,
1389
+ confirmation: customEmails.subjects.confirmation,
1390
+ recovery: customEmails.subjects.recovery,
1391
+ emailChange: customEmails.subjects.emailChange,
1392
+ },
1393
+ templates: {
1394
+ invite: customEmails.templates.invite,
1395
+ confirmation: customEmails.templates.confirmation,
1396
+ recovery: customEmails.templates.recovery,
1397
+ emailChange: customEmails.templates.emailChange,
1398
+ },
459
1399
  };
460
- // Add custom email templates if enabled
461
- if (config.features.customEmails?.enabled &&
462
- config.features.customEmails.subjects &&
463
- config.features.customEmails.templates) {
1400
+ }
1401
+ else {
464
1402
  supabaseGlobalConfig.emails = {
465
1403
  subjects: {
466
- invite: config.features.customEmails.subjects.invite,
467
- confirmation: config.features.customEmails.subjects.confirmation,
468
- recovery: config.features.customEmails.subjects.recovery,
469
- emailChange: config.features.customEmails.subjects.emailChange,
1404
+ invite: "Join your team on Rulebricks",
1405
+ confirmation: "Confirm Your Email",
1406
+ recovery: "Reset Your Password",
1407
+ emailChange: "Confirm Email Change",
470
1408
  },
471
1409
  templates: {
472
- invite: config.features.customEmails.templates.invite,
473
- confirmation: config.features.customEmails.templates.confirmation,
474
- recovery: config.features.customEmails.templates.recovery,
475
- emailChange: config.features.customEmails.templates.emailChange,
1410
+ invite: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/invite.html",
1411
+ confirmation: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/verify.html",
1412
+ recovery: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/password_change.html",
1413
+ emailChange: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/email_change.html",
476
1414
  },
477
1415
  };
478
1416
  }
@@ -485,7 +1423,30 @@ export async function generateHelmValues(config, options = {}) {
485
1423
  email: config.adminEmail,
486
1424
  tlsEnabled,
487
1425
  licenseKey: config.licenseKey,
1426
+ // Pull secret for the private docker.io/rulebricks/* images. References the
1427
+ // license registry secret <release>-regcred (index.docker.io, authed by the
1428
+ // license PAT). kube-prometheus-stack + cert-manager honor this global value;
1429
+ // keda, traefik, vector and the strimzi operator each get the same secret on
1430
+ // their own key below.
1431
+ imagePullSecrets: [{ name: `${releaseName}-regcred` }],
1432
+ // Single registry-host override (empty => docker.io/rulebricks/*). Honored by
1433
+ // kube-prometheus-stack and our subcharts; the CLI also rewrites the host into
1434
+ // the other Tier-2 charts' native image keys below.
1435
+ ...(config.imageRegistry ? { imageRegistry: config.imageRegistry } : {}),
1436
+ // Generated name->sha256 digest map (empty until the helm repo's mirror
1437
+ // pipeline populates IMAGE_DIGESTS). When a name is present the chart image
1438
+ // helper pins @sha256 instead of :tag.
1439
+ imageDigests: IMAGE_DIGESTS,
1440
+ ...(productVersion && SEMVER_PATTERN.test(productVersion)
1441
+ ? { version: productVersion }
1442
+ : {}),
488
1443
  externalDnsEnabled,
1444
+ // Scheduling priority tiers (the chart renders release-scoped
1445
+ // <release>-critical and <release>-burst PriorityClasses).
1446
+ priorityClasses: { enabled: true },
1447
+ clickstack: {
1448
+ enabled: clickStackEnabled,
1449
+ },
489
1450
  // SMTP Configuration
490
1451
  smtp: {
491
1452
  host: config.smtp.host,
@@ -516,62 +1477,164 @@ export async function generateHelmValues(config, options = {}) {
516
1477
  : {
517
1478
  enabled: false,
518
1479
  },
1480
+ storage: config.storage
1481
+ ? {
1482
+ // One provider, one identity, one bucket/container. decision-logs and
1483
+ // db-backups are key prefixes under paths.* within it.
1484
+ provider: config.storage.provider,
1485
+ bucket: config.storage.bucket,
1486
+ region: config.storage.region,
1487
+ s3: {
1488
+ iamRoleArn: config.storage.awsIamRoleArn || "",
1489
+ existingSecret: { name: "" },
1490
+ },
1491
+ azure: {
1492
+ authMode: config.storage.cloudAuthMode === "secret"
1493
+ ? "connection-string"
1494
+ : "workload-identity",
1495
+ clientId: config.storage.azureBlobClientId || "",
1496
+ tenantId: config.storage.azureBlobTenantId || "",
1497
+ container: config.storage.azureBlobContainer || "",
1498
+ connectionStringSecretRef: config.storage.azureBlobConnectionStringSecretRef || {
1499
+ name: "",
1500
+ key: "",
1501
+ },
1502
+ },
1503
+ gcp: {
1504
+ serviceAccountEmail: config.storage.gcpServiceAccountEmail || "",
1505
+ },
1506
+ paths: {
1507
+ decisionLogs: config.storage.paths?.decisionLogs || "decision-logs",
1508
+ dbBackups: config.storage.paths?.dbBackups || "db-backups",
1509
+ },
1510
+ }
1511
+ : undefined,
1512
+ // Distributed tracing (omitted entirely when disabled).
1513
+ ...(tracingGlobal ? { tracing: tracingGlobal } : {}),
519
1514
  },
1515
+ clickstack: generateClickStackValues(clickStackEnabled, config, storageClass, infrastructurePodLabels, operationalDaemonSetTolerations),
1516
+ backup: generateBackupValues(config),
520
1517
  // =============================================================================
521
1518
  // RULEBRICKS APPLICATION STACK
522
1519
  // =============================================================================
523
1520
  rulebricks: {
524
- app: {
525
- ...(config.appVersion
526
- ? {
527
- image: {
528
- repository: "index.docker.io/rulebricks/app",
529
- tag: config.appVersion,
530
- pullPolicy: "IfNotPresent",
531
- },
532
- }
533
- : {}),
534
- replicaCount: tierConfig.appReplicas,
535
- resources: tierConfig.appResources,
536
- tolerations: arm64Tolerations,
537
- // Logging configuration
538
- logging: {
1521
+ metrics: {
1522
+ enabled: true,
1523
+ serviceMonitor: {
539
1524
  enabled: true,
540
- kafkaBrokers: "", // Auto-discover from Kafka subchart
541
- kafkaTopic: "logs",
542
- loggingDestination: getLoggingDestinationLabel(config.features.logging.sink),
1525
+ interval: "30s",
1526
+ scrapeTimeout: "10s",
1527
+ },
1528
+ app: {
1529
+ path: "/api/metrics",
543
1530
  },
1531
+ hps: {
1532
+ path: "/metrics",
1533
+ },
1534
+ worker: {
1535
+ path: "/metrics",
1536
+ port: 3000,
1537
+ },
1538
+ },
1539
+ app: {
1540
+ image: {
1541
+ // Split shape: the rulebricks-chart.image helper applies
1542
+ // global.imageRegistry to the host + digest pinning. The host NEVER
1543
+ // goes in repository.
1544
+ registry: reg,
1545
+ repository: IMAGE_REPOSITORIES.app,
1546
+ pullPolicy: "IfNotPresent",
1547
+ },
1548
+ // Replica count and resources fall back to the chart defaults.
1549
+ podLabels: infrastructurePodLabels,
1550
+ ...coreScheduling,
1551
+ // Logging configuration (in-cluster auto-discovery or external Kafka)
1552
+ logging: generateAppLogging(config),
544
1553
  },
545
1554
  // HPS (High Performance Server)
546
1555
  hps: {
547
1556
  enabled: true,
548
- ...(config.hpsVersion
549
- ? {
550
- image: {
551
- repository: "index.docker.io/rulebricks/hps",
552
- tag: config.hpsVersion,
553
- pullPolicy: "Always",
554
- },
555
- }
556
- : {}),
557
- replicas: tierConfig.hpsReplicas,
558
- resources: tierConfig.hpsResources,
559
- tolerations: arm64Tolerations,
1557
+ image: {
1558
+ // Split shape (see app.image): host comes from global.imageRegistry via
1559
+ // the rulebricks-chart.image helper, never baked into repository.
1560
+ registry: reg,
1561
+ repository: IMAGE_REPOSITORIES.hps,
1562
+ pullPolicy: "Always",
1563
+ },
1564
+ // Replica count and resources fall back to the chart defaults.
1565
+ podLabels: applicationPodLabels,
1566
+ ...coreScheduling,
1567
+ // Gather-plane autoscaling: HPS parses every chunk response, so its
1568
+ // capacity scales with request rate (load testing showed a fixed
1569
+ // gather plane plateaus throughput while workers idle). Conservative
1570
+ // one-pod-at-a-time scaling - each scale event rebalances the
1571
+ // response consumer group and can time out in-flight requests. Only the
1572
+ // enable flag is set here; min/max and thresholds use the chart
1573
+ // defaults.
1574
+ keda: {
1575
+ enabled: true,
1576
+ },
1577
+ // Warm the hps/worker images onto active worker-capable nodes so burst
1578
+ // scale-outs skip the image pull without targeting shutdown nodes.
1579
+ imagePrepull: {
1580
+ enabled: true,
1581
+ tolerations: operationalDaemonSetTolerations,
1582
+ },
1583
+ extraEnv: [
1584
+ // FLOW_CHUNK_MAX_ITEMS is the #1 throughput dial. Each chunk is one
1585
+ // Kafka round-trip (gather -> solution -> worker -> solution-response
1586
+ // -> gather), so throughput ~= (broker messages/sec) x (payloads per
1587
+ // message). Bigger chunks = fewer messages per solution = less broker
1588
+ // and coordination overhead. Benchmarks: 10 -> 50 gave +27%, and on
1589
+ // small payloads 100 -> 1000 gave another ~1.6x (22k -> 35k sol/s),
1590
+ // until the bottleneck moved off the broker onto worker CPU.
1591
+ // 500 keeps typical bulk requests to 1-2 messages. The byte bound
1592
+ // (CHUNK_MAX_BYTES, default 256 KiB in HPS) caps message size
1593
+ // regardless, so large payloads stay under Kafka's 2 MiB
1594
+ // max.message.bytes. High-throughput, small-payload deployments can
1595
+ // raise this much higher (and CHUNK_MAX_BYTES with it); the only costs
1596
+ // are per-request latency (one worker processes a whole chunk) and the
1597
+ // 2 MiB cap on the larger response message (avg output x chunk size
1598
+ // must stay < 2 MiB, so lower this for output-heavy flows).
1599
+ { name: "FLOW_CHUNK_MAX_ITEMS", value: "500" },
1600
+ ],
1601
+ // Service account (annotated with the MSK IAM role for external Kafka)
1602
+ serviceAccount: generateHpsServiceAccount(config),
560
1603
  // HPS Workers with KEDA autoscaling
561
1604
  workers: {
562
1605
  enabled: true,
563
- replicas: tierConfig.hpsWorkerReplicas.min,
1606
+ // Workers consume the solution topic directly, so under external MSK
1607
+ // IAM they need their own cloud identity - not the shared/default SA.
1608
+ // Same rule as HPS: a dedicated `<release>-hps-worker` SA (no role-arn
1609
+ // annotation) that the CLI's workload-identity step binds to the Kafka
1610
+ // role via Pod Identity.
1611
+ serviceAccount: generateHpsServiceAccount(config),
1612
+ // Partition count of the solution request topic (also exported to
1613
+ // HPS as MAX_WORKERS). Must match kafka.provisioning above; it is
1614
+ // the fleet-concurrency ceiling, NOT a worker count. Replica count
1615
+ // and resources fall back to the chart defaults.
1616
+ solutionPartitions: SOLUTION_TOPIC_PARTITIONS,
564
1617
  keda: {
565
1618
  enabled: true,
566
- minReplicaCount: tierConfig.hpsWorkerReplicas.min,
567
- maxReplicaCount: tierConfig.hpsWorkerReplicas.max,
568
- pollingInterval: 10,
1619
+ // Poll fast so bursts are detected within seconds; the chart's
1620
+ // ScaledObject defaults add exponential scale-up (double every
1621
+ // 15s) and smooth scale-down (5-min window, -25%/min) behavior.
1622
+ // min/max replica counts fall back to the chart defaults.
1623
+ pollingInterval: 5,
569
1624
  cooldownPeriod: 300,
1625
+ // Lag is measured in MESSAGES; with chunked bulk dispatch each
1626
+ // message is a bounded unit of work (~50-150ms), so 50 messages
1627
+ // approximates 5-8s of backlog for a single worker - one replica
1628
+ // is added per ~5s of fleet backlog, biasing toward early
1629
+ // scale-out for bursty traffic.
570
1630
  lagThreshold: 50,
571
1631
  cpuThreshold: 25,
572
1632
  },
573
- resources: tierConfig.hpsWorkerResources,
574
- tolerations: arm64Tolerations,
1633
+ podLabels: applicationPodLabels,
1634
+ // Burst tier: first preemption victims, so critical infrastructure
1635
+ // can always reschedule during an aggressive scale-out.
1636
+ priorityClassName: burstPriorityClass,
1637
+ ...workerScheduling,
575
1638
  },
576
1639
  },
577
1640
  // Ingress configuration
@@ -580,74 +1643,138 @@ export async function generateHelmValues(config, options = {}) {
580
1643
  className: "traefik",
581
1644
  paths: [{ path: "/", pathType: "Prefix" }],
582
1645
  },
583
- // Redis configuration
584
- redis: {
585
- resources: tierConfig.redisResources,
586
- tolerations: arm64Tolerations,
587
- persistence: {
588
- enabled: true,
589
- size: tierConfig.redisPersistenceSize,
590
- storageClass: storageClass,
591
- },
592
- },
1646
+ // Redis configuration (in-cluster sizing or external connection settings)
1647
+ redis: generateRedisBlock(config, storageClass, infrastructurePodLabels, coreScheduling),
1648
+ cache: generateCacheObservabilityBlock(config, infrastructurePodLabels),
1649
+ kafkaExporter: generateKafkaExporterBlock(config, infrastructurePodLabels),
593
1650
  },
594
1651
  // =============================================================================
595
1652
  // KAFKA (Message Queue)
596
1653
  // =============================================================================
597
1654
  kafka: {
598
- enabled: true,
599
- // KRaft mode (no Zookeeper)
600
- kraft: {
1655
+ enabled: !isExternalKafka(config),
1656
+ // Apache Kafka version (must be one the bundled DHI Strimzi operator
1657
+ // supports; DHI strimzi 1.0.1 ships Kafka 4.2.0).
1658
+ version: "4.2.0",
1659
+ // Single combined controller+broker node (KRaft, no ZooKeeper).
1660
+ replicas: TOPIC_REPLICATION_FACTOR,
1661
+ storage: {
1662
+ size: "20Gi",
1663
+ class: storageClass,
1664
+ },
1665
+ // Critical tier: the broker must always be able to preempt burst workers.
1666
+ priorityClassName: criticalPriorityClass,
1667
+ config: generateKafkaConfig(),
1668
+ jvm: {
1669
+ xms: "1g",
1670
+ xmx: "1g",
1671
+ extraOpts: {
1672
+ UseZGC: "true",
1673
+ AlwaysPreTouch: "true",
1674
+ MaxDirectMemorySize: "256M",
1675
+ },
1676
+ },
1677
+ metrics: {
601
1678
  enabled: true,
1679
+ serviceMonitor: { enabled: true },
602
1680
  },
603
- zookeeper: {
604
- enabled: false,
1681
+ // Topics, reconciled by the Strimzi Topic Operator (KafkaTopic CRs) for the
1682
+ // in-cluster broker, or created by the kafka-topic-provision Job for an
1683
+ // external MSK IAM broker.
1684
+ topics: generateKafkaTopics(config),
1685
+ // When false, the chart never creates topics on an external broker - the
1686
+ // operator manages them (and the workload role needs no CreateTopic).
1687
+ provisioning: {
1688
+ enabled: config.externalServices?.kafka?.external?.provisionTopics ?? true,
605
1689
  },
606
- // Kafka broker configuration
607
- overrideConfiguration: {
608
- "auto.create.topics.enable": "true",
609
- "log.retention.hours": "24",
610
- "default.replication.factor": String(tierConfig.kafkaReplication),
611
- "offsets.topic.replication.factor": String(tierConfig.kafkaReplication),
612
- "num.partitions": String(tierConfig.hpsWorkerReplicas.max), // Match max workers for parallel consumption
613
- },
614
- controller: {
615
- replicaCount: tierConfig.kafkaReplication,
616
- resources: tierConfig.kafkaResources,
617
- tolerations: arm64Tolerations,
618
- persistence: {
1690
+ },
1691
+ // Strimzi operator: pull secret so the operator pod pulls the private
1692
+ // rulebricks/* image from index.docker.io.
1693
+ "strimzi-kafka-operator": {
1694
+ image: { imagePullSecrets: rulebricksPullSecret },
1695
+ },
1696
+ // =============================================================================
1697
+ // VECTOR KAFKA BRIDGE (AWS MSK IAM token auth)
1698
+ // =============================================================================
1699
+ kafkaBridge: generateKafkaBridge(config),
1700
+ clickhouse: {
1701
+ enabled: true,
1702
+ // Critical tier: single replica must preempt burst workers to
1703
+ // reschedule; never autoscaler-evicted on scale-down.
1704
+ priorityClassName: criticalPriorityClass,
1705
+ podAnnotations: safeToEvictAnnotations,
1706
+ auth: {
1707
+ username: "rulebricks",
1708
+ password: "",
1709
+ existingSecret: '{{ printf "%s-clickhouse-credentials" .Release.Name }}',
1710
+ existingSecretKey: "admin-password",
1711
+ },
1712
+ persistence: clickStackEnabled
1713
+ ? {
619
1714
  enabled: true,
620
- size: tierConfig.kafkaStorage,
621
1715
  storageClass: storageClass,
1716
+ size: clickHouseStorageSize,
1717
+ }
1718
+ : { enabled: false },
1719
+ resources: clickStackEnabled
1720
+ ? {
1721
+ requests: { cpu: "1000m", memory: "4Gi" },
1722
+ limits: { cpu: "4", memory: "12Gi" },
1723
+ }
1724
+ : {
1725
+ requests: { cpu: "500m", memory: "2Gi" },
1726
+ limits: { cpu: "2", memory: "6Gi" },
622
1727
  },
623
- heapOpts: tierConfig.kafkaHeapOpts,
624
- extraEnvVars: generateKafkaExtraEnvVars(),
1728
+ serviceAccount: {
1729
+ create: true,
1730
+ annotations: {},
625
1731
  },
626
- listeners: {
627
- client: {
628
- protocol: "PLAINTEXT",
629
- },
630
- controller: {
631
- protocol: "PLAINTEXT",
632
- },
633
- interbroker: {
634
- protocol: "PLAINTEXT",
1732
+ metrics: {
1733
+ enabled: true,
1734
+ serviceMonitor: {
1735
+ enabled: true,
635
1736
  },
636
1737
  },
1738
+ queryLimits: {
1739
+ maxMemoryUsage: 4294967296,
1740
+ maxThreads: 4,
1741
+ maxExecutionTime: 120,
1742
+ maxRowsToRead: 50000000,
1743
+ readOverflowMode: "break",
1744
+ },
1745
+ otelQueryLimits: {
1746
+ maxMemoryUsage: 4294967296,
1747
+ maxThreads: 8,
1748
+ maxExecutionTime: 120,
1749
+ },
1750
+ otelDatabase: "otel",
1751
+ // config.d / users.d / the decision-log view are rendered by the parent
1752
+ // chart's clickhouse templates (no longer passed as Bitnami subchart values).
637
1753
  },
638
1754
  // =============================================================================
639
1755
  // TRAEFIK (Ingress Controller)
640
1756
  // =============================================================================
641
1757
  traefik: {
642
1758
  enabled: true,
1759
+ // traefik has no global.imageRegistry path: set registry + repository
1760
+ // directly (host = reg, rulebricks/* path).
1761
+ image: {
1762
+ registry: reg,
1763
+ repository: IMAGE_REPOSITORIES.traefik,
1764
+ },
1765
+ deployment: {
1766
+ imagePullSecrets: rulebricksPullSecret,
1767
+ },
643
1768
  ingressClass: {
644
1769
  name: "traefik",
645
1770
  },
646
- tolerations: arm64Tolerations,
1771
+ ...coreScheduling,
647
1772
  autoscaling: {
648
1773
  enabled: true,
649
1774
  minReplicas: 1,
650
- maxReplicas: 2,
1775
+ // Headroom for colocated clients pushing multi-hundred-RPS bulk
1776
+ // traffic through the ingress.
1777
+ maxReplicas: 4,
651
1778
  },
652
1779
  resources: {
653
1780
  requests: {
@@ -670,11 +1797,26 @@ export async function generateHelmValues(config, options = {}) {
670
1797
  websecure: {
671
1798
  port: 8443,
672
1799
  exposedPort: 443,
673
- tls: {
674
- enabled: tlsEnabled,
1800
+ // traefik 41.x moved per-entrypoint TLS under ports.<name>.http.tls
1801
+ // (the old ports.<name>.tls location is rejected by the chart schema).
1802
+ http: {
1803
+ tls: {
1804
+ enabled: tlsEnabled,
1805
+ },
1806
+ },
1807
+ },
1808
+ },
1809
+ metrics: {
1810
+ prometheus: {
1811
+ enabled: true,
1812
+ serviceMonitor: {
1813
+ enabled: false,
675
1814
  },
676
1815
  },
677
1816
  },
1817
+ // OTLP tracing: ingress becomes the root span and propagates traceparent
1818
+ // to backends. Empty object when tracing is disabled.
1819
+ tracing: generateTraefikTracing(config, releaseName),
678
1820
  persistence: {
679
1821
  enabled: false,
680
1822
  },
@@ -684,7 +1826,29 @@ export async function generateHelmValues(config, options = {}) {
684
1826
  // =============================================================================
685
1827
  keda: {
686
1828
  enabled: true,
687
- tolerations: arm64Tolerations,
1829
+ imagePullSecrets: rulebricksPullSecret,
1830
+ // keda reads global.image.registry (NOT global.imageRegistry) for the host;
1831
+ // set it plus the rulebricks/* repositories for all three sub-images.
1832
+ global: {
1833
+ image: {
1834
+ registry: reg,
1835
+ },
1836
+ },
1837
+ image: {
1838
+ keda: {
1839
+ registry: reg,
1840
+ repository: IMAGE_REPOSITORIES.keda,
1841
+ },
1842
+ metricsApiServer: {
1843
+ registry: reg,
1844
+ repository: IMAGE_REPOSITORIES.kedaMetricsApiServer,
1845
+ },
1846
+ webhooks: {
1847
+ registry: reg,
1848
+ repository: IMAGE_REPOSITORIES.kedaAdmissionWebhooks,
1849
+ },
1850
+ },
1851
+ ...coreScheduling,
688
1852
  crds: {
689
1853
  install: false, // CRDs managed in parent chart
690
1854
  },
@@ -694,13 +1858,41 @@ export async function generateHelmValues(config, options = {}) {
694
1858
  // =============================================================================
695
1859
  "cert-manager": {
696
1860
  enabled: tlsEnabled,
697
- installCRDs: false, // CRDs managed in parent chart
698
- tolerations: arm64Tolerations,
1861
+ // CRDs managed in parent chart (cert-manager v1.15+ uses crds.enabled,
1862
+ // not the deprecated installCRDs flag).
1863
+ crds: { enabled: false },
1864
+ // cert-manager prepends image.registry to image.repository, so set both per
1865
+ // component (host = reg, rulebricks/cert-manager-* path).
1866
+ image: {
1867
+ registry: reg,
1868
+ repository: IMAGE_REPOSITORIES.certManagerController,
1869
+ },
1870
+ ...coreScheduling,
699
1871
  webhook: {
700
- tolerations: arm64Tolerations,
1872
+ image: {
1873
+ registry: reg,
1874
+ repository: IMAGE_REPOSITORIES.certManagerWebhook,
1875
+ },
1876
+ ...coreScheduling,
701
1877
  },
702
1878
  cainjector: {
703
- tolerations: arm64Tolerations,
1879
+ image: {
1880
+ registry: reg,
1881
+ repository: IMAGE_REPOSITORIES.certManagerCainjector,
1882
+ },
1883
+ ...coreScheduling,
1884
+ },
1885
+ startupapicheck: {
1886
+ image: {
1887
+ registry: reg,
1888
+ repository: IMAGE_REPOSITORIES.certManagerStartupapicheck,
1889
+ },
1890
+ },
1891
+ acmesolver: {
1892
+ image: {
1893
+ registry: reg,
1894
+ repository: IMAGE_REPOSITORIES.certManagerAcmesolver,
1895
+ },
704
1896
  },
705
1897
  },
706
1898
  // Cluster Issuer for Let's Encrypt
@@ -714,12 +1906,20 @@ export async function generateHelmValues(config, options = {}) {
714
1906
  // =============================================================================
715
1907
  vector: {
716
1908
  enabled: true,
1909
+ // vector's image.repository is the FULL path including host (no separate
1910
+ // registry field), so the reg host is prefixed here.
1911
+ image: {
1912
+ repository: `${reg}/${IMAGE_REPOSITORIES.vector}`,
1913
+ pullSecrets: rulebricksPullSecret,
1914
+ },
717
1915
  role: "Stateless-Aggregator",
718
- replicas: tierConfig.vectorReplicas,
719
- resources: tierConfig.vectorResources,
720
- tolerations: arm64Tolerations,
1916
+ // Replica count and resources fall back to the chart defaults.
1917
+ ...coreScheduling,
721
1918
  serviceAccount: generateVectorServiceAccount(config),
722
1919
  podLabels: generateVectorPodLabels(config),
1920
+ ...(generateVectorExtraContainers(config)
1921
+ ? { extraContainers: generateVectorExtraContainers(config) }
1922
+ : {}),
723
1923
  service: {
724
1924
  enabled: true,
725
1925
  ports: [{ name: "api", port: 8686, protocol: "TCP", targetPort: 8686 }],
@@ -731,90 +1931,274 @@ export async function generateHelmValues(config, options = {}) {
731
1931
  kafka: {
732
1932
  type: "kafka",
733
1933
  bootstrap_servers: "${KAFKA_BOOTSTRAP_SERVERS:-rulebricks-kafka:9092}",
734
- topics: ["logs"],
1934
+ // KAFKA_LOG_TOPIC carries the namespace prefix (e.g. com.rulebricks.logs).
1935
+ topics: ["${KAFKA_LOG_TOPIC:-logs}"],
735
1936
  group_id: "vector-consumers",
736
1937
  auto_offset_reset: "latest",
1938
+ // TLS + SASL driven by env from vector-kafka-env (disabled for
1939
+ // in-cluster Kafka and the kafka-proxy bridge path).
1940
+ tls: { enabled: "${KAFKA_TLS_ENABLED:-false}" },
1941
+ sasl: {
1942
+ enabled: "${KAFKA_SASL_ENABLED:-false}",
1943
+ mechanism: "${KAFKA_SASL_MECHANISM:-PLAIN}",
1944
+ // username/password are only emitted for external Kafka using a
1945
+ // direct PLAIN/SCRAM credential (where vector-kafka-credentials is
1946
+ // populated). Emitting them with an empty default would render as
1947
+ // YAML null and crash Vector at config load; omitting the keys
1948
+ // leaves them unset (valid) whenever SASL is disabled.
1949
+ ...(kafkaUsesDirectSasl(config)
1950
+ ? {
1951
+ username: "${KAFKA_SASL_USERNAME}",
1952
+ password: "${KAFKA_SASL_PASSWORD}",
1953
+ }
1954
+ : {}),
1955
+ },
1956
+ },
1957
+ },
1958
+ transforms: {
1959
+ normalize_logs: {
1960
+ type: "remap",
1961
+ inputs: ["kafka"],
1962
+ source: VECTOR_NORMALIZE_LOGS_VRL,
737
1963
  },
738
1964
  },
739
1965
  sinks: generateVectorSinks(config),
740
1966
  },
741
1967
  },
742
1968
  // =============================================================================
1969
+ // VECTOR AGENT (Application / container logs -> Elasticsearch)
1970
+ // =============================================================================
1971
+ "vector-agent": clickStackEnabled
1972
+ ? { enabled: false }
1973
+ : {
1974
+ ...generateVectorAgent(config, infrastructurePodLabels, operationalDaemonSetTolerations),
1975
+ // Full-path repository (see vector above) + pull secret.
1976
+ image: {
1977
+ repository: `${reg}/${IMAGE_REPOSITORIES.vector}`,
1978
+ pullSecrets: rulebricksPullSecret,
1979
+ },
1980
+ },
1981
+ // =============================================================================
743
1982
  // SUPABASE (Self-hosted Database)
744
1983
  // =============================================================================
745
1984
  supabase: {
746
1985
  enabled: config.database.type === "self-hosted",
747
1986
  ...(config.database.type === "self-hosted"
748
- ? {
749
- secret: {
750
- db: {
751
- username: "postgres",
752
- password: config.database.supabaseDbPassword,
753
- database: "postgres",
1987
+ ? (() => {
1988
+ // External managed Postgres (AWS RDS / Azure Flexible Server): the
1989
+ // self-hosted Supabase services run against it instead of the
1990
+ // bundled in-cluster database.
1991
+ const pgExt = config.externalServices?.postgres?.mode === "external"
1992
+ ? config.externalServices?.postgres?.external
1993
+ : undefined;
1994
+ return {
1995
+ secret: {
1996
+ db: {
1997
+ username: "postgres",
1998
+ // Shared service-role password (authenticator / auth_admin /
1999
+ // replication_admin). With an external DB the bootstrap hook
2000
+ // sets the roles to this same value.
2001
+ password: config.database.supabaseDbPassword,
2002
+ database: pgExt?.database || "postgres",
2003
+ },
2004
+ dashboard: {
2005
+ username: config.database.supabaseDashboardUser || "supabase",
2006
+ password: config.database.supabaseDashboardPass,
2007
+ },
2008
+ jwt: {
2009
+ secret: config.database.supabaseJwtSecret,
2010
+ },
2011
+ // SECRET_KEY_BASE / DB_ENC_KEY, derived from the JWT secret
2012
+ // (stable across redeploys). The chart no longer ships defaults.
2013
+ realtime: deriveRealtimeSecrets(config.database.supabaseJwtSecret || ""),
754
2014
  },
755
- dashboard: {
756
- username: config.database.supabaseDashboardUser || "supabase",
757
- password: config.database.supabaseDashboardPass,
2015
+ ...(pgExt
2016
+ ? {
2017
+ // One switch: enabling externalDatabase disables the bundled
2018
+ // Postgres and runs the bootstrap hook to initialize the
2019
+ // managed instance. db.enabled=false is explicit so chart
2020
+ // schema rules keyed off it hold.
2021
+ db: { enabled: false },
2022
+ externalDatabase: {
2023
+ enabled: true,
2024
+ host: pgExt.host ?? "",
2025
+ port: pgExt.port ?? 5432,
2026
+ bootstrap: {
2027
+ enabled: pgExt.bootstrap?.enabled ?? true,
2028
+ masterUsername: pgExt.bootstrap?.masterUsername ?? "postgres",
2029
+ masterPassword: pgExt.bootstrap?.masterPassword ?? "",
2030
+ appRole: pgExt.bootstrap?.appRole ?? "postgres",
2031
+ },
2032
+ },
2033
+ }
2034
+ : {
2035
+ db: {
2036
+ // Explicit so chart schema rules that key off
2037
+ // supabase.db.enabled (e.g. Database Backup Storage
2038
+ // Validation) hold without relying on subchart-default
2039
+ // coalescing.
2040
+ enabled: true,
2041
+ image: {
2042
+ // Split shape: the supabase.image helper applies
2043
+ // global.imageRegistry to the host. Host never in repository.
2044
+ registry: reg,
2045
+ repository: SUPABASE_POSTGRES_IMAGE_REPOSITORY,
2046
+ tag: SUPABASE_POSTGRES_IMAGE_TAG,
2047
+ pullPolicy: "IfNotPresent",
2048
+ },
2049
+ podLabels: infrastructurePodLabels,
2050
+ // Critical tier: the primary datastore must preempt burst
2051
+ // workers to reschedule; never autoscaler-evicted.
2052
+ // Resources and persistence size fall back to chart
2053
+ // defaults.
2054
+ priorityClassName: criticalPriorityClass,
2055
+ podAnnotations: safeToEvictAnnotations,
2056
+ ...coreScheduling,
2057
+ persistence: {
2058
+ enabled: true,
2059
+ storageClassName: storageClass,
2060
+ },
2061
+ },
2062
+ }),
2063
+ auth: {
2064
+ // Explicit public URLs so GoTrue never falls back to the
2065
+ // in-cluster Kong service name when global.domain propagation
2066
+ // is lost (e.g. after manual patching or partial upgrades).
2067
+ siteUrl: `https://${config.domain}`,
2068
+ externalUrl: `https://supabase.${config.domain}`,
2069
+ ...coreScheduling,
758
2070
  },
759
- jwt: {
760
- secret: config.database.supabaseJwtSecret,
2071
+ rest: {
2072
+ ...coreScheduling,
761
2073
  },
762
- },
763
- db: {
764
- resources: tierConfig.dbResources,
765
- tolerations: arm64Tolerations,
766
- persistence: {
767
- enabled: true,
768
- size: tierConfig.dbPersistenceSize,
769
- storageClassName: storageClass,
2074
+ realtime: {
2075
+ ...coreScheduling,
770
2076
  },
771
- },
772
- auth: {
773
- tolerations: arm64Tolerations,
774
- },
775
- rest: {
776
- tolerations: arm64Tolerations,
777
- },
778
- realtime: {
779
- tolerations: arm64Tolerations,
780
- },
781
- meta: {
782
- tolerations: arm64Tolerations,
783
- },
784
- kong: {
785
- tolerations: arm64Tolerations,
786
- ingress: {
787
- enabled: true,
788
- className: "traefik",
789
- annotations: {},
2077
+ meta: {
2078
+ ...coreScheduling,
790
2079
  },
791
- },
792
- studio: {
793
- tolerations: arm64Tolerations,
794
- },
795
- }
2080
+ kong: {
2081
+ ...coreScheduling,
2082
+ ingress: {
2083
+ enabled: true,
2084
+ className: "traefik",
2085
+ // The supabase subchart's kong ingress does NOT emit Traefik's
2086
+ // router.entrypoints/router.tls annotations the way the app
2087
+ // ingress does — without them Traefik only builds a web (HTTP)
2088
+ // router, so https://supabase.<domain> 404s and the app can't
2089
+ // reach Supabase. Inject them via the subchart's annotations
2090
+ // passthrough (kong/ingress.yaml ranges over these), matching
2091
+ // charts/rulebricks/templates/ingress.yaml.
2092
+ annotations: {
2093
+ "traefik.ingress.kubernetes.io/router.entrypoints": tlsEnabled ? "websecure" : "web",
2094
+ "traefik.ingress.kubernetes.io/router.tls": tlsEnabled
2095
+ ? "true"
2096
+ : "false",
2097
+ },
2098
+ },
2099
+ },
2100
+ studio: {
2101
+ ...coreScheduling,
2102
+ },
2103
+ };
2104
+ })()
796
2105
  : {}),
797
2106
  },
798
2107
  // =============================================================================
799
2108
  // MONITORING
800
2109
  // =============================================================================
801
2110
  monitoring: {
802
- enabled: config.features.monitoring.enabled,
2111
+ enabled: true,
803
2112
  },
804
2113
  "kube-prometheus-stack": {
805
- enabled: config.features.monitoring.enabled,
2114
+ enabled: true,
2115
+ // kube-prometheus-stack honors the parent global.imageRegistry for the host
2116
+ // automatically; the CLI sets the rulebricks/* repository defaults (and the
2117
+ // reg host explicitly) for every sub-image so a bare helm install also pulls
2118
+ // rulebricks/*.
806
2119
  alertmanager: {
807
2120
  enabled: false,
2121
+ alertmanagerSpec: {
2122
+ image: {
2123
+ registry: reg,
2124
+ repository: IMAGE_REPOSITORIES.alertmanager,
2125
+ },
2126
+ },
2127
+ },
2128
+ prometheusOperator: {
2129
+ image: {
2130
+ registry: reg,
2131
+ repository: IMAGE_REPOSITORIES.prometheusOperator,
2132
+ },
2133
+ prometheusConfigReloader: {
2134
+ image: {
2135
+ registry: reg,
2136
+ repository: IMAGE_REPOSITORIES.prometheusConfigReloader,
2137
+ },
2138
+ },
2139
+ admissionWebhooks: {
2140
+ patch: {
2141
+ image: {
2142
+ registry: reg,
2143
+ repository: IMAGE_REPOSITORIES.kubeWebhookCertgen,
2144
+ },
2145
+ },
2146
+ },
2147
+ },
2148
+ "kube-state-metrics": {
2149
+ image: {
2150
+ registry: reg,
2151
+ repository: IMAGE_REPOSITORIES.kubeStateMetrics,
2152
+ },
2153
+ },
2154
+ "prometheus-node-exporter": {
2155
+ image: {
2156
+ registry: reg,
2157
+ repository: IMAGE_REPOSITORIES.nodeExporter,
2158
+ },
808
2159
  },
809
2160
  grafana: {
810
2161
  enabled: useLocalGrafana,
2162
+ image: {
2163
+ registry: reg,
2164
+ repository: IMAGE_REPOSITORIES.grafana,
2165
+ },
2166
+ // Dashboard sidecar imports the provisioned Rulebricks dashboards
2167
+ // (ConfigMaps labeled grafana_dashboard="1") when in-cluster Grafana
2168
+ // is enabled.
2169
+ sidecar: {
2170
+ image: {
2171
+ registry: reg,
2172
+ repository: IMAGE_REPOSITORIES.k8sSidecar,
2173
+ },
2174
+ ...(useLocalGrafana
2175
+ ? {
2176
+ dashboards: {
2177
+ enabled: true,
2178
+ label: "grafana_dashboard",
2179
+ labelValue: "1",
2180
+ searchNamespace: "ALL",
2181
+ folderAnnotation: "grafana_folder",
2182
+ provider: { foldersFromFilesStructure: true },
2183
+ },
2184
+ }
2185
+ : {}),
2186
+ },
811
2187
  },
812
2188
  prometheus: {
813
- enabled: config.features.monitoring.enabled,
2189
+ enabled: true,
814
2190
  serviceAccount: generatePrometheusServiceAccount(config),
815
2191
  prometheusSpec: {
816
2192
  retention: "30d",
2193
+ image: {
2194
+ registry: reg,
2195
+ repository: IMAGE_REPOSITORIES.prometheus,
2196
+ },
817
2197
  podMetadata: generatePrometheusPodMetadata(config),
2198
+ serviceMonitorSelectorNilUsesHelmValues: false,
2199
+ serviceMonitorSelector: {},
2200
+ podMonitorSelectorNilUsesHelmValues: false,
2201
+ podMonitorSelector: {},
818
2202
  storageSpec: {
819
2203
  volumeClaimTemplate: {
820
2204
  spec: {
@@ -828,7 +2212,9 @@ export async function generateHelmValues(config, options = {}) {
828
2212
  },
829
2213
  },
830
2214
  },
831
- remoteWrite: generateRemoteWriteSpec(config),
2215
+ remoteWrite: [
2216
+ ...(clickStackEnabled ? [] : generateRemoteWriteSpec(config)),
2217
+ ],
832
2218
  },
833
2219
  },
834
2220
  },
@@ -836,20 +2222,21 @@ export async function generateHelmValues(config, options = {}) {
836
2222
  // STORAGE CLASS
837
2223
  // =============================================================================
838
2224
  storageClass: {
839
- create: true,
2225
+ create: false,
840
2226
  name: storageClass,
841
- provisioner: config.infrastructure.provider === "aws"
842
- ? "ebs.csi.aws.com"
843
- : config.infrastructure.provider === "gcp"
844
- ? "pd.csi.storage.gke.io"
845
- : config.infrastructure.provider === "azure"
846
- ? "disk.csi.azure.com"
847
- : "ebs.csi.aws.com",
2227
+ provisioner: config.infrastructure.storageProvisioner ||
2228
+ (config.infrastructure.provider === "aws"
2229
+ ? "ebs.csi.aws.com"
2230
+ : config.infrastructure.provider === "gcp"
2231
+ ? "pd.csi.storage.gke.io"
2232
+ : config.infrastructure.provider === "azure"
2233
+ ? "disk.csi.azure.com"
2234
+ : "ebs.csi.aws.com"),
848
2235
  // Parameters for the StorageClass - must include type for disk provisioning
849
2236
  parameters: config.infrastructure.provider === "aws"
850
2237
  ? { type: "gp3" }
851
2238
  : config.infrastructure.provider === "gcp"
852
- ? { type: "hyperdisk-balanced" }
2239
+ ? { type: gcpDiskType }
853
2240
  : config.infrastructure.provider === "azure"
854
2241
  ? { skuName: "Premium_LRS" }
855
2242
  : { type: "gp3" },
@@ -864,7 +2251,13 @@ export async function generateHelmValues(config, options = {}) {
864
2251
  "external-dns": externalDnsEnabled
865
2252
  ? {
866
2253
  enabled: true,
867
- provider: getExternalDnsProvider(config.dns.provider),
2254
+ // external-dns has NO image.registry field: image.repository is the
2255
+ // FULL path including host (reg prefix + rulebricks/external-dns).
2256
+ image: {
2257
+ repository: `${reg}/${IMAGE_REPOSITORIES.externalDns}`,
2258
+ },
2259
+ // external-dns 1.21+ idiom: provider is an object ({name: ...}).
2260
+ provider: { name: getExternalDnsProvider(config.dns.provider) },
868
2261
  domainFilters: [config.domain],
869
2262
  sources: ["ingress", "service"],
870
2263
  policy: "upsert-only",
@@ -873,6 +2266,149 @@ export async function generateHelmValues(config, options = {}) {
873
2266
  enabled: false,
874
2267
  },
875
2268
  };
2269
+ // The managed-Postgres migration hook (templates/migration-job.yaml) reads the
2270
+ // DB host/port from .Values.migrations.externalDb — a SEPARATE seam from
2271
+ // supabase.externalDatabase.* — and its `pg_isready -h $DB_HOST` loop hangs
2272
+ // forever (empty host) if it is unset. Wire it for external Postgres. We only
2273
+ // set host/port: DB_PASSWORD falls back to the <release>-supabase-db secret and
2274
+ // DB_USER/DB_NAME default to "postgres", which match deploymentSecretNames()
2275
+ // and the bootstrap app role.
2276
+ const migrationsPgExt = config.database.type === "self-hosted" &&
2277
+ config.externalServices?.postgres?.mode === "external"
2278
+ ? config.externalServices.postgres.external
2279
+ : undefined;
2280
+ if (migrationsPgExt) {
2281
+ values.migrations = {
2282
+ externalDb: {
2283
+ host: migrationsPgExt.host ?? "",
2284
+ // Chart schema requires a string here (the template quotes it).
2285
+ port: String(migrationsPgExt.port ?? 5432),
2286
+ // Run migrations as the master/app_role. The bootstrap hook creates the
2287
+ // service login roles (authenticator, supabase_auth_admin, …) with the
2288
+ // service password but deliberately does NOT change the master's
2289
+ // password (bootstrap.sql runs "as the master user (named postgres)").
2290
+ // So the migrate hook must authenticate with the MASTER credential, not
2291
+ // the service password in <release>-supabase-db (that would 401). Point
2292
+ // DB_PASSWORD at the bootstrap Secret's master-password.
2293
+ existingSecret: deploymentSecretNames(config).dbBootstrap,
2294
+ existingSecretKey: "master-password",
2295
+ },
2296
+ };
2297
+ }
2298
+ // In k8s secret mode, the CLI creates Kubernetes Secrets and the chart reads
2299
+ // them by reference. Point the chart's secretRef seams at those Secrets and
2300
+ // strip every plaintext secret out of the generated values.
2301
+ if (secretMode === "k8s") {
2302
+ return redactSecretsToRefs(values, config);
2303
+ }
2304
+ return values;
2305
+ }
2306
+ /**
2307
+ * Rewrites generated values for k8s secret mode: sets the chart's *.secretRef
2308
+ * seams to the CLI-created Secret names and removes inline plaintext secrets so
2309
+ * none are persisted to values.yaml or the Helm release.
2310
+ */
2311
+ export function redactSecretsToRefs(values, config) {
2312
+ const names = deploymentSecretNames(config);
2313
+ const global = (values.global ?? {});
2314
+ const supabase = (values.supabase ?? {});
2315
+ const pgExt = config.database.type === "self-hosted" &&
2316
+ config.externalServices?.postgres?.mode === "external"
2317
+ ? config.externalServices.postgres.external
2318
+ : undefined;
2319
+ // App-level consolidated secret: one secretRef supplies every app cred.
2320
+ global.secrets = { ...(global.secrets ?? {}), secretRef: names.app };
2321
+ // Strip inline app/global secrets (non-secret config like host/from/url stays).
2322
+ if (global.smtp) {
2323
+ delete global.smtp.user;
2324
+ delete global.smtp.pass;
2325
+ }
2326
+ if (global.supabase) {
2327
+ delete global.supabase.jwtSecret;
2328
+ // NOTE: anonKey is intentionally NOT stripped. It is the *public* Supabase
2329
+ // key that app-configmap.yaml embeds into the Next.js client bundle
2330
+ // (SUPABASE_PUBLIC_KEY / NEXT_PUBLIC_SUPABASE_PUBLIC_KEY). That ConfigMap
2331
+ // reads global.supabase.anonKey at TEMPLATE time and there is no secretRef
2332
+ // seam for it, so stripping it leaves the browser client with an empty key.
2333
+ // It is a public token (safe in a ConfigMap by design) and never appears in
2334
+ // the k8s-mode secret-leak checks.
2335
+ delete global.supabase.serviceKey;
2336
+ delete global.supabase.accessToken;
2337
+ }
2338
+ if (global.ai)
2339
+ delete global.ai.openaiApiKey;
2340
+ if (global.sso) {
2341
+ delete global.sso.clientId;
2342
+ delete global.sso.clientSecret;
2343
+ }
2344
+ // NOTE: licenseKey is intentionally NOT stripped. The (standard) chart builds
2345
+ // the image-pull secret <release>-regcred from inline global.licenseKey at
2346
+ // TEMPLATE time (templates/registry-secret.yaml -> imagePullSecret helper). A
2347
+ // Kubernetes imagePullSecret cannot be sourced from a secretRef, so the chart
2348
+ // has no k8s-mode seam for it — stripping it makes the chart fall back to the
2349
+ // "evaluation" placeholder -> dckr_pat_evaluation -> 401 on every private
2350
+ // rulebricks/* image. Standalone chart users set global.licenseKey in their own
2351
+ // values for exactly this reason; the CLI must do the same to stay compatible
2352
+ // with the unmodified chart. It is a Docker Hub read-only PAT and already lives
2353
+ // in the deployment's config.yaml, so keeping it inline adds no new exposure.
2354
+ // Supabase subchart: replace each inline secret block with a secretRef.
2355
+ if (supabase.secret) {
2356
+ const dbSecret = { secretRef: names.db };
2357
+ if (pgExt) {
2358
+ dbSecret.secretRefKey = {
2359
+ host: "host",
2360
+ port: "port",
2361
+ username: "username",
2362
+ password: "password",
2363
+ database: "database",
2364
+ };
2365
+ }
2366
+ supabase.secret = {
2367
+ db: dbSecret,
2368
+ jwt: { secretRef: names.jwt },
2369
+ dashboard: { secretRef: names.dashboard },
2370
+ realtime: { secretRef: names.realtime },
2371
+ // Supabase auth (GoTrue) SMTP — only when SMTP creds are configured;
2372
+ // otherwise the global.smtp we just stripped would leave it empty.
2373
+ ...(config.smtp?.user || config.smtp?.pass
2374
+ ? { smtp: { secretRef: names.smtp } }
2375
+ : {}),
2376
+ };
2377
+ }
2378
+ if (pgExt && supabase.externalDatabase) {
2379
+ supabase.externalDatabase = {
2380
+ ...supabase.externalDatabase,
2381
+ // New charts read host/port/user/pass/db from this single Secret. Keep
2382
+ // externalDatabase.host/port above for older charts that do not yet support
2383
+ // host/port secret keys.
2384
+ secretRef: names.db,
2385
+ secretRefKey: {
2386
+ host: "host",
2387
+ port: "port",
2388
+ username: "username",
2389
+ password: "password",
2390
+ database: "database",
2391
+ },
2392
+ bootstrap: {
2393
+ ...(supabase.externalDatabase.bootstrap ?? {}),
2394
+ secretRef: names.dbBootstrap,
2395
+ // Master credentials move into the hook Secret in k8s mode.
2396
+ masterUsername: undefined,
2397
+ masterPassword: undefined,
2398
+ },
2399
+ };
2400
+ }
2401
+ values.global = global;
2402
+ values.supabase = supabase;
2403
+ return values;
2404
+ }
2405
+ /**
2406
+ * Generates Helm values from the deployment configuration
2407
+ */
2408
+ export async function generateHelmValues(config, options = {}) {
2409
+ const values = buildHelmValues(config, options);
2410
+ // Last-line guardrail: never write/deploy values the chart would reject.
2411
+ assertValidHelmValues(values);
876
2412
  await saveHelmValues(config.name, values);
877
2413
  }
878
2414
  /**
@@ -908,6 +2444,22 @@ export async function updateHelmValuesForTLS(deploymentName, tlsEnabled) {
908
2444
  }
909
2445
  }
910
2446
  }
2447
+ // Keep the supabase kong ingress on the right Traefik entrypoint. The
2448
+ // subchart doesn't emit router.entrypoints/tls itself, so on the TLS-toggle
2449
+ // path (not a full regen) HTTPS to supabase.<domain> would 404 without this.
2450
+ // Mirrors what buildHelmValues sets on the kong ingress annotations.
2451
+ const supabase = values.supabase;
2452
+ const kongIngress = supabase?.kong
2453
+ ?.ingress;
2454
+ if (kongIngress && typeof kongIngress === "object") {
2455
+ kongIngress.annotations = {
2456
+ ...kongIngress.annotations,
2457
+ "traefik.ingress.kubernetes.io/router.entrypoints": tlsEnabled
2458
+ ? "websecure"
2459
+ : "web",
2460
+ "traefik.ingress.kubernetes.io/router.tls": tlsEnabled ? "true" : "false",
2461
+ };
2462
+ }
911
2463
  // Save updated values
912
2464
  await fs.writeFile(valuesPath, YAML.stringify(values), "utf8");
913
2465
  }