@rulebricks/cli 2.1.7 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/README.md +51 -16
  2. package/cluster-setup/aws/README.md +96 -47
  3. package/cluster-setup/aws/check-aws-access.sh +216 -52
  4. package/cluster-setup/aws/parameters.json +13 -0
  5. package/cluster-setup/aws/rulebricks-cluster.cfn.yaml +355 -0
  6. package/cluster-setup/azure/README.md +103 -55
  7. package/cluster-setup/azure/check-aks-prereqs.sh +236 -56
  8. package/cluster-setup/azure/parameters.json +30 -0
  9. package/cluster-setup/azure/rulebricks-cluster.bicep +546 -0
  10. package/cluster-setup/gcp/README.md +51 -34
  11. package/cluster-setup/gcp/check-gke-prereqs.sh +222 -60
  12. package/dist/commands/backup.d.ts +5 -0
  13. package/dist/commands/backup.js +104 -0
  14. package/dist/commands/deploy.d.ts +3 -1
  15. package/dist/commands/deploy.js +226 -326
  16. package/dist/commands/destroy.d.ts +1 -1
  17. package/dist/commands/destroy.js +73 -123
  18. package/dist/commands/init.d.ts +5 -1
  19. package/dist/commands/init.js +78 -54
  20. package/dist/commands/list.d.ts +1 -0
  21. package/dist/commands/list.js +74 -0
  22. package/dist/commands/open.d.ts +1 -1
  23. package/dist/commands/open.js +4 -12
  24. package/dist/commands/redeploy.d.ts +6 -0
  25. package/dist/commands/redeploy.js +310 -0
  26. package/dist/commands/restore.d.ts +5 -0
  27. package/dist/commands/restore.js +338 -0
  28. package/dist/commands/status.js +62 -49
  29. package/dist/commands/upgrade.js +74 -51
  30. package/dist/components/DNSWaitScreen.d.ts +5 -1
  31. package/dist/components/DNSWaitScreen.js +47 -41
  32. package/dist/components/Wizard/WizardContext.d.ts +157 -36
  33. package/dist/components/Wizard/WizardContext.js +872 -160
  34. package/dist/components/Wizard/steps/CloudProviderStep.js +192 -107
  35. package/dist/components/Wizard/steps/DomainStep.js +5 -24
  36. package/dist/components/Wizard/steps/ExternalServicesStep.d.ts +6 -0
  37. package/dist/components/Wizard/steps/ExternalServicesStep.js +645 -0
  38. package/dist/components/Wizard/steps/FeatureConfigStep.d.ts +2 -1
  39. package/dist/components/Wizard/steps/FeatureConfigStep.js +739 -425
  40. package/dist/components/Wizard/steps/FeaturesStep.js +31 -35
  41. package/dist/components/Wizard/steps/ObservabilityStep.d.ts +6 -0
  42. package/dist/components/Wizard/steps/ObservabilityStep.js +137 -0
  43. package/dist/components/Wizard/steps/ReviewStep.d.ts +2 -1
  44. package/dist/components/Wizard/steps/ReviewStep.js +56 -12
  45. package/dist/components/Wizard/steps/StorageStep.d.ts +9 -0
  46. package/dist/components/Wizard/steps/StorageStep.js +592 -0
  47. package/dist/components/Wizard/steps/SupabaseCredentialsStep.js +20 -21
  48. package/dist/components/Wizard/steps/VersionStep.js +45 -23
  49. package/dist/components/Wizard/steps/index.d.ts +3 -3
  50. package/dist/components/Wizard/steps/index.js +3 -3
  51. package/dist/components/common/CommandApproval.d.ts +12 -0
  52. package/dist/components/common/CommandApproval.js +91 -0
  53. package/dist/components/common/DeploymentPicker.d.ts +14 -0
  54. package/dist/components/common/DeploymentPicker.js +16 -0
  55. package/dist/components/common/index.d.ts +2 -0
  56. package/dist/components/common/index.js +2 -0
  57. package/dist/index.js +94 -62
  58. package/dist/lib/cloudCli.d.ts +134 -63
  59. package/dist/lib/cloudCli.js +512 -220
  60. package/dist/lib/clusterSetupDefaults.d.ts +30 -0
  61. package/dist/lib/clusterSetupDefaults.js +64 -0
  62. package/dist/lib/commandApproval.d.ts +26 -0
  63. package/dist/lib/commandApproval.js +114 -0
  64. package/dist/lib/config.d.ts +12 -10
  65. package/dist/lib/config.js +91 -33
  66. package/dist/lib/configFixtures.d.ts +5 -0
  67. package/dist/lib/configFixtures.js +513 -0
  68. package/dist/lib/deploymentHealth.d.ts +32 -0
  69. package/dist/lib/deploymentHealth.js +157 -0
  70. package/dist/lib/dns.d.ts +1 -1
  71. package/dist/lib/dns.js +19 -1
  72. package/dist/lib/dns.test.d.ts +1 -0
  73. package/dist/lib/dns.test.js +27 -0
  74. package/dist/lib/dockerHub.d.ts +12 -1
  75. package/dist/lib/dockerHub.js +18 -8
  76. package/dist/lib/helm.d.ts +4 -0
  77. package/dist/lib/helm.js +16 -0
  78. package/dist/lib/helmValues.d.ts +25 -0
  79. package/dist/lib/helmValues.js +1762 -289
  80. package/dist/lib/helmValues.test.d.ts +1 -0
  81. package/dist/lib/helmValues.test.js +966 -0
  82. package/dist/lib/htpasswd.d.ts +1 -0
  83. package/dist/lib/htpasswd.js +15 -0
  84. package/dist/lib/kubernetes.d.ts +124 -17
  85. package/dist/lib/kubernetes.js +576 -145
  86. package/dist/lib/secrets.d.ts +23 -0
  87. package/dist/lib/secrets.js +158 -0
  88. package/dist/lib/validateValues.d.ts +31 -0
  89. package/dist/lib/validateValues.js +253 -0
  90. package/dist/lib/versions.d.ts +82 -11
  91. package/dist/lib/versions.js +131 -31
  92. package/dist/lib/versions.test.d.ts +1 -0
  93. package/dist/lib/versions.test.js +81 -0
  94. package/dist/lib/wizardSteps.d.ts +14 -0
  95. package/dist/lib/wizardSteps.js +23 -0
  96. package/dist/lib/workloadIdentity.d.ts +26 -0
  97. package/dist/lib/workloadIdentity.js +323 -0
  98. package/dist/lib/workloadIdentity.test.d.ts +1 -0
  99. package/dist/lib/workloadIdentity.test.js +57 -0
  100. package/dist/types/index.d.ts +1860 -164
  101. package/dist/types/index.js +518 -295
  102. package/package.json +9 -4
  103. package/schema/values.schema.json +1934 -0
  104. package/cluster-setup/aws/cluster.yaml +0 -33
  105. package/cluster-setup/azure/main.bicep +0 -282
  106. package/cluster-setup/azure/main.parameters.json +0 -21
  107. package/dist/components/Wizard/steps/CredentialsStep.d.ts +0 -6
  108. package/dist/components/Wizard/steps/CredentialsStep.js +0 -22
  109. package/dist/components/Wizard/steps/DeploymentModeStep.d.ts +0 -5
  110. package/dist/components/Wizard/steps/DeploymentModeStep.js +0 -26
  111. package/dist/components/Wizard/steps/TierStep.d.ts +0 -6
  112. package/dist/components/Wizard/steps/TierStep.js +0 -29
  113. package/dist/lib/terraform.d.ts +0 -66
  114. package/dist/lib/terraform.js +0 -754
  115. package/terraform/aws/main.tf +0 -355
  116. package/terraform/azure/main.tf +0 -371
  117. package/terraform/gcp/main.tf +0 -407
@@ -1,7 +1,96 @@
1
- import { TIER_CONFIGS, isSupportedDnsProvider, getLoggingDestinationLabel, } from "../types/index.js";
1
+ import { getReleaseName, isSupportedDnsProvider, validateRemoteWriteConfig, } from "../types/index.js";
2
2
  import { saveHelmValues, getHelmValuesPath } from "./config.js";
3
+ import { assertValidHelmValues } from "./validateValues.js";
4
+ import { SUPABASE_POSTGRES_IMAGE_REPOSITORY, SUPABASE_POSTGRES_IMAGE_TAG, DEFAULT_IMAGE_REGISTRY, IMAGE_REPOSITORIES, IMAGE_DIGESTS, KAFKA_PROXY_IMAGE, } from "./versions.js";
5
+ import { createHmac } from "crypto";
3
6
  import fs from "fs/promises";
4
7
  import YAML from "yaml";
8
+ // Names of the Kubernetes Secrets the CLI creates in k8s secret mode. Shared by
9
+ // the value generator (which sets the secretRef fields) and src/lib/secrets.ts
10
+ // (which creates the Secrets) so they always agree.
11
+ export function deploymentSecretNames(config) {
12
+ const base = config.name;
13
+ return {
14
+ app: `${base}-app-secrets`,
15
+ db: `${base}-supabase-db`,
16
+ dbBootstrap: `${base}-supabase-db-bootstrap`,
17
+ jwt: `${base}-supabase-jwt`,
18
+ dashboard: `${base}-supabase-dashboard`,
19
+ realtime: `${base}-supabase-realtime`,
20
+ smtp: `${base}-supabase-smtp`,
21
+ };
22
+ }
23
+ // Baseline Kafka topic partitioning. These are NOT user-tunable sizing knobs
24
+ // (tiers were removed); they are a structural contract that must stay
25
+ // consistent across three places at once: the kafka.provisioning topic
26
+ // partitions, rulebricks.hps.workers.solutionPartitions (the worker-fleet
27
+ // concurrency ceiling the chart cross-checks), and the worker KEDA
28
+ // maxReplicaCount (validated to be <= solutionPartitions). They mirror the Helm
29
+ // chart's own defaults, so operators who need a different size tune the chart
30
+ // values directly. Partitions can never be decreased, so solution is sized with
31
+ // generous headroom up front; idle partitions are effectively free.
32
+ const SOLUTION_TOPIC_PARTITIONS = 128;
33
+ const LOGS_TOPIC_PARTITIONS = 24;
34
+ // RPC + log topics: replication factor 1. RPC traffic is transient and
35
+ // latency-sensitive (the HPS producer's acks=-1 would otherwise wait on full
36
+ // ISR replication); the in-cluster broker is single-replica by default.
37
+ const TOPIC_REPLICATION_FACTOR = 1;
38
+ // global.version must be empty or a semantic version per the chart schema. The
39
+ // CLI normally pins a real version, but migrated/legacy configs can carry
40
+ // "latest"; emitting that would fail chart validation, so we omit it instead
41
+ // and let the chart fall back to its default.
42
+ const SEMVER_PATTERN = /^\d+\.\d+\.\d+(-[0-9A-Za-z.-]+)?$/;
43
+ // Healthy defaults for the decision-log archive that ClickHouse reads:
44
+ // flush a gzipped NDJSON file at ~64 MiB (uncompressed) or after 5 minutes,
45
+ // whichever comes first. Users can override these in their Helm values.
46
+ //
47
+ // max_bytes MUST stay well below the Vector pod's memory limit
48
+ // (vector.resources.limits.memory in the chart): the object-storage sink buffers
49
+ // the whole uncompressed batch in memory before it flushes, so a batch sized at
50
+ // or above the pod limit gets OOMKilled before it can ever write a blob - which
51
+ // silently disables decision-log export entirely. 64 MiB leaves comfortable
52
+ // headroom under the chart's 1 GiB Vector limit while still producing large,
53
+ // scan-efficient files for ClickHouse.
54
+ const DECISION_LOG_BATCH = { max_bytes: 67108864, timeout_secs: 300 };
55
+ const SUPABASE_JWT_ISSUED_AT = 1641769200;
56
+ const SUPABASE_JWT_EXPIRES_AT = 4102444800;
57
+ // VRL that normalizes the Kafka decision-log envelope into the ClickHouse column
58
+ // types. Inlined as a real multi-line string (not a chart `{{ include }}`) so
59
+ // that YAML.stringify / Helm's toYaml emit it as a block scalar. A templated
60
+ // single-line include gets rendered into a single-quoted YAML scalar, whose
61
+ // newlines YAML folds into spaces - collapsing the statements onto one line and
62
+ // breaking VRL parsing. Keep in sync with rulebricks.vector.normalizeLogs.
63
+ const VECTOR_NORMALIZE_LOGS_VRL = [
64
+ "parsed, err = parse_json(string!(.message))",
65
+ "if err == null {",
66
+ " . = parsed",
67
+ "}",
68
+ '.timestamp = parse_timestamp!(to_string(.timestamp) ?? to_string(now()), format: "%+")',
69
+ '.api_key = to_string(.api_key) ?? ""',
70
+ ".user_id = to_string(.user_id) ?? null",
71
+ ".environment = to_string(.environment) ?? null",
72
+ ".ip = to_string(.ip) ?? null",
73
+ ".method = to_string(.method) ?? null",
74
+ '.url = to_string(.url) ?? ""',
75
+ ".status = to_int(.status) ?? 0",
76
+ ".rule_name = to_string(.rule_name) ?? null",
77
+ ".rule_id = to_string(.rule_id) ?? null",
78
+ ".rule_slug = to_string(.rule_slug) ?? null",
79
+ ".rule_version = to_string(.rule_version) ?? null",
80
+ ".operation = to_string(.operation) ?? null",
81
+ '.level = to_string(.level) ?? "info"',
82
+ ".error = to_string(.error) ?? null",
83
+ ".trace_id = to_string(.trace_id) ?? null",
84
+ ".span_id = to_string(.span_id) ?? null",
85
+ '.request = to_string(.request) ?? "null"',
86
+ '.response = to_string(.response) ?? "null"',
87
+ '.decision = to_string(.decision) ?? "{}"',
88
+ '.params = to_string(.params) ?? "{}"',
89
+ ].join("\n");
90
+ function decisionLogPathPrefix(config) {
91
+ const path = config.storage?.paths?.decisionLogs || "decision-logs";
92
+ return `${path.replace(/^\/+|\/+$/g, "")}/year=%Y/month=%m/day=%d/hour=%H/`;
93
+ }
5
94
  /**
6
95
  * Generates Vector sink configuration based on logging settings
7
96
  */
@@ -10,84 +99,85 @@ function generateVectorSinks(config) {
10
99
  // Console sink is always enabled
11
100
  console: {
12
101
  type: "console",
13
- inputs: ["kafka"],
102
+ inputs: ["normalize_logs"],
14
103
  encoding: {
15
104
  codec: "json",
16
105
  },
17
106
  },
18
107
  };
19
- // Add external sink if configured
20
- if (config.features.logging.sink !== "console" &&
21
- config.features.logging.sink !== "pending") {
22
- const { sink, bucket, region } = config.features.logging;
23
- switch (sink) {
24
- // Cloud Storage sinks
108
+ if (config.storage) {
109
+ const storage = config.storage;
110
+ switch (config.storage.provider) {
25
111
  case "s3":
26
- sinks.s3 = {
112
+ sinks.decision_logs = {
27
113
  type: "aws_s3",
28
- inputs: ["kafka"],
29
- bucket: bucket,
30
- region: region,
31
- key_prefix: "rulebricks/logs/%Y/%m/%d/",
114
+ inputs: ["normalize_logs"],
115
+ bucket: storage.bucket,
116
+ region: storage.region,
117
+ key_prefix: decisionLogPathPrefix(config),
118
+ filename_extension: "ndjson",
32
119
  compression: "gzip",
33
- encoding: {
34
- codec: "json",
35
- },
120
+ encoding: { codec: "json" },
121
+ framing: { method: "newline_delimited" },
122
+ batch: { ...DECISION_LOG_BATCH },
36
123
  };
37
124
  break;
38
- case "azure-blob":
39
- if (!bucket) {
40
- throw new Error("Azure Blob logging requires a storage account.");
41
- }
42
- const azureBlobSink = {
125
+ case "azure-blob": {
126
+ const sink = {
43
127
  type: "azure_blob",
44
- inputs: ["kafka"],
45
- account_name: bucket,
46
- container_name: config.features.logging.azureBlobContainer || "rulebricks-logs",
47
- blob_prefix: "rulebricks/logs/%Y/%m/%d/",
128
+ inputs: ["normalize_logs"],
129
+ account_name: storage.bucket,
130
+ container_name: storage.azureBlobContainer || "rulebricks",
131
+ blob_prefix: decisionLogPathPrefix(config),
132
+ // azure_blob has no filename_extension (unlike aws_s3/gcs); it always
133
+ // writes ".log" (".log.gz" when compressed). ClickHouse globs on *.gz.
48
134
  compression: "gzip",
49
- encoding: {
50
- codec: "json",
51
- },
135
+ encoding: { codec: "json" },
136
+ framing: { method: "newline_delimited" },
137
+ batch: { ...DECISION_LOG_BATCH },
52
138
  };
53
- if (config.features.logging.cloudAuthMode === "secret") {
54
- if (!config.features.logging.azureBlobConnectionStringSecretRef) {
55
- throw new Error("Azure Blob connection string auth requires a secret ref.");
56
- }
57
- azureBlobSink.connection_string = "${AZURE_STORAGE_CONNECTION_STRING}";
139
+ if (config.storage.cloudAuthMode === "secret") {
140
+ sink.connection_string = "${AZURE_STORAGE_CONNECTION_STRING}";
58
141
  }
59
142
  else {
60
- if (!config.features.logging.azureBlobClientId ||
61
- !config.features.logging.azureBlobTenantId) {
62
- throw new Error("Azure Blob workload identity requires client ID and tenant ID.");
63
- }
64
- azureBlobSink.auth = {
143
+ sink.auth = {
65
144
  azure_credential_kind: "workload_identity",
66
- client_id: config.features.logging.azureBlobClientId,
67
- tenant_id: config.features.logging.azureBlobTenantId,
145
+ client_id: config.storage.azureBlobClientId,
146
+ tenant_id: config.storage.azureBlobTenantId,
68
147
  token_file_path: "/var/run/secrets/azure/tokens/azure-identity-token",
69
148
  };
70
149
  }
71
- sinks.azure_blob = azureBlobSink;
150
+ sinks.decision_logs = sink;
72
151
  break;
152
+ }
73
153
  case "gcs":
74
- sinks.gcs = {
154
+ sinks.decision_logs = {
75
155
  type: "gcp_cloud_storage",
76
- inputs: ["kafka"],
77
- bucket: bucket,
78
- key_prefix: "rulebricks/logs/%Y/%m/%d/",
156
+ inputs: ["normalize_logs"],
157
+ bucket: storage.bucket,
158
+ key_prefix: decisionLogPathPrefix(config),
159
+ filename_extension: "ndjson",
79
160
  compression: "gzip",
80
- encoding: {
81
- codec: "json",
82
- },
161
+ encoding: { codec: "json" },
162
+ framing: { method: "newline_delimited" },
163
+ batch: { ...DECISION_LOG_BATCH },
83
164
  };
84
165
  break;
166
+ }
167
+ }
168
+ // Add external logging-platform sink if configured. Decision logs always go
169
+ // to object storage via the decision_logs sink above; this is an additional
170
+ // platform destination (Datadog, Splunk, etc.).
171
+ if (config.features.logging.sink !== "console" &&
172
+ config.features.logging.sink !== "pending") {
173
+ const { sink, bucket, region } = config.features.logging;
174
+ switch (sink) {
85
175
  // Logging platform sinks
86
176
  // For platforms, bucket is repurposed for API key/token, region for site/URL
87
177
  case "datadog":
88
178
  sinks.datadog = {
89
179
  type: "datadog_logs",
90
- inputs: ["kafka"],
180
+ inputs: ["normalize_logs"],
91
181
  default_api_key: bucket, // API key stored in bucket field
92
182
  site: region || "datadoghq.com", // Site stored in region field
93
183
  compression: "gzip",
@@ -99,7 +189,7 @@ function generateVectorSinks(config) {
99
189
  case "splunk":
100
190
  sinks.splunk = {
101
191
  type: "splunk_hec_logs",
102
- inputs: ["kafka"],
192
+ inputs: ["normalize_logs"],
103
193
  endpoint: region, // URL stored in region field
104
194
  default_token: bucket, // HEC token stored in bucket field
105
195
  compression: "gzip",
@@ -114,7 +204,7 @@ function generateVectorSinks(config) {
114
204
  const esConfig = JSON.parse(bucket || "{}");
115
205
  sinks.elasticsearch = {
116
206
  type: "elasticsearch",
117
- inputs: ["kafka"],
207
+ inputs: ["normalize_logs"],
118
208
  endpoints: [esConfig.url],
119
209
  bulk: {
120
210
  index: esConfig.index || "rulebricks-logs",
@@ -134,7 +224,7 @@ function generateVectorSinks(config) {
134
224
  // Fallback if JSON parsing fails
135
225
  sinks.elasticsearch = {
136
226
  type: "elasticsearch",
137
- inputs: ["kafka"],
227
+ inputs: ["normalize_logs"],
138
228
  endpoints: [bucket],
139
229
  bulk: {
140
230
  index: region || "rulebricks-logs",
@@ -145,7 +235,7 @@ function generateVectorSinks(config) {
145
235
  case "loki":
146
236
  sinks.loki = {
147
237
  type: "loki",
148
- inputs: ["kafka"],
238
+ inputs: ["normalize_logs"],
149
239
  endpoint: bucket, // Loki URL stored in bucket field
150
240
  labels: {
151
241
  app: "rulebricks",
@@ -159,7 +249,7 @@ function generateVectorSinks(config) {
159
249
  case "newrelic":
160
250
  sinks.newrelic = {
161
251
  type: "new_relic",
162
- inputs: ["kafka"],
252
+ inputs: ["normalize_logs"],
163
253
  license_key: bucket, // License key stored in bucket field
164
254
  account_id: region, // Account ID stored in region field
165
255
  api: "logs",
@@ -172,7 +262,7 @@ function generateVectorSinks(config) {
172
262
  case "axiom":
173
263
  sinks.axiom = {
174
264
  type: "axiom",
175
- inputs: ["kafka"],
265
+ inputs: ["normalize_logs"],
176
266
  token: bucket, // API token stored in bucket field
177
267
  dataset: region || "rulebricks", // Dataset stored in region field
178
268
  compression: "gzip",
@@ -186,20 +276,32 @@ function generateVectorSinks(config) {
186
276
  return sinks;
187
277
  }
188
278
  function generateVectorEnv(config) {
189
- const env = [
190
- {
191
- name: "KAFKA_BOOTSTRAP_SERVERS",
279
+ // Kafka connection settings come from the templated vector-kafka-env ConfigMap
280
+ // so the in-cluster vs external (and bridge) decision lives in one place.
281
+ const configMapKeys = [
282
+ "KAFKA_BOOTSTRAP_SERVERS",
283
+ "KAFKA_TLS_ENABLED",
284
+ "KAFKA_SASL_ENABLED",
285
+ "KAFKA_SASL_MECHANISM",
286
+ "KAFKA_LOG_TOPIC",
287
+ ];
288
+ const env = configMapKeys.map((key) => ({
289
+ name: key,
290
+ valueFrom: { configMapKeyRef: { name: "vector-kafka-env", key } },
291
+ }));
292
+ // SASL credentials (inline PLAIN/SCRAM). Optional so in-cluster/token-auth
293
+ // deploys work without the secret existing.
294
+ for (const key of ["KAFKA_SASL_USERNAME", "KAFKA_SASL_PASSWORD"]) {
295
+ env.push({
296
+ name: key,
192
297
  valueFrom: {
193
- configMapKeyRef: {
194
- name: "vector-kafka-env",
195
- key: "KAFKA_BOOTSTRAP_SERVERS",
196
- },
298
+ secretKeyRef: { name: "vector-kafka-credentials", key, optional: true },
197
299
  },
198
- },
199
- ];
200
- const azureBlobSecretRef = config.features.logging.azureBlobConnectionStringSecretRef;
201
- if (config.features.logging.sink === "azure-blob" &&
202
- config.features.logging.cloudAuthMode === "secret" &&
300
+ });
301
+ }
302
+ const azureBlobSecretRef = config.storage?.azureBlobConnectionStringSecretRef;
303
+ if (config.storage?.provider === "azure-blob" &&
304
+ config.storage.cloudAuthMode === "secret" &&
203
305
  azureBlobSecretRef) {
204
306
  env.push({
205
307
  name: "AZURE_STORAGE_CONNECTION_STRING",
@@ -211,20 +313,20 @@ function generateVectorEnv(config) {
211
313
  return env;
212
314
  }
213
315
  function generateVectorServiceAccount(config) {
316
+ // AWS uses EKS Pod Identity: NO eks.amazonaws.com/role-arn annotation - the
317
+ // CLI's workload-identity step creates a namespace-scoped association for this
318
+ // SA (to a role granting both the object-storage and MSK access Vector needs).
319
+ // Azure/GCP still annotate the SA, which is how their workload identity binds.
214
320
  const annotations = {};
215
- if (config.features.logging.sink === "s3" && config.features.logging.awsIamRoleArn) {
216
- annotations["eks.amazonaws.com/role-arn"] =
217
- config.features.logging.awsIamRoleArn;
218
- }
219
- if (config.features.logging.sink === "azure-blob" &&
220
- config.features.logging.cloudAuthMode !== "secret" &&
221
- config.features.logging.azureBlobClientId) {
321
+ if (config.storage?.provider === "azure-blob" &&
322
+ config.storage.cloudAuthMode !== "secret" &&
323
+ config.storage.azureBlobClientId) {
222
324
  annotations["azure.workload.identity/client-id"] =
223
- config.features.logging.azureBlobClientId;
325
+ config.storage.azureBlobClientId;
224
326
  }
225
- if (config.features.logging.sink === "gcs" && config.features.logging.gcpServiceAccountEmail) {
327
+ if (config.storage?.provider === "gcs" && config.storage.gcpServiceAccountEmail) {
226
328
  annotations["iam.gke.io/gcp-service-account"] =
227
- config.features.logging.gcpServiceAccountEmail;
329
+ config.storage.gcpServiceAccountEmail;
228
330
  }
229
331
  return {
230
332
  create: true,
@@ -233,9 +335,11 @@ function generateVectorServiceAccount(config) {
233
335
  };
234
336
  }
235
337
  function generateVectorPodLabels(config) {
236
- const labels = {};
237
- if (config.features.logging.sink === "azure-blob" &&
238
- config.features.logging.cloudAuthMode !== "secret") {
338
+ const labels = {
339
+ "rulebricks.com/workload-group": "infrastructure",
340
+ };
341
+ if (config.storage?.provider === "azure-blob" &&
342
+ config.storage.cloudAuthMode !== "secret") {
239
343
  labels["azure.workload.identity/use"] = "true";
240
344
  }
241
345
  return labels;
@@ -258,6 +362,48 @@ function secretKeySelector(ref) {
258
362
  key: ref.key,
259
363
  };
260
364
  }
365
+ function base64UrlJson(value) {
366
+ return Buffer.from(JSON.stringify(value)).toString("base64url");
367
+ }
368
+ // Self-hosted Supabase derives the anon and service_role API keys from the JWT
369
+ // secret: each is an HS256 JWT (role: anon / service_role) signed with the secret.
370
+ // https://supabase.com/docs/guides/self-hosting/self-hosted-auth-keys
371
+ export function signSupabaseJwt(role, secret) {
372
+ const header = base64UrlJson({ alg: "HS256", typ: "JWT" });
373
+ const payload = base64UrlJson({
374
+ role,
375
+ iss: "supabase",
376
+ iat: SUPABASE_JWT_ISSUED_AT,
377
+ exp: SUPABASE_JWT_EXPIRES_AT,
378
+ });
379
+ const body = `${header}.${payload}`;
380
+ const signature = createHmac("sha256", secret).update(body).digest("base64url");
381
+ return `${body}.${signature}`;
382
+ }
383
+ // Realtime needs SECRET_KEY_BASE (signs/encrypts its tokens) and a 16-byte
384
+ // DB_ENC_KEY (encrypts tenant DB creds). Derive both deterministically from the
385
+ // JWT secret so they are stable across redeploys with no extra state to persist,
386
+ // and anchored to the one root secret the operator already manages.
387
+ export function deriveRealtimeSecrets(jwtSecret) {
388
+ const secretKeyBase = createHmac("sha256", jwtSecret)
389
+ .update("supabase-realtime-secret-key-base")
390
+ .digest("hex"); // 64 chars
391
+ const dbEncKey = createHmac("sha256", jwtSecret)
392
+ .update("supabase-realtime-db-enc-key")
393
+ .digest("hex")
394
+ .slice(0, 16); // Realtime requires exactly 16 bytes
395
+ return { secretKeyBase, dbEncKey };
396
+ }
397
+ /**
398
+ * Strips surrounding whitespace and embedded control characters (notably the
399
+ * trailing carriage return that sneaks in when a remote_write URL is pasted from
400
+ * a CRLF file or captured from command output). A stray "\r" corrupts the URL
401
+ * the Prometheus operator hands to remote_write, so normalize it at the source.
402
+ */
403
+ function sanitizeRemoteWriteUrl(url) {
404
+ // eslint-disable-next-line no-control-regex
405
+ return url.replace(/[\u0000-\u001F\u007F]/g, "").trim();
406
+ }
261
407
  function generateRemoteWriteSpec(config) {
262
408
  if (config.features.monitoring.destination === "local-grafana") {
263
409
  return [];
@@ -265,11 +411,18 @@ function generateRemoteWriteSpec(config) {
265
411
  const remoteWrite = config.features.monitoring.remoteWrite;
266
412
  if (!remoteWrite) {
267
413
  return config.features.monitoring.remoteWriteUrl
268
- ? [{ url: config.features.monitoring.remoteWriteUrl }]
414
+ ? [{ url: sanitizeRemoteWriteUrl(config.features.monitoring.remoteWriteUrl) }]
269
415
  : [];
270
416
  }
417
+ // Enforce the same per-destination/auth requirements the wizard and Zod
418
+ // schema do. This is unreachable for CLI-generated configs (they are gated
419
+ // earlier) but guards hand-edited values and keeps one source of truth.
420
+ const remoteWriteErrors = validateRemoteWriteConfig(remoteWrite);
421
+ if (remoteWriteErrors.length > 0) {
422
+ throw new Error(remoteWriteErrors.join(" "));
423
+ }
271
424
  const base = {
272
- url: remoteWrite.url,
425
+ url: sanitizeRemoteWriteUrl(remoteWrite.url),
273
426
  };
274
427
  switch (remoteWrite.destination) {
275
428
  case "aws-amp":
@@ -294,12 +447,125 @@ function generateRemoteWriteSpec(config) {
294
447
  return [base];
295
448
  }
296
449
  }
450
+ function isClickStackEnabled(config) {
451
+ return config.features.observability?.clickstack?.enabled ?? true;
452
+ }
453
+ function generateClickStackValues(enabled, config, storageClass, infrastructurePodLabels, operationalDaemonSetTolerations) {
454
+ const clickstack = config.features.observability?.clickstack;
455
+ const telemetryRetentionDays = clickstack?.telemetryRetentionDays ?? 7;
456
+ const clickHouseStorageSize = clickstack?.clickHouseStorageSize ?? "100Gi";
457
+ // Registry host for the clickstack images. The clickstack subchart routes
458
+ // these through its own image helper, so the split { registry, repository }
459
+ // shape lets global.imageRegistry + digest pinning flow through.
460
+ const reg = config.imageRegistry || DEFAULT_IMAGE_REGISTRY;
461
+ return {
462
+ enabled,
463
+ clickhouse: {
464
+ database: "otel",
465
+ username: "rulebricks",
466
+ existingSecret: "",
467
+ existingSecretKey: "admin-password",
468
+ retentionDays: telemetryRetentionDays,
469
+ ttl: "",
470
+ },
471
+ hyperdx: {
472
+ enabled,
473
+ image: {
474
+ registry: reg,
475
+ repository: IMAGE_REPOSITORIES.hyperdx.repository,
476
+ tag: IMAGE_REPOSITORIES.hyperdx.tag,
477
+ pullPolicy: "IfNotPresent",
478
+ },
479
+ resources: {
480
+ requests: { cpu: "250m", memory: "512Mi" },
481
+ limits: { cpu: "1000m", memory: "1Gi" },
482
+ },
483
+ ingress: {
484
+ enabled,
485
+ className: "traefik",
486
+ hostname: "",
487
+ allowedIPs: [],
488
+ },
489
+ podLabels: infrastructurePodLabels,
490
+ },
491
+ collector: {
492
+ image: {
493
+ registry: reg,
494
+ repository: IMAGE_REPOSITORIES.clickstackOtelCollector.repository,
495
+ tag: IMAGE_REPOSITORIES.clickstackOtelCollector.tag,
496
+ pullPolicy: "IfNotPresent",
497
+ },
498
+ memoryLimitMiB: 800,
499
+ agent: {
500
+ enabled,
501
+ securityContext: {
502
+ runAsUser: 0,
503
+ runAsGroup: 0,
504
+ },
505
+ resources: {
506
+ requests: { cpu: "100m", memory: "256Mi" },
507
+ limits: { cpu: "500m", memory: "512Mi" },
508
+ },
509
+ tolerations: operationalDaemonSetTolerations,
510
+ podLabels: infrastructurePodLabels,
511
+ },
512
+ gateway: {
513
+ replicas: 1,
514
+ resources: {
515
+ requests: { cpu: "250m", memory: "512Mi" },
516
+ limits: { cpu: "2000m", memory: "1Gi" },
517
+ },
518
+ podLabels: infrastructurePodLabels,
519
+ },
520
+ },
521
+ ferretdb: {
522
+ enabled,
523
+ image: {
524
+ registry: reg,
525
+ repository: IMAGE_REPOSITORIES.ferretdb.repository,
526
+ tag: IMAGE_REPOSITORIES.ferretdb.tag,
527
+ pullPolicy: "IfNotPresent",
528
+ },
529
+ postgresImage: {
530
+ registry: reg,
531
+ repository: IMAGE_REPOSITORIES.postgresDocumentdb.repository,
532
+ tag: IMAGE_REPOSITORIES.postgresDocumentdb.tag,
533
+ pullPolicy: "IfNotPresent",
534
+ },
535
+ auth: {
536
+ username: "hyperdx",
537
+ password: "",
538
+ existingSecret: "",
539
+ existingSecretKey: "password",
540
+ },
541
+ persistence: {
542
+ enabled,
543
+ size: "10Gi",
544
+ storageClassName: storageClass,
545
+ },
546
+ resources: {
547
+ ferretdb: {
548
+ requests: { cpu: "100m", memory: "256Mi" },
549
+ limits: { cpu: "500m", memory: "512Mi" },
550
+ },
551
+ postgres: {
552
+ requests: { cpu: "250m", memory: "512Mi" },
553
+ limits: { cpu: "1000m", memory: "1Gi" },
554
+ },
555
+ },
556
+ podLabels: infrastructurePodLabels,
557
+ podAnnotations: {
558
+ "cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
559
+ },
560
+ },
561
+ };
562
+ }
297
563
  function generatePrometheusServiceAccount(config) {
564
+ // AWS (AMP remote write) uses EKS Pod Identity - the association is created by
565
+ // the CLI's workload-identity step, so no eks.amazonaws.com/role-arn annotation.
566
+ // Azure Monitor still annotates the SA for its workload identity.
298
567
  const annotations = {};
299
568
  const remoteWrite = config.features.monitoring.remoteWrite;
300
- if (remoteWrite?.destination === "aws-amp" && remoteWrite.awsRoleArn) {
301
- annotations["eks.amazonaws.com/role-arn"] = remoteWrite.awsRoleArn;
302
- }
303
569
  if (remoteWrite?.destination === "azure-monitor" &&
304
570
  remoteWrite.authType === "workload-identity" &&
305
571
  remoteWrite.clientId) {
@@ -343,8 +609,16 @@ function generateAzureMonitorRemoteWrite(remoteWrite, base) {
343
609
  if (!remoteWrite.clientId || !remoteWrite.tenantId) {
344
610
  throw new Error("Azure Monitor remote_write workload identity requires client ID and tenant ID.");
345
611
  }
346
- azureAd.workloadIdentity = {
347
- clientId: remoteWrite.clientId,
612
+ // The prometheus-operator AzureAD schema supports only managedIdentity,
613
+ // oauth, and sdk (there is no "workloadIdentity" field - emitting it makes
614
+ // the operator reject the whole remoteWrite with "must provide Azure Managed
615
+ // Identity or Azure OAuth or Azure SDK", which silently prevents the
616
+ // Prometheus StatefulSet from being created). For AKS workload identity we
617
+ // use the Azure SDK credential: it reads the projected token + AZURE_CLIENT_ID
618
+ // injected by the workload-identity webhook (driven by the prometheus
619
+ // ServiceAccount's azure.workload.identity/client-id annotation and the
620
+ // azure.workload.identity/use pod label), so only the tenant ID is needed here.
621
+ azureAd.sdk = {
348
622
  tenantId: remoteWrite.tenantId,
349
623
  };
350
624
  }
@@ -392,57 +666,690 @@ function generateGenericRemoteWrite(remoteWrite, base) {
392
666
  return base;
393
667
  }
394
668
  /**
395
- * Generates Kafka extra environment variables for tuning
669
+ * Generates the Kafka broker config map (Kafka.spec.kafka.config for Strimzi).
670
+ * These are the former KAFKA_CFG_* tuning env vars, as their Kafka property
671
+ * names. Kept in lockstep with the chart's kafka.config.
672
+ */
673
+ function generateKafkaConfig() {
674
+ return {
675
+ "auto.create.topics.enable": "true",
676
+ "log.retention.hours": "24",
677
+ "num.partitions": "12",
678
+ "num.network.threads": "8",
679
+ "num.io.threads": "8",
680
+ "socket.send.buffer.bytes": "1048576",
681
+ "socket.receive.buffer.bytes": "1048576",
682
+ "socket.request.max.bytes": "209715200",
683
+ // Broker-wide max record size; must exceed every per-topic max.message.bytes.
684
+ "message.max.bytes": "2097152",
685
+ "replica.fetch.max.bytes": "4194304",
686
+ // Broker-wide default retention; the application topics carry tighter caps.
687
+ "log.retention.bytes": "536870912",
688
+ "log.segment.bytes": "1073741824",
689
+ "num.replica.fetchers": "4",
690
+ "queued.max.requests": "10000",
691
+ "replica.socket.receive.buffer.bytes": "1048576",
692
+ "log.cleaner.dedupe.buffer.size": "268435456",
693
+ "log.cleaner.io.buffer.size": "1048576",
694
+ };
695
+ }
696
+ /**
697
+ * Effective Kafka topic prefix as HPS/Vector/KEDA will see it.
698
+ * Mirrors generateAppLogging: in-cluster Kafka runs UNPREFIXED (dedicated
699
+ * broker, and prefixing would desync chart-side consumers from producers);
700
+ * external Kafka uses the explicit prefix, falling back to the chart default.
701
+ */
702
+ function effectiveTopicPrefix(config) {
703
+ if (!isExternalKafka(config)) {
704
+ return "";
705
+ }
706
+ const ext = config.externalServices?.kafka?.external ?? {};
707
+ return ext.topicPrefix !== undefined ? ext.topicPrefix : "com.rulebricks.";
708
+ }
709
+ /**
710
+ * Explicit topic management for in-cluster Kafka.
711
+ *
712
+ * Generates the kafka.provisioning block consumed by BOTH the subchart
713
+ * provisioning Job (creates topics) and the chart's kafka-topic-align Job
714
+ * (idempotently converges pre-existing topics on upgrade). Topic names are
715
+ * derived from the SAME prefix written to app.logging.kafkaTopicPrefix - the
716
+ * chart fails the render if these ever diverge.
717
+ *
718
+ * Sizing policy (baseline constants, mirroring the chart defaults):
719
+ * - solution/solution-response: SOLUTION_TOPIC_PARTITIONS (the worker-fleet
720
+ * concurrency CEILING; partitions can never be decreased, workers are sized
721
+ * separately by the cluster autoscaler). RF stays 1: RPC traffic is transient
722
+ * and latency-sensitive, and the HPS producer's acks=-1 would otherwise wait
723
+ * on full ISR replication.
724
+ * - logs: LOGS_TOPIC_PARTITIONS (durable data feeding the Vector -> object
725
+ * storage pipeline).
726
+ */
727
+ function generateKafkaTopics(config) {
728
+ // External MSK IAM: the chart's kafka-topic-provision Job creates these on the
729
+ // managed broker (through the proxy bridge), so they must be populated here -
730
+ // MSK Serverless won't auto-create them. Other external brokers (SCRAM / Event
731
+ // Hubs / GCP, no bridge) a plain client can reach stay customer-managed.
732
+ if (isExternalKafka(config) && !kafkaUsesBridge(config)) {
733
+ return [];
734
+ }
735
+ const prefix = effectiveTopicPrefix(config);
736
+ const rpcTopicConfig = {
737
+ "retention.ms": "300000",
738
+ "segment.ms": "300000",
739
+ "segment.bytes": "67108864",
740
+ "retention.bytes": "67108864",
741
+ "max.message.bytes": "2097152",
742
+ };
743
+ return [
744
+ {
745
+ name: `${prefix}solution`,
746
+ partitions: SOLUTION_TOPIC_PARTITIONS,
747
+ replicas: TOPIC_REPLICATION_FACTOR,
748
+ config: rpcTopicConfig,
749
+ },
750
+ {
751
+ name: `${prefix}solution-response`,
752
+ partitions: SOLUTION_TOPIC_PARTITIONS,
753
+ replicas: TOPIC_REPLICATION_FACTOR,
754
+ config: rpcTopicConfig,
755
+ },
756
+ {
757
+ name: `${prefix}logs`,
758
+ partitions: LOGS_TOPIC_PARTITIONS,
759
+ replicas: TOPIC_REPLICATION_FACTOR,
760
+ config: {
761
+ "retention.ms": "86400000",
762
+ "retention.bytes": "268435456",
763
+ "max.message.bytes": "2097152",
764
+ },
765
+ },
766
+ ];
767
+ }
768
+ function generateWorkerPodAntiAffinity() {
769
+ return {
770
+ podAntiAffinity: {
771
+ preferredDuringSchedulingIgnoredDuringExecution: [
772
+ {
773
+ weight: 50,
774
+ podAffinityTerm: {
775
+ labelSelector: {
776
+ matchExpressions: [
777
+ {
778
+ key: "rulebricks.com/workload-group",
779
+ operator: "In",
780
+ values: ["infrastructure"],
781
+ },
782
+ ],
783
+ },
784
+ topologyKey: "kubernetes.io/hostname",
785
+ },
786
+ },
787
+ ],
788
+ },
789
+ };
790
+ }
791
+ function generateScheduling(tolerations, affinity) {
792
+ return {
793
+ ...(affinity ? { affinity } : {}),
794
+ ...(tolerations ? { tolerations } : {}),
795
+ };
796
+ }
797
+ /**
798
+ * Burst-pool scheduling, always on. Cluster-setup provisions a dedicated
799
+ * worker pool labeled and tainted rulebricks.com/pool=burst (one big
800
+ * Deallocate-parked node on Azure or an on-demand nodegroup on AWS); workers
801
+ * tolerate the taint and SOFTLY prefer the label. On clusters without such a
802
+ * pool both are inert, so BYO clusters schedule exactly as before - zero
803
+ * configuration required either way.
804
+ */
805
+ const BURST_POOL_TOLERATION = {
806
+ key: "rulebricks.com/pool",
807
+ operator: "Equal",
808
+ value: "burst",
809
+ effect: "NoSchedule",
810
+ };
811
+ const BURST_POOL_NODE_PREFERENCE = {
812
+ weight: 100,
813
+ preference: {
814
+ matchExpressions: [
815
+ { key: "rulebricks.com/pool", operator: "In", values: ["burst"] },
816
+ ],
817
+ },
818
+ };
819
+ function generateBackupValues(config) {
820
+ const usesInClusterPostgres = config.database.type === "self-hosted" &&
821
+ config.externalServices?.postgres?.mode !== "external";
822
+ const enabled = usesInClusterPostgres && config.backup?.enabled === true;
823
+ // The backup CronJob streams pg_dump from the running DB (using supabase.db.image)
824
+ // and uploads it with rclone, so no backup-specific image is needed here. The
825
+ // chart default rclone image applies unless overridden in values.
826
+ return {
827
+ enabled,
828
+ schedule: config.backup?.schedule || "0 2 * * *",
829
+ retentionDays: config.backup?.retentionDays || 7,
830
+ };
831
+ }
832
+ function isExternalRedis(config) {
833
+ return config.externalServices?.redis?.mode === "external";
834
+ }
835
+ function isExternalKafka(config) {
836
+ return config.externalServices?.kafka?.mode === "external";
837
+ }
838
+ /**
839
+ * Whether the Vector kafka-proxy bridge sidecar is required. Only AWS MSK IAM
840
+ * needs it: Vector's kafka source can't speak token mechanisms, while Azure
841
+ * Event Hubs and GCP both use SASL PLAIN/SCRAM that Vector handles directly.
842
+ */
843
+ function kafkaUsesBridge(config) {
844
+ if (!isExternalKafka(config))
845
+ return false;
846
+ const ext = config.externalServices?.kafka?.external;
847
+ return (ext?.preset === "aws-msk-iam" || ext?.sasl?.mechanism === "aws-iam");
848
+ }
849
+ /**
850
+ * Whether Vector's kafka source connects with a direct PLAIN/SCRAM credential
851
+ * and therefore needs username/password. This mirrors the vector-kafka-env
852
+ * ConfigMap, which only sets KAFKA_SASL_ENABLED=true for external, non-token,
853
+ * non-bridge mechanisms (and where vector-kafka-credentials is populated). For
854
+ * in-cluster, bridge, and token-auth paths SASL is disabled, so username and
855
+ * password MUST be omitted: an empty env default (${VAR:-}) renders unquoted
856
+ * via Helm's toYaml and Vector reads the value as YAML null, which it rejects
857
+ * at startup ("invalid type: unit value, expected any valid TOML value").
858
+ */
859
+ function kafkaUsesDirectSasl(config) {
860
+ if (!isExternalKafka(config))
861
+ return false;
862
+ if (kafkaUsesBridge(config))
863
+ return false;
864
+ const mechanism = config.externalServices?.kafka?.external?.sasl?.mechanism;
865
+ if (!mechanism)
866
+ return false;
867
+ return mechanism !== "aws-iam" && mechanism !== "oauthbearer";
868
+ }
869
+ /**
870
+ * Builds the rulebricks.redis block: in-cluster sizing when embedded, or
871
+ * external connection settings when the user points at managed Redis.
872
+ */
873
+ function generateRedisBlock(config, storageClass, infrastructurePodLabels, coreScheduling) {
874
+ if (!isExternalRedis(config)) {
875
+ // Sizing (resources, persistence size) falls back to the chart defaults;
876
+ // only the deployment-specific storage class is set here.
877
+ return {
878
+ podLabels: infrastructurePodLabels,
879
+ ...coreScheduling,
880
+ persistence: {
881
+ enabled: true,
882
+ storageClass,
883
+ },
884
+ };
885
+ }
886
+ const ext = config.externalServices?.redis?.external ?? {};
887
+ const external = {
888
+ host: ext.host ?? "",
889
+ port: ext.port ?? 6379,
890
+ tls: { enabled: ext.tls ?? false },
891
+ };
892
+ if (ext.password) {
893
+ external.password = ext.password;
894
+ }
895
+ if (ext.existingSecret) {
896
+ external.existingSecret = ext.existingSecret;
897
+ external.existingSecretKey = ext.existingSecretKey || "redis-password";
898
+ }
899
+ if (ext.httpApi?.enabled) {
900
+ external.httpApi = {
901
+ enabled: true,
902
+ url: ext.httpApi.url ?? "",
903
+ token: ext.httpApi.token ?? "",
904
+ };
905
+ }
906
+ return {
907
+ enabled: false,
908
+ external,
909
+ };
910
+ }
911
+ function generateCacheObservabilityBlock(config, infrastructurePodLabels) {
912
+ const cache = config.features.cache;
913
+ const valkeyAdmin = cache?.valkeyAdmin;
914
+ const redisExporter = cache?.redisExporter;
915
+ const valkeyAdminIngressEnabled = valkeyAdmin?.exposure === "ingress";
916
+ return {
917
+ valkeyAdmin: {
918
+ enabled: valkeyAdmin?.enabled ?? false,
919
+ exposure: valkeyAdmin?.exposure ?? "internal",
920
+ podLabels: infrastructurePodLabels,
921
+ ingress: {
922
+ enabled: valkeyAdminIngressEnabled,
923
+ hostname: valkeyAdminIngressEnabled
924
+ ? valkeyAdmin?.hostname || `valkey.${config.domain}`
925
+ : "",
926
+ basicAuth: {
927
+ users: valkeyAdmin?.basicAuthUsers ?? [],
928
+ existingSecret: valkeyAdmin?.basicAuthExistingSecret ?? "",
929
+ },
930
+ allowedIPs: valkeyAdmin?.allowedIPs ?? [],
931
+ },
932
+ },
933
+ redisExporter: {
934
+ enabled: redisExporter?.enabled ?? true,
935
+ podLabels: infrastructurePodLabels,
936
+ },
937
+ };
938
+ }
939
+ function generateKafkaExporterBlock(config, infrastructurePodLabels) {
940
+ const requested = config.features.cache?.kafkaExporter?.enabled;
941
+ const canUseKafkaExporter = !isExternalKafka(config);
942
+ return {
943
+ enabled: requested ?? canUseKafkaExporter,
944
+ podLabels: infrastructurePodLabels,
945
+ brokers: isExternalKafka(config)
946
+ ? config.externalServices?.kafka?.external?.brokers ?? ""
947
+ : "",
948
+ };
949
+ }
950
+ /**
951
+ * Builds the rulebricks.app.logging block. Decision logging is always enabled;
952
+ * external Kafka adds brokers + SSL/SASL, while embedded auto-discovers the
953
+ * in-cluster Kafka service.
954
+ */
955
+ function generateAppLogging(config) {
956
+ if (!isExternalKafka(config)) {
957
+ return {
958
+ enabled: true,
959
+ kafkaBrokers: "", // Auto-discover from Kafka subchart
960
+ kafkaTopic: "logs",
961
+ // The in-cluster app/HPS produce to unprefixed topics (logs, solution,
962
+ // solution-response). The chart default prefix ("com.rulebricks.") is meant
963
+ // for shared/managed Kafka collision avoidance, but when applied here it
964
+ // makes the chart-side consumers diverge from the producers: Vector would
965
+ // subscribe to "com.rulebricks.logs" (no data) and the KEDA worker trigger
966
+ // would watch "com.rulebricks.solution" (no lag signal). Disable prefixing
967
+ // for the dedicated in-cluster broker so everything lines up.
968
+ kafkaTopicPrefix: "",
969
+ };
970
+ }
971
+ const ext = config.externalServices?.kafka?.external ?? {};
972
+ const logging = {
973
+ enabled: true,
974
+ kafkaBrokers: ext.brokers ?? "",
975
+ kafkaTopic: ext.topic || "logs",
976
+ kafkaSsl: ext.ssl ?? false,
977
+ };
978
+ // Topic prefix: emit only when explicitly provided (incl. "" to disable). When
979
+ // omitted, the chart default (com.rulebricks.) applies via value merge.
980
+ if (ext.topicPrefix !== undefined) {
981
+ logging.kafkaTopicPrefix = ext.topicPrefix;
982
+ }
983
+ if (ext.sasl?.mechanism) {
984
+ const sasl = { mechanism: ext.sasl.mechanism };
985
+ if (ext.sasl.region)
986
+ sasl.region = ext.sasl.region;
987
+ if (ext.sasl.username)
988
+ sasl.username = ext.sasl.username;
989
+ if (ext.sasl.password)
990
+ sasl.password = ext.sasl.password;
991
+ if (ext.sasl.existingSecret)
992
+ sasl.existingSecret = ext.sasl.existingSecret;
993
+ logging.kafkaSasl = sasl;
994
+ }
995
+ return logging;
996
+ }
997
+ /**
998
+ * HPS service account. When external Kafka uses MSK IAM, HPS authenticates to the
999
+ * broker with its pod's cloud identity - under EKS Pod Identity that comes from a
1000
+ * namespace-scoped association (created by the CLI's workload-identity step for
1001
+ * the `<release>-hps` SA), NOT an eks.amazonaws.com/role-arn annotation. We only
1002
+ * CREATE the SA here so the association has a subject to bind.
1003
+ */
1004
+ function generateHpsServiceAccount(config) {
1005
+ if (kafkaUsesBridge(config)) {
1006
+ return { create: true, annotations: {} };
1007
+ }
1008
+ return { create: false, annotations: {} };
1009
+ }
1010
+ /**
1011
+ * Top-level kafkaBridge block consumed by the Vector env ConfigMap. Only enabled
1012
+ * for AWS MSK IAM, where a kafka-proxy sidecar fronts the brokers for Vector.
1013
+ */
1014
+ function generateKafkaBridge(config) {
1015
+ if (!kafkaUsesBridge(config)) {
1016
+ return { enabled: false };
1017
+ }
1018
+ const ext = config.externalServices?.kafka?.external ?? {};
1019
+ return {
1020
+ enabled: true,
1021
+ provider: "aws",
1022
+ region: ext.sasl?.region ?? "",
1023
+ brokers: ext.brokers ?? "",
1024
+ localPort: 19092,
1025
+ image: KAFKA_PROXY_IMAGE,
1026
+ awsRoleArn: ext.identity?.awsRoleArn ?? "",
1027
+ };
1028
+ }
1029
+ /**
1030
+ * kafka-proxy sidecar for the Vector pod (AWS MSK IAM). Maps each upstream
1031
+ * broker to a sequential local port and authenticates with the pod's IRSA role.
396
1032
  */
397
- function generateKafkaExtraEnvVars() {
1033
+ function generateVectorExtraContainers(config) {
1034
+ if (!kafkaUsesBridge(config))
1035
+ return undefined;
1036
+ const ext = config.externalServices?.kafka?.external ?? {};
1037
+ const brokers = (ext.brokers ?? "")
1038
+ .split(",")
1039
+ .map((b) => b.trim())
1040
+ .filter(Boolean);
1041
+ if (brokers.length === 0)
1042
+ return undefined;
1043
+ const basePort = 19092;
1044
+ const mappings = brokers.map((broker, i) => `--bootstrap-server-mapping=${broker},127.0.0.1:${basePort + i}`);
398
1045
  return [
399
1046
  {
400
- name: "KAFKA_JVM_PERFORMANCE_OPTS",
401
- value: "-XX:MaxDirectMemorySize=256M -Djdk.nio.maxCachedBufferSize=262144",
402
- },
403
- { name: "KAFKA_CFG_QUEUED_MAX_REQUESTS", value: "10000" },
404
- { name: "KAFKA_CFG_NUM_NETWORK_THREADS", value: "8" },
405
- { name: "KAFKA_CFG_NUM_IO_THREADS", value: "8" },
406
- { name: "KAFKA_CFG_SOCKET_SEND_BUFFER_BYTES", value: "1048576" },
407
- { name: "KAFKA_CFG_SOCKET_RECEIVE_BUFFER_BYTES", value: "1048576" },
408
- { name: "KAFKA_CFG_SOCKET_REQUEST_MAX_BYTES", value: "209715200" },
409
- { name: "KAFKA_CFG_LOG_RETENTION_BYTES", value: "4294967296" },
410
- { name: "KAFKA_CFG_LOG_SEGMENT_BYTES", value: "1073741824" },
411
- { name: "KAFKA_CFG_NUM_REPLICA_FETCHERS", value: "4" },
412
- { name: "KAFKA_CFG_REPLICA_SOCKET_RECEIVE_BUFFER_BYTES", value: "1048576" },
413
- { name: "KAFKA_CFG_LOG_CLEANER_DEDUPE_BUFFER_SIZE", value: "268435456" },
414
- { name: "KAFKA_CFG_LOG_CLEANER_IO_BUFFER_SIZE", value: "1048576" },
415
- { name: "KAFKA_CFG_MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION", value: "10" },
1047
+ name: "kafka-proxy",
1048
+ image: KAFKA_PROXY_IMAGE,
1049
+ args: [
1050
+ "server",
1051
+ ...mappings,
1052
+ "--tls-enable",
1053
+ "--sasl-enable",
1054
+ "--sasl-method=AWS_MSK_IAM",
1055
+ `--sasl-aws-region=${ext.sasl?.region ?? ""}`,
1056
+ ],
1057
+ ports: brokers.map((_, i) => ({ containerPort: basePort + i })),
1058
+ },
416
1059
  ];
417
1060
  }
1061
+ // VRL for the Vector agent: parse JSON app/HPS log lines, lift trace_id/span_id
1062
+ // for logs<->traces correlation, and flatten useful Kubernetes metadata. Kept
1063
+ // in sync with charts/.../values.yaml vector-agent.customConfig.transforms.
1064
+ const VECTOR_APP_LOGS_VRL = [
1065
+ 'parsed, err = parse_json(to_string(.message) ?? "")',
1066
+ "if err == null && is_object(parsed) {",
1067
+ " .log = parsed",
1068
+ " .trace_id = parsed.trace_id",
1069
+ " .span_id = parsed.span_id",
1070
+ ' if exists(parsed.level) { .level = to_string(parsed.level) ?? "info" }',
1071
+ "}",
1072
+ ".pod = .kubernetes.pod_name",
1073
+ ".namespace = .kubernetes.pod_namespace",
1074
+ ".container = .kubernetes.container_name",
1075
+ ".node = .kubernetes.pod_node_name",
1076
+ ].join("\n");
418
1077
  /**
419
- * Generates Helm values from the deployment configuration
1078
+ * global.tracing block (in-cluster OTel Collector -> pluggable trace backend).
1079
+ * Emits the destination-specific sub-block (elastic | otlp | azure-monitor) and
1080
+ * returns undefined when tracing is disabled so it is omitted entirely.
420
1081
  */
421
- export async function generateHelmValues(config, options = {}) {
422
- const tierConfig = TIER_CONFIGS[config.tier];
423
- const { tlsEnabled = true } = options;
1082
+ function generateTracingGlobal(config) {
1083
+ const tracing = config.features.tracing;
1084
+ if (!tracing?.enabled)
1085
+ return undefined;
1086
+ const destination = tracing.destination ?? "elastic";
1087
+ const reg = config.imageRegistry || DEFAULT_IMAGE_REGISTRY;
1088
+ const base = {
1089
+ enabled: true,
1090
+ destination,
1091
+ samplingRatio: tracing.samplingRatio ?? 1,
1092
+ // RB image dict for the parent chart's otel-collector deployment. The
1093
+ // rulebricks.image helper requires image.repository and applies
1094
+ // global.imageRegistry to the host.
1095
+ collector: {
1096
+ image: {
1097
+ registry: reg,
1098
+ repository: IMAGE_REPOSITORIES.opentelemetryCollector.repository,
1099
+ tag: IMAGE_REPOSITORIES.opentelemetryCollector.tag,
1100
+ },
1101
+ },
1102
+ };
1103
+ if (destination === "elastic") {
1104
+ const elastic = tracing.elastic ?? {};
1105
+ const authMode = elastic.authMode ?? "secret-token";
1106
+ const elasticBlock = {
1107
+ endpoint: elastic.endpoint ?? "",
1108
+ authMode,
1109
+ tlsInsecureSkipVerify: false,
1110
+ };
1111
+ if (authMode === "secret-token" && elastic.secretToken) {
1112
+ elasticBlock.secretToken = elastic.secretToken;
1113
+ }
1114
+ if (authMode === "api-key" && elastic.apiKey) {
1115
+ elasticBlock.apiKey = elastic.apiKey;
1116
+ }
1117
+ return { ...base, elastic: elasticBlock };
1118
+ }
1119
+ if (destination === "otlp") {
1120
+ const otlp = tracing.otlp ?? {};
1121
+ const authMode = otlp.authMode ?? "none";
1122
+ const otlpBlock = {
1123
+ endpoint: otlp.endpoint ?? "",
1124
+ authMode,
1125
+ tlsInsecureSkipVerify: otlp.tlsInsecureSkipVerify ?? false,
1126
+ };
1127
+ if (authMode === "bearer" && otlp.token)
1128
+ otlpBlock.token = otlp.token;
1129
+ if (authMode === "api-key" && otlp.apiKey)
1130
+ otlpBlock.apiKey = otlp.apiKey;
1131
+ if (authMode === "header") {
1132
+ otlpBlock.headerName = otlp.headerName ?? "Authorization";
1133
+ if (otlp.headerValue)
1134
+ otlpBlock.headerValue = otlp.headerValue;
1135
+ }
1136
+ if (otlp.headers && Object.keys(otlp.headers).length > 0) {
1137
+ otlpBlock.headers = otlp.headers;
1138
+ }
1139
+ return { ...base, otlp: otlpBlock };
1140
+ }
1141
+ // azure-monitor
1142
+ const azure = tracing.azureMonitor ?? {};
1143
+ return {
1144
+ ...base,
1145
+ azureMonitor: { connectionString: azure.connectionString ?? "" },
1146
+ };
1147
+ }
1148
+ /**
1149
+ * traefik.tracing block: makes Traefik the root span and propagates the W3C
1150
+ * traceparent to backends. Empty object when tracing is disabled.
1151
+ */
1152
+ function generateTraefikTracing(config, releaseName) {
1153
+ if (!isClickStackEnabled(config) && !config.features.tracing?.enabled)
1154
+ return {};
1155
+ return {
1156
+ otlp: {
1157
+ enabled: true,
1158
+ http: {
1159
+ enabled: true,
1160
+ endpoint: `http://${releaseName}-otel-collector:4318/v1/traces`,
1161
+ },
1162
+ },
1163
+ };
1164
+ }
1165
+ /**
1166
+ * vector-agent block: a second Vector deployment (role Agent / DaemonSet) that
1167
+ * tails all pod logs and ships them to a customer-managed Elasticsearch. Decision
1168
+ * logs are unaffected (they stay in ClickHouse via the `vector` aggregator).
1169
+ */
1170
+ function generateVectorAgent(config, podLabels, tolerations) {
1171
+ const appLogs = config.features.logging.appLogs;
1172
+ if (!appLogs?.enabled) {
1173
+ return { enabled: false };
1174
+ }
1175
+ const destination = appLogs.destination ?? "elasticsearch";
1176
+ let sinkName = "elasticsearch";
1177
+ let sink;
1178
+ if (destination === "loki") {
1179
+ const loki = appLogs.loki ?? {};
1180
+ sinkName = "loki";
1181
+ sink = {
1182
+ type: "loki",
1183
+ inputs: ["app_logs"],
1184
+ endpoint: loki.endpoint,
1185
+ labels: loki.labels ?? {
1186
+ app: "rulebricks",
1187
+ namespace: "{{ namespace }}",
1188
+ pod: "{{ pod }}",
1189
+ container: "{{ container }}",
1190
+ },
1191
+ encoding: { codec: "json" },
1192
+ };
1193
+ }
1194
+ else if (destination === "generic") {
1195
+ const generic = appLogs.generic ?? {};
1196
+ sinkName = "generic_http";
1197
+ sink = {
1198
+ type: "http",
1199
+ inputs: ["app_logs"],
1200
+ uri: generic.endpoint,
1201
+ method: "post",
1202
+ encoding: { codec: "json" },
1203
+ };
1204
+ if (generic.authHeader) {
1205
+ sink.request = { headers: { Authorization: generic.authHeader } };
1206
+ }
1207
+ }
1208
+ else {
1209
+ const es = appLogs.elasticsearch ?? {};
1210
+ const authMode = es.authMode ?? "basic";
1211
+ sink = {
1212
+ type: "elasticsearch",
1213
+ inputs: ["app_logs"],
1214
+ endpoints: [es.endpoint],
1215
+ mode: "bulk",
1216
+ bulk: { index: es.index || "rulebricks-app-logs" },
1217
+ tls: { verify_certificate: es.verifyCertificate ?? true },
1218
+ };
1219
+ if (authMode === "basic") {
1220
+ sink.auth = { strategy: "basic", user: es.username, password: es.password };
1221
+ }
1222
+ else if (authMode === "api-key") {
1223
+ sink.request = { headers: { Authorization: `ApiKey ${es.apiKey}` } };
1224
+ }
1225
+ }
1226
+ return {
1227
+ enabled: true,
1228
+ role: "Agent",
1229
+ podLabels,
1230
+ // Follow active worker pools without tolerating shutdown, out-of-service,
1231
+ // or unreachable node taints.
1232
+ tolerations,
1233
+ resources: {
1234
+ requests: { cpu: "100m", memory: "256Mi" },
1235
+ limits: { cpu: "500m", memory: "512Mi" },
1236
+ },
1237
+ customConfig: {
1238
+ data_dir: "/vector-data-dir",
1239
+ sources: {
1240
+ kubernetes_logs: {
1241
+ type: "kubernetes_logs",
1242
+ // Skip both Vector deployments: the aggregator
1243
+ // (app.kubernetes.io/name=vector) re-emits decision logs on stdout
1244
+ // (those belong in ClickHouse, not Elasticsearch) and the agent
1245
+ // itself (vector-agent) to avoid a self-scrape loop.
1246
+ extra_label_selector: "app.kubernetes.io/name notin (vector,vector-agent)",
1247
+ },
1248
+ },
1249
+ transforms: {
1250
+ app_logs: {
1251
+ type: "remap",
1252
+ inputs: ["kubernetes_logs"],
1253
+ source: VECTOR_APP_LOGS_VRL,
1254
+ },
1255
+ },
1256
+ sinks: { [sinkName]: sink },
1257
+ },
1258
+ };
1259
+ }
1260
+ /**
1261
+ * Builds Helm values from the deployment configuration.
1262
+ */
1263
+ export function buildHelmValues(config, options = {}) {
1264
+ if (config.database.type === "self-hosted" &&
1265
+ !config.database.supabaseJwtSecret) {
1266
+ throw new Error("Self-hosted Supabase is missing a JWT secret. Run `rulebricks redeploy <name>` to regenerate deployment credentials, or set database.supabaseJwtSecret in config.yaml.");
1267
+ }
1268
+ if (config.features.ai.enabled && !config.features.ai.openaiApiKey) {
1269
+ throw new Error("AI features are enabled but the OpenAI API key is missing. Run `rulebricks redeploy <name>` and enter your OpenAI API key, or disable AI features in config.yaml.");
1270
+ }
1271
+ const { tlsEnabled = true, secretMode = "inline" } = options;
424
1272
  const useLocalGrafana = config.features.monitoring.destination === "local-grafana";
425
1273
  // Determine if external-dns should be enabled
426
1274
  const externalDnsEnabled = config.dns.autoManage && isSupportedDnsProvider(config.dns.provider);
427
- // Determine storage class based on provider
428
- // Note: GCP uses "hyperdisk-balanced" because C4A instances only support Hyperdisk (not Persistent Disk)
429
- const storageClass = config.infrastructure.provider === "aws"
430
- ? "gp3"
431
- : config.infrastructure.provider === "gcp"
432
- ? "hyperdisk-balanced"
433
- : config.infrastructure.provider === "azure"
434
- ? "managed-premium"
435
- : "gp3";
436
- // ARM64 tolerations for GKE C4A nodes (and other ARM64 providers)
437
- // GKE automatically taints ARM64 nodes with kubernetes.io/arch=arm64:NoSchedule
438
- const arm64Tolerations = [
439
- {
440
- key: "kubernetes.io/arch",
441
- operator: "Equal",
442
- value: "arm64",
443
- effect: "NoSchedule",
444
- },
1275
+ const gcpDiskType = config.infrastructure.nodeArchitecture === "amd64"
1276
+ ? "pd-balanced"
1277
+ : "hyperdisk-balanced";
1278
+ // Prefer the live cluster's StorageClass. Provider defaults are only a
1279
+ // fallback for legacy configs that predate capability scanning.
1280
+ const storageClass = config.infrastructure.storageClass ||
1281
+ (config.infrastructure.provider === "aws"
1282
+ ? "gp3"
1283
+ : config.infrastructure.provider === "gcp"
1284
+ ? gcpDiskType
1285
+ : config.infrastructure.provider === "azure"
1286
+ ? "managed-premium"
1287
+ : "gp3");
1288
+ const shouldApplyArm64Toleration = config.infrastructure.arm64TolerationRequired ?? false;
1289
+ const architectureTolerations = shouldApplyArm64Toleration
1290
+ ? [
1291
+ {
1292
+ key: "kubernetes.io/arch",
1293
+ operator: "Equal",
1294
+ value: "arm64",
1295
+ effect: "NoSchedule",
1296
+ },
1297
+ ]
1298
+ : undefined;
1299
+ const coreScheduling = generateScheduling(architectureTolerations);
1300
+ // Workers always tolerate + softly prefer the optional burst pool
1301
+ // (rulebricks.com/pool=burst). The preference is soft, so clusters without a
1302
+ // burst pool schedule workers on ordinary capacity exactly as before.
1303
+ const workerTolerations = [
1304
+ ...(architectureTolerations ?? []),
1305
+ BURST_POOL_TOLERATION,
445
1306
  ];
1307
+ const operationalDaemonSetTolerations = workerTolerations;
1308
+ const workerScheduling = generateScheduling(workerTolerations, {
1309
+ ...generateWorkerPodAntiAffinity(),
1310
+ nodeAffinity: {
1311
+ preferredDuringSchedulingIgnoredDuringExecution: [
1312
+ BURST_POOL_NODE_PREFERENCE,
1313
+ ],
1314
+ },
1315
+ });
1316
+ const infrastructurePodLabels = {
1317
+ "rulebricks.com/workload-group": "infrastructure",
1318
+ };
1319
+ const applicationPodLabels = {
1320
+ "rulebricks.com/workload-group": "application",
1321
+ };
1322
+ const productVersion = config.version;
1323
+ // Scheduling priority tiers. The chart creates release-scoped
1324
+ // PriorityClasses (<release>-critical / <release>-burst); stateful
1325
+ // infrastructure references the critical class so it can always preempt
1326
+ // burst workers to reschedule, and workers reference the burst class so
1327
+ // they are strictly the first preemption victims. Subchart values cannot
1328
+ // template release names, so the CLI emits them as literals.
1329
+ const releaseName = getReleaseName(config.name);
1330
+ const criticalPriorityClass = `${releaseName}-critical`;
1331
+ const burstPriorityClass = `${releaseName}-burst`;
1332
+ // Subcharts that don't honor global.imagePullSecrets (keda, strimzi, traefik,
1333
+ // vector) need the pull secret on their own key so their pods can pull the
1334
+ // private docker.io/rulebricks/* images from index.docker.io.
1335
+ const rulebricksPullSecret = [{ name: `${releaseName}-regcred` }];
1336
+ // Registry host for every image. Empty config.imageRegistry => docker.io. When
1337
+ // set, the host is rewritten into global.imageRegistry (which kube-prometheus-stack
1338
+ // and our subcharts honor) and into each of the six Tier-2 charts' own image
1339
+ // keys below, always keeping the rulebricks/<name> path.
1340
+ const reg = config.imageRegistry || DEFAULT_IMAGE_REGISTRY;
1341
+ const clickStackEnabled = isClickStackEnabled(config);
1342
+ const clickStackConfig = config.features.observability?.clickstack;
1343
+ const clickHouseStorageSize = clickStackConfig?.clickHouseStorageSize ?? "100Gi";
1344
+ // Distributed tracing (self-hosted only). Lives under global so the
1345
+ // rulebricks subchart deployments can read it; the collector + traefik are
1346
+ // wired below from the same source.
1347
+ const tracingGlobal = clickStackEnabled ? undefined : generateTracingGlobal(config);
1348
+ // Never let the cluster-autoscaler evict single-replica stateful pods
1349
+ // during node scale-down; an evicted broker/db stalls the whole pipeline.
1350
+ const safeToEvictAnnotations = {
1351
+ "cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
1352
+ };
446
1353
  // Build global.supabase configuration
447
1354
  const supabaseGlobalConfig = config.database.type === "supabase-cloud"
448
1355
  ? {
@@ -452,27 +1359,51 @@ export async function generateHelmValues(config, options = {}) {
452
1359
  accessToken: config.database.supabaseAccessToken || undefined,
453
1360
  projectRef: config.database.supabaseProjectRef || undefined,
454
1361
  }
455
- : {
456
- jwtSecret: config.database.supabaseJwtSecret || undefined,
457
- anonKey: undefined,
458
- serviceKey: undefined,
1362
+ : (() => {
1363
+ const jwtSecret = config.database.supabaseJwtSecret || "";
1364
+ return {
1365
+ jwtSecret: jwtSecret || undefined,
1366
+ anonKey: jwtSecret ? signSupabaseJwt("anon", jwtSecret) : undefined,
1367
+ serviceKey: jwtSecret
1368
+ ? signSupabaseJwt("service_role", jwtSecret)
1369
+ : undefined,
1370
+ };
1371
+ })();
1372
+ // Always emit email configuration so auth pods receive template/subject env
1373
+ // vars regardless of Helm merge order. Custom values take precedence over
1374
+ // built-in defaults when explicitly enabled.
1375
+ const customEmails = config.features.customEmails;
1376
+ if (customEmails?.enabled &&
1377
+ customEmails.subjects &&
1378
+ customEmails.templates) {
1379
+ supabaseGlobalConfig.emails = {
1380
+ subjects: {
1381
+ invite: customEmails.subjects.invite,
1382
+ confirmation: customEmails.subjects.confirmation,
1383
+ recovery: customEmails.subjects.recovery,
1384
+ emailChange: customEmails.subjects.emailChange,
1385
+ },
1386
+ templates: {
1387
+ invite: customEmails.templates.invite,
1388
+ confirmation: customEmails.templates.confirmation,
1389
+ recovery: customEmails.templates.recovery,
1390
+ emailChange: customEmails.templates.emailChange,
1391
+ },
459
1392
  };
460
- // Add custom email templates if enabled
461
- if (config.features.customEmails?.enabled &&
462
- config.features.customEmails.subjects &&
463
- config.features.customEmails.templates) {
1393
+ }
1394
+ else {
464
1395
  supabaseGlobalConfig.emails = {
465
1396
  subjects: {
466
- invite: config.features.customEmails.subjects.invite,
467
- confirmation: config.features.customEmails.subjects.confirmation,
468
- recovery: config.features.customEmails.subjects.recovery,
469
- emailChange: config.features.customEmails.subjects.emailChange,
1397
+ invite: "Join your team on Rulebricks",
1398
+ confirmation: "Confirm Your Email",
1399
+ recovery: "Reset Your Password",
1400
+ emailChange: "Confirm Email Change",
470
1401
  },
471
1402
  templates: {
472
- invite: config.features.customEmails.templates.invite,
473
- confirmation: config.features.customEmails.templates.confirmation,
474
- recovery: config.features.customEmails.templates.recovery,
475
- emailChange: config.features.customEmails.templates.emailChange,
1403
+ invite: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/invite.html",
1404
+ confirmation: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/verify.html",
1405
+ recovery: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/password_change.html",
1406
+ emailChange: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/email_change.html",
476
1407
  },
477
1408
  };
478
1409
  }
@@ -485,7 +1416,30 @@ export async function generateHelmValues(config, options = {}) {
485
1416
  email: config.adminEmail,
486
1417
  tlsEnabled,
487
1418
  licenseKey: config.licenseKey,
1419
+ // Pull secret for the private docker.io/rulebricks/* images. References the
1420
+ // license registry secret <release>-regcred (index.docker.io, authed by the
1421
+ // license PAT). kube-prometheus-stack + cert-manager honor this global value;
1422
+ // keda, traefik, vector and the strimzi operator each get the same secret on
1423
+ // their own key below.
1424
+ imagePullSecrets: [{ name: `${releaseName}-regcred` }],
1425
+ // Single registry-host override (empty => docker.io/rulebricks/*). Honored by
1426
+ // kube-prometheus-stack and our subcharts; the CLI also rewrites the host into
1427
+ // the other Tier-2 charts' native image keys below.
1428
+ ...(config.imageRegistry ? { imageRegistry: config.imageRegistry } : {}),
1429
+ // Generated name->sha256 digest map (empty until the helm repo's mirror
1430
+ // pipeline populates IMAGE_DIGESTS). When a name is present the chart image
1431
+ // helper pins @sha256 instead of :tag.
1432
+ imageDigests: IMAGE_DIGESTS,
1433
+ ...(productVersion && SEMVER_PATTERN.test(productVersion)
1434
+ ? { version: productVersion }
1435
+ : {}),
488
1436
  externalDnsEnabled,
1437
+ // Scheduling priority tiers (the chart renders release-scoped
1438
+ // <release>-critical and <release>-burst PriorityClasses).
1439
+ priorityClasses: { enabled: true },
1440
+ clickstack: {
1441
+ enabled: clickStackEnabled,
1442
+ },
489
1443
  // SMTP Configuration
490
1444
  smtp: {
491
1445
  host: config.smtp.host,
@@ -516,62 +1470,164 @@ export async function generateHelmValues(config, options = {}) {
516
1470
  : {
517
1471
  enabled: false,
518
1472
  },
1473
+ storage: config.storage
1474
+ ? {
1475
+ // One provider, one identity, one bucket/container. decision-logs and
1476
+ // db-backups are key prefixes under paths.* within it.
1477
+ provider: config.storage.provider,
1478
+ bucket: config.storage.bucket,
1479
+ region: config.storage.region,
1480
+ s3: {
1481
+ iamRoleArn: config.storage.awsIamRoleArn || "",
1482
+ existingSecret: { name: "" },
1483
+ },
1484
+ azure: {
1485
+ authMode: config.storage.cloudAuthMode === "secret"
1486
+ ? "connection-string"
1487
+ : "workload-identity",
1488
+ clientId: config.storage.azureBlobClientId || "",
1489
+ tenantId: config.storage.azureBlobTenantId || "",
1490
+ container: config.storage.azureBlobContainer || "",
1491
+ connectionStringSecretRef: config.storage.azureBlobConnectionStringSecretRef || {
1492
+ name: "",
1493
+ key: "",
1494
+ },
1495
+ },
1496
+ gcp: {
1497
+ serviceAccountEmail: config.storage.gcpServiceAccountEmail || "",
1498
+ },
1499
+ paths: {
1500
+ decisionLogs: config.storage.paths?.decisionLogs || "decision-logs",
1501
+ dbBackups: config.storage.paths?.dbBackups || "db-backups",
1502
+ },
1503
+ }
1504
+ : undefined,
1505
+ // Distributed tracing (omitted entirely when disabled).
1506
+ ...(tracingGlobal ? { tracing: tracingGlobal } : {}),
519
1507
  },
1508
+ clickstack: generateClickStackValues(clickStackEnabled, config, storageClass, infrastructurePodLabels, operationalDaemonSetTolerations),
1509
+ backup: generateBackupValues(config),
520
1510
  // =============================================================================
521
1511
  // RULEBRICKS APPLICATION STACK
522
1512
  // =============================================================================
523
1513
  rulebricks: {
524
- app: {
525
- ...(config.appVersion
526
- ? {
527
- image: {
528
- repository: "index.docker.io/rulebricks/app",
529
- tag: config.appVersion,
530
- pullPolicy: "IfNotPresent",
531
- },
532
- }
533
- : {}),
534
- replicaCount: tierConfig.appReplicas,
535
- resources: tierConfig.appResources,
536
- tolerations: arm64Tolerations,
537
- // Logging configuration
538
- logging: {
1514
+ metrics: {
1515
+ enabled: true,
1516
+ serviceMonitor: {
539
1517
  enabled: true,
540
- kafkaBrokers: "", // Auto-discover from Kafka subchart
541
- kafkaTopic: "logs",
542
- loggingDestination: getLoggingDestinationLabel(config.features.logging.sink),
1518
+ interval: "30s",
1519
+ scrapeTimeout: "10s",
1520
+ },
1521
+ app: {
1522
+ path: "/api/metrics",
1523
+ },
1524
+ hps: {
1525
+ path: "/metrics",
1526
+ },
1527
+ worker: {
1528
+ path: "/metrics",
1529
+ port: 3000,
543
1530
  },
544
1531
  },
1532
+ app: {
1533
+ image: {
1534
+ // Split shape: the rulebricks-chart.image helper applies
1535
+ // global.imageRegistry to the host + digest pinning. The host NEVER
1536
+ // goes in repository.
1537
+ registry: reg,
1538
+ repository: IMAGE_REPOSITORIES.app,
1539
+ pullPolicy: "IfNotPresent",
1540
+ },
1541
+ // Replica count and resources fall back to the chart defaults.
1542
+ podLabels: infrastructurePodLabels,
1543
+ ...coreScheduling,
1544
+ // Logging configuration (in-cluster auto-discovery or external Kafka)
1545
+ logging: generateAppLogging(config),
1546
+ },
545
1547
  // HPS (High Performance Server)
546
1548
  hps: {
547
1549
  enabled: true,
548
- ...(config.hpsVersion
549
- ? {
550
- image: {
551
- repository: "index.docker.io/rulebricks/hps",
552
- tag: config.hpsVersion,
553
- pullPolicy: "Always",
554
- },
555
- }
556
- : {}),
557
- replicas: tierConfig.hpsReplicas,
558
- resources: tierConfig.hpsResources,
559
- tolerations: arm64Tolerations,
1550
+ image: {
1551
+ // Split shape (see app.image): host comes from global.imageRegistry via
1552
+ // the rulebricks-chart.image helper, never baked into repository.
1553
+ registry: reg,
1554
+ repository: IMAGE_REPOSITORIES.hps,
1555
+ pullPolicy: "Always",
1556
+ },
1557
+ // Replica count and resources fall back to the chart defaults.
1558
+ podLabels: applicationPodLabels,
1559
+ ...coreScheduling,
1560
+ // Gather-plane autoscaling: HPS parses every chunk response, so its
1561
+ // capacity scales with request rate (load testing showed a fixed
1562
+ // gather plane plateaus throughput while workers idle). Conservative
1563
+ // one-pod-at-a-time scaling - each scale event rebalances the
1564
+ // response consumer group and can time out in-flight requests. Only the
1565
+ // enable flag is set here; min/max and thresholds use the chart
1566
+ // defaults.
1567
+ keda: {
1568
+ enabled: true,
1569
+ },
1570
+ // Warm the hps/worker images onto active worker-capable nodes so burst
1571
+ // scale-outs skip the image pull without targeting shutdown nodes.
1572
+ imagePrepull: {
1573
+ enabled: true,
1574
+ tolerations: operationalDaemonSetTolerations,
1575
+ },
1576
+ extraEnv: [
1577
+ // FLOW_CHUNK_MAX_ITEMS is the #1 throughput dial. Each chunk is one
1578
+ // Kafka round-trip (gather -> solution -> worker -> solution-response
1579
+ // -> gather), so throughput ~= (broker messages/sec) x (payloads per
1580
+ // message). Bigger chunks = fewer messages per solution = less broker
1581
+ // and coordination overhead. Benchmarks: 10 -> 50 gave +27%, and on
1582
+ // small payloads 100 -> 1000 gave another ~1.6x (22k -> 35k sol/s),
1583
+ // until the bottleneck moved off the broker onto worker CPU.
1584
+ // 500 keeps typical bulk requests to 1-2 messages. The byte bound
1585
+ // (CHUNK_MAX_BYTES, default 256 KiB in HPS) caps message size
1586
+ // regardless, so large payloads stay under Kafka's 2 MiB
1587
+ // max.message.bytes. High-throughput, small-payload deployments can
1588
+ // raise this much higher (and CHUNK_MAX_BYTES with it); the only costs
1589
+ // are per-request latency (one worker processes a whole chunk) and the
1590
+ // 2 MiB cap on the larger response message (avg output x chunk size
1591
+ // must stay < 2 MiB, so lower this for output-heavy flows).
1592
+ { name: "FLOW_CHUNK_MAX_ITEMS", value: "500" },
1593
+ ],
1594
+ // Service account (annotated with the MSK IAM role for external Kafka)
1595
+ serviceAccount: generateHpsServiceAccount(config),
560
1596
  // HPS Workers with KEDA autoscaling
561
1597
  workers: {
562
1598
  enabled: true,
563
- replicas: tierConfig.hpsWorkerReplicas.min,
1599
+ // Workers consume the solution topic directly, so under external MSK
1600
+ // IAM they need their own cloud identity - not the shared/default SA.
1601
+ // Same rule as HPS: a dedicated `<release>-hps-worker` SA (no role-arn
1602
+ // annotation) that the CLI's workload-identity step binds to the Kafka
1603
+ // role via Pod Identity.
1604
+ serviceAccount: generateHpsServiceAccount(config),
1605
+ // Partition count of the solution request topic (also exported to
1606
+ // HPS as MAX_WORKERS). Must match kafka.provisioning above; it is
1607
+ // the fleet-concurrency ceiling, NOT a worker count. Replica count
1608
+ // and resources fall back to the chart defaults.
1609
+ solutionPartitions: SOLUTION_TOPIC_PARTITIONS,
564
1610
  keda: {
565
1611
  enabled: true,
566
- minReplicaCount: tierConfig.hpsWorkerReplicas.min,
567
- maxReplicaCount: tierConfig.hpsWorkerReplicas.max,
568
- pollingInterval: 10,
1612
+ // Poll fast so bursts are detected within seconds; the chart's
1613
+ // ScaledObject defaults add exponential scale-up (double every
1614
+ // 15s) and smooth scale-down (5-min window, -25%/min) behavior.
1615
+ // min/max replica counts fall back to the chart defaults.
1616
+ pollingInterval: 5,
569
1617
  cooldownPeriod: 300,
1618
+ // Lag is measured in MESSAGES; with chunked bulk dispatch each
1619
+ // message is a bounded unit of work (~50-150ms), so 50 messages
1620
+ // approximates 5-8s of backlog for a single worker - one replica
1621
+ // is added per ~5s of fleet backlog, biasing toward early
1622
+ // scale-out for bursty traffic.
570
1623
  lagThreshold: 50,
571
1624
  cpuThreshold: 25,
572
1625
  },
573
- resources: tierConfig.hpsWorkerResources,
574
- tolerations: arm64Tolerations,
1626
+ podLabels: applicationPodLabels,
1627
+ // Burst tier: first preemption victims, so critical infrastructure
1628
+ // can always reschedule during an aggressive scale-out.
1629
+ priorityClassName: burstPriorityClass,
1630
+ ...workerScheduling,
575
1631
  },
576
1632
  },
577
1633
  // Ingress configuration
@@ -580,74 +1636,138 @@ export async function generateHelmValues(config, options = {}) {
580
1636
  className: "traefik",
581
1637
  paths: [{ path: "/", pathType: "Prefix" }],
582
1638
  },
583
- // Redis configuration
584
- redis: {
585
- resources: tierConfig.redisResources,
586
- tolerations: arm64Tolerations,
587
- persistence: {
588
- enabled: true,
589
- size: tierConfig.redisPersistenceSize,
590
- storageClass: storageClass,
591
- },
592
- },
1639
+ // Redis configuration (in-cluster sizing or external connection settings)
1640
+ redis: generateRedisBlock(config, storageClass, infrastructurePodLabels, coreScheduling),
1641
+ cache: generateCacheObservabilityBlock(config, infrastructurePodLabels),
1642
+ kafkaExporter: generateKafkaExporterBlock(config, infrastructurePodLabels),
593
1643
  },
594
1644
  // =============================================================================
595
1645
  // KAFKA (Message Queue)
596
1646
  // =============================================================================
597
1647
  kafka: {
598
- enabled: true,
599
- // KRaft mode (no Zookeeper)
600
- kraft: {
1648
+ enabled: !isExternalKafka(config),
1649
+ // Apache Kafka version (must be one the bundled DHI Strimzi operator
1650
+ // supports; DHI strimzi 1.0.1 ships Kafka 4.2.0).
1651
+ version: "4.2.0",
1652
+ // Single combined controller+broker node (KRaft, no ZooKeeper).
1653
+ replicas: TOPIC_REPLICATION_FACTOR,
1654
+ storage: {
1655
+ size: "20Gi",
1656
+ class: storageClass,
1657
+ },
1658
+ // Critical tier: the broker must always be able to preempt burst workers.
1659
+ priorityClassName: criticalPriorityClass,
1660
+ config: generateKafkaConfig(),
1661
+ jvm: {
1662
+ xms: "1g",
1663
+ xmx: "1g",
1664
+ extraOpts: {
1665
+ UseZGC: "true",
1666
+ AlwaysPreTouch: "true",
1667
+ MaxDirectMemorySize: "256M",
1668
+ },
1669
+ },
1670
+ metrics: {
601
1671
  enabled: true,
1672
+ serviceMonitor: { enabled: true },
602
1673
  },
603
- zookeeper: {
604
- enabled: false,
1674
+ // Topics, reconciled by the Strimzi Topic Operator (KafkaTopic CRs) for the
1675
+ // in-cluster broker, or created by the kafka-topic-provision Job for an
1676
+ // external MSK IAM broker.
1677
+ topics: generateKafkaTopics(config),
1678
+ // When false, the chart never creates topics on an external broker - the
1679
+ // operator manages them (and the workload role needs no CreateTopic).
1680
+ provisioning: {
1681
+ enabled: config.externalServices?.kafka?.external?.provisionTopics ?? true,
1682
+ },
1683
+ },
1684
+ // Strimzi operator: pull secret so the operator pod pulls the private
1685
+ // rulebricks/* image from index.docker.io.
1686
+ "strimzi-kafka-operator": {
1687
+ image: { imagePullSecrets: rulebricksPullSecret },
1688
+ },
1689
+ // =============================================================================
1690
+ // VECTOR KAFKA BRIDGE (AWS MSK IAM token auth)
1691
+ // =============================================================================
1692
+ kafkaBridge: generateKafkaBridge(config),
1693
+ clickhouse: {
1694
+ enabled: true,
1695
+ // Critical tier: single replica must preempt burst workers to
1696
+ // reschedule; never autoscaler-evicted on scale-down.
1697
+ priorityClassName: criticalPriorityClass,
1698
+ podAnnotations: safeToEvictAnnotations,
1699
+ auth: {
1700
+ username: "rulebricks",
1701
+ password: "",
1702
+ existingSecret: '{{ printf "%s-clickhouse-credentials" .Release.Name }}',
1703
+ existingSecretKey: "admin-password",
605
1704
  },
606
- // Kafka broker configuration
607
- overrideConfiguration: {
608
- "auto.create.topics.enable": "true",
609
- "log.retention.hours": "24",
610
- "default.replication.factor": String(tierConfig.kafkaReplication),
611
- "offsets.topic.replication.factor": String(tierConfig.kafkaReplication),
612
- "num.partitions": String(tierConfig.hpsWorkerReplicas.max), // Match max workers for parallel consumption
613
- },
614
- controller: {
615
- replicaCount: tierConfig.kafkaReplication,
616
- resources: tierConfig.kafkaResources,
617
- tolerations: arm64Tolerations,
618
- persistence: {
1705
+ persistence: clickStackEnabled
1706
+ ? {
619
1707
  enabled: true,
620
- size: tierConfig.kafkaStorage,
621
1708
  storageClass: storageClass,
1709
+ size: clickHouseStorageSize,
1710
+ }
1711
+ : { enabled: false },
1712
+ resources: clickStackEnabled
1713
+ ? {
1714
+ requests: { cpu: "1000m", memory: "4Gi" },
1715
+ limits: { cpu: "4", memory: "12Gi" },
1716
+ }
1717
+ : {
1718
+ requests: { cpu: "500m", memory: "2Gi" },
1719
+ limits: { cpu: "2", memory: "6Gi" },
622
1720
  },
623
- heapOpts: tierConfig.kafkaHeapOpts,
624
- extraEnvVars: generateKafkaExtraEnvVars(),
1721
+ serviceAccount: {
1722
+ create: true,
1723
+ annotations: {},
625
1724
  },
626
- listeners: {
627
- client: {
628
- protocol: "PLAINTEXT",
629
- },
630
- controller: {
631
- protocol: "PLAINTEXT",
632
- },
633
- interbroker: {
634
- protocol: "PLAINTEXT",
1725
+ metrics: {
1726
+ enabled: true,
1727
+ serviceMonitor: {
1728
+ enabled: true,
635
1729
  },
636
1730
  },
1731
+ queryLimits: {
1732
+ maxMemoryUsage: 4294967296,
1733
+ maxThreads: 4,
1734
+ maxExecutionTime: 120,
1735
+ maxRowsToRead: 50000000,
1736
+ readOverflowMode: "break",
1737
+ },
1738
+ otelQueryLimits: {
1739
+ maxMemoryUsage: 4294967296,
1740
+ maxThreads: 8,
1741
+ maxExecutionTime: 120,
1742
+ },
1743
+ otelDatabase: "otel",
1744
+ // config.d / users.d / the decision-log view are rendered by the parent
1745
+ // chart's clickhouse templates (no longer passed as Bitnami subchart values).
637
1746
  },
638
1747
  // =============================================================================
639
1748
  // TRAEFIK (Ingress Controller)
640
1749
  // =============================================================================
641
1750
  traefik: {
642
1751
  enabled: true,
1752
+ // traefik has no global.imageRegistry path: set registry + repository
1753
+ // directly (host = reg, rulebricks/* path).
1754
+ image: {
1755
+ registry: reg,
1756
+ repository: IMAGE_REPOSITORIES.traefik,
1757
+ },
1758
+ deployment: {
1759
+ imagePullSecrets: rulebricksPullSecret,
1760
+ },
643
1761
  ingressClass: {
644
1762
  name: "traefik",
645
1763
  },
646
- tolerations: arm64Tolerations,
1764
+ ...coreScheduling,
647
1765
  autoscaling: {
648
1766
  enabled: true,
649
1767
  minReplicas: 1,
650
- maxReplicas: 2,
1768
+ // Headroom for colocated clients pushing multi-hundred-RPS bulk
1769
+ // traffic through the ingress.
1770
+ maxReplicas: 4,
651
1771
  },
652
1772
  resources: {
653
1773
  requests: {
@@ -670,11 +1790,26 @@ export async function generateHelmValues(config, options = {}) {
670
1790
  websecure: {
671
1791
  port: 8443,
672
1792
  exposedPort: 443,
673
- tls: {
674
- enabled: tlsEnabled,
1793
+ // traefik 41.x moved per-entrypoint TLS under ports.<name>.http.tls
1794
+ // (the old ports.<name>.tls location is rejected by the chart schema).
1795
+ http: {
1796
+ tls: {
1797
+ enabled: tlsEnabled,
1798
+ },
675
1799
  },
676
1800
  },
677
1801
  },
1802
+ metrics: {
1803
+ prometheus: {
1804
+ enabled: true,
1805
+ serviceMonitor: {
1806
+ enabled: false,
1807
+ },
1808
+ },
1809
+ },
1810
+ // OTLP tracing: ingress becomes the root span and propagates traceparent
1811
+ // to backends. Empty object when tracing is disabled.
1812
+ tracing: generateTraefikTracing(config, releaseName),
678
1813
  persistence: {
679
1814
  enabled: false,
680
1815
  },
@@ -684,7 +1819,29 @@ export async function generateHelmValues(config, options = {}) {
684
1819
  // =============================================================================
685
1820
  keda: {
686
1821
  enabled: true,
687
- tolerations: arm64Tolerations,
1822
+ imagePullSecrets: rulebricksPullSecret,
1823
+ // keda reads global.image.registry (NOT global.imageRegistry) for the host;
1824
+ // set it plus the rulebricks/* repositories for all three sub-images.
1825
+ global: {
1826
+ image: {
1827
+ registry: reg,
1828
+ },
1829
+ },
1830
+ image: {
1831
+ keda: {
1832
+ registry: reg,
1833
+ repository: IMAGE_REPOSITORIES.keda,
1834
+ },
1835
+ metricsApiServer: {
1836
+ registry: reg,
1837
+ repository: IMAGE_REPOSITORIES.kedaMetricsApiServer,
1838
+ },
1839
+ webhooks: {
1840
+ registry: reg,
1841
+ repository: IMAGE_REPOSITORIES.kedaAdmissionWebhooks,
1842
+ },
1843
+ },
1844
+ ...coreScheduling,
688
1845
  crds: {
689
1846
  install: false, // CRDs managed in parent chart
690
1847
  },
@@ -694,13 +1851,41 @@ export async function generateHelmValues(config, options = {}) {
694
1851
  // =============================================================================
695
1852
  "cert-manager": {
696
1853
  enabled: tlsEnabled,
697
- installCRDs: false, // CRDs managed in parent chart
698
- tolerations: arm64Tolerations,
1854
+ // CRDs managed in parent chart (cert-manager v1.15+ uses crds.enabled,
1855
+ // not the deprecated installCRDs flag).
1856
+ crds: { enabled: false },
1857
+ // cert-manager prepends image.registry to image.repository, so set both per
1858
+ // component (host = reg, rulebricks/cert-manager-* path).
1859
+ image: {
1860
+ registry: reg,
1861
+ repository: IMAGE_REPOSITORIES.certManagerController,
1862
+ },
1863
+ ...coreScheduling,
699
1864
  webhook: {
700
- tolerations: arm64Tolerations,
1865
+ image: {
1866
+ registry: reg,
1867
+ repository: IMAGE_REPOSITORIES.certManagerWebhook,
1868
+ },
1869
+ ...coreScheduling,
701
1870
  },
702
1871
  cainjector: {
703
- tolerations: arm64Tolerations,
1872
+ image: {
1873
+ registry: reg,
1874
+ repository: IMAGE_REPOSITORIES.certManagerCainjector,
1875
+ },
1876
+ ...coreScheduling,
1877
+ },
1878
+ startupapicheck: {
1879
+ image: {
1880
+ registry: reg,
1881
+ repository: IMAGE_REPOSITORIES.certManagerStartupapicheck,
1882
+ },
1883
+ },
1884
+ acmesolver: {
1885
+ image: {
1886
+ registry: reg,
1887
+ repository: IMAGE_REPOSITORIES.certManagerAcmesolver,
1888
+ },
704
1889
  },
705
1890
  },
706
1891
  // Cluster Issuer for Let's Encrypt
@@ -714,12 +1899,20 @@ export async function generateHelmValues(config, options = {}) {
714
1899
  // =============================================================================
715
1900
  vector: {
716
1901
  enabled: true,
1902
+ // vector's image.repository is the FULL path including host (no separate
1903
+ // registry field), so the reg host is prefixed here.
1904
+ image: {
1905
+ repository: `${reg}/${IMAGE_REPOSITORIES.vector}`,
1906
+ pullSecrets: rulebricksPullSecret,
1907
+ },
717
1908
  role: "Stateless-Aggregator",
718
- replicas: tierConfig.vectorReplicas,
719
- resources: tierConfig.vectorResources,
720
- tolerations: arm64Tolerations,
1909
+ // Replica count and resources fall back to the chart defaults.
1910
+ ...coreScheduling,
721
1911
  serviceAccount: generateVectorServiceAccount(config),
722
1912
  podLabels: generateVectorPodLabels(config),
1913
+ ...(generateVectorExtraContainers(config)
1914
+ ? { extraContainers: generateVectorExtraContainers(config) }
1915
+ : {}),
723
1916
  service: {
724
1917
  enabled: true,
725
1918
  ports: [{ name: "api", port: 8686, protocol: "TCP", targetPort: 8686 }],
@@ -731,90 +1924,262 @@ export async function generateHelmValues(config, options = {}) {
731
1924
  kafka: {
732
1925
  type: "kafka",
733
1926
  bootstrap_servers: "${KAFKA_BOOTSTRAP_SERVERS:-rulebricks-kafka:9092}",
734
- topics: ["logs"],
1927
+ // KAFKA_LOG_TOPIC carries the namespace prefix (e.g. com.rulebricks.logs).
1928
+ topics: ["${KAFKA_LOG_TOPIC:-logs}"],
735
1929
  group_id: "vector-consumers",
736
1930
  auto_offset_reset: "latest",
1931
+ // TLS + SASL driven by env from vector-kafka-env (disabled for
1932
+ // in-cluster Kafka and the kafka-proxy bridge path).
1933
+ tls: { enabled: "${KAFKA_TLS_ENABLED:-false}" },
1934
+ sasl: {
1935
+ enabled: "${KAFKA_SASL_ENABLED:-false}",
1936
+ mechanism: "${KAFKA_SASL_MECHANISM:-PLAIN}",
1937
+ // username/password are only emitted for external Kafka using a
1938
+ // direct PLAIN/SCRAM credential (where vector-kafka-credentials is
1939
+ // populated). Emitting them with an empty default would render as
1940
+ // YAML null and crash Vector at config load; omitting the keys
1941
+ // leaves them unset (valid) whenever SASL is disabled.
1942
+ ...(kafkaUsesDirectSasl(config)
1943
+ ? {
1944
+ username: "${KAFKA_SASL_USERNAME}",
1945
+ password: "${KAFKA_SASL_PASSWORD}",
1946
+ }
1947
+ : {}),
1948
+ },
1949
+ },
1950
+ },
1951
+ transforms: {
1952
+ normalize_logs: {
1953
+ type: "remap",
1954
+ inputs: ["kafka"],
1955
+ source: VECTOR_NORMALIZE_LOGS_VRL,
737
1956
  },
738
1957
  },
739
1958
  sinks: generateVectorSinks(config),
740
1959
  },
741
1960
  },
742
1961
  // =============================================================================
1962
+ // VECTOR AGENT (Application / container logs -> Elasticsearch)
1963
+ // =============================================================================
1964
+ "vector-agent": clickStackEnabled
1965
+ ? { enabled: false }
1966
+ : {
1967
+ ...generateVectorAgent(config, infrastructurePodLabels, operationalDaemonSetTolerations),
1968
+ // Full-path repository (see vector above) + pull secret.
1969
+ image: {
1970
+ repository: `${reg}/${IMAGE_REPOSITORIES.vector}`,
1971
+ pullSecrets: rulebricksPullSecret,
1972
+ },
1973
+ },
1974
+ // =============================================================================
743
1975
  // SUPABASE (Self-hosted Database)
744
1976
  // =============================================================================
745
1977
  supabase: {
746
1978
  enabled: config.database.type === "self-hosted",
747
1979
  ...(config.database.type === "self-hosted"
748
- ? {
749
- secret: {
750
- db: {
751
- username: "postgres",
752
- password: config.database.supabaseDbPassword,
753
- database: "postgres",
1980
+ ? (() => {
1981
+ // External managed Postgres (AWS RDS / Azure Flexible Server): the
1982
+ // self-hosted Supabase services run against it instead of the
1983
+ // bundled in-cluster database.
1984
+ const pgExt = config.externalServices?.postgres?.mode === "external"
1985
+ ? config.externalServices?.postgres?.external
1986
+ : undefined;
1987
+ return {
1988
+ secret: {
1989
+ db: {
1990
+ username: "postgres",
1991
+ // Shared service-role password (authenticator / auth_admin /
1992
+ // replication_admin). With an external DB the bootstrap hook
1993
+ // sets the roles to this same value.
1994
+ password: config.database.supabaseDbPassword,
1995
+ database: pgExt?.database || "postgres",
1996
+ },
1997
+ dashboard: {
1998
+ username: config.database.supabaseDashboardUser || "supabase",
1999
+ password: config.database.supabaseDashboardPass,
2000
+ },
2001
+ jwt: {
2002
+ secret: config.database.supabaseJwtSecret,
2003
+ },
2004
+ // SECRET_KEY_BASE / DB_ENC_KEY, derived from the JWT secret
2005
+ // (stable across redeploys). The chart no longer ships defaults.
2006
+ realtime: deriveRealtimeSecrets(config.database.supabaseJwtSecret || ""),
754
2007
  },
755
- dashboard: {
756
- username: config.database.supabaseDashboardUser || "supabase",
757
- password: config.database.supabaseDashboardPass,
2008
+ ...(pgExt
2009
+ ? {
2010
+ // One switch: enabling externalDatabase disables the bundled
2011
+ // Postgres and runs the bootstrap hook to initialize the
2012
+ // managed instance. db.enabled=false is explicit so chart
2013
+ // schema rules keyed off it hold.
2014
+ db: { enabled: false },
2015
+ externalDatabase: {
2016
+ enabled: true,
2017
+ host: pgExt.host ?? "",
2018
+ port: pgExt.port ?? 5432,
2019
+ bootstrap: {
2020
+ enabled: pgExt.bootstrap?.enabled ?? true,
2021
+ masterUsername: pgExt.bootstrap?.masterUsername ?? "postgres",
2022
+ masterPassword: pgExt.bootstrap?.masterPassword ?? "",
2023
+ appRole: pgExt.bootstrap?.appRole ?? "postgres",
2024
+ },
2025
+ },
2026
+ }
2027
+ : {
2028
+ db: {
2029
+ // Explicit so chart schema rules that key off
2030
+ // supabase.db.enabled (e.g. Database Backup Storage
2031
+ // Validation) hold without relying on subchart-default
2032
+ // coalescing.
2033
+ enabled: true,
2034
+ image: {
2035
+ // Split shape: the supabase.image helper applies
2036
+ // global.imageRegistry to the host. Host never in repository.
2037
+ registry: reg,
2038
+ repository: SUPABASE_POSTGRES_IMAGE_REPOSITORY,
2039
+ tag: SUPABASE_POSTGRES_IMAGE_TAG,
2040
+ pullPolicy: "IfNotPresent",
2041
+ },
2042
+ podLabels: infrastructurePodLabels,
2043
+ // Critical tier: the primary datastore must preempt burst
2044
+ // workers to reschedule; never autoscaler-evicted.
2045
+ // Resources and persistence size fall back to chart
2046
+ // defaults.
2047
+ priorityClassName: criticalPriorityClass,
2048
+ podAnnotations: safeToEvictAnnotations,
2049
+ ...coreScheduling,
2050
+ persistence: {
2051
+ enabled: true,
2052
+ storageClassName: storageClass,
2053
+ },
2054
+ },
2055
+ }),
2056
+ auth: {
2057
+ // Explicit public URLs so GoTrue never falls back to the
2058
+ // in-cluster Kong service name when global.domain propagation
2059
+ // is lost (e.g. after manual patching or partial upgrades).
2060
+ siteUrl: `https://${config.domain}`,
2061
+ externalUrl: `https://supabase.${config.domain}`,
2062
+ ...coreScheduling,
758
2063
  },
759
- jwt: {
760
- secret: config.database.supabaseJwtSecret,
2064
+ rest: {
2065
+ ...coreScheduling,
761
2066
  },
762
- },
763
- db: {
764
- resources: tierConfig.dbResources,
765
- tolerations: arm64Tolerations,
766
- persistence: {
767
- enabled: true,
768
- size: tierConfig.dbPersistenceSize,
769
- storageClassName: storageClass,
2067
+ realtime: {
2068
+ ...coreScheduling,
770
2069
  },
771
- },
772
- auth: {
773
- tolerations: arm64Tolerations,
774
- },
775
- rest: {
776
- tolerations: arm64Tolerations,
777
- },
778
- realtime: {
779
- tolerations: arm64Tolerations,
780
- },
781
- meta: {
782
- tolerations: arm64Tolerations,
783
- },
784
- kong: {
785
- tolerations: arm64Tolerations,
786
- ingress: {
787
- enabled: true,
788
- className: "traefik",
789
- annotations: {},
2070
+ meta: {
2071
+ ...coreScheduling,
790
2072
  },
791
- },
792
- studio: {
793
- tolerations: arm64Tolerations,
794
- },
795
- }
2073
+ kong: {
2074
+ ...coreScheduling,
2075
+ ingress: {
2076
+ enabled: true,
2077
+ className: "traefik",
2078
+ annotations: {},
2079
+ },
2080
+ },
2081
+ studio: {
2082
+ ...coreScheduling,
2083
+ },
2084
+ };
2085
+ })()
796
2086
  : {}),
797
2087
  },
798
2088
  // =============================================================================
799
2089
  // MONITORING
800
2090
  // =============================================================================
801
2091
  monitoring: {
802
- enabled: config.features.monitoring.enabled,
2092
+ enabled: true,
803
2093
  },
804
2094
  "kube-prometheus-stack": {
805
- enabled: config.features.monitoring.enabled,
2095
+ enabled: true,
2096
+ // kube-prometheus-stack honors the parent global.imageRegistry for the host
2097
+ // automatically; the CLI sets the rulebricks/* repository defaults (and the
2098
+ // reg host explicitly) for every sub-image so a bare helm install also pulls
2099
+ // rulebricks/*.
806
2100
  alertmanager: {
807
2101
  enabled: false,
2102
+ alertmanagerSpec: {
2103
+ image: {
2104
+ registry: reg,
2105
+ repository: IMAGE_REPOSITORIES.alertmanager,
2106
+ },
2107
+ },
2108
+ },
2109
+ prometheusOperator: {
2110
+ image: {
2111
+ registry: reg,
2112
+ repository: IMAGE_REPOSITORIES.prometheusOperator,
2113
+ },
2114
+ prometheusConfigReloader: {
2115
+ image: {
2116
+ registry: reg,
2117
+ repository: IMAGE_REPOSITORIES.prometheusConfigReloader,
2118
+ },
2119
+ },
2120
+ admissionWebhooks: {
2121
+ patch: {
2122
+ image: {
2123
+ registry: reg,
2124
+ repository: IMAGE_REPOSITORIES.kubeWebhookCertgen,
2125
+ },
2126
+ },
2127
+ },
2128
+ },
2129
+ "kube-state-metrics": {
2130
+ image: {
2131
+ registry: reg,
2132
+ repository: IMAGE_REPOSITORIES.kubeStateMetrics,
2133
+ },
2134
+ },
2135
+ "prometheus-node-exporter": {
2136
+ image: {
2137
+ registry: reg,
2138
+ repository: IMAGE_REPOSITORIES.nodeExporter,
2139
+ },
808
2140
  },
809
2141
  grafana: {
810
2142
  enabled: useLocalGrafana,
2143
+ image: {
2144
+ registry: reg,
2145
+ repository: IMAGE_REPOSITORIES.grafana,
2146
+ },
2147
+ // Dashboard sidecar imports the provisioned Rulebricks dashboards
2148
+ // (ConfigMaps labeled grafana_dashboard="1") when in-cluster Grafana
2149
+ // is enabled.
2150
+ sidecar: {
2151
+ image: {
2152
+ registry: reg,
2153
+ repository: IMAGE_REPOSITORIES.k8sSidecar,
2154
+ },
2155
+ ...(useLocalGrafana
2156
+ ? {
2157
+ dashboards: {
2158
+ enabled: true,
2159
+ label: "grafana_dashboard",
2160
+ labelValue: "1",
2161
+ searchNamespace: "ALL",
2162
+ folderAnnotation: "grafana_folder",
2163
+ provider: { foldersFromFilesStructure: true },
2164
+ },
2165
+ }
2166
+ : {}),
2167
+ },
811
2168
  },
812
2169
  prometheus: {
813
- enabled: config.features.monitoring.enabled,
2170
+ enabled: true,
814
2171
  serviceAccount: generatePrometheusServiceAccount(config),
815
2172
  prometheusSpec: {
816
2173
  retention: "30d",
2174
+ image: {
2175
+ registry: reg,
2176
+ repository: IMAGE_REPOSITORIES.prometheus,
2177
+ },
817
2178
  podMetadata: generatePrometheusPodMetadata(config),
2179
+ serviceMonitorSelectorNilUsesHelmValues: false,
2180
+ serviceMonitorSelector: {},
2181
+ podMonitorSelectorNilUsesHelmValues: false,
2182
+ podMonitorSelector: {},
818
2183
  storageSpec: {
819
2184
  volumeClaimTemplate: {
820
2185
  spec: {
@@ -828,7 +2193,9 @@ export async function generateHelmValues(config, options = {}) {
828
2193
  },
829
2194
  },
830
2195
  },
831
- remoteWrite: generateRemoteWriteSpec(config),
2196
+ remoteWrite: [
2197
+ ...(clickStackEnabled ? [] : generateRemoteWriteSpec(config)),
2198
+ ],
832
2199
  },
833
2200
  },
834
2201
  },
@@ -836,20 +2203,21 @@ export async function generateHelmValues(config, options = {}) {
836
2203
  // STORAGE CLASS
837
2204
  // =============================================================================
838
2205
  storageClass: {
839
- create: true,
2206
+ create: false,
840
2207
  name: storageClass,
841
- provisioner: config.infrastructure.provider === "aws"
842
- ? "ebs.csi.aws.com"
843
- : config.infrastructure.provider === "gcp"
844
- ? "pd.csi.storage.gke.io"
845
- : config.infrastructure.provider === "azure"
846
- ? "disk.csi.azure.com"
847
- : "ebs.csi.aws.com",
2208
+ provisioner: config.infrastructure.storageProvisioner ||
2209
+ (config.infrastructure.provider === "aws"
2210
+ ? "ebs.csi.aws.com"
2211
+ : config.infrastructure.provider === "gcp"
2212
+ ? "pd.csi.storage.gke.io"
2213
+ : config.infrastructure.provider === "azure"
2214
+ ? "disk.csi.azure.com"
2215
+ : "ebs.csi.aws.com"),
848
2216
  // Parameters for the StorageClass - must include type for disk provisioning
849
2217
  parameters: config.infrastructure.provider === "aws"
850
2218
  ? { type: "gp3" }
851
2219
  : config.infrastructure.provider === "gcp"
852
- ? { type: "hyperdisk-balanced" }
2220
+ ? { type: gcpDiskType }
853
2221
  : config.infrastructure.provider === "azure"
854
2222
  ? { skuName: "Premium_LRS" }
855
2223
  : { type: "gp3" },
@@ -864,7 +2232,13 @@ export async function generateHelmValues(config, options = {}) {
864
2232
  "external-dns": externalDnsEnabled
865
2233
  ? {
866
2234
  enabled: true,
867
- provider: getExternalDnsProvider(config.dns.provider),
2235
+ // external-dns has NO image.registry field: image.repository is the
2236
+ // FULL path including host (reg prefix + rulebricks/external-dns).
2237
+ image: {
2238
+ repository: `${reg}/${IMAGE_REPOSITORIES.externalDns}`,
2239
+ },
2240
+ // external-dns 1.21+ idiom: provider is an object ({name: ...}).
2241
+ provider: { name: getExternalDnsProvider(config.dns.provider) },
868
2242
  domainFilters: [config.domain],
869
2243
  sources: ["ingress", "service"],
870
2244
  policy: "upsert-only",
@@ -873,6 +2247,105 @@ export async function generateHelmValues(config, options = {}) {
873
2247
  enabled: false,
874
2248
  },
875
2249
  };
2250
+ // In k8s secret mode, the CLI creates Kubernetes Secrets and the chart reads
2251
+ // them by reference. Point the chart's secretRef seams at those Secrets and
2252
+ // strip every plaintext secret out of the generated values.
2253
+ if (secretMode === "k8s") {
2254
+ return redactSecretsToRefs(values, config);
2255
+ }
2256
+ return values;
2257
+ }
2258
+ /**
2259
+ * Rewrites generated values for k8s secret mode: sets the chart's *.secretRef
2260
+ * seams to the CLI-created Secret names and removes inline plaintext secrets so
2261
+ * none are persisted to values.yaml or the Helm release.
2262
+ */
2263
+ export function redactSecretsToRefs(values, config) {
2264
+ const names = deploymentSecretNames(config);
2265
+ const global = (values.global ?? {});
2266
+ const supabase = (values.supabase ?? {});
2267
+ const pgExt = config.database.type === "self-hosted" &&
2268
+ config.externalServices?.postgres?.mode === "external"
2269
+ ? config.externalServices.postgres.external
2270
+ : undefined;
2271
+ // App-level consolidated secret: one secretRef supplies every app cred.
2272
+ global.secrets = { ...(global.secrets ?? {}), secretRef: names.app };
2273
+ // Strip inline app/global secrets (non-secret config like host/from/url stays).
2274
+ if (global.smtp) {
2275
+ delete global.smtp.user;
2276
+ delete global.smtp.pass;
2277
+ }
2278
+ if (global.supabase) {
2279
+ delete global.supabase.jwtSecret;
2280
+ delete global.supabase.anonKey;
2281
+ delete global.supabase.serviceKey;
2282
+ delete global.supabase.accessToken;
2283
+ }
2284
+ if (global.ai)
2285
+ delete global.ai.openaiApiKey;
2286
+ if (global.sso) {
2287
+ delete global.sso.clientId;
2288
+ delete global.sso.clientSecret;
2289
+ }
2290
+ delete global.licenseKey;
2291
+ // Supabase subchart: replace each inline secret block with a secretRef.
2292
+ if (supabase.secret) {
2293
+ const dbSecret = { secretRef: names.db };
2294
+ if (pgExt) {
2295
+ dbSecret.secretRefKey = {
2296
+ host: "host",
2297
+ port: "port",
2298
+ username: "username",
2299
+ password: "password",
2300
+ database: "database",
2301
+ };
2302
+ }
2303
+ supabase.secret = {
2304
+ db: dbSecret,
2305
+ jwt: { secretRef: names.jwt },
2306
+ dashboard: { secretRef: names.dashboard },
2307
+ realtime: { secretRef: names.realtime },
2308
+ // Supabase auth (GoTrue) SMTP — only when SMTP creds are configured;
2309
+ // otherwise the global.smtp we just stripped would leave it empty.
2310
+ ...(config.smtp?.user || config.smtp?.pass
2311
+ ? { smtp: { secretRef: names.smtp } }
2312
+ : {}),
2313
+ };
2314
+ }
2315
+ if (pgExt && supabase.externalDatabase) {
2316
+ supabase.externalDatabase = {
2317
+ ...supabase.externalDatabase,
2318
+ // New charts read host/port/user/pass/db from this single Secret. Keep
2319
+ // externalDatabase.host/port above for older charts that do not yet support
2320
+ // host/port secret keys.
2321
+ secretRef: names.db,
2322
+ secretRefKey: {
2323
+ host: "host",
2324
+ port: "port",
2325
+ username: "username",
2326
+ password: "password",
2327
+ database: "database",
2328
+ },
2329
+ bootstrap: {
2330
+ ...(supabase.externalDatabase.bootstrap ?? {}),
2331
+ secretRef: names.dbBootstrap,
2332
+ // Master credentials move into the hook Secret in k8s mode.
2333
+ masterUsername: undefined,
2334
+ masterPassword: undefined,
2335
+ },
2336
+ };
2337
+ }
2338
+ values.global = global;
2339
+ values.supabase = supabase;
2340
+ return values;
2341
+ }
2342
+ /**
2343
+ * Generates Helm values from the deployment configuration
2344
+ */
2345
+ export async function generateHelmValues(config, options = {}) {
2346
+ const values = buildHelmValues(config, options);
2347
+ // Last-line guardrail: never write/deploy values the chart would reject.
2348
+ assertValidHelmValues(values);
876
2349
  await saveHelmValues(config.name, values);
877
2350
  }
878
2351
  /**