@rulebricks/cli 2.1.6 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/README.md +75 -14
  2. package/cluster-setup/aws/README.md +123 -0
  3. package/cluster-setup/aws/check-aws-access.sh +242 -0
  4. package/cluster-setup/aws/parameters.json +13 -0
  5. package/cluster-setup/aws/rulebricks-cluster.cfn.yaml +355 -0
  6. package/cluster-setup/azure/README.md +141 -0
  7. package/cluster-setup/azure/check-aks-prereqs.sh +276 -0
  8. package/cluster-setup/azure/parameters.json +30 -0
  9. package/cluster-setup/azure/rulebricks-cluster.bicep +546 -0
  10. package/cluster-setup/gcp/README.md +189 -0
  11. package/cluster-setup/gcp/check-gke-prereqs.sh +260 -0
  12. package/dist/commands/backup.d.ts +5 -0
  13. package/dist/commands/backup.js +104 -0
  14. package/dist/commands/deploy.d.ts +3 -1
  15. package/dist/commands/deploy.js +226 -326
  16. package/dist/commands/destroy.d.ts +1 -1
  17. package/dist/commands/destroy.js +73 -123
  18. package/dist/commands/init.d.ts +5 -1
  19. package/dist/commands/init.js +78 -47
  20. package/dist/commands/list.d.ts +1 -0
  21. package/dist/commands/list.js +74 -0
  22. package/dist/commands/open.d.ts +1 -1
  23. package/dist/commands/open.js +4 -12
  24. package/dist/commands/redeploy.d.ts +6 -0
  25. package/dist/commands/redeploy.js +310 -0
  26. package/dist/commands/restore.d.ts +5 -0
  27. package/dist/commands/restore.js +338 -0
  28. package/dist/commands/status.js +62 -49
  29. package/dist/commands/upgrade.js +74 -51
  30. package/dist/components/DNSWaitScreen.d.ts +5 -1
  31. package/dist/components/DNSWaitScreen.js +47 -41
  32. package/dist/components/Wizard/WizardContext.d.ts +174 -29
  33. package/dist/components/Wizard/WizardContext.js +896 -91
  34. package/dist/components/Wizard/steps/CloudProviderStep.js +192 -102
  35. package/dist/components/Wizard/steps/DomainStep.js +5 -24
  36. package/dist/components/Wizard/steps/ExternalServicesStep.d.ts +6 -0
  37. package/dist/components/Wizard/steps/ExternalServicesStep.js +645 -0
  38. package/dist/components/Wizard/steps/FeatureConfigStep.d.ts +2 -1
  39. package/dist/components/Wizard/steps/FeatureConfigStep.js +959 -248
  40. package/dist/components/Wizard/steps/FeaturesStep.js +31 -35
  41. package/dist/components/Wizard/steps/ObservabilityStep.d.ts +6 -0
  42. package/dist/components/Wizard/steps/ObservabilityStep.js +137 -0
  43. package/dist/components/Wizard/steps/ReviewStep.d.ts +2 -1
  44. package/dist/components/Wizard/steps/ReviewStep.js +56 -7
  45. package/dist/components/Wizard/steps/StorageStep.d.ts +9 -0
  46. package/dist/components/Wizard/steps/StorageStep.js +592 -0
  47. package/dist/components/Wizard/steps/SupabaseCredentialsStep.js +20 -21
  48. package/dist/components/Wizard/steps/VersionStep.js +45 -23
  49. package/dist/components/Wizard/steps/index.d.ts +3 -3
  50. package/dist/components/Wizard/steps/index.js +3 -3
  51. package/dist/components/common/CommandApproval.d.ts +12 -0
  52. package/dist/components/common/CommandApproval.js +91 -0
  53. package/dist/components/common/DeploymentPicker.d.ts +14 -0
  54. package/dist/components/common/DeploymentPicker.js +16 -0
  55. package/dist/components/common/index.d.ts +2 -0
  56. package/dist/components/common/index.js +2 -0
  57. package/dist/index.js +94 -62
  58. package/dist/lib/cloudCli.d.ts +134 -63
  59. package/dist/lib/cloudCli.js +512 -220
  60. package/dist/lib/clusterSetupDefaults.d.ts +30 -0
  61. package/dist/lib/clusterSetupDefaults.js +64 -0
  62. package/dist/lib/commandApproval.d.ts +26 -0
  63. package/dist/lib/commandApproval.js +114 -0
  64. package/dist/lib/config.d.ts +12 -10
  65. package/dist/lib/config.js +91 -33
  66. package/dist/lib/configFixtures.d.ts +5 -0
  67. package/dist/lib/configFixtures.js +513 -0
  68. package/dist/lib/deploymentHealth.d.ts +32 -0
  69. package/dist/lib/deploymentHealth.js +157 -0
  70. package/dist/lib/dns.d.ts +1 -1
  71. package/dist/lib/dns.js +19 -1
  72. package/dist/lib/dns.test.d.ts +1 -0
  73. package/dist/lib/dns.test.js +27 -0
  74. package/dist/lib/dockerHub.d.ts +12 -1
  75. package/dist/lib/dockerHub.js +18 -8
  76. package/dist/lib/helm.d.ts +4 -0
  77. package/dist/lib/helm.js +16 -0
  78. package/dist/lib/helmValues.d.ts +25 -0
  79. package/dist/lib/helmValues.js +1937 -259
  80. package/dist/lib/helmValues.test.d.ts +1 -0
  81. package/dist/lib/helmValues.test.js +966 -0
  82. package/dist/lib/htpasswd.d.ts +1 -0
  83. package/dist/lib/htpasswd.js +15 -0
  84. package/dist/lib/kubernetes.d.ts +126 -13
  85. package/dist/lib/kubernetes.js +624 -134
  86. package/dist/lib/secrets.d.ts +23 -0
  87. package/dist/lib/secrets.js +158 -0
  88. package/dist/lib/validateValues.d.ts +31 -0
  89. package/dist/lib/validateValues.js +253 -0
  90. package/dist/lib/versions.d.ts +82 -11
  91. package/dist/lib/versions.js +131 -31
  92. package/dist/lib/versions.test.d.ts +1 -0
  93. package/dist/lib/versions.test.js +81 -0
  94. package/dist/lib/wizardSteps.d.ts +14 -0
  95. package/dist/lib/wizardSteps.js +23 -0
  96. package/dist/lib/workloadIdentity.d.ts +26 -0
  97. package/dist/lib/workloadIdentity.js +323 -0
  98. package/dist/lib/workloadIdentity.test.d.ts +1 -0
  99. package/dist/lib/workloadIdentity.test.js +57 -0
  100. package/dist/types/index.d.ts +2152 -95
  101. package/dist/types/index.js +554 -286
  102. package/package.json +10 -4
  103. package/schema/values.schema.json +1934 -0
  104. package/dist/components/Wizard/steps/CredentialsStep.d.ts +0 -6
  105. package/dist/components/Wizard/steps/CredentialsStep.js +0 -22
  106. package/dist/components/Wizard/steps/DeploymentModeStep.d.ts +0 -5
  107. package/dist/components/Wizard/steps/DeploymentModeStep.js +0 -26
  108. package/dist/components/Wizard/steps/TierStep.d.ts +0 -6
  109. package/dist/components/Wizard/steps/TierStep.js +0 -29
  110. package/dist/lib/terraform.d.ts +0 -66
  111. package/dist/lib/terraform.js +0 -754
  112. package/terraform/aws/main.tf +0 -355
  113. package/terraform/azure/main.tf +0 -371
  114. package/terraform/gcp/main.tf +0 -407
@@ -1,7 +1,96 @@
1
- import { TIER_CONFIGS, isSupportedDnsProvider, getLoggingDestinationLabel, } from "../types/index.js";
1
+ import { getReleaseName, isSupportedDnsProvider, validateRemoteWriteConfig, } from "../types/index.js";
2
2
  import { saveHelmValues, getHelmValuesPath } from "./config.js";
3
+ import { assertValidHelmValues } from "./validateValues.js";
4
+ import { SUPABASE_POSTGRES_IMAGE_REPOSITORY, SUPABASE_POSTGRES_IMAGE_TAG, DEFAULT_IMAGE_REGISTRY, IMAGE_REPOSITORIES, IMAGE_DIGESTS, KAFKA_PROXY_IMAGE, } from "./versions.js";
5
+ import { createHmac } from "crypto";
3
6
  import fs from "fs/promises";
4
7
  import YAML from "yaml";
8
+ // Names of the Kubernetes Secrets the CLI creates in k8s secret mode. Shared by
9
+ // the value generator (which sets the secretRef fields) and src/lib/secrets.ts
10
+ // (which creates the Secrets) so they always agree.
11
+ export function deploymentSecretNames(config) {
12
+ const base = config.name;
13
+ return {
14
+ app: `${base}-app-secrets`,
15
+ db: `${base}-supabase-db`,
16
+ dbBootstrap: `${base}-supabase-db-bootstrap`,
17
+ jwt: `${base}-supabase-jwt`,
18
+ dashboard: `${base}-supabase-dashboard`,
19
+ realtime: `${base}-supabase-realtime`,
20
+ smtp: `${base}-supabase-smtp`,
21
+ };
22
+ }
23
+ // Baseline Kafka topic partitioning. These are NOT user-tunable sizing knobs
24
+ // (tiers were removed); they are a structural contract that must stay
25
+ // consistent across three places at once: the kafka.provisioning topic
26
+ // partitions, rulebricks.hps.workers.solutionPartitions (the worker-fleet
27
+ // concurrency ceiling the chart cross-checks), and the worker KEDA
28
+ // maxReplicaCount (validated to be <= solutionPartitions). They mirror the Helm
29
+ // chart's own defaults, so operators who need a different size tune the chart
30
+ // values directly. Partitions can never be decreased, so solution is sized with
31
+ // generous headroom up front; idle partitions are effectively free.
32
+ const SOLUTION_TOPIC_PARTITIONS = 128;
33
+ const LOGS_TOPIC_PARTITIONS = 24;
34
+ // RPC + log topics: replication factor 1. RPC traffic is transient and
35
+ // latency-sensitive (the HPS producer's acks=-1 would otherwise wait on full
36
+ // ISR replication); the in-cluster broker is single-replica by default.
37
+ const TOPIC_REPLICATION_FACTOR = 1;
38
+ // global.version must be empty or a semantic version per the chart schema. The
39
+ // CLI normally pins a real version, but migrated/legacy configs can carry
40
+ // "latest"; emitting that would fail chart validation, so we omit it instead
41
+ // and let the chart fall back to its default.
42
+ const SEMVER_PATTERN = /^\d+\.\d+\.\d+(-[0-9A-Za-z.-]+)?$/;
43
+ // Healthy defaults for the decision-log archive that ClickHouse reads:
44
+ // flush a gzipped NDJSON file at ~64 MiB (uncompressed) or after 5 minutes,
45
+ // whichever comes first. Users can override these in their Helm values.
46
+ //
47
+ // max_bytes MUST stay well below the Vector pod's memory limit
48
+ // (vector.resources.limits.memory in the chart): the object-storage sink buffers
49
+ // the whole uncompressed batch in memory before it flushes, so a batch sized at
50
+ // or above the pod limit gets OOMKilled before it can ever write a blob - which
51
+ // silently disables decision-log export entirely. 64 MiB leaves comfortable
52
+ // headroom under the chart's 1 GiB Vector limit while still producing large,
53
+ // scan-efficient files for ClickHouse.
54
+ const DECISION_LOG_BATCH = { max_bytes: 67108864, timeout_secs: 300 };
55
+ const SUPABASE_JWT_ISSUED_AT = 1641769200;
56
+ const SUPABASE_JWT_EXPIRES_AT = 4102444800;
57
+ // VRL that normalizes the Kafka decision-log envelope into the ClickHouse column
58
+ // types. Inlined as a real multi-line string (not a chart `{{ include }}`) so
59
+ // that YAML.stringify / Helm's toYaml emit it as a block scalar. A templated
60
+ // single-line include gets rendered into a single-quoted YAML scalar, whose
61
+ // newlines YAML folds into spaces - collapsing the statements onto one line and
62
+ // breaking VRL parsing. Keep in sync with rulebricks.vector.normalizeLogs.
63
+ const VECTOR_NORMALIZE_LOGS_VRL = [
64
+ "parsed, err = parse_json(string!(.message))",
65
+ "if err == null {",
66
+ " . = parsed",
67
+ "}",
68
+ '.timestamp = parse_timestamp!(to_string(.timestamp) ?? to_string(now()), format: "%+")',
69
+ '.api_key = to_string(.api_key) ?? ""',
70
+ ".user_id = to_string(.user_id) ?? null",
71
+ ".environment = to_string(.environment) ?? null",
72
+ ".ip = to_string(.ip) ?? null",
73
+ ".method = to_string(.method) ?? null",
74
+ '.url = to_string(.url) ?? ""',
75
+ ".status = to_int(.status) ?? 0",
76
+ ".rule_name = to_string(.rule_name) ?? null",
77
+ ".rule_id = to_string(.rule_id) ?? null",
78
+ ".rule_slug = to_string(.rule_slug) ?? null",
79
+ ".rule_version = to_string(.rule_version) ?? null",
80
+ ".operation = to_string(.operation) ?? null",
81
+ '.level = to_string(.level) ?? "info"',
82
+ ".error = to_string(.error) ?? null",
83
+ ".trace_id = to_string(.trace_id) ?? null",
84
+ ".span_id = to_string(.span_id) ?? null",
85
+ '.request = to_string(.request) ?? "null"',
86
+ '.response = to_string(.response) ?? "null"',
87
+ '.decision = to_string(.decision) ?? "{}"',
88
+ '.params = to_string(.params) ?? "{}"',
89
+ ].join("\n");
90
+ function decisionLogPathPrefix(config) {
91
+ const path = config.storage?.paths?.decisionLogs || "decision-logs";
92
+ return `${path.replace(/^\/+|\/+$/g, "")}/year=%Y/month=%m/day=%d/hour=%H/`;
93
+ }
5
94
  /**
6
95
  * Generates Vector sink configuration based on logging settings
7
96
  */
@@ -10,62 +99,85 @@ function generateVectorSinks(config) {
10
99
  // Console sink is always enabled
11
100
  console: {
12
101
  type: "console",
13
- inputs: ["kafka"],
102
+ inputs: ["normalize_logs"],
14
103
  encoding: {
15
104
  codec: "json",
16
105
  },
17
106
  },
18
107
  };
19
- // Add external sink if configured
20
- if (config.features.logging.sink !== "console" &&
21
- config.features.logging.sink !== "pending") {
22
- const { sink, bucket, region } = config.features.logging;
23
- switch (sink) {
24
- // Cloud Storage sinks
108
+ if (config.storage) {
109
+ const storage = config.storage;
110
+ switch (config.storage.provider) {
25
111
  case "s3":
26
- sinks.s3 = {
112
+ sinks.decision_logs = {
27
113
  type: "aws_s3",
28
- inputs: ["kafka"],
29
- bucket: bucket,
30
- region: region,
31
- key_prefix: "rulebricks/logs/%Y/%m/%d/",
114
+ inputs: ["normalize_logs"],
115
+ bucket: storage.bucket,
116
+ region: storage.region,
117
+ key_prefix: decisionLogPathPrefix(config),
118
+ filename_extension: "ndjson",
32
119
  compression: "gzip",
33
- encoding: {
34
- codec: "json",
35
- },
120
+ encoding: { codec: "json" },
121
+ framing: { method: "newline_delimited" },
122
+ batch: { ...DECISION_LOG_BATCH },
36
123
  };
37
124
  break;
38
- case "azure-blob":
39
- sinks.azure_blob = {
125
+ case "azure-blob": {
126
+ const sink = {
40
127
  type: "azure_blob",
41
- inputs: ["kafka"],
42
- container_name: bucket,
43
- storage_account: "rulebrickslogs", // Will be configured via env var
44
- blob_prefix: "rulebricks/logs/%Y/%m/%d/",
128
+ inputs: ["normalize_logs"],
129
+ account_name: storage.bucket,
130
+ container_name: storage.azureBlobContainer || "rulebricks",
131
+ blob_prefix: decisionLogPathPrefix(config),
132
+ // azure_blob has no filename_extension (unlike aws_s3/gcs); it always
133
+ // writes ".log" (".log.gz" when compressed). ClickHouse globs on *.gz.
45
134
  compression: "gzip",
46
- encoding: {
47
- codec: "json",
48
- },
135
+ encoding: { codec: "json" },
136
+ framing: { method: "newline_delimited" },
137
+ batch: { ...DECISION_LOG_BATCH },
49
138
  };
139
+ if (config.storage.cloudAuthMode === "secret") {
140
+ sink.connection_string = "${AZURE_STORAGE_CONNECTION_STRING}";
141
+ }
142
+ else {
143
+ sink.auth = {
144
+ azure_credential_kind: "workload_identity",
145
+ client_id: config.storage.azureBlobClientId,
146
+ tenant_id: config.storage.azureBlobTenantId,
147
+ token_file_path: "/var/run/secrets/azure/tokens/azure-identity-token",
148
+ };
149
+ }
150
+ sinks.decision_logs = sink;
50
151
  break;
152
+ }
51
153
  case "gcs":
52
- sinks.gcs = {
154
+ sinks.decision_logs = {
53
155
  type: "gcp_cloud_storage",
54
- inputs: ["kafka"],
55
- bucket: bucket,
56
- key_prefix: "rulebricks/logs/%Y/%m/%d/",
156
+ inputs: ["normalize_logs"],
157
+ bucket: storage.bucket,
158
+ key_prefix: decisionLogPathPrefix(config),
159
+ filename_extension: "ndjson",
57
160
  compression: "gzip",
58
- encoding: {
59
- codec: "json",
60
- },
161
+ encoding: { codec: "json" },
162
+ framing: { method: "newline_delimited" },
163
+ batch: { ...DECISION_LOG_BATCH },
61
164
  };
62
165
  break;
166
+ }
167
+ }
168
+ // Add external logging-platform sink if configured. Decision logs always go
169
+ // to object storage via the decision_logs sink above; this is an additional
170
+ // platform destination (Datadog, Splunk, etc.).
171
+ if (config.features.logging.sink !== "console" &&
172
+ config.features.logging.sink !== "pending") {
173
+ const { sink, bucket, region } = config.features.logging;
174
+ switch (sink) {
63
175
  // Logging platform sinks
64
176
  // For platforms, bucket is repurposed for API key/token, region for site/URL
65
177
  case "datadog":
66
178
  sinks.datadog = {
67
179
  type: "datadog_logs",
68
- inputs: ["kafka"],
180
+ inputs: ["normalize_logs"],
69
181
  default_api_key: bucket, // API key stored in bucket field
70
182
  site: region || "datadoghq.com", // Site stored in region field
71
183
  compression: "gzip",
@@ -77,7 +189,7 @@ function generateVectorSinks(config) {
77
189
  case "splunk":
78
190
  sinks.splunk = {
79
191
  type: "splunk_hec_logs",
80
- inputs: ["kafka"],
192
+ inputs: ["normalize_logs"],
81
193
  endpoint: region, // URL stored in region field
82
194
  default_token: bucket, // HEC token stored in bucket field
83
195
  compression: "gzip",
@@ -92,7 +204,7 @@ function generateVectorSinks(config) {
92
204
  const esConfig = JSON.parse(bucket || "{}");
93
205
  sinks.elasticsearch = {
94
206
  type: "elasticsearch",
95
- inputs: ["kafka"],
207
+ inputs: ["normalize_logs"],
96
208
  endpoints: [esConfig.url],
97
209
  bulk: {
98
210
  index: esConfig.index || "rulebricks-logs",
@@ -112,7 +224,7 @@ function generateVectorSinks(config) {
112
224
  // Fallback if JSON parsing fails
113
225
  sinks.elasticsearch = {
114
226
  type: "elasticsearch",
115
- inputs: ["kafka"],
227
+ inputs: ["normalize_logs"],
116
228
  endpoints: [bucket],
117
229
  bulk: {
118
230
  index: region || "rulebricks-logs",
@@ -123,7 +235,7 @@ function generateVectorSinks(config) {
123
235
  case "loki":
124
236
  sinks.loki = {
125
237
  type: "loki",
126
- inputs: ["kafka"],
238
+ inputs: ["normalize_logs"],
127
239
  endpoint: bucket, // Loki URL stored in bucket field
128
240
  labels: {
129
241
  app: "rulebricks",
@@ -137,7 +249,7 @@ function generateVectorSinks(config) {
137
249
  case "newrelic":
138
250
  sinks.newrelic = {
139
251
  type: "new_relic",
140
- inputs: ["kafka"],
252
+ inputs: ["normalize_logs"],
141
253
  license_key: bucket, // License key stored in bucket field
142
254
  account_id: region, // Account ID stored in region field
143
255
  api: "logs",
@@ -150,7 +262,7 @@ function generateVectorSinks(config) {
150
262
  case "axiom":
151
263
  sinks.axiom = {
152
264
  type: "axiom",
153
- inputs: ["kafka"],
265
+ inputs: ["normalize_logs"],
154
266
  token: bucket, // API token stored in bucket field
155
267
  dataset: region || "rulebricks", // Dataset stored in region field
156
268
  compression: "gzip",
@@ -163,6 +275,75 @@ function generateVectorSinks(config) {
163
275
  }
164
276
  return sinks;
165
277
  }
278
+ function generateVectorEnv(config) {
279
+ // Kafka connection settings come from the templated vector-kafka-env ConfigMap
280
+ // so the in-cluster vs external (and bridge) decision lives in one place.
281
+ const configMapKeys = [
282
+ "KAFKA_BOOTSTRAP_SERVERS",
283
+ "KAFKA_TLS_ENABLED",
284
+ "KAFKA_SASL_ENABLED",
285
+ "KAFKA_SASL_MECHANISM",
286
+ "KAFKA_LOG_TOPIC",
287
+ ];
288
+ const env = configMapKeys.map((key) => ({
289
+ name: key,
290
+ valueFrom: { configMapKeyRef: { name: "vector-kafka-env", key } },
291
+ }));
292
+ // SASL credentials (inline PLAIN/SCRAM). Optional so in-cluster/token-auth
293
+ // deploys work without the secret existing.
294
+ for (const key of ["KAFKA_SASL_USERNAME", "KAFKA_SASL_PASSWORD"]) {
295
+ env.push({
296
+ name: key,
297
+ valueFrom: {
298
+ secretKeyRef: { name: "vector-kafka-credentials", key, optional: true },
299
+ },
300
+ });
301
+ }
302
+ const azureBlobSecretRef = config.storage?.azureBlobConnectionStringSecretRef;
303
+ if (config.storage?.provider === "azure-blob" &&
304
+ config.storage.cloudAuthMode === "secret" &&
305
+ azureBlobSecretRef) {
306
+ env.push({
307
+ name: "AZURE_STORAGE_CONNECTION_STRING",
308
+ valueFrom: {
309
+ secretKeyRef: secretKeySelector(azureBlobSecretRef),
310
+ },
311
+ });
312
+ }
313
+ return env;
314
+ }
315
+ function generateVectorServiceAccount(config) {
316
+ // AWS uses EKS Pod Identity: NO eks.amazonaws.com/role-arn annotation - the
317
+ // CLI's workload-identity step creates a namespace-scoped association for this
318
+ // SA (to a role granting both the object-storage and MSK access Vector needs).
319
+ // Azure/GCP still annotate the SA, which is how their workload identity binds.
320
+ const annotations = {};
321
+ if (config.storage?.provider === "azure-blob" &&
322
+ config.storage.cloudAuthMode !== "secret" &&
323
+ config.storage.azureBlobClientId) {
324
+ annotations["azure.workload.identity/client-id"] =
325
+ config.storage.azureBlobClientId;
326
+ }
327
+ if (config.storage?.provider === "gcs" && config.storage.gcpServiceAccountEmail) {
328
+ annotations["iam.gke.io/gcp-service-account"] =
329
+ config.storage.gcpServiceAccountEmail;
330
+ }
331
+ return {
332
+ create: true,
333
+ name: "vector",
334
+ annotations,
335
+ };
336
+ }
337
+ function generateVectorPodLabels(config) {
338
+ const labels = {
339
+ "rulebricks.com/workload-group": "infrastructure",
340
+ };
341
+ if (config.storage?.provider === "azure-blob" &&
342
+ config.storage.cloudAuthMode !== "secret") {
343
+ labels["azure.workload.identity/use"] = "true";
344
+ }
345
+ return labels;
346
+ }
166
347
  /**
167
348
  * Maps DNS provider to external-dns provider name
168
349
  */
@@ -175,57 +356,1000 @@ function getExternalDnsProvider(dnsProvider) {
175
356
  };
176
357
  return mapping[dnsProvider] || "aws";
177
358
  }
359
+ function secretKeySelector(ref) {
360
+ return {
361
+ name: ref.name,
362
+ key: ref.key,
363
+ };
364
+ }
365
+ function base64UrlJson(value) {
366
+ return Buffer.from(JSON.stringify(value)).toString("base64url");
367
+ }
368
+ // Self-hosted Supabase derives the anon and service_role API keys from the JWT
369
+ // secret: each is an HS256 JWT (role: anon / service_role) signed with the secret.
370
+ // https://supabase.com/docs/guides/self-hosting/self-hosted-auth-keys
371
+ export function signSupabaseJwt(role, secret) {
372
+ const header = base64UrlJson({ alg: "HS256", typ: "JWT" });
373
+ const payload = base64UrlJson({
374
+ role,
375
+ iss: "supabase",
376
+ iat: SUPABASE_JWT_ISSUED_AT,
377
+ exp: SUPABASE_JWT_EXPIRES_AT,
378
+ });
379
+ const body = `${header}.${payload}`;
380
+ const signature = createHmac("sha256", secret).update(body).digest("base64url");
381
+ return `${body}.${signature}`;
382
+ }
383
+ // Realtime needs SECRET_KEY_BASE (signs/encrypts its tokens) and a 16-byte
384
+ // DB_ENC_KEY (encrypts tenant DB creds). Derive both deterministically from the
385
+ // JWT secret so they are stable across redeploys with no extra state to persist,
386
+ // and anchored to the one root secret the operator already manages.
387
+ export function deriveRealtimeSecrets(jwtSecret) {
388
+ const secretKeyBase = createHmac("sha256", jwtSecret)
389
+ .update("supabase-realtime-secret-key-base")
390
+ .digest("hex"); // 64 chars
391
+ const dbEncKey = createHmac("sha256", jwtSecret)
392
+ .update("supabase-realtime-db-enc-key")
393
+ .digest("hex")
394
+ .slice(0, 16); // Realtime requires exactly 16 bytes
395
+ return { secretKeyBase, dbEncKey };
396
+ }
397
+ /**
398
+ * Strips surrounding whitespace and embedded control characters (notably the
399
+ * trailing carriage return that sneaks in when a remote_write URL is pasted from
400
+ * a CRLF file or captured from command output). A stray "\r" corrupts the URL
401
+ * the Prometheus operator hands to remote_write, so normalize it at the source.
402
+ */
403
+ function sanitizeRemoteWriteUrl(url) {
404
+ // eslint-disable-next-line no-control-regex
405
+ return url.replace(/[\u0000-\u001F\u007F]/g, "").trim();
406
+ }
407
+ function generateRemoteWriteSpec(config) {
408
+ if (config.features.monitoring.destination === "local-grafana") {
409
+ return [];
410
+ }
411
+ const remoteWrite = config.features.monitoring.remoteWrite;
412
+ if (!remoteWrite) {
413
+ return config.features.monitoring.remoteWriteUrl
414
+ ? [{ url: sanitizeRemoteWriteUrl(config.features.monitoring.remoteWriteUrl) }]
415
+ : [];
416
+ }
417
+ // Enforce the same per-destination/auth requirements the wizard and Zod
418
+ // schema do. This is unreachable for CLI-generated configs (they are gated
419
+ // earlier) but guards hand-edited values and keeps one source of truth.
420
+ const remoteWriteErrors = validateRemoteWriteConfig(remoteWrite);
421
+ if (remoteWriteErrors.length > 0) {
422
+ throw new Error(remoteWriteErrors.join(" "));
423
+ }
424
+ const base = {
425
+ url: sanitizeRemoteWriteUrl(remoteWrite.url),
426
+ };
427
+ switch (remoteWrite.destination) {
428
+ case "aws-amp":
429
+ if (!remoteWrite.awsRegion) {
430
+ throw new Error("AWS Managed Prometheus remote_write requires a region.");
431
+ }
432
+ return [
433
+ {
434
+ ...base,
435
+ sigv4: {
436
+ region: remoteWrite.awsRegion,
437
+ },
438
+ },
439
+ ];
440
+ case "azure-monitor":
441
+ return [generateAzureMonitorRemoteWrite(remoteWrite, base)];
442
+ case "grafana-cloud":
443
+ return [generateBasicAuthRemoteWrite(remoteWrite, base)];
444
+ case "generic":
445
+ return [generateGenericRemoteWrite(remoteWrite, base)];
446
+ default:
447
+ return [base];
448
+ }
449
+ }
450
+ function isClickStackEnabled(config) {
451
+ return config.features.observability?.clickstack?.enabled ?? true;
452
+ }
453
+ function generateClickStackValues(enabled, config, storageClass, infrastructurePodLabels, operationalDaemonSetTolerations) {
454
+ const clickstack = config.features.observability?.clickstack;
455
+ const telemetryRetentionDays = clickstack?.telemetryRetentionDays ?? 7;
456
+ const clickHouseStorageSize = clickstack?.clickHouseStorageSize ?? "100Gi";
457
+ // Registry host for the clickstack images. The clickstack subchart routes
458
+ // these through its own image helper, so the split { registry, repository }
459
+ // shape lets global.imageRegistry + digest pinning flow through.
460
+ const reg = config.imageRegistry || DEFAULT_IMAGE_REGISTRY;
461
+ return {
462
+ enabled,
463
+ clickhouse: {
464
+ database: "otel",
465
+ username: "rulebricks",
466
+ existingSecret: "",
467
+ existingSecretKey: "admin-password",
468
+ retentionDays: telemetryRetentionDays,
469
+ ttl: "",
470
+ },
471
+ hyperdx: {
472
+ enabled,
473
+ image: {
474
+ registry: reg,
475
+ repository: IMAGE_REPOSITORIES.hyperdx.repository,
476
+ tag: IMAGE_REPOSITORIES.hyperdx.tag,
477
+ pullPolicy: "IfNotPresent",
478
+ },
479
+ resources: {
480
+ requests: { cpu: "250m", memory: "512Mi" },
481
+ limits: { cpu: "1000m", memory: "1Gi" },
482
+ },
483
+ ingress: {
484
+ enabled,
485
+ className: "traefik",
486
+ hostname: "",
487
+ allowedIPs: [],
488
+ },
489
+ podLabels: infrastructurePodLabels,
490
+ },
491
+ collector: {
492
+ image: {
493
+ registry: reg,
494
+ repository: IMAGE_REPOSITORIES.clickstackOtelCollector.repository,
495
+ tag: IMAGE_REPOSITORIES.clickstackOtelCollector.tag,
496
+ pullPolicy: "IfNotPresent",
497
+ },
498
+ memoryLimitMiB: 800,
499
+ agent: {
500
+ enabled,
501
+ securityContext: {
502
+ runAsUser: 0,
503
+ runAsGroup: 0,
504
+ },
505
+ resources: {
506
+ requests: { cpu: "100m", memory: "256Mi" },
507
+ limits: { cpu: "500m", memory: "512Mi" },
508
+ },
509
+ tolerations: operationalDaemonSetTolerations,
510
+ podLabels: infrastructurePodLabels,
511
+ },
512
+ gateway: {
513
+ replicas: 1,
514
+ resources: {
515
+ requests: { cpu: "250m", memory: "512Mi" },
516
+ limits: { cpu: "2000m", memory: "1Gi" },
517
+ },
518
+ podLabels: infrastructurePodLabels,
519
+ },
520
+ },
521
+ ferretdb: {
522
+ enabled,
523
+ image: {
524
+ registry: reg,
525
+ repository: IMAGE_REPOSITORIES.ferretdb.repository,
526
+ tag: IMAGE_REPOSITORIES.ferretdb.tag,
527
+ pullPolicy: "IfNotPresent",
528
+ },
529
+ postgresImage: {
530
+ registry: reg,
531
+ repository: IMAGE_REPOSITORIES.postgresDocumentdb.repository,
532
+ tag: IMAGE_REPOSITORIES.postgresDocumentdb.tag,
533
+ pullPolicy: "IfNotPresent",
534
+ },
535
+ auth: {
536
+ username: "hyperdx",
537
+ password: "",
538
+ existingSecret: "",
539
+ existingSecretKey: "password",
540
+ },
541
+ persistence: {
542
+ enabled,
543
+ size: "10Gi",
544
+ storageClassName: storageClass,
545
+ },
546
+ resources: {
547
+ ferretdb: {
548
+ requests: { cpu: "100m", memory: "256Mi" },
549
+ limits: { cpu: "500m", memory: "512Mi" },
550
+ },
551
+ postgres: {
552
+ requests: { cpu: "250m", memory: "512Mi" },
553
+ limits: { cpu: "1000m", memory: "1Gi" },
554
+ },
555
+ },
556
+ podLabels: infrastructurePodLabels,
557
+ podAnnotations: {
558
+ "cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
559
+ },
560
+ },
561
+ };
562
+ }
563
+ function generatePrometheusServiceAccount(config) {
564
+ // AWS (AMP remote write) uses EKS Pod Identity - the association is created by
565
+ // the CLI's workload-identity step, so no eks.amazonaws.com/role-arn annotation.
566
+ // Azure Monitor still annotates the SA for its workload identity.
567
+ const annotations = {};
568
+ const remoteWrite = config.features.monitoring.remoteWrite;
569
+ if (remoteWrite?.destination === "azure-monitor" &&
570
+ remoteWrite.authType === "workload-identity" &&
571
+ remoteWrite.clientId) {
572
+ annotations["azure.workload.identity/client-id"] = remoteWrite.clientId;
573
+ }
574
+ return {
575
+ create: true,
576
+ name: "prometheus",
577
+ annotations,
578
+ };
579
+ }
580
+ function generatePrometheusPodMetadata(config) {
581
+ const remoteWrite = config.features.monitoring.remoteWrite;
582
+ if (remoteWrite?.destination === "azure-monitor" &&
583
+ remoteWrite.authType === "workload-identity") {
584
+ return {
585
+ labels: {
586
+ "azure.workload.identity/use": "true",
587
+ },
588
+ };
589
+ }
590
+ return {};
591
+ }
592
+ function generateAzureMonitorRemoteWrite(remoteWrite, base) {
593
+ const azureAd = {
594
+ cloud: remoteWrite.azureCloud || "AzurePublic",
595
+ };
596
+ if (remoteWrite.authType === "oauth") {
597
+ if (!remoteWrite.clientId ||
598
+ !remoteWrite.tenantId ||
599
+ !remoteWrite.clientSecretRef) {
600
+ throw new Error("Azure Monitor remote_write OAuth requires client ID, tenant ID, and client secret ref.");
601
+ }
602
+ azureAd.oauth = {
603
+ clientId: remoteWrite.clientId,
604
+ tenantId: remoteWrite.tenantId,
605
+ clientSecret: secretKeySelector(remoteWrite.clientSecretRef),
606
+ };
607
+ }
608
+ else if (remoteWrite.authType === "workload-identity") {
609
+ if (!remoteWrite.clientId || !remoteWrite.tenantId) {
610
+ throw new Error("Azure Monitor remote_write workload identity requires client ID and tenant ID.");
611
+ }
612
+ // The prometheus-operator AzureAD schema supports only managedIdentity,
613
+ // oauth, and sdk (there is no "workloadIdentity" field - emitting it makes
614
+ // the operator reject the whole remoteWrite with "must provide Azure Managed
615
+ // Identity or Azure OAuth or Azure SDK", which silently prevents the
616
+ // Prometheus StatefulSet from being created). For AKS workload identity we
617
+ // use the Azure SDK credential: it reads the projected token + AZURE_CLIENT_ID
618
+ // injected by the workload-identity webhook (driven by the prometheus
619
+ // ServiceAccount's azure.workload.identity/client-id annotation and the
620
+ // azure.workload.identity/use pod label), so only the tenant ID is needed here.
621
+ azureAd.sdk = {
622
+ tenantId: remoteWrite.tenantId,
623
+ };
624
+ }
625
+ else {
626
+ if (!remoteWrite.clientId) {
627
+ throw new Error("Azure Monitor remote_write managed identity requires client ID.");
628
+ }
629
+ azureAd.managedIdentity = {
630
+ clientId: remoteWrite.clientId,
631
+ };
632
+ }
633
+ return {
634
+ ...base,
635
+ azureAd,
636
+ };
637
+ }
638
+ function generateBasicAuthRemoteWrite(remoteWrite, base) {
639
+ if (!remoteWrite.usernameSecretRef || !remoteWrite.passwordSecretRef) {
640
+ throw new Error("Basic auth remote_write requires username and password secret refs.");
641
+ }
642
+ return {
643
+ ...base,
644
+ basicAuth: {
645
+ username: secretKeySelector(remoteWrite.usernameSecretRef),
646
+ password: secretKeySelector(remoteWrite.passwordSecretRef),
647
+ },
648
+ };
649
+ }
650
+ function generateGenericRemoteWrite(remoteWrite, base) {
651
+ if (remoteWrite.authType === "basic") {
652
+ return generateBasicAuthRemoteWrite(remoteWrite, base);
653
+ }
654
+ if (remoteWrite.authType === "bearer") {
655
+ if (!remoteWrite.bearerTokenSecretRef) {
656
+ throw new Error("Bearer remote_write requires a token secret ref.");
657
+ }
658
+ return {
659
+ ...base,
660
+ authorization: {
661
+ type: "Bearer",
662
+ credentials: secretKeySelector(remoteWrite.bearerTokenSecretRef),
663
+ },
664
+ };
665
+ }
666
+ return base;
667
+ }
668
+ /**
669
+ * Generates the Kafka broker config map (Kafka.spec.kafka.config for Strimzi).
670
+ * These are the former KAFKA_CFG_* tuning env vars, as their Kafka property
671
+ * names. Kept in lockstep with the chart's kafka.config.
672
+ */
673
+ function generateKafkaConfig() {
674
+ return {
675
+ "auto.create.topics.enable": "true",
676
+ "log.retention.hours": "24",
677
+ "num.partitions": "12",
678
+ "num.network.threads": "8",
679
+ "num.io.threads": "8",
680
+ "socket.send.buffer.bytes": "1048576",
681
+ "socket.receive.buffer.bytes": "1048576",
682
+ "socket.request.max.bytes": "209715200",
683
+ // Broker-wide max record size; must exceed every per-topic max.message.bytes.
684
+ "message.max.bytes": "2097152",
685
+ "replica.fetch.max.bytes": "4194304",
686
+ // Broker-wide default retention; the application topics carry tighter caps.
687
+ "log.retention.bytes": "536870912",
688
+ "log.segment.bytes": "1073741824",
689
+ "num.replica.fetchers": "4",
690
+ "queued.max.requests": "10000",
691
+ "replica.socket.receive.buffer.bytes": "1048576",
692
+ "log.cleaner.dedupe.buffer.size": "268435456",
693
+ "log.cleaner.io.buffer.size": "1048576",
694
+ };
695
+ }
696
+ /**
697
+ * Effective Kafka topic prefix as HPS/Vector/KEDA will see it.
698
+ * Mirrors generateAppLogging: in-cluster Kafka runs UNPREFIXED (dedicated
699
+ * broker, and prefixing would desync chart-side consumers from producers);
700
+ * external Kafka uses the explicit prefix, falling back to the chart default.
701
+ */
702
+ function effectiveTopicPrefix(config) {
703
+ if (!isExternalKafka(config)) {
704
+ return "";
705
+ }
706
+ const ext = config.externalServices?.kafka?.external ?? {};
707
+ return ext.topicPrefix !== undefined ? ext.topicPrefix : "com.rulebricks.";
708
+ }
178
709
  /**
179
- * Generates Kafka extra environment variables for tuning
710
+ * Explicit topic management for in-cluster Kafka.
711
+ *
712
+ * Generates the kafka.provisioning block consumed by BOTH the subchart
713
+ * provisioning Job (creates topics) and the chart's kafka-topic-align Job
714
+ * (idempotently converges pre-existing topics on upgrade). Topic names are
715
+ * derived from the SAME prefix written to app.logging.kafkaTopicPrefix - the
716
+ * chart fails the render if these ever diverge.
717
+ *
718
+ * Sizing policy (baseline constants, mirroring the chart defaults):
719
+ * - solution/solution-response: SOLUTION_TOPIC_PARTITIONS (the worker-fleet
720
+ * concurrency CEILING; partitions can never be decreased, workers are sized
721
+ * separately by the cluster autoscaler). RF stays 1: RPC traffic is transient
722
+ * and latency-sensitive, and the HPS producer's acks=-1 would otherwise wait
723
+ * on full ISR replication.
724
+ * - logs: LOGS_TOPIC_PARTITIONS (durable data feeding the Vector -> object
725
+ * storage pipeline).
180
726
  */
181
- function generateKafkaExtraEnvVars() {
727
+ function generateKafkaTopics(config) {
728
+ // External MSK IAM: the chart's kafka-topic-provision Job creates these on the
729
+ // managed broker (through the proxy bridge), so they must be populated here -
730
+ // MSK Serverless won't auto-create them. Other external brokers (SCRAM / Event
731
+ // Hubs / GCP, no bridge) a plain client can reach stay customer-managed.
732
+ if (isExternalKafka(config) && !kafkaUsesBridge(config)) {
733
+ return [];
734
+ }
735
+ const prefix = effectiveTopicPrefix(config);
736
+ const rpcTopicConfig = {
737
+ "retention.ms": "300000",
738
+ "segment.ms": "300000",
739
+ "segment.bytes": "67108864",
740
+ "retention.bytes": "67108864",
741
+ "max.message.bytes": "2097152",
742
+ };
182
743
  return [
183
744
  {
184
- name: "KAFKA_JVM_PERFORMANCE_OPTS",
185
- value: "-XX:MaxDirectMemorySize=256M -Djdk.nio.maxCachedBufferSize=262144",
186
- },
187
- { name: "KAFKA_CFG_QUEUED_MAX_REQUESTS", value: "10000" },
188
- { name: "KAFKA_CFG_NUM_NETWORK_THREADS", value: "8" },
189
- { name: "KAFKA_CFG_NUM_IO_THREADS", value: "8" },
190
- { name: "KAFKA_CFG_SOCKET_SEND_BUFFER_BYTES", value: "1048576" },
191
- { name: "KAFKA_CFG_SOCKET_RECEIVE_BUFFER_BYTES", value: "1048576" },
192
- { name: "KAFKA_CFG_SOCKET_REQUEST_MAX_BYTES", value: "209715200" },
193
- { name: "KAFKA_CFG_LOG_RETENTION_BYTES", value: "4294967296" },
194
- { name: "KAFKA_CFG_LOG_SEGMENT_BYTES", value: "1073741824" },
195
- { name: "KAFKA_CFG_NUM_REPLICA_FETCHERS", value: "4" },
196
- { name: "KAFKA_CFG_REPLICA_SOCKET_RECEIVE_BUFFER_BYTES", value: "1048576" },
197
- { name: "KAFKA_CFG_LOG_CLEANER_DEDUPE_BUFFER_SIZE", value: "268435456" },
198
- { name: "KAFKA_CFG_LOG_CLEANER_IO_BUFFER_SIZE", value: "1048576" },
199
- { name: "KAFKA_CFG_MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION", value: "10" },
745
+ name: `${prefix}solution`,
746
+ partitions: SOLUTION_TOPIC_PARTITIONS,
747
+ replicas: TOPIC_REPLICATION_FACTOR,
748
+ config: rpcTopicConfig,
749
+ },
750
+ {
751
+ name: `${prefix}solution-response`,
752
+ partitions: SOLUTION_TOPIC_PARTITIONS,
753
+ replicas: TOPIC_REPLICATION_FACTOR,
754
+ config: rpcTopicConfig,
755
+ },
756
+ {
757
+ name: `${prefix}logs`,
758
+ partitions: LOGS_TOPIC_PARTITIONS,
759
+ replicas: TOPIC_REPLICATION_FACTOR,
760
+ config: {
761
+ "retention.ms": "86400000",
762
+ "retention.bytes": "268435456",
763
+ "max.message.bytes": "2097152",
764
+ },
765
+ },
200
766
  ];
201
767
  }
768
+ function generateWorkerPodAntiAffinity() {
769
+ return {
770
+ podAntiAffinity: {
771
+ preferredDuringSchedulingIgnoredDuringExecution: [
772
+ {
773
+ weight: 50,
774
+ podAffinityTerm: {
775
+ labelSelector: {
776
+ matchExpressions: [
777
+ {
778
+ key: "rulebricks.com/workload-group",
779
+ operator: "In",
780
+ values: ["infrastructure"],
781
+ },
782
+ ],
783
+ },
784
+ topologyKey: "kubernetes.io/hostname",
785
+ },
786
+ },
787
+ ],
788
+ },
789
+ };
790
+ }
791
+ function generateScheduling(tolerations, affinity) {
792
+ return {
793
+ ...(affinity ? { affinity } : {}),
794
+ ...(tolerations ? { tolerations } : {}),
795
+ };
796
+ }
202
797
  /**
203
- * Generates Helm values from the deployment configuration
798
+ * Burst-pool scheduling, always on. Cluster-setup provisions a dedicated
799
+ * worker pool labeled and tainted rulebricks.com/pool=burst (one big
800
+ * Deallocate-parked node on Azure or an on-demand nodegroup on AWS); workers
801
+ * tolerate the taint and SOFTLY prefer the label. On clusters without such a
802
+ * pool both are inert, so BYO clusters schedule exactly as before - zero
803
+ * configuration required either way.
204
804
  */
205
- export async function generateHelmValues(config, options = {}) {
206
- const tierConfig = TIER_CONFIGS[config.tier];
207
- const { tlsEnabled = true } = options;
208
- // Determine if external-dns should be enabled
209
- const externalDnsEnabled = config.dns.autoManage && isSupportedDnsProvider(config.dns.provider);
210
- // Determine storage class based on provider
211
- // Note: GCP uses "hyperdisk-balanced" because C4A instances only support Hyperdisk (not Persistent Disk)
212
- const storageClass = config.infrastructure.provider === "aws"
213
- ? "gp3"
214
- : config.infrastructure.provider === "gcp"
215
- ? "hyperdisk-balanced"
216
- : config.infrastructure.provider === "azure"
217
- ? "managed-premium"
218
- : "gp3";
219
- // ARM64 tolerations for GKE C4A nodes (and other ARM64 providers)
220
- // GKE automatically taints ARM64 nodes with kubernetes.io/arch=arm64:NoSchedule
221
- const arm64Tolerations = [
805
+ const BURST_POOL_TOLERATION = {
806
+ key: "rulebricks.com/pool",
807
+ operator: "Equal",
808
+ value: "burst",
809
+ effect: "NoSchedule",
810
+ };
811
+ const BURST_POOL_NODE_PREFERENCE = {
812
+ weight: 100,
813
+ preference: {
814
+ matchExpressions: [
815
+ { key: "rulebricks.com/pool", operator: "In", values: ["burst"] },
816
+ ],
817
+ },
818
+ };
819
+ function generateBackupValues(config) {
820
+ const usesInClusterPostgres = config.database.type === "self-hosted" &&
821
+ config.externalServices?.postgres?.mode !== "external";
822
+ const enabled = usesInClusterPostgres && config.backup?.enabled === true;
823
+ // The backup CronJob streams pg_dump from the running DB (using supabase.db.image)
824
+ // and uploads it with rclone, so no backup-specific image is needed here. The
825
+ // chart default rclone image applies unless overridden in values.
826
+ return {
827
+ enabled,
828
+ schedule: config.backup?.schedule || "0 2 * * *",
829
+ retentionDays: config.backup?.retentionDays || 7,
830
+ };
831
+ }
832
+ function isExternalRedis(config) {
833
+ return config.externalServices?.redis?.mode === "external";
834
+ }
835
+ function isExternalKafka(config) {
836
+ return config.externalServices?.kafka?.mode === "external";
837
+ }
838
+ /**
839
+ * Whether the Vector kafka-proxy bridge sidecar is required. Only AWS MSK IAM
840
+ * needs it: Vector's kafka source can't speak token mechanisms, while Azure
841
+ * Event Hubs and GCP both use SASL PLAIN/SCRAM that Vector handles directly.
842
+ */
843
+ function kafkaUsesBridge(config) {
844
+ if (!isExternalKafka(config))
845
+ return false;
846
+ const ext = config.externalServices?.kafka?.external;
847
+ return (ext?.preset === "aws-msk-iam" || ext?.sasl?.mechanism === "aws-iam");
848
+ }
849
+ /**
850
+ * Whether Vector's kafka source connects with a direct PLAIN/SCRAM credential
851
+ * and therefore needs username/password. This mirrors the vector-kafka-env
852
+ * ConfigMap, which only sets KAFKA_SASL_ENABLED=true for external, non-token,
853
+ * non-bridge mechanisms (and where vector-kafka-credentials is populated). For
854
+ * in-cluster, bridge, and token-auth paths SASL is disabled, so username and
855
+ * password MUST be omitted: an empty env default (${VAR:-}) renders unquoted
856
+ * via Helm's toYaml and Vector reads the value as YAML null, which it rejects
857
+ * at startup ("invalid type: unit value, expected any valid TOML value").
858
+ */
859
+ function kafkaUsesDirectSasl(config) {
860
+ if (!isExternalKafka(config))
861
+ return false;
862
+ if (kafkaUsesBridge(config))
863
+ return false;
864
+ const mechanism = config.externalServices?.kafka?.external?.sasl?.mechanism;
865
+ if (!mechanism)
866
+ return false;
867
+ return mechanism !== "aws-iam" && mechanism !== "oauthbearer";
868
+ }
869
+ /**
870
+ * Builds the rulebricks.redis block: in-cluster sizing when embedded, or
871
+ * external connection settings when the user points at managed Redis.
872
+ */
873
+ function generateRedisBlock(config, storageClass, infrastructurePodLabels, coreScheduling) {
874
+ if (!isExternalRedis(config)) {
875
+ // Sizing (resources, persistence size) falls back to the chart defaults;
876
+ // only the deployment-specific storage class is set here.
877
+ return {
878
+ podLabels: infrastructurePodLabels,
879
+ ...coreScheduling,
880
+ persistence: {
881
+ enabled: true,
882
+ storageClass,
883
+ },
884
+ };
885
+ }
886
+ const ext = config.externalServices?.redis?.external ?? {};
887
+ const external = {
888
+ host: ext.host ?? "",
889
+ port: ext.port ?? 6379,
890
+ tls: { enabled: ext.tls ?? false },
891
+ };
892
+ if (ext.password) {
893
+ external.password = ext.password;
894
+ }
895
+ if (ext.existingSecret) {
896
+ external.existingSecret = ext.existingSecret;
897
+ external.existingSecretKey = ext.existingSecretKey || "redis-password";
898
+ }
899
+ if (ext.httpApi?.enabled) {
900
+ external.httpApi = {
901
+ enabled: true,
902
+ url: ext.httpApi.url ?? "",
903
+ token: ext.httpApi.token ?? "",
904
+ };
905
+ }
906
+ return {
907
+ enabled: false,
908
+ external,
909
+ };
910
+ }
911
+ function generateCacheObservabilityBlock(config, infrastructurePodLabels) {
912
+ const cache = config.features.cache;
913
+ const valkeyAdmin = cache?.valkeyAdmin;
914
+ const redisExporter = cache?.redisExporter;
915
+ const valkeyAdminIngressEnabled = valkeyAdmin?.exposure === "ingress";
916
+ return {
917
+ valkeyAdmin: {
918
+ enabled: valkeyAdmin?.enabled ?? false,
919
+ exposure: valkeyAdmin?.exposure ?? "internal",
920
+ podLabels: infrastructurePodLabels,
921
+ ingress: {
922
+ enabled: valkeyAdminIngressEnabled,
923
+ hostname: valkeyAdminIngressEnabled
924
+ ? valkeyAdmin?.hostname || `valkey.${config.domain}`
925
+ : "",
926
+ basicAuth: {
927
+ users: valkeyAdmin?.basicAuthUsers ?? [],
928
+ existingSecret: valkeyAdmin?.basicAuthExistingSecret ?? "",
929
+ },
930
+ allowedIPs: valkeyAdmin?.allowedIPs ?? [],
931
+ },
932
+ },
933
+ redisExporter: {
934
+ enabled: redisExporter?.enabled ?? true,
935
+ podLabels: infrastructurePodLabels,
936
+ },
937
+ };
938
+ }
939
+ function generateKafkaExporterBlock(config, infrastructurePodLabels) {
940
+ const requested = config.features.cache?.kafkaExporter?.enabled;
941
+ const canUseKafkaExporter = !isExternalKafka(config);
942
+ return {
943
+ enabled: requested ?? canUseKafkaExporter,
944
+ podLabels: infrastructurePodLabels,
945
+ brokers: isExternalKafka(config)
946
+ ? config.externalServices?.kafka?.external?.brokers ?? ""
947
+ : "",
948
+ };
949
+ }
950
+ /**
951
+ * Builds the rulebricks.app.logging block. Decision logging is always enabled;
952
+ * external Kafka adds brokers + SSL/SASL, while embedded auto-discovers the
953
+ * in-cluster Kafka service.
954
+ */
955
+ function generateAppLogging(config) {
956
+ if (!isExternalKafka(config)) {
957
+ return {
958
+ enabled: true,
959
+ kafkaBrokers: "", // Auto-discover from Kafka subchart
960
+ kafkaTopic: "logs",
961
+ // The in-cluster app/HPS produce to unprefixed topics (logs, solution,
962
+ // solution-response). The chart default prefix ("com.rulebricks.") is meant
963
+ // for shared/managed Kafka collision avoidance, but when applied here it
964
+ // makes the chart-side consumers diverge from the producers: Vector would
965
+ // subscribe to "com.rulebricks.logs" (no data) and the KEDA worker trigger
966
+ // would watch "com.rulebricks.solution" (no lag signal). Disable prefixing
967
+ // for the dedicated in-cluster broker so everything lines up.
968
+ kafkaTopicPrefix: "",
969
+ };
970
+ }
971
+ const ext = config.externalServices?.kafka?.external ?? {};
972
+ const logging = {
973
+ enabled: true,
974
+ kafkaBrokers: ext.brokers ?? "",
975
+ kafkaTopic: ext.topic || "logs",
976
+ kafkaSsl: ext.ssl ?? false,
977
+ };
978
+ // Topic prefix: emit only when explicitly provided (incl. "" to disable). When
979
+ // omitted, the chart default (com.rulebricks.) applies via value merge.
980
+ if (ext.topicPrefix !== undefined) {
981
+ logging.kafkaTopicPrefix = ext.topicPrefix;
982
+ }
983
+ if (ext.sasl?.mechanism) {
984
+ const sasl = { mechanism: ext.sasl.mechanism };
985
+ if (ext.sasl.region)
986
+ sasl.region = ext.sasl.region;
987
+ if (ext.sasl.username)
988
+ sasl.username = ext.sasl.username;
989
+ if (ext.sasl.password)
990
+ sasl.password = ext.sasl.password;
991
+ if (ext.sasl.existingSecret)
992
+ sasl.existingSecret = ext.sasl.existingSecret;
993
+ logging.kafkaSasl = sasl;
994
+ }
995
+ return logging;
996
+ }
997
+ /**
998
+ * HPS service account. When external Kafka uses MSK IAM, HPS authenticates to the
999
+ * broker with its pod's cloud identity - under EKS Pod Identity that comes from a
1000
+ * namespace-scoped association (created by the CLI's workload-identity step for
1001
+ * the `<release>-hps` SA), NOT an eks.amazonaws.com/role-arn annotation. We only
1002
+ * CREATE the SA here so the association has a subject to bind.
1003
+ */
1004
+ function generateHpsServiceAccount(config) {
1005
+ if (kafkaUsesBridge(config)) {
1006
+ return { create: true, annotations: {} };
1007
+ }
1008
+ return { create: false, annotations: {} };
1009
+ }
1010
+ /**
1011
+ * Top-level kafkaBridge block consumed by the Vector env ConfigMap. Only enabled
1012
+ * for AWS MSK IAM, where a kafka-proxy sidecar fronts the brokers for Vector.
1013
+ */
1014
+ function generateKafkaBridge(config) {
1015
+ if (!kafkaUsesBridge(config)) {
1016
+ return { enabled: false };
1017
+ }
1018
+ const ext = config.externalServices?.kafka?.external ?? {};
1019
+ return {
1020
+ enabled: true,
1021
+ provider: "aws",
1022
+ region: ext.sasl?.region ?? "",
1023
+ brokers: ext.brokers ?? "",
1024
+ localPort: 19092,
1025
+ image: KAFKA_PROXY_IMAGE,
1026
+ awsRoleArn: ext.identity?.awsRoleArn ?? "",
1027
+ };
1028
+ }
1029
+ /**
1030
+ * kafka-proxy sidecar for the Vector pod (AWS MSK IAM). Maps each upstream
1031
+ * broker to a sequential local port and authenticates with the pod's IRSA role.
1032
+ */
1033
+ function generateVectorExtraContainers(config) {
1034
+ if (!kafkaUsesBridge(config))
1035
+ return undefined;
1036
+ const ext = config.externalServices?.kafka?.external ?? {};
1037
+ const brokers = (ext.brokers ?? "")
1038
+ .split(",")
1039
+ .map((b) => b.trim())
1040
+ .filter(Boolean);
1041
+ if (brokers.length === 0)
1042
+ return undefined;
1043
+ const basePort = 19092;
1044
+ const mappings = brokers.map((broker, i) => `--bootstrap-server-mapping=${broker},127.0.0.1:${basePort + i}`);
1045
+ return [
222
1046
  {
223
- key: "kubernetes.io/arch",
224
- operator: "Equal",
225
- value: "arm64",
226
- effect: "NoSchedule",
1047
+ name: "kafka-proxy",
1048
+ image: KAFKA_PROXY_IMAGE,
1049
+ args: [
1050
+ "server",
1051
+ ...mappings,
1052
+ "--tls-enable",
1053
+ "--sasl-enable",
1054
+ "--sasl-method=AWS_MSK_IAM",
1055
+ `--sasl-aws-region=${ext.sasl?.region ?? ""}`,
1056
+ ],
1057
+ ports: brokers.map((_, i) => ({ containerPort: basePort + i })),
227
1058
  },
228
1059
  ];
1060
+ }
1061
+ // VRL for the Vector agent: parse JSON app/HPS log lines, lift trace_id/span_id
1062
+ // for logs<->traces correlation, and flatten useful Kubernetes metadata. Kept
1063
+ // in sync with charts/.../values.yaml vector-agent.customConfig.transforms.
1064
+ const VECTOR_APP_LOGS_VRL = [
1065
+ 'parsed, err = parse_json(to_string(.message) ?? "")',
1066
+ "if err == null && is_object(parsed) {",
1067
+ " .log = parsed",
1068
+ " .trace_id = parsed.trace_id",
1069
+ " .span_id = parsed.span_id",
1070
+ ' if exists(parsed.level) { .level = to_string(parsed.level) ?? "info" }',
1071
+ "}",
1072
+ ".pod = .kubernetes.pod_name",
1073
+ ".namespace = .kubernetes.pod_namespace",
1074
+ ".container = .kubernetes.container_name",
1075
+ ".node = .kubernetes.pod_node_name",
1076
+ ].join("\n");
1077
+ /**
1078
+ * global.tracing block (in-cluster OTel Collector -> pluggable trace backend).
1079
+ * Emits the destination-specific sub-block (elastic | otlp | azure-monitor) and
1080
+ * returns undefined when tracing is disabled so it is omitted entirely.
1081
+ */
1082
+ function generateTracingGlobal(config) {
1083
+ const tracing = config.features.tracing;
1084
+ if (!tracing?.enabled)
1085
+ return undefined;
1086
+ const destination = tracing.destination ?? "elastic";
1087
+ const reg = config.imageRegistry || DEFAULT_IMAGE_REGISTRY;
1088
+ const base = {
1089
+ enabled: true,
1090
+ destination,
1091
+ samplingRatio: tracing.samplingRatio ?? 1,
1092
+ // RB image dict for the parent chart's otel-collector deployment. The
1093
+ // rulebricks.image helper requires image.repository and applies
1094
+ // global.imageRegistry to the host.
1095
+ collector: {
1096
+ image: {
1097
+ registry: reg,
1098
+ repository: IMAGE_REPOSITORIES.opentelemetryCollector.repository,
1099
+ tag: IMAGE_REPOSITORIES.opentelemetryCollector.tag,
1100
+ },
1101
+ },
1102
+ };
1103
+ if (destination === "elastic") {
1104
+ const elastic = tracing.elastic ?? {};
1105
+ const authMode = elastic.authMode ?? "secret-token";
1106
+ const elasticBlock = {
1107
+ endpoint: elastic.endpoint ?? "",
1108
+ authMode,
1109
+ tlsInsecureSkipVerify: false,
1110
+ };
1111
+ if (authMode === "secret-token" && elastic.secretToken) {
1112
+ elasticBlock.secretToken = elastic.secretToken;
1113
+ }
1114
+ if (authMode === "api-key" && elastic.apiKey) {
1115
+ elasticBlock.apiKey = elastic.apiKey;
1116
+ }
1117
+ return { ...base, elastic: elasticBlock };
1118
+ }
1119
+ if (destination === "otlp") {
1120
+ const otlp = tracing.otlp ?? {};
1121
+ const authMode = otlp.authMode ?? "none";
1122
+ const otlpBlock = {
1123
+ endpoint: otlp.endpoint ?? "",
1124
+ authMode,
1125
+ tlsInsecureSkipVerify: otlp.tlsInsecureSkipVerify ?? false,
1126
+ };
1127
+ if (authMode === "bearer" && otlp.token)
1128
+ otlpBlock.token = otlp.token;
1129
+ if (authMode === "api-key" && otlp.apiKey)
1130
+ otlpBlock.apiKey = otlp.apiKey;
1131
+ if (authMode === "header") {
1132
+ otlpBlock.headerName = otlp.headerName ?? "Authorization";
1133
+ if (otlp.headerValue)
1134
+ otlpBlock.headerValue = otlp.headerValue;
1135
+ }
1136
+ if (otlp.headers && Object.keys(otlp.headers).length > 0) {
1137
+ otlpBlock.headers = otlp.headers;
1138
+ }
1139
+ return { ...base, otlp: otlpBlock };
1140
+ }
1141
+ // azure-monitor
1142
+ const azure = tracing.azureMonitor ?? {};
1143
+ return {
1144
+ ...base,
1145
+ azureMonitor: { connectionString: azure.connectionString ?? "" },
1146
+ };
1147
+ }
1148
+ /**
1149
+ * traefik.tracing block: makes Traefik the root span and propagates the W3C
1150
+ * traceparent to backends. Empty object when tracing is disabled.
1151
+ */
1152
+ function generateTraefikTracing(config, releaseName) {
1153
+ if (!isClickStackEnabled(config) && !config.features.tracing?.enabled)
1154
+ return {};
1155
+ return {
1156
+ otlp: {
1157
+ enabled: true,
1158
+ http: {
1159
+ enabled: true,
1160
+ endpoint: `http://${releaseName}-otel-collector:4318/v1/traces`,
1161
+ },
1162
+ },
1163
+ };
1164
+ }
1165
+ /**
1166
+ * vector-agent block: a second Vector deployment (role Agent / DaemonSet) that
1167
+ * tails all pod logs and ships them to a customer-managed Elasticsearch. Decision
1168
+ * logs are unaffected (they stay in ClickHouse via the `vector` aggregator).
1169
+ */
1170
+ function generateVectorAgent(config, podLabels, tolerations) {
1171
+ const appLogs = config.features.logging.appLogs;
1172
+ if (!appLogs?.enabled) {
1173
+ return { enabled: false };
1174
+ }
1175
+ const destination = appLogs.destination ?? "elasticsearch";
1176
+ let sinkName = "elasticsearch";
1177
+ let sink;
1178
+ if (destination === "loki") {
1179
+ const loki = appLogs.loki ?? {};
1180
+ sinkName = "loki";
1181
+ sink = {
1182
+ type: "loki",
1183
+ inputs: ["app_logs"],
1184
+ endpoint: loki.endpoint,
1185
+ labels: loki.labels ?? {
1186
+ app: "rulebricks",
1187
+ namespace: "{{ namespace }}",
1188
+ pod: "{{ pod }}",
1189
+ container: "{{ container }}",
1190
+ },
1191
+ encoding: { codec: "json" },
1192
+ };
1193
+ }
1194
+ else if (destination === "generic") {
1195
+ const generic = appLogs.generic ?? {};
1196
+ sinkName = "generic_http";
1197
+ sink = {
1198
+ type: "http",
1199
+ inputs: ["app_logs"],
1200
+ uri: generic.endpoint,
1201
+ method: "post",
1202
+ encoding: { codec: "json" },
1203
+ };
1204
+ if (generic.authHeader) {
1205
+ sink.request = { headers: { Authorization: generic.authHeader } };
1206
+ }
1207
+ }
1208
+ else {
1209
+ const es = appLogs.elasticsearch ?? {};
1210
+ const authMode = es.authMode ?? "basic";
1211
+ sink = {
1212
+ type: "elasticsearch",
1213
+ inputs: ["app_logs"],
1214
+ endpoints: [es.endpoint],
1215
+ mode: "bulk",
1216
+ bulk: { index: es.index || "rulebricks-app-logs" },
1217
+ tls: { verify_certificate: es.verifyCertificate ?? true },
1218
+ };
1219
+ if (authMode === "basic") {
1220
+ sink.auth = { strategy: "basic", user: es.username, password: es.password };
1221
+ }
1222
+ else if (authMode === "api-key") {
1223
+ sink.request = { headers: { Authorization: `ApiKey ${es.apiKey}` } };
1224
+ }
1225
+ }
1226
+ return {
1227
+ enabled: true,
1228
+ role: "Agent",
1229
+ podLabels,
1230
+ // Follow active worker pools without tolerating shutdown, out-of-service,
1231
+ // or unreachable node taints.
1232
+ tolerations,
1233
+ resources: {
1234
+ requests: { cpu: "100m", memory: "256Mi" },
1235
+ limits: { cpu: "500m", memory: "512Mi" },
1236
+ },
1237
+ customConfig: {
1238
+ data_dir: "/vector-data-dir",
1239
+ sources: {
1240
+ kubernetes_logs: {
1241
+ type: "kubernetes_logs",
1242
+ // Skip both Vector deployments: the aggregator
1243
+ // (app.kubernetes.io/name=vector) re-emits decision logs on stdout
1244
+ // (those belong in ClickHouse, not Elasticsearch) and the agent
1245
+ // itself (vector-agent) to avoid a self-scrape loop.
1246
+ extra_label_selector: "app.kubernetes.io/name notin (vector,vector-agent)",
1247
+ },
1248
+ },
1249
+ transforms: {
1250
+ app_logs: {
1251
+ type: "remap",
1252
+ inputs: ["kubernetes_logs"],
1253
+ source: VECTOR_APP_LOGS_VRL,
1254
+ },
1255
+ },
1256
+ sinks: { [sinkName]: sink },
1257
+ },
1258
+ };
1259
+ }
1260
+ /**
1261
+ * Builds Helm values from the deployment configuration.
1262
+ */
1263
+ export function buildHelmValues(config, options = {}) {
1264
+ if (config.database.type === "self-hosted" &&
1265
+ !config.database.supabaseJwtSecret) {
1266
+ throw new Error("Self-hosted Supabase is missing a JWT secret. Run `rulebricks redeploy <name>` to regenerate deployment credentials, or set database.supabaseJwtSecret in config.yaml.");
1267
+ }
1268
+ if (config.features.ai.enabled && !config.features.ai.openaiApiKey) {
1269
+ throw new Error("AI features are enabled but the OpenAI API key is missing. Run `rulebricks redeploy <name>` and enter your OpenAI API key, or disable AI features in config.yaml.");
1270
+ }
1271
+ const { tlsEnabled = true, secretMode = "inline" } = options;
1272
+ const useLocalGrafana = config.features.monitoring.destination === "local-grafana";
1273
+ // Determine if external-dns should be enabled
1274
+ const externalDnsEnabled = config.dns.autoManage && isSupportedDnsProvider(config.dns.provider);
1275
+ const gcpDiskType = config.infrastructure.nodeArchitecture === "amd64"
1276
+ ? "pd-balanced"
1277
+ : "hyperdisk-balanced";
1278
+ // Prefer the live cluster's StorageClass. Provider defaults are only a
1279
+ // fallback for legacy configs that predate capability scanning.
1280
+ const storageClass = config.infrastructure.storageClass ||
1281
+ (config.infrastructure.provider === "aws"
1282
+ ? "gp3"
1283
+ : config.infrastructure.provider === "gcp"
1284
+ ? gcpDiskType
1285
+ : config.infrastructure.provider === "azure"
1286
+ ? "managed-premium"
1287
+ : "gp3");
1288
+ const shouldApplyArm64Toleration = config.infrastructure.arm64TolerationRequired ?? false;
1289
+ const architectureTolerations = shouldApplyArm64Toleration
1290
+ ? [
1291
+ {
1292
+ key: "kubernetes.io/arch",
1293
+ operator: "Equal",
1294
+ value: "arm64",
1295
+ effect: "NoSchedule",
1296
+ },
1297
+ ]
1298
+ : undefined;
1299
+ const coreScheduling = generateScheduling(architectureTolerations);
1300
+ // Workers always tolerate + softly prefer the optional burst pool
1301
+ // (rulebricks.com/pool=burst). The preference is soft, so clusters without a
1302
+ // burst pool schedule workers on ordinary capacity exactly as before.
1303
+ const workerTolerations = [
1304
+ ...(architectureTolerations ?? []),
1305
+ BURST_POOL_TOLERATION,
1306
+ ];
1307
+ const operationalDaemonSetTolerations = workerTolerations;
1308
+ const workerScheduling = generateScheduling(workerTolerations, {
1309
+ ...generateWorkerPodAntiAffinity(),
1310
+ nodeAffinity: {
1311
+ preferredDuringSchedulingIgnoredDuringExecution: [
1312
+ BURST_POOL_NODE_PREFERENCE,
1313
+ ],
1314
+ },
1315
+ });
1316
+ const infrastructurePodLabels = {
1317
+ "rulebricks.com/workload-group": "infrastructure",
1318
+ };
1319
+ const applicationPodLabels = {
1320
+ "rulebricks.com/workload-group": "application",
1321
+ };
1322
+ const productVersion = config.version;
1323
+ // Scheduling priority tiers. The chart creates release-scoped
1324
+ // PriorityClasses (<release>-critical / <release>-burst); stateful
1325
+ // infrastructure references the critical class so it can always preempt
1326
+ // burst workers to reschedule, and workers reference the burst class so
1327
+ // they are strictly the first preemption victims. Subchart values cannot
1328
+ // template release names, so the CLI emits them as literals.
1329
+ const releaseName = getReleaseName(config.name);
1330
+ const criticalPriorityClass = `${releaseName}-critical`;
1331
+ const burstPriorityClass = `${releaseName}-burst`;
1332
+ // Subcharts that don't honor global.imagePullSecrets (keda, strimzi, traefik,
1333
+ // vector) need the pull secret on their own key so their pods can pull the
1334
+ // private docker.io/rulebricks/* images from index.docker.io.
1335
+ const rulebricksPullSecret = [{ name: `${releaseName}-regcred` }];
1336
+ // Registry host for every image. Empty config.imageRegistry => docker.io. When
1337
+ // set, the host is rewritten into global.imageRegistry (which kube-prometheus-stack
1338
+ // and our subcharts honor) and into each of the six Tier-2 charts' own image
1339
+ // keys below, always keeping the rulebricks/<name> path.
1340
+ const reg = config.imageRegistry || DEFAULT_IMAGE_REGISTRY;
1341
+ const clickStackEnabled = isClickStackEnabled(config);
1342
+ const clickStackConfig = config.features.observability?.clickstack;
1343
+ const clickHouseStorageSize = clickStackConfig?.clickHouseStorageSize ?? "100Gi";
1344
+ // Distributed tracing (self-hosted only). Lives under global so the
1345
+ // rulebricks subchart deployments can read it; the collector + traefik are
1346
+ // wired below from the same source.
1347
+ const tracingGlobal = clickStackEnabled ? undefined : generateTracingGlobal(config);
1348
+ // Never let the cluster-autoscaler evict single-replica stateful pods
1349
+ // during node scale-down; an evicted broker/db stalls the whole pipeline.
1350
+ const safeToEvictAnnotations = {
1351
+ "cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
1352
+ };
229
1353
  // Build global.supabase configuration
230
1354
  const supabaseGlobalConfig = config.database.type === "supabase-cloud"
231
1355
  ? {
@@ -235,27 +1359,51 @@ export async function generateHelmValues(config, options = {}) {
235
1359
  accessToken: config.database.supabaseAccessToken || undefined,
236
1360
  projectRef: config.database.supabaseProjectRef || undefined,
237
1361
  }
238
- : {
239
- jwtSecret: config.database.supabaseJwtSecret || undefined,
240
- anonKey: undefined,
241
- serviceKey: undefined,
1362
+ : (() => {
1363
+ const jwtSecret = config.database.supabaseJwtSecret || "";
1364
+ return {
1365
+ jwtSecret: jwtSecret || undefined,
1366
+ anonKey: jwtSecret ? signSupabaseJwt("anon", jwtSecret) : undefined,
1367
+ serviceKey: jwtSecret
1368
+ ? signSupabaseJwt("service_role", jwtSecret)
1369
+ : undefined,
1370
+ };
1371
+ })();
1372
+ // Always emit email configuration so auth pods receive template/subject env
1373
+ // vars regardless of Helm merge order. Custom values take precedence over
1374
+ // built-in defaults when explicitly enabled.
1375
+ const customEmails = config.features.customEmails;
1376
+ if (customEmails?.enabled &&
1377
+ customEmails.subjects &&
1378
+ customEmails.templates) {
1379
+ supabaseGlobalConfig.emails = {
1380
+ subjects: {
1381
+ invite: customEmails.subjects.invite,
1382
+ confirmation: customEmails.subjects.confirmation,
1383
+ recovery: customEmails.subjects.recovery,
1384
+ emailChange: customEmails.subjects.emailChange,
1385
+ },
1386
+ templates: {
1387
+ invite: customEmails.templates.invite,
1388
+ confirmation: customEmails.templates.confirmation,
1389
+ recovery: customEmails.templates.recovery,
1390
+ emailChange: customEmails.templates.emailChange,
1391
+ },
242
1392
  };
243
- // Add custom email templates if enabled
244
- if (config.features.customEmails?.enabled &&
245
- config.features.customEmails.subjects &&
246
- config.features.customEmails.templates) {
1393
+ }
1394
+ else {
247
1395
  supabaseGlobalConfig.emails = {
248
1396
  subjects: {
249
- invite: config.features.customEmails.subjects.invite,
250
- confirmation: config.features.customEmails.subjects.confirmation,
251
- recovery: config.features.customEmails.subjects.recovery,
252
- emailChange: config.features.customEmails.subjects.emailChange,
1397
+ invite: "Join your team on Rulebricks",
1398
+ confirmation: "Confirm Your Email",
1399
+ recovery: "Reset Your Password",
1400
+ emailChange: "Confirm Email Change",
253
1401
  },
254
1402
  templates: {
255
- invite: config.features.customEmails.templates.invite,
256
- confirmation: config.features.customEmails.templates.confirmation,
257
- recovery: config.features.customEmails.templates.recovery,
258
- emailChange: config.features.customEmails.templates.emailChange,
1403
+ invite: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/invite.html",
1404
+ confirmation: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/verify.html",
1405
+ recovery: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/password_change.html",
1406
+ emailChange: "https://prefix-files.s3.us-west-2.amazonaws.com/templates/email_change.html",
259
1407
  },
260
1408
  };
261
1409
  }
@@ -268,7 +1416,30 @@ export async function generateHelmValues(config, options = {}) {
268
1416
  email: config.adminEmail,
269
1417
  tlsEnabled,
270
1418
  licenseKey: config.licenseKey,
1419
+ // Pull secret for the private docker.io/rulebricks/* images. References the
1420
+ // license registry secret <release>-regcred (index.docker.io, authed by the
1421
+ // license PAT). kube-prometheus-stack + cert-manager honor this global value;
1422
+ // keda, traefik, vector and the strimzi operator each get the same secret on
1423
+ // their own key below.
1424
+ imagePullSecrets: [{ name: `${releaseName}-regcred` }],
1425
+ // Single registry-host override (empty => docker.io/rulebricks/*). Honored by
1426
+ // kube-prometheus-stack and our subcharts; the CLI also rewrites the host into
1427
+ // the other Tier-2 charts' native image keys below.
1428
+ ...(config.imageRegistry ? { imageRegistry: config.imageRegistry } : {}),
1429
+ // Generated name->sha256 digest map (empty until the helm repo's mirror
1430
+ // pipeline populates IMAGE_DIGESTS). When a name is present the chart image
1431
+ // helper pins @sha256 instead of :tag.
1432
+ imageDigests: IMAGE_DIGESTS,
1433
+ ...(productVersion && SEMVER_PATTERN.test(productVersion)
1434
+ ? { version: productVersion }
1435
+ : {}),
271
1436
  externalDnsEnabled,
1437
+ // Scheduling priority tiers (the chart renders release-scoped
1438
+ // <release>-critical and <release>-burst PriorityClasses).
1439
+ priorityClasses: { enabled: true },
1440
+ clickstack: {
1441
+ enabled: clickStackEnabled,
1442
+ },
272
1443
  // SMTP Configuration
273
1444
  smtp: {
274
1445
  host: config.smtp.host,
@@ -299,62 +1470,164 @@ export async function generateHelmValues(config, options = {}) {
299
1470
  : {
300
1471
  enabled: false,
301
1472
  },
1473
+ storage: config.storage
1474
+ ? {
1475
+ // One provider, one identity, one bucket/container. decision-logs and
1476
+ // db-backups are key prefixes under paths.* within it.
1477
+ provider: config.storage.provider,
1478
+ bucket: config.storage.bucket,
1479
+ region: config.storage.region,
1480
+ s3: {
1481
+ iamRoleArn: config.storage.awsIamRoleArn || "",
1482
+ existingSecret: { name: "" },
1483
+ },
1484
+ azure: {
1485
+ authMode: config.storage.cloudAuthMode === "secret"
1486
+ ? "connection-string"
1487
+ : "workload-identity",
1488
+ clientId: config.storage.azureBlobClientId || "",
1489
+ tenantId: config.storage.azureBlobTenantId || "",
1490
+ container: config.storage.azureBlobContainer || "",
1491
+ connectionStringSecretRef: config.storage.azureBlobConnectionStringSecretRef || {
1492
+ name: "",
1493
+ key: "",
1494
+ },
1495
+ },
1496
+ gcp: {
1497
+ serviceAccountEmail: config.storage.gcpServiceAccountEmail || "",
1498
+ },
1499
+ paths: {
1500
+ decisionLogs: config.storage.paths?.decisionLogs || "decision-logs",
1501
+ dbBackups: config.storage.paths?.dbBackups || "db-backups",
1502
+ },
1503
+ }
1504
+ : undefined,
1505
+ // Distributed tracing (omitted entirely when disabled).
1506
+ ...(tracingGlobal ? { tracing: tracingGlobal } : {}),
302
1507
  },
1508
+ clickstack: generateClickStackValues(clickStackEnabled, config, storageClass, infrastructurePodLabels, operationalDaemonSetTolerations),
1509
+ backup: generateBackupValues(config),
303
1510
  // =============================================================================
304
1511
  // RULEBRICKS APPLICATION STACK
305
1512
  // =============================================================================
306
1513
  rulebricks: {
307
- app: {
308
- ...(config.appVersion
309
- ? {
310
- image: {
311
- repository: "index.docker.io/rulebricks/app",
312
- tag: config.appVersion,
313
- pullPolicy: "IfNotPresent",
314
- },
315
- }
316
- : {}),
317
- replicaCount: tierConfig.appReplicas,
318
- resources: tierConfig.appResources,
319
- tolerations: arm64Tolerations,
320
- // Logging configuration
321
- logging: {
1514
+ metrics: {
1515
+ enabled: true,
1516
+ serviceMonitor: {
322
1517
  enabled: true,
323
- kafkaBrokers: "", // Auto-discover from Kafka subchart
324
- kafkaTopic: "logs",
325
- loggingDestination: getLoggingDestinationLabel(config.features.logging.sink),
1518
+ interval: "30s",
1519
+ scrapeTimeout: "10s",
1520
+ },
1521
+ app: {
1522
+ path: "/api/metrics",
1523
+ },
1524
+ hps: {
1525
+ path: "/metrics",
1526
+ },
1527
+ worker: {
1528
+ path: "/metrics",
1529
+ port: 3000,
326
1530
  },
327
1531
  },
1532
+ app: {
1533
+ image: {
1534
+ // Split shape: the rulebricks-chart.image helper applies
1535
+ // global.imageRegistry to the host + digest pinning. The host NEVER
1536
+ // goes in repository.
1537
+ registry: reg,
1538
+ repository: IMAGE_REPOSITORIES.app,
1539
+ pullPolicy: "IfNotPresent",
1540
+ },
1541
+ // Replica count and resources fall back to the chart defaults.
1542
+ podLabels: infrastructurePodLabels,
1543
+ ...coreScheduling,
1544
+ // Logging configuration (in-cluster auto-discovery or external Kafka)
1545
+ logging: generateAppLogging(config),
1546
+ },
328
1547
  // HPS (High Performance Server)
329
1548
  hps: {
330
1549
  enabled: true,
331
- ...(config.hpsVersion
332
- ? {
333
- image: {
334
- repository: "index.docker.io/rulebricks/hps",
335
- tag: config.hpsVersion,
336
- pullPolicy: "Always",
337
- },
338
- }
339
- : {}),
340
- replicas: tierConfig.hpsReplicas,
341
- resources: tierConfig.hpsResources,
342
- tolerations: arm64Tolerations,
1550
+ image: {
1551
+ // Split shape (see app.image): host comes from global.imageRegistry via
1552
+ // the rulebricks-chart.image helper, never baked into repository.
1553
+ registry: reg,
1554
+ repository: IMAGE_REPOSITORIES.hps,
1555
+ pullPolicy: "Always",
1556
+ },
1557
+ // Replica count and resources fall back to the chart defaults.
1558
+ podLabels: applicationPodLabels,
1559
+ ...coreScheduling,
1560
+ // Gather-plane autoscaling: HPS parses every chunk response, so its
1561
+ // capacity scales with request rate (load testing showed a fixed
1562
+ // gather plane plateaus throughput while workers idle). Conservative
1563
+ // one-pod-at-a-time scaling - each scale event rebalances the
1564
+ // response consumer group and can time out in-flight requests. Only the
1565
+ // enable flag is set here; min/max and thresholds use the chart
1566
+ // defaults.
1567
+ keda: {
1568
+ enabled: true,
1569
+ },
1570
+ // Warm the hps/worker images onto active worker-capable nodes so burst
1571
+ // scale-outs skip the image pull without targeting shutdown nodes.
1572
+ imagePrepull: {
1573
+ enabled: true,
1574
+ tolerations: operationalDaemonSetTolerations,
1575
+ },
1576
+ extraEnv: [
1577
+ // FLOW_CHUNK_MAX_ITEMS is the #1 throughput dial. Each chunk is one
1578
+ // Kafka round-trip (gather -> solution -> worker -> solution-response
1579
+ // -> gather), so throughput ~= (broker messages/sec) x (payloads per
1580
+ // message). Bigger chunks = fewer messages per solution = less broker
1581
+ // and coordination overhead. Benchmarks: 10 -> 50 gave +27%, and on
1582
+ // small payloads 100 -> 1000 gave another ~1.6x (22k -> 35k sol/s),
1583
+ // until the bottleneck moved off the broker onto worker CPU.
1584
+ // 500 keeps typical bulk requests to 1-2 messages. The byte bound
1585
+ // (CHUNK_MAX_BYTES, default 256 KiB in HPS) caps message size
1586
+ // regardless, so large payloads stay under Kafka's 2 MiB
1587
+ // max.message.bytes. High-throughput, small-payload deployments can
1588
+ // raise this much higher (and CHUNK_MAX_BYTES with it); the only costs
1589
+ // are per-request latency (one worker processes a whole chunk) and the
1590
+ // 2 MiB cap on the larger response message (avg output x chunk size
1591
+ // must stay < 2 MiB, so lower this for output-heavy flows).
1592
+ { name: "FLOW_CHUNK_MAX_ITEMS", value: "500" },
1593
+ ],
1594
+ // Service account (annotated with the MSK IAM role for external Kafka)
1595
+ serviceAccount: generateHpsServiceAccount(config),
343
1596
  // HPS Workers with KEDA autoscaling
344
1597
  workers: {
345
1598
  enabled: true,
346
- replicas: tierConfig.hpsWorkerReplicas.min,
1599
+ // Workers consume the solution topic directly, so under external MSK
1600
+ // IAM they need their own cloud identity - not the shared/default SA.
1601
+ // Same rule as HPS: a dedicated `<release>-hps-worker` SA (no role-arn
1602
+ // annotation) that the CLI's workload-identity step binds to the Kafka
1603
+ // role via Pod Identity.
1604
+ serviceAccount: generateHpsServiceAccount(config),
1605
+ // Partition count of the solution request topic (also exported to
1606
+ // HPS as MAX_WORKERS). Must match kafka.provisioning above; it is
1607
+ // the fleet-concurrency ceiling, NOT a worker count. Replica count
1608
+ // and resources fall back to the chart defaults.
1609
+ solutionPartitions: SOLUTION_TOPIC_PARTITIONS,
347
1610
  keda: {
348
1611
  enabled: true,
349
- minReplicaCount: tierConfig.hpsWorkerReplicas.min,
350
- maxReplicaCount: tierConfig.hpsWorkerReplicas.max,
351
- pollingInterval: 10,
1612
+ // Poll fast so bursts are detected within seconds; the chart's
1613
+ // ScaledObject defaults add exponential scale-up (double every
1614
+ // 15s) and smooth scale-down (5-min window, -25%/min) behavior.
1615
+ // min/max replica counts fall back to the chart defaults.
1616
+ pollingInterval: 5,
352
1617
  cooldownPeriod: 300,
1618
+ // Lag is measured in MESSAGES; with chunked bulk dispatch each
1619
+ // message is a bounded unit of work (~50-150ms), so 50 messages
1620
+ // approximates 5-8s of backlog for a single worker - one replica
1621
+ // is added per ~5s of fleet backlog, biasing toward early
1622
+ // scale-out for bursty traffic.
353
1623
  lagThreshold: 50,
354
1624
  cpuThreshold: 25,
355
1625
  },
356
- resources: tierConfig.hpsWorkerResources,
357
- tolerations: arm64Tolerations,
1626
+ podLabels: applicationPodLabels,
1627
+ // Burst tier: first preemption victims, so critical infrastructure
1628
+ // can always reschedule during an aggressive scale-out.
1629
+ priorityClassName: burstPriorityClass,
1630
+ ...workerScheduling,
358
1631
  },
359
1632
  },
360
1633
  // Ingress configuration
@@ -363,74 +1636,138 @@ export async function generateHelmValues(config, options = {}) {
363
1636
  className: "traefik",
364
1637
  paths: [{ path: "/", pathType: "Prefix" }],
365
1638
  },
366
- // Redis configuration
367
- redis: {
368
- resources: tierConfig.redisResources,
369
- tolerations: arm64Tolerations,
370
- persistence: {
371
- enabled: true,
372
- size: tierConfig.redisPersistenceSize,
373
- storageClass: storageClass,
374
- },
375
- },
1639
+ // Redis configuration (in-cluster sizing or external connection settings)
1640
+ redis: generateRedisBlock(config, storageClass, infrastructurePodLabels, coreScheduling),
1641
+ cache: generateCacheObservabilityBlock(config, infrastructurePodLabels),
1642
+ kafkaExporter: generateKafkaExporterBlock(config, infrastructurePodLabels),
376
1643
  },
377
1644
  // =============================================================================
378
1645
  // KAFKA (Message Queue)
379
1646
  // =============================================================================
380
1647
  kafka: {
381
- enabled: true,
382
- // KRaft mode (no Zookeeper)
383
- kraft: {
1648
+ enabled: !isExternalKafka(config),
1649
+ // Apache Kafka version (must be one the bundled DHI Strimzi operator
1650
+ // supports; DHI strimzi 1.0.1 ships Kafka 4.2.0).
1651
+ version: "4.2.0",
1652
+ // Single combined controller+broker node (KRaft, no ZooKeeper).
1653
+ replicas: TOPIC_REPLICATION_FACTOR,
1654
+ storage: {
1655
+ size: "20Gi",
1656
+ class: storageClass,
1657
+ },
1658
+ // Critical tier: the broker must always be able to preempt burst workers.
1659
+ priorityClassName: criticalPriorityClass,
1660
+ config: generateKafkaConfig(),
1661
+ jvm: {
1662
+ xms: "1g",
1663
+ xmx: "1g",
1664
+ extraOpts: {
1665
+ UseZGC: "true",
1666
+ AlwaysPreTouch: "true",
1667
+ MaxDirectMemorySize: "256M",
1668
+ },
1669
+ },
1670
+ metrics: {
384
1671
  enabled: true,
1672
+ serviceMonitor: { enabled: true },
385
1673
  },
386
- zookeeper: {
387
- enabled: false,
1674
+ // Topics, reconciled by the Strimzi Topic Operator (KafkaTopic CRs) for the
1675
+ // in-cluster broker, or created by the kafka-topic-provision Job for an
1676
+ // external MSK IAM broker.
1677
+ topics: generateKafkaTopics(config),
1678
+ // When false, the chart never creates topics on an external broker - the
1679
+ // operator manages them (and the workload role needs no CreateTopic).
1680
+ provisioning: {
1681
+ enabled: config.externalServices?.kafka?.external?.provisionTopics ?? true,
388
1682
  },
389
- // Kafka broker configuration
390
- overrideConfiguration: {
391
- "auto.create.topics.enable": "true",
392
- "log.retention.hours": "24",
393
- "default.replication.factor": String(tierConfig.kafkaReplication),
394
- "offsets.topic.replication.factor": String(tierConfig.kafkaReplication),
395
- "num.partitions": String(tierConfig.hpsWorkerReplicas.max), // Match max workers for parallel consumption
396
- },
397
- controller: {
398
- replicaCount: tierConfig.kafkaReplication,
399
- resources: tierConfig.kafkaResources,
400
- tolerations: arm64Tolerations,
401
- persistence: {
1683
+ },
1684
+ // Strimzi operator: pull secret so the operator pod pulls the private
1685
+ // rulebricks/* image from index.docker.io.
1686
+ "strimzi-kafka-operator": {
1687
+ image: { imagePullSecrets: rulebricksPullSecret },
1688
+ },
1689
+ // =============================================================================
1690
+ // VECTOR KAFKA BRIDGE (AWS MSK IAM token auth)
1691
+ // =============================================================================
1692
+ kafkaBridge: generateKafkaBridge(config),
1693
+ clickhouse: {
1694
+ enabled: true,
1695
+ // Critical tier: single replica must preempt burst workers to
1696
+ // reschedule; never autoscaler-evicted on scale-down.
1697
+ priorityClassName: criticalPriorityClass,
1698
+ podAnnotations: safeToEvictAnnotations,
1699
+ auth: {
1700
+ username: "rulebricks",
1701
+ password: "",
1702
+ existingSecret: '{{ printf "%s-clickhouse-credentials" .Release.Name }}',
1703
+ existingSecretKey: "admin-password",
1704
+ },
1705
+ persistence: clickStackEnabled
1706
+ ? {
402
1707
  enabled: true,
403
- size: tierConfig.kafkaStorage,
404
1708
  storageClass: storageClass,
1709
+ size: clickHouseStorageSize,
1710
+ }
1711
+ : { enabled: false },
1712
+ resources: clickStackEnabled
1713
+ ? {
1714
+ requests: { cpu: "1000m", memory: "4Gi" },
1715
+ limits: { cpu: "4", memory: "12Gi" },
1716
+ }
1717
+ : {
1718
+ requests: { cpu: "500m", memory: "2Gi" },
1719
+ limits: { cpu: "2", memory: "6Gi" },
405
1720
  },
406
- heapOpts: tierConfig.kafkaHeapOpts,
407
- extraEnvVars: generateKafkaExtraEnvVars(),
1721
+ serviceAccount: {
1722
+ create: true,
1723
+ annotations: {},
408
1724
  },
409
- listeners: {
410
- client: {
411
- protocol: "PLAINTEXT",
412
- },
413
- controller: {
414
- protocol: "PLAINTEXT",
415
- },
416
- interbroker: {
417
- protocol: "PLAINTEXT",
1725
+ metrics: {
1726
+ enabled: true,
1727
+ serviceMonitor: {
1728
+ enabled: true,
418
1729
  },
419
1730
  },
1731
+ queryLimits: {
1732
+ maxMemoryUsage: 4294967296,
1733
+ maxThreads: 4,
1734
+ maxExecutionTime: 120,
1735
+ maxRowsToRead: 50000000,
1736
+ readOverflowMode: "break",
1737
+ },
1738
+ otelQueryLimits: {
1739
+ maxMemoryUsage: 4294967296,
1740
+ maxThreads: 8,
1741
+ maxExecutionTime: 120,
1742
+ },
1743
+ otelDatabase: "otel",
1744
+ // config.d / users.d / the decision-log view are rendered by the parent
1745
+ // chart's clickhouse templates (no longer passed as Bitnami subchart values).
420
1746
  },
421
1747
  // =============================================================================
422
1748
  // TRAEFIK (Ingress Controller)
423
1749
  // =============================================================================
424
1750
  traefik: {
425
1751
  enabled: true,
1752
+ // traefik has no global.imageRegistry path: set registry + repository
1753
+ // directly (host = reg, rulebricks/* path).
1754
+ image: {
1755
+ registry: reg,
1756
+ repository: IMAGE_REPOSITORIES.traefik,
1757
+ },
1758
+ deployment: {
1759
+ imagePullSecrets: rulebricksPullSecret,
1760
+ },
426
1761
  ingressClass: {
427
1762
  name: "traefik",
428
1763
  },
429
- tolerations: arm64Tolerations,
1764
+ ...coreScheduling,
430
1765
  autoscaling: {
431
1766
  enabled: true,
432
1767
  minReplicas: 1,
433
- maxReplicas: 2,
1768
+ // Headroom for colocated clients pushing multi-hundred-RPS bulk
1769
+ // traffic through the ingress.
1770
+ maxReplicas: 4,
434
1771
  },
435
1772
  resources: {
436
1773
  requests: {
@@ -453,11 +1790,26 @@ export async function generateHelmValues(config, options = {}) {
453
1790
  websecure: {
454
1791
  port: 8443,
455
1792
  exposedPort: 443,
456
- tls: {
457
- enabled: tlsEnabled,
1793
+ // traefik 41.x moved per-entrypoint TLS under ports.<name>.http.tls
1794
+ // (the old ports.<name>.tls location is rejected by the chart schema).
1795
+ http: {
1796
+ tls: {
1797
+ enabled: tlsEnabled,
1798
+ },
458
1799
  },
459
1800
  },
460
1801
  },
1802
+ metrics: {
1803
+ prometheus: {
1804
+ enabled: true,
1805
+ serviceMonitor: {
1806
+ enabled: false,
1807
+ },
1808
+ },
1809
+ },
1810
+ // OTLP tracing: ingress becomes the root span and propagates traceparent
1811
+ // to backends. Empty object when tracing is disabled.
1812
+ tracing: generateTraefikTracing(config, releaseName),
461
1813
  persistence: {
462
1814
  enabled: false,
463
1815
  },
@@ -467,7 +1819,29 @@ export async function generateHelmValues(config, options = {}) {
467
1819
  // =============================================================================
468
1820
  keda: {
469
1821
  enabled: true,
470
- tolerations: arm64Tolerations,
1822
+ imagePullSecrets: rulebricksPullSecret,
1823
+ // keda reads global.image.registry (NOT global.imageRegistry) for the host;
1824
+ // set it plus the rulebricks/* repositories for all three sub-images.
1825
+ global: {
1826
+ image: {
1827
+ registry: reg,
1828
+ },
1829
+ },
1830
+ image: {
1831
+ keda: {
1832
+ registry: reg,
1833
+ repository: IMAGE_REPOSITORIES.keda,
1834
+ },
1835
+ metricsApiServer: {
1836
+ registry: reg,
1837
+ repository: IMAGE_REPOSITORIES.kedaMetricsApiServer,
1838
+ },
1839
+ webhooks: {
1840
+ registry: reg,
1841
+ repository: IMAGE_REPOSITORIES.kedaAdmissionWebhooks,
1842
+ },
1843
+ },
1844
+ ...coreScheduling,
471
1845
  crds: {
472
1846
  install: false, // CRDs managed in parent chart
473
1847
  },
@@ -477,13 +1851,41 @@ export async function generateHelmValues(config, options = {}) {
477
1851
  // =============================================================================
478
1852
  "cert-manager": {
479
1853
  enabled: tlsEnabled,
480
- installCRDs: false, // CRDs managed in parent chart
481
- tolerations: arm64Tolerations,
1854
+ // CRDs managed in parent chart (cert-manager v1.15+ uses crds.enabled,
1855
+ // not the deprecated installCRDs flag).
1856
+ crds: { enabled: false },
1857
+ // cert-manager prepends image.registry to image.repository, so set both per
1858
+ // component (host = reg, rulebricks/cert-manager-* path).
1859
+ image: {
1860
+ registry: reg,
1861
+ repository: IMAGE_REPOSITORIES.certManagerController,
1862
+ },
1863
+ ...coreScheduling,
482
1864
  webhook: {
483
- tolerations: arm64Tolerations,
1865
+ image: {
1866
+ registry: reg,
1867
+ repository: IMAGE_REPOSITORIES.certManagerWebhook,
1868
+ },
1869
+ ...coreScheduling,
484
1870
  },
485
1871
  cainjector: {
486
- tolerations: arm64Tolerations,
1872
+ image: {
1873
+ registry: reg,
1874
+ repository: IMAGE_REPOSITORIES.certManagerCainjector,
1875
+ },
1876
+ ...coreScheduling,
1877
+ },
1878
+ startupapicheck: {
1879
+ image: {
1880
+ registry: reg,
1881
+ repository: IMAGE_REPOSITORIES.certManagerStartupapicheck,
1882
+ },
1883
+ },
1884
+ acmesolver: {
1885
+ image: {
1886
+ registry: reg,
1887
+ repository: IMAGE_REPOSITORIES.certManagerAcmesolver,
1888
+ },
487
1889
  },
488
1890
  },
489
1891
  // Cluster Issuer for Let's Encrypt
@@ -497,113 +1899,287 @@ export async function generateHelmValues(config, options = {}) {
497
1899
  // =============================================================================
498
1900
  vector: {
499
1901
  enabled: true,
1902
+ // vector's image.repository is the FULL path including host (no separate
1903
+ // registry field), so the reg host is prefixed here.
1904
+ image: {
1905
+ repository: `${reg}/${IMAGE_REPOSITORIES.vector}`,
1906
+ pullSecrets: rulebricksPullSecret,
1907
+ },
500
1908
  role: "Stateless-Aggregator",
501
- replicas: tierConfig.vectorReplicas,
502
- resources: tierConfig.vectorResources,
503
- tolerations: arm64Tolerations,
1909
+ // Replica count and resources fall back to the chart defaults.
1910
+ ...coreScheduling,
1911
+ serviceAccount: generateVectorServiceAccount(config),
1912
+ podLabels: generateVectorPodLabels(config),
1913
+ ...(generateVectorExtraContainers(config)
1914
+ ? { extraContainers: generateVectorExtraContainers(config) }
1915
+ : {}),
504
1916
  service: {
505
1917
  enabled: true,
506
1918
  ports: [{ name: "api", port: 8686, protocol: "TCP", targetPort: 8686 }],
507
1919
  },
508
1920
  // Load KAFKA_BOOTSTRAP_SERVERS from templated ConfigMap
509
- env: [
510
- {
511
- name: "KAFKA_BOOTSTRAP_SERVERS",
512
- valueFrom: {
513
- configMapKeyRef: {
514
- name: "vector-kafka-env",
515
- key: "KAFKA_BOOTSTRAP_SERVERS",
516
- },
517
- },
518
- },
519
- ],
1921
+ env: generateVectorEnv(config),
520
1922
  customConfig: {
521
1923
  sources: {
522
1924
  kafka: {
523
1925
  type: "kafka",
524
1926
  bootstrap_servers: "${KAFKA_BOOTSTRAP_SERVERS:-rulebricks-kafka:9092}",
525
- topics: ["logs"],
1927
+ // KAFKA_LOG_TOPIC carries the namespace prefix (e.g. com.rulebricks.logs).
1928
+ topics: ["${KAFKA_LOG_TOPIC:-logs}"],
526
1929
  group_id: "vector-consumers",
527
1930
  auto_offset_reset: "latest",
1931
+ // TLS + SASL driven by env from vector-kafka-env (disabled for
1932
+ // in-cluster Kafka and the kafka-proxy bridge path).
1933
+ tls: { enabled: "${KAFKA_TLS_ENABLED:-false}" },
1934
+ sasl: {
1935
+ enabled: "${KAFKA_SASL_ENABLED:-false}",
1936
+ mechanism: "${KAFKA_SASL_MECHANISM:-PLAIN}",
1937
+ // username/password are only emitted for external Kafka using a
1938
+ // direct PLAIN/SCRAM credential (where vector-kafka-credentials is
1939
+ // populated). Emitting them with an empty default would render as
1940
+ // YAML null and crash Vector at config load; omitting the keys
1941
+ // leaves them unset (valid) whenever SASL is disabled.
1942
+ ...(kafkaUsesDirectSasl(config)
1943
+ ? {
1944
+ username: "${KAFKA_SASL_USERNAME}",
1945
+ password: "${KAFKA_SASL_PASSWORD}",
1946
+ }
1947
+ : {}),
1948
+ },
1949
+ },
1950
+ },
1951
+ transforms: {
1952
+ normalize_logs: {
1953
+ type: "remap",
1954
+ inputs: ["kafka"],
1955
+ source: VECTOR_NORMALIZE_LOGS_VRL,
528
1956
  },
529
1957
  },
530
1958
  sinks: generateVectorSinks(config),
531
1959
  },
532
1960
  },
533
1961
  // =============================================================================
1962
+ // VECTOR AGENT (Application / container logs -> Elasticsearch)
1963
+ // =============================================================================
1964
+ "vector-agent": clickStackEnabled
1965
+ ? { enabled: false }
1966
+ : {
1967
+ ...generateVectorAgent(config, infrastructurePodLabels, operationalDaemonSetTolerations),
1968
+ // Full-path repository (see vector above) + pull secret.
1969
+ image: {
1970
+ repository: `${reg}/${IMAGE_REPOSITORIES.vector}`,
1971
+ pullSecrets: rulebricksPullSecret,
1972
+ },
1973
+ },
1974
+ // =============================================================================
534
1975
  // SUPABASE (Self-hosted Database)
535
1976
  // =============================================================================
536
1977
  supabase: {
537
1978
  enabled: config.database.type === "self-hosted",
538
1979
  ...(config.database.type === "self-hosted"
539
- ? {
540
- secret: {
541
- db: {
542
- username: "postgres",
543
- password: config.database.supabaseDbPassword,
544
- database: "postgres",
1980
+ ? (() => {
1981
+ // External managed Postgres (AWS RDS / Azure Flexible Server): the
1982
+ // self-hosted Supabase services run against it instead of the
1983
+ // bundled in-cluster database.
1984
+ const pgExt = config.externalServices?.postgres?.mode === "external"
1985
+ ? config.externalServices?.postgres?.external
1986
+ : undefined;
1987
+ return {
1988
+ secret: {
1989
+ db: {
1990
+ username: "postgres",
1991
+ // Shared service-role password (authenticator / auth_admin /
1992
+ // replication_admin). With an external DB the bootstrap hook
1993
+ // sets the roles to this same value.
1994
+ password: config.database.supabaseDbPassword,
1995
+ database: pgExt?.database || "postgres",
1996
+ },
1997
+ dashboard: {
1998
+ username: config.database.supabaseDashboardUser || "supabase",
1999
+ password: config.database.supabaseDashboardPass,
2000
+ },
2001
+ jwt: {
2002
+ secret: config.database.supabaseJwtSecret,
2003
+ },
2004
+ // SECRET_KEY_BASE / DB_ENC_KEY, derived from the JWT secret
2005
+ // (stable across redeploys). The chart no longer ships defaults.
2006
+ realtime: deriveRealtimeSecrets(config.database.supabaseJwtSecret || ""),
545
2007
  },
546
- dashboard: {
547
- username: config.database.supabaseDashboardUser || "supabase",
548
- password: config.database.supabaseDashboardPass,
2008
+ ...(pgExt
2009
+ ? {
2010
+ // One switch: enabling externalDatabase disables the bundled
2011
+ // Postgres and runs the bootstrap hook to initialize the
2012
+ // managed instance. db.enabled=false is explicit so chart
2013
+ // schema rules keyed off it hold.
2014
+ db: { enabled: false },
2015
+ externalDatabase: {
2016
+ enabled: true,
2017
+ host: pgExt.host ?? "",
2018
+ port: pgExt.port ?? 5432,
2019
+ bootstrap: {
2020
+ enabled: pgExt.bootstrap?.enabled ?? true,
2021
+ masterUsername: pgExt.bootstrap?.masterUsername ?? "postgres",
2022
+ masterPassword: pgExt.bootstrap?.masterPassword ?? "",
2023
+ appRole: pgExt.bootstrap?.appRole ?? "postgres",
2024
+ },
2025
+ },
2026
+ }
2027
+ : {
2028
+ db: {
2029
+ // Explicit so chart schema rules that key off
2030
+ // supabase.db.enabled (e.g. Database Backup Storage
2031
+ // Validation) hold without relying on subchart-default
2032
+ // coalescing.
2033
+ enabled: true,
2034
+ image: {
2035
+ // Split shape: the supabase.image helper applies
2036
+ // global.imageRegistry to the host. Host never in repository.
2037
+ registry: reg,
2038
+ repository: SUPABASE_POSTGRES_IMAGE_REPOSITORY,
2039
+ tag: SUPABASE_POSTGRES_IMAGE_TAG,
2040
+ pullPolicy: "IfNotPresent",
2041
+ },
2042
+ podLabels: infrastructurePodLabels,
2043
+ // Critical tier: the primary datastore must preempt burst
2044
+ // workers to reschedule; never autoscaler-evicted.
2045
+ // Resources and persistence size fall back to chart
2046
+ // defaults.
2047
+ priorityClassName: criticalPriorityClass,
2048
+ podAnnotations: safeToEvictAnnotations,
2049
+ ...coreScheduling,
2050
+ persistence: {
2051
+ enabled: true,
2052
+ storageClassName: storageClass,
2053
+ },
2054
+ },
2055
+ }),
2056
+ auth: {
2057
+ // Explicit public URLs so GoTrue never falls back to the
2058
+ // in-cluster Kong service name when global.domain propagation
2059
+ // is lost (e.g. after manual patching or partial upgrades).
2060
+ siteUrl: `https://${config.domain}`,
2061
+ externalUrl: `https://supabase.${config.domain}`,
2062
+ ...coreScheduling,
549
2063
  },
550
- jwt: {
551
- secret: config.database.supabaseJwtSecret,
2064
+ rest: {
2065
+ ...coreScheduling,
552
2066
  },
553
- },
554
- db: {
555
- resources: tierConfig.dbResources,
556
- tolerations: arm64Tolerations,
557
- persistence: {
558
- enabled: true,
559
- size: tierConfig.dbPersistenceSize,
560
- storageClassName: storageClass,
2067
+ realtime: {
2068
+ ...coreScheduling,
561
2069
  },
562
- },
563
- auth: {
564
- tolerations: arm64Tolerations,
565
- },
566
- rest: {
567
- tolerations: arm64Tolerations,
568
- },
569
- realtime: {
570
- tolerations: arm64Tolerations,
571
- },
572
- meta: {
573
- tolerations: arm64Tolerations,
574
- },
575
- kong: {
576
- tolerations: arm64Tolerations,
577
- ingress: {
578
- enabled: true,
579
- className: "traefik",
580
- annotations: {},
2070
+ meta: {
2071
+ ...coreScheduling,
581
2072
  },
582
- },
583
- studio: {
584
- tolerations: arm64Tolerations,
585
- },
586
- }
2073
+ kong: {
2074
+ ...coreScheduling,
2075
+ ingress: {
2076
+ enabled: true,
2077
+ className: "traefik",
2078
+ annotations: {},
2079
+ },
2080
+ },
2081
+ studio: {
2082
+ ...coreScheduling,
2083
+ },
2084
+ };
2085
+ })()
587
2086
  : {}),
588
2087
  },
589
2088
  // =============================================================================
590
2089
  // MONITORING
591
2090
  // =============================================================================
592
2091
  monitoring: {
593
- enabled: config.features.monitoring.enabled,
2092
+ enabled: true,
594
2093
  },
595
2094
  "kube-prometheus-stack": {
596
- enabled: config.features.monitoring.enabled,
2095
+ enabled: true,
2096
+ // kube-prometheus-stack honors the parent global.imageRegistry for the host
2097
+ // automatically; the CLI sets the rulebricks/* repository defaults (and the
2098
+ // reg host explicitly) for every sub-image so a bare helm install also pulls
2099
+ // rulebricks/*.
597
2100
  alertmanager: {
598
2101
  enabled: false,
2102
+ alertmanagerSpec: {
2103
+ image: {
2104
+ registry: reg,
2105
+ repository: IMAGE_REPOSITORIES.alertmanager,
2106
+ },
2107
+ },
2108
+ },
2109
+ prometheusOperator: {
2110
+ image: {
2111
+ registry: reg,
2112
+ repository: IMAGE_REPOSITORIES.prometheusOperator,
2113
+ },
2114
+ prometheusConfigReloader: {
2115
+ image: {
2116
+ registry: reg,
2117
+ repository: IMAGE_REPOSITORIES.prometheusConfigReloader,
2118
+ },
2119
+ },
2120
+ admissionWebhooks: {
2121
+ patch: {
2122
+ image: {
2123
+ registry: reg,
2124
+ repository: IMAGE_REPOSITORIES.kubeWebhookCertgen,
2125
+ },
2126
+ },
2127
+ },
2128
+ },
2129
+ "kube-state-metrics": {
2130
+ image: {
2131
+ registry: reg,
2132
+ repository: IMAGE_REPOSITORIES.kubeStateMetrics,
2133
+ },
2134
+ },
2135
+ "prometheus-node-exporter": {
2136
+ image: {
2137
+ registry: reg,
2138
+ repository: IMAGE_REPOSITORIES.nodeExporter,
2139
+ },
599
2140
  },
600
2141
  grafana: {
601
- enabled: false,
2142
+ enabled: useLocalGrafana,
2143
+ image: {
2144
+ registry: reg,
2145
+ repository: IMAGE_REPOSITORIES.grafana,
2146
+ },
2147
+ // Dashboard sidecar imports the provisioned Rulebricks dashboards
2148
+ // (ConfigMaps labeled grafana_dashboard="1") when in-cluster Grafana
2149
+ // is enabled.
2150
+ sidecar: {
2151
+ image: {
2152
+ registry: reg,
2153
+ repository: IMAGE_REPOSITORIES.k8sSidecar,
2154
+ },
2155
+ ...(useLocalGrafana
2156
+ ? {
2157
+ dashboards: {
2158
+ enabled: true,
2159
+ label: "grafana_dashboard",
2160
+ labelValue: "1",
2161
+ searchNamespace: "ALL",
2162
+ folderAnnotation: "grafana_folder",
2163
+ provider: { foldersFromFilesStructure: true },
2164
+ },
2165
+ }
2166
+ : {}),
2167
+ },
602
2168
  },
603
2169
  prometheus: {
604
- enabled: config.features.monitoring.enabled,
2170
+ enabled: true,
2171
+ serviceAccount: generatePrometheusServiceAccount(config),
605
2172
  prometheusSpec: {
606
2173
  retention: "30d",
2174
+ image: {
2175
+ registry: reg,
2176
+ repository: IMAGE_REPOSITORIES.prometheus,
2177
+ },
2178
+ podMetadata: generatePrometheusPodMetadata(config),
2179
+ serviceMonitorSelectorNilUsesHelmValues: false,
2180
+ serviceMonitorSelector: {},
2181
+ podMonitorSelectorNilUsesHelmValues: false,
2182
+ podMonitorSelector: {},
607
2183
  storageSpec: {
608
2184
  volumeClaimTemplate: {
609
2185
  spec: {
@@ -617,13 +2193,9 @@ export async function generateHelmValues(config, options = {}) {
617
2193
  },
618
2194
  },
619
2195
  },
620
- ...(config.features.monitoring.remoteWriteUrl
621
- ? {
622
- remoteWrite: [
623
- { url: config.features.monitoring.remoteWriteUrl },
624
- ],
625
- }
626
- : { remoteWrite: [] }),
2196
+ remoteWrite: [
2197
+ ...(clickStackEnabled ? [] : generateRemoteWriteSpec(config)),
2198
+ ],
627
2199
  },
628
2200
  },
629
2201
  },
@@ -631,20 +2203,21 @@ export async function generateHelmValues(config, options = {}) {
631
2203
  // STORAGE CLASS
632
2204
  // =============================================================================
633
2205
  storageClass: {
634
- create: true,
2206
+ create: false,
635
2207
  name: storageClass,
636
- provisioner: config.infrastructure.provider === "aws"
637
- ? "ebs.csi.aws.com"
638
- : config.infrastructure.provider === "gcp"
639
- ? "pd.csi.storage.gke.io"
640
- : config.infrastructure.provider === "azure"
641
- ? "disk.csi.azure.com"
642
- : "ebs.csi.aws.com",
2208
+ provisioner: config.infrastructure.storageProvisioner ||
2209
+ (config.infrastructure.provider === "aws"
2210
+ ? "ebs.csi.aws.com"
2211
+ : config.infrastructure.provider === "gcp"
2212
+ ? "pd.csi.storage.gke.io"
2213
+ : config.infrastructure.provider === "azure"
2214
+ ? "disk.csi.azure.com"
2215
+ : "ebs.csi.aws.com"),
643
2216
  // Parameters for the StorageClass - must include type for disk provisioning
644
2217
  parameters: config.infrastructure.provider === "aws"
645
2218
  ? { type: "gp3" }
646
2219
  : config.infrastructure.provider === "gcp"
647
- ? { type: "hyperdisk-balanced" }
2220
+ ? { type: gcpDiskType }
648
2221
  : config.infrastructure.provider === "azure"
649
2222
  ? { skuName: "Premium_LRS" }
650
2223
  : { type: "gp3" },
@@ -659,7 +2232,13 @@ export async function generateHelmValues(config, options = {}) {
659
2232
  "external-dns": externalDnsEnabled
660
2233
  ? {
661
2234
  enabled: true,
662
- provider: getExternalDnsProvider(config.dns.provider),
2235
+ // external-dns has NO image.registry field: image.repository is the
2236
+ // FULL path including host (reg prefix + rulebricks/external-dns).
2237
+ image: {
2238
+ repository: `${reg}/${IMAGE_REPOSITORIES.externalDns}`,
2239
+ },
2240
+ // external-dns 1.21+ idiom: provider is an object ({name: ...}).
2241
+ provider: { name: getExternalDnsProvider(config.dns.provider) },
663
2242
  domainFilters: [config.domain],
664
2243
  sources: ["ingress", "service"],
665
2244
  policy: "upsert-only",
@@ -668,6 +2247,105 @@ export async function generateHelmValues(config, options = {}) {
668
2247
  enabled: false,
669
2248
  },
670
2249
  };
2250
+ // In k8s secret mode, the CLI creates Kubernetes Secrets and the chart reads
2251
+ // them by reference. Point the chart's secretRef seams at those Secrets and
2252
+ // strip every plaintext secret out of the generated values.
2253
+ if (secretMode === "k8s") {
2254
+ return redactSecretsToRefs(values, config);
2255
+ }
2256
+ return values;
2257
+ }
2258
+ /**
2259
+ * Rewrites generated values for k8s secret mode: sets the chart's *.secretRef
2260
+ * seams to the CLI-created Secret names and removes inline plaintext secrets so
2261
+ * none are persisted to values.yaml or the Helm release.
2262
+ */
2263
+ export function redactSecretsToRefs(values, config) {
2264
+ const names = deploymentSecretNames(config);
2265
+ const global = (values.global ?? {});
2266
+ const supabase = (values.supabase ?? {});
2267
+ const pgExt = config.database.type === "self-hosted" &&
2268
+ config.externalServices?.postgres?.mode === "external"
2269
+ ? config.externalServices.postgres.external
2270
+ : undefined;
2271
+ // App-level consolidated secret: one secretRef supplies every app cred.
2272
+ global.secrets = { ...(global.secrets ?? {}), secretRef: names.app };
2273
+ // Strip inline app/global secrets (non-secret config like host/from/url stays).
2274
+ if (global.smtp) {
2275
+ delete global.smtp.user;
2276
+ delete global.smtp.pass;
2277
+ }
2278
+ if (global.supabase) {
2279
+ delete global.supabase.jwtSecret;
2280
+ delete global.supabase.anonKey;
2281
+ delete global.supabase.serviceKey;
2282
+ delete global.supabase.accessToken;
2283
+ }
2284
+ if (global.ai)
2285
+ delete global.ai.openaiApiKey;
2286
+ if (global.sso) {
2287
+ delete global.sso.clientId;
2288
+ delete global.sso.clientSecret;
2289
+ }
2290
+ delete global.licenseKey;
2291
+ // Supabase subchart: replace each inline secret block with a secretRef.
2292
+ if (supabase.secret) {
2293
+ const dbSecret = { secretRef: names.db };
2294
+ if (pgExt) {
2295
+ dbSecret.secretRefKey = {
2296
+ host: "host",
2297
+ port: "port",
2298
+ username: "username",
2299
+ password: "password",
2300
+ database: "database",
2301
+ };
2302
+ }
2303
+ supabase.secret = {
2304
+ db: dbSecret,
2305
+ jwt: { secretRef: names.jwt },
2306
+ dashboard: { secretRef: names.dashboard },
2307
+ realtime: { secretRef: names.realtime },
2308
+ // Supabase auth (GoTrue) SMTP — only when SMTP creds are configured;
2309
+ // otherwise the global.smtp we just stripped would leave it empty.
2310
+ ...(config.smtp?.user || config.smtp?.pass
2311
+ ? { smtp: { secretRef: names.smtp } }
2312
+ : {}),
2313
+ };
2314
+ }
2315
+ if (pgExt && supabase.externalDatabase) {
2316
+ supabase.externalDatabase = {
2317
+ ...supabase.externalDatabase,
2318
+ // New charts read host/port/user/pass/db from this single Secret. Keep
2319
+ // externalDatabase.host/port above for older charts that do not yet support
2320
+ // host/port secret keys.
2321
+ secretRef: names.db,
2322
+ secretRefKey: {
2323
+ host: "host",
2324
+ port: "port",
2325
+ username: "username",
2326
+ password: "password",
2327
+ database: "database",
2328
+ },
2329
+ bootstrap: {
2330
+ ...(supabase.externalDatabase.bootstrap ?? {}),
2331
+ secretRef: names.dbBootstrap,
2332
+ // Master credentials move into the hook Secret in k8s mode.
2333
+ masterUsername: undefined,
2334
+ masterPassword: undefined,
2335
+ },
2336
+ };
2337
+ }
2338
+ values.global = global;
2339
+ values.supabase = supabase;
2340
+ return values;
2341
+ }
2342
+ /**
2343
+ * Generates Helm values from the deployment configuration
2344
+ */
2345
+ export async function generateHelmValues(config, options = {}) {
2346
+ const values = buildHelmValues(config, options);
2347
+ // Last-line guardrail: never write/deploy values the chart would reject.
2348
+ assertValidHelmValues(values);
671
2349
  await saveHelmValues(config.name, values);
672
2350
  }
673
2351
  /**