npm - @rulebricks/cli - Versions diffs - 2.1.7 → 2.3.1 - Mend

@rulebricks/cli 2.1.7 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (117) hide show

package/README.md +51 -16
package/cluster-setup/aws/README.md +96 -47
package/cluster-setup/aws/check-aws-access.sh +216 -52
package/cluster-setup/aws/parameters.json +13 -0
package/cluster-setup/aws/rulebricks-cluster.cfn.yaml +355 -0
package/cluster-setup/azure/README.md +103 -55
package/cluster-setup/azure/check-aks-prereqs.sh +236 -56
package/cluster-setup/azure/parameters.json +30 -0
package/cluster-setup/azure/rulebricks-cluster.bicep +546 -0
package/cluster-setup/gcp/README.md +51 -34
package/cluster-setup/gcp/check-gke-prereqs.sh +222 -60
package/dist/commands/backup.d.ts +5 -0
package/dist/commands/backup.js +104 -0
package/dist/commands/deploy.d.ts +3 -1
package/dist/commands/deploy.js +226 -326
package/dist/commands/destroy.d.ts +1 -1
package/dist/commands/destroy.js +73 -123
package/dist/commands/init.d.ts +5 -1
package/dist/commands/init.js +78 -54
package/dist/commands/list.d.ts +1 -0
package/dist/commands/list.js +74 -0
package/dist/commands/open.d.ts +1 -1
package/dist/commands/open.js +4 -12
package/dist/commands/redeploy.d.ts +6 -0
package/dist/commands/redeploy.js +310 -0
package/dist/commands/restore.d.ts +5 -0
package/dist/commands/restore.js +338 -0
package/dist/commands/status.js +62 -49
package/dist/commands/upgrade.js +74 -51
package/dist/components/DNSWaitScreen.d.ts +5 -1
package/dist/components/DNSWaitScreen.js +47 -41
package/dist/components/Wizard/WizardContext.d.ts +157 -36
package/dist/components/Wizard/WizardContext.js +872 -160
package/dist/components/Wizard/steps/CloudProviderStep.js +192 -107
package/dist/components/Wizard/steps/DomainStep.js +5 -24
package/dist/components/Wizard/steps/ExternalServicesStep.d.ts +6 -0
package/dist/components/Wizard/steps/ExternalServicesStep.js +645 -0
package/dist/components/Wizard/steps/FeatureConfigStep.d.ts +2 -1
package/dist/components/Wizard/steps/FeatureConfigStep.js +739 -425
package/dist/components/Wizard/steps/FeaturesStep.js +31 -35
package/dist/components/Wizard/steps/ObservabilityStep.d.ts +6 -0
package/dist/components/Wizard/steps/ObservabilityStep.js +137 -0
package/dist/components/Wizard/steps/ReviewStep.d.ts +2 -1
package/dist/components/Wizard/steps/ReviewStep.js +56 -12
package/dist/components/Wizard/steps/StorageStep.d.ts +9 -0
package/dist/components/Wizard/steps/StorageStep.js +592 -0
package/dist/components/Wizard/steps/SupabaseCredentialsStep.js +20 -21
package/dist/components/Wizard/steps/VersionStep.js +45 -23
package/dist/components/Wizard/steps/index.d.ts +3 -3
package/dist/components/Wizard/steps/index.js +3 -3
package/dist/components/common/CommandApproval.d.ts +12 -0
package/dist/components/common/CommandApproval.js +91 -0
package/dist/components/common/DeploymentPicker.d.ts +14 -0
package/dist/components/common/DeploymentPicker.js +16 -0
package/dist/components/common/index.d.ts +2 -0
package/dist/components/common/index.js +2 -0
package/dist/index.js +94 -62
package/dist/lib/cloudCli.d.ts +134 -63
package/dist/lib/cloudCli.js +512 -220
package/dist/lib/clusterSetupDefaults.d.ts +30 -0
package/dist/lib/clusterSetupDefaults.js +64 -0
package/dist/lib/commandApproval.d.ts +26 -0
package/dist/lib/commandApproval.js +114 -0
package/dist/lib/config.d.ts +12 -10
package/dist/lib/config.js +91 -33
package/dist/lib/configFixtures.d.ts +5 -0
package/dist/lib/configFixtures.js +513 -0
package/dist/lib/deploymentHealth.d.ts +32 -0
package/dist/lib/deploymentHealth.js +157 -0
package/dist/lib/dns.d.ts +1 -1
package/dist/lib/dns.js +19 -1
package/dist/lib/dns.test.d.ts +1 -0
package/dist/lib/dns.test.js +27 -0
package/dist/lib/dockerHub.d.ts +12 -1
package/dist/lib/dockerHub.js +18 -8
package/dist/lib/helm.d.ts +4 -0
package/dist/lib/helm.js +16 -0
package/dist/lib/helmValues.d.ts +25 -0
package/dist/lib/helmValues.js +1762 -289
package/dist/lib/helmValues.test.d.ts +1 -0
package/dist/lib/helmValues.test.js +966 -0
package/dist/lib/htpasswd.d.ts +1 -0
package/dist/lib/htpasswd.js +15 -0
package/dist/lib/kubernetes.d.ts +124 -17
package/dist/lib/kubernetes.js +576 -145
package/dist/lib/secrets.d.ts +23 -0
package/dist/lib/secrets.js +158 -0
package/dist/lib/validateValues.d.ts +31 -0
package/dist/lib/validateValues.js +253 -0
package/dist/lib/versions.d.ts +82 -11
package/dist/lib/versions.js +131 -31
package/dist/lib/versions.test.d.ts +1 -0
package/dist/lib/versions.test.js +81 -0
package/dist/lib/wizardSteps.d.ts +14 -0
package/dist/lib/wizardSteps.js +23 -0
package/dist/lib/workloadIdentity.d.ts +26 -0
package/dist/lib/workloadIdentity.js +323 -0
package/dist/lib/workloadIdentity.test.d.ts +1 -0
package/dist/lib/workloadIdentity.test.js +57 -0
package/dist/types/index.d.ts +1860 -164
package/dist/types/index.js +518 -295
package/package.json +9 -4
package/schema/values.schema.json +1934 -0
package/cluster-setup/aws/cluster.yaml +0 -33
package/cluster-setup/azure/main.bicep +0 -282
package/cluster-setup/azure/main.parameters.json +0 -21
package/dist/components/Wizard/steps/CredentialsStep.d.ts +0 -6
package/dist/components/Wizard/steps/CredentialsStep.js +0 -22
package/dist/components/Wizard/steps/DeploymentModeStep.d.ts +0 -5
package/dist/components/Wizard/steps/DeploymentModeStep.js +0 -26
package/dist/components/Wizard/steps/TierStep.d.ts +0 -6
package/dist/components/Wizard/steps/TierStep.js +0 -29
package/dist/lib/terraform.d.ts +0 -66
package/dist/lib/terraform.js +0 -754
package/terraform/aws/main.tf +0 -355
package/terraform/azure/main.tf +0 -371
package/terraform/gcp/main.tf +0 -407

package/README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ![Banner](./banner.png)
-The Rulebricks CLI is a management utility that automates the creation and maintenance of private Rulebricks clusters, helping you deploy Rulebricks in customizable, high-throughput configurations on AWS, GCP, or Azure.
+The Rulebricks CLI is a management utility for configuring and deploying private Rulebricks instances onto Kubernetes clusters you already control.
-You can choose how much you would like the CLI to automate for you– use it to generate valid configuration values, automate infrastructure provisioning (via Terraform), software deployment (via Helm), or all of the above.
+It focuses on generating valid Rulebricks configuration values, sizing the application from the selected cluster's available resources, and deploying the Helm chart.
 ## Installation
@@ -17,6 +17,8 @@ to deploy using this CLI. You will be
 requested for this key during project
 configuration.
+You must also have an **available Kubernetes cluster** to deploy to. You can use the `cluster-setup` directory to easily create a standalone cluster for Rulebricks. These resources satisfy the minimum cluster requirements, role/identity resources, and object storage buckets required for a production deployment, and double as documentation for teams looking to deploy Rulebricks to an existing cluster.
 Rulebricks requires TLS. You will require either external-dns on your cluster to automatically add DNS records, or you will need **access** to manually add **DNS records** for the subdomain(s) where you would like to access your private deployment from.
 Finally, you will need to have the following tools installed and ready on your machine:
@@ -24,12 +26,11 @@ Finally, you will need to have the following tools installed and ready on your m
 - **Node.js** >= 20
 - **kubectl** - Kubernetes CLI
 - **Helm** >= 3.0
-- **Terraform** >= 1.0 (for infrastructure provisioning)
-- Cloud CLI (`aws`, `gcloud`, or `az`) configured for your provider
+- Cloud CLI (`aws`, `gcloud`, or `az`) configured for your provider if you want the wizard to discover clusters or refresh kubeconfig
 ## Cluster Setup
-If you want to create the Kubernetes cluster yourself, use the resources in `cluster-setup/` before running the CLI wizard. These files provide minimum compatible AWS, Azure, and GCP cluster setup guidance plus optional access checks. Monitoring destinations are configured later by the CLI wizard and Helm values, not by these cluster setup files.
+Create or select a Kubernetes cluster before running the CLI wizard. If you need a starting point, use the resources in `cluster-setup/`; they provide minimum compatible AWS, Azure, and GCP setup guidance plus optional access checks. Monitoring destinations are configured later by the CLI wizard and Helm values, not by these cluster setup files.
 ```bash
 # AWS: optional access check, then create EKS with eksctl
@@ -51,7 +52,7 @@ GCP_REGION=us-central1 bash cluster-setup/gcp/check-gke-prereqs.sh
 # Follow cluster-setup/gcp/README.md for the gcloud create commands.
 ```
-After the cluster exists, update kubeconfig, then run `rulebricks init` and choose **Use existing Kubernetes cluster**. The existing Terraform provisioning path remains available, but native cloud setup is the clearest path when you want to own the cluster directly.
+After the cluster exists, update kubeconfig, then run `rulebricks init`. The wizard can also refresh kubeconfig for EKS, GKE, or AKS when provider details are available.
 ## Quick Start
@@ -59,24 +60,58 @@ After the cluster exists, update kubeconfig, then run `rulebricks init` and choo
 # Configuration wizard (generates values.yaml)
 rulebricks init
-# Provision and/or deploy to your cluster
+# Deploy to your cluster
 rulebricks deploy my-deployment
 ```
+The generated Helm values pin one Rulebricks product version under
+`global.version`. That single semantic version selects the app, HPS, and HPS
+worker images together.
 ## Main Commands
-| Command                     | Description                            |
-| --------------------------- | -------------------------------------- |
-| `rulebricks init`           | Interactive setup wizard               |
-| `rulebricks deploy [name]`  | Deploy to Kubernetes                   |
-| `rulebricks upgrade [name]` | Upgrade to a new version               |
-| `rulebricks destroy [name]` | Remove a deployment                    |
-| `rulebricks status [name]`  | Show deployment health                 |
-| `rulebricks logs [name]`    | Inspect services                       |
-| `rulebricks open [name]`    | Open the generated configuration files |
+| Command                     | Description                              |
+| --------------------------- | ---------------------------------------- |
+| `rulebricks init`           | Interactive setup wizard                 |
+| `rulebricks deploy [name]`  | Deploy to Kubernetes                     |
+| `rulebricks upgrade [name]` | Upgrade to a new version                 |
+| `rulebricks destroy [name]` | Remove a deployment                      |
+| `rulebricks status [name]`  | Show deployment health                   |
+| `rulebricks logs [name]`    | Inspect services                         |
+| `rulebricks open [name]`    | Open the generated configuration files   |
+| `rulebricks backup [name]`  | Run an on-demand database backup         |
+| `rulebricks restore [name]` | Restore the database from object storage |
 Use `rulebricks -h` to explore all commands, and add `-h` to any command to learn more about a particular command's options.
+## Monitoring
+Self-hosted deployments enable Prometheus monitoring by default. The wizard only asks whether you want to configure a Prometheus `remote_write` destination; you can skip that step if you do not yet have AWS Managed Prometheus, Azure Monitor managed Prometheus, Grafana Cloud, or another remote-write-compatible backend ready.
+By default, generated Helm values install `kube-prometheus-stack`, scrape Kubernetes and cluster metrics, and add Rulebricks scrape targets for:
+- App/admin API health: request counts, latency histograms, coarse rejection counts, and frontend error counts.
+- HPS rule-engine traffic: request counts, latency histograms, coarse rejection counts, Kafka worker wait time, bulk/parallel item volume, and memory cache stats.
+- Supporting infrastructure where available: Kafka JMX, ClickHouse metrics when ClickHouse is enabled, and Traefik's Prometheus endpoint. Traefik's ServiceMonitor remains an explicit opt-in after Prometheus Operator CRDs are installed.
+Metrics intentionally use low-cardinality labels such as route template, method, status class, operation, and rejection reason. They do not include API keys, users, organizations, IP addresses, raw URLs, rule slugs, flow slugs, or exception messages.
+Useful PromQL examples:
+```promql
+histogram_quantile(0.95, sum(rate(rulebricks_hps_http_request_duration_seconds_bucket[5m])) by (le, route))
+sum(rate(rulebricks_hps_rejections_total[5m])) by (route, reason)
+histogram_quantile(0.95, sum(rate(rulebricks_hps_kafka_request_duration_seconds_bucket[5m])) by (le, operation))
+sum(rate(rulebricks_hps_bulk_items_total[5m])) by (operation)
+sum(rate(rulebricks_app_frontend_errors_total[5m])) by (source)
+```
+## Object Storage and Backups
+The wizard now collects a shared object storage backend for every deployment. Rulebricks uses separate prefixes in that bucket for decision logs (`decision-logs/`) and self-hosted Supabase database backups (`db-backups/`).
+Database backups are optional for self-hosted Supabase deployments. When enabled, the Helm chart schedules Barman base backups according to the configured cron schedule and retention window. You can also run `rulebricks backup <name>` to trigger an on-demand backup, or `rulebricks restore <name>` to list backups in object storage and interactively restore one after confirmation.
 ## Notes
 There are a uniquely wide variety of customization options this CLI makes available (multi-cloud, hybrid vs. self-hosted database deployment, custom email templates, etc.), and not all combinations have been validated.

package/cluster-setup/aws/README.md CHANGED Viewed

@@ -1,74 +1,123 @@
 # AWS Cluster Setup
-Use these files to create a minimum EKS cluster that can run Rulebricks without using the Rulebricks CLI Terraform flow.
+A compact, turnkey EKS cluster for Rulebricks. One CloudFormation stack creates
+the cluster **and** the S3 bucket + Amazon Managed Prometheus workspace the
+platform needs, wired to workloads via **EKS Pod Identity** (AWS's recommended
+mechanism for new clusters — no OIDC provider to manage).
+`eksctl` is not used: it can create a cluster but not the bucket or AMP
+workspace, so the full picture lives in one stack instead.
 ## Files
-- `cluster.yaml` is the minimum compatible `eksctl` cluster config, using ARM64 managed nodes and EBS CSI support.
-- `check-aws-access.sh` verifies AWS identity, common EKS/EC2/IAM permissions, quota, `eksctl`, `kubectl`, and Helm.
+- `rulebricks-cluster.cfn.yaml` — VPC, EKS cluster + managed node group, EBS CSI + Pod Identity add-ons, one S3 data bucket, AMP workspace, and a single IAM role. (The CLI creates the namespace-scoped Pod Identity associations at deploy time.)
+- `parameters.json` — sample parameter overrides (omit any to use template defaults).
+- `check-aws-prereqs.sh` — verifies identity, service access, IAM role-creation rights, quota, kubectl/helm.
-## Core Cluster Parameters
+## One role, one bucket
-- Cluster name: `rulebricks-cluster` (`cluster.yaml` -> `metadata.name`)
-- Region: `us-east-1` (`cluster.yaml` -> `metadata.region`)
-- Kubernetes version: `1.34` (`cluster.yaml` -> `metadata.version`)
-- Node count: `4` (`cluster.yaml` -> `managedNodeGroups[0].desiredCapacity`)
-- Instance type: `c8g.large` (`cluster.yaml` -> `managedNodeGroups[0].instanceType`)
-- Disk size (GB): `50` (`cluster.yaml` -> `managedNodeGroups[0].volumeSize`)
-- Disk type: `gp3` (`cluster.yaml` -> `managedNodeGroups[0].volumeType`)
+A single IAM role, `<cluster>-rulebricks`, is bound to the ServiceAccounts that
+need cloud access via `EKS::PodIdentityAssociation`. All data lives in one
+bucket, `<cluster>-data-<account-id>`, under per-purpose prefixes.
-## Check Access
+| Path                              | Service account                      | Permission / target                                       |
+| --------------------------------- | ------------------------------------ | --------------------------------------------------------- |
+| Decision logs (Vector → S3)       | `vector`                             | `s3:*Object`/`ListBucket` → `<cluster>-data/decision-logs/` |
+| DB backups (job → S3)             | `rulebricks-<deploymentName>-backup` | `s3:*Object`/`ListBucket` → `<cluster>-data/db-backups/`    |
+| Metrics (Prometheus remote write) | `prometheus`                         | `aps:RemoteWrite` → AMP workspace                          |
-```bash
-AWS_REGION=us-east-1 bash check-aws-access.sh
-```
+The bucket is encrypted and has public access blocked.
+> **This stack does not need a deployment name.** `EKS::PodIdentityAssociation` is
+> `namespace`-scoped, so the **Rulebricks CLI creates the associations** (vector / backup /
+> prometheus → this role) at `rulebricks deploy` time. The stack only provisions the
+> deployment-independent role, bucket, and AMP workspace, so one cluster can host many deployments.
+## Core cluster parameters
+`ClusterName` (`rulebricks-cluster`), `KubernetesVersion` (`1.34`),
+`NodeInstanceType` (`c7i.xlarge`), `NodeDesiredCapacity`/`NodeMinSize`/`NodeMaxSize`
+(`2`/`2`/`4`), `NodeVolumeSizeGiB` (`50`). The standard (core) nodegroup runs
+the always-on services on two to four 4-vCPU nodes; burst capacity lives in
+the dedicated burst nodegroup below.
+### Burst worker nodegroup (default on)
+`EnableBurstPool` (`"true"`), `BurstInstanceType` (`c7i.4xlarge`, 16 vCPU),
+`BurstNodeMaxSize` (`1`). One large on-demand node that scales 0 -> 1 on
+demand, labeled and tainted `rulebricks.com/pool=burst`: the Rulebricks chart
+makes workers tolerate the taint and softly prefer the label out of the box,
+so the scaled-out worker fleet lands here while core services stay on the
+standard nodegroup. Sizing math: 2 x 4 vCPU core floor + 16 vCPU burst =
+24 vCPU running steady-state at full burst, and exactly 32 vCPU even with
+the core nodegroup at its 4-node max. Note: EKS has no parked-VM equivalent of AKS
+Deallocate, so each burst cold-provisions the node (~2-3 min); the warm
+worker floor on the core nodes carries traffic during provisioning, and a
+Karpenter NodePool carrying the same label/taint is the planned fast path.
-## Create The Cluster
+> `NodeInstanceType` and the node AMI are coupled: `c7i` is x86, so the template
+> uses `AL2023_x86_64_STANDARD`. If you switch to a Graviton/ARM type (e.g.
+> `c8g`), change `AmiType` to `AL2023_ARM_64_STANDARD` or the nodes won't boot.
+## Region
+CloudFormation is regional — the stack deploys to whatever region your CLI call
+targets. Set it with `--region` (or `AWS_REGION` / your profile), not a
+parameter. Availability zones auto-resolve to that region.
+## Check access
 ```bash
-eksctl create cluster -f cluster.yaml
+AWS_REGION=us-east-1 bash check-aws-prereqs.sh
 ```
-`eksctl` updates kubeconfig automatically. To refresh it manually:
+The stack creates named IAM roles, so the deploying principal must be able to
+create roles, and the deploy must pass `--capabilities CAPABILITY_NAMED_IAM`
+(below). The check script flags this.
+## Create the cluster
 ```bash
+aws cloudformation create-stack \
+  --stack-name rulebricks-cluster \
+  --region us-east-1 \
+  --template-body file://rulebricks-cluster.cfn.yaml \
+  --parameters file://parameters.json \
+  --capabilities CAPABILITY_NAMED_IAM
+aws cloudformation wait stack-create-complete \
+  --stack-name rulebricks-cluster --region us-east-1
 aws eks update-kubeconfig --name rulebricks-cluster --region us-east-1
 ```
-Use `rulebricks init` with **Use existing Kubernetes cluster** after kubeconfig works.
+`CAPABILITY_NAMED_IAM` is a single inline flag on the deploy call (no
+prerequisite step) and is required because the role has an explicit name. Run
+`rulebricks init` once kubeconfig works, then select this cluster. Stack outputs
+give `DataBucketName`, `RulebricksRoleArn`, and the AMP `remote_write` URL for
+the CLI.
-## Optional Identity Setup
+## Delete the cluster
-If you use S3 decision-log export or AWS Managed Prometheus remote write, create IAM roles for the Kubernetes service accounts rendered by the CLI:
+Run `rulebricks destroy <deployment-name>` first so Kubernetes removes
+LoadBalancer services and PVC-backed EBS volumes. CloudFormation **cannot delete
+non-empty S3 buckets**, so empty them before deleting the stack:
 ```bash
-NAMESPACE=rulebricks-demo
-CLUSTER=rulebricks-cluster
-REGION=us-east-1
 ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
+aws s3 rm "s3://rulebricks-cluster-data-${ACCOUNT_ID}" --recursive
-eksctl utils associate-iam-oidc-provider \
-  --cluster "$CLUSTER" \
-  --region "$REGION" \
-  --approve
-eksctl create iamserviceaccount \
-  --cluster "$CLUSTER" \
-  --region "$REGION" \
-  --namespace "$NAMESPACE" \
-  --name vector \
-  --attach-policy-arn arn:aws:iam::"$ACCOUNT_ID":policy/<vector-s3-policy> \
-  --role-name rulebricks-vector \
-  --approve
-eksctl create iamserviceaccount \
-  --cluster "$CLUSTER" \
-  --region "$REGION" \
-  --namespace "$NAMESPACE" \
-  --name prometheus \
-  --attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \
-  --role-name rulebricks-prometheus \
-  --approve
+aws cloudformation delete-stack --stack-name rulebricks-cluster --region us-east-1
+aws cloudformation wait stack-delete-complete \
+  --stack-name rulebricks-cluster --region us-east-1
 ```
-Enter the created role ARNs when prompted by the CLI.
+The stack is the teardown boundary (analogous to the Azure resource group):
+deleting it removes the cluster, node group, VPC, the IAM role, Pod Identity
+associations, AMP workspace, and the (emptied) bucket.
+## Notes
+- Rulebricks uses a Kubernetes LoadBalancer service; EKS provisions the load balancer and its `80`/`443` security-group rules. In a locked-down VPC, ensure public inbound `80`/`443` can reach it for DNS and cert-manager HTTP-01 validation.
+- Pod Identity requires the `eks-pod-identity-agent` add-on, which the stack installs.
+- To bring your own buckets or AMP workspace, replace the corresponding resources with parameters and references (not enabled by default to keep the stack compact).

package/cluster-setup/aws/check-aws-access.sh CHANGED Viewed

@@ -1,78 +1,242 @@
 #!/usr/bin/env bash
+# Rulebricks AWS / EKS prerequisite check.
+#
+# Prints a short pass/fail report and a final READY / NOT READY verdict
+# with the exact actions you need to take before deploying the CloudFormation
+# stack.
+#
+# Env vars:
+#   AWS_REGION / AWS_DEFAULT_REGION   Region to check (default: us-east-1)
+#   AWS_PROFILE                       Optional named profile to verify
+#   VERBOSE=1                         Print raw AWS error messages inline
 set -euo pipefail
+if [[ -z "${BASH_VERSION:-}" ]]; then
+  exec bash "$0" "$@"
+fi
+export AWS_PAGER=""
 REGION="${AWS_REGION:-${AWS_DEFAULT_REGION:-us-east-1}}"
 REQUIRED_VCPU=8
+VERBOSE="${VERBOSE:-0}"
+ACTIONS=()
+BLOCKERS=0
+# ---------- helpers ----------
 require_cmd() {
   command -v "$1" >/dev/null 2>&1 || {
-    echo "Missing required command: $1" >&2
+    printf "ERROR: required command not found: %s\n" "$1" >&2
     exit 1
   }
 }
-check_aws() {
-  echo "Checking AWS identity..."
-  aws sts get-caller-identity --output table
-  echo
+# Run an aws command. Sets AWS_STDOUT / AWS_STDERR / AWS_RC. Never aborts.
+aws_run() {
+  AWS_STDOUT=""; AWS_STDERR=""; AWS_RC=0
+  local _err
+  _err="$(mktemp)"
+  AWS_STDOUT="$(aws "$@" 2>"$_err")" || AWS_RC=$?
+  AWS_STDERR="$(cat "$_err")"
+  rm -f "$_err"
+  if [[ "$VERBOSE" == "1" && -n "$AWS_STDERR" ]]; then
+    printf "      debug: %s\n" "${AWS_STDERR%%$'\n'*}" >&2
+  fi
+  return "$AWS_RC"
 }
-check_permission() {
-  local label="$1"
-  shift
-  if "$@" >/dev/null 2>&1; then
-    echo "OK: $label"
-  else
-    echo "WARN: Could not verify $label"
-    echo "      Command failed: $*"
-  fi
+is_auth_error() {
+  [[ "$AWS_STDERR" == *"ExpiredToken"*           ]] && return 0
+  [[ "$AWS_STDERR" == *"InvalidClientTokenId"*   ]] && return 0
+  [[ "$AWS_STDERR" == *"UnrecognizedClientException"* ]] && return 0
+  [[ "$AWS_STDERR" == *"Unable to locate credentials"* ]] && return 0
+  [[ "$AWS_STDERR" == *"SignatureDoesNotMatch"*  ]] && return 0
+  [[ "$AWS_STDERR" == *"TokenRefreshRequired"*   ]] && return 0
+  [[ "$AWS_STDERR" == *"SSOTokenLoadError"*      ]] && return 0
+  [[ "$AWS_STDERR" == *"sso login"*              ]] && return 0
+  return 1
 }
-check_quota() {
-  echo "Checking regional on-demand vCPU quota in $REGION..."
-  local quota
-  quota="$(aws service-quotas get-service-quota \
-    --service-code ec2 \
-    --quota-code L-1216C47A \
-    --region "$REGION" \
-    --query 'Quota.Value' \
-    --output text 2>/dev/null || true)"
+row() {
+  printf "  %-50s %s\n" "$1" "$2"
+}
-  if [[ -z "$quota" || "$quota" == "None" ]]; then
-    echo "WARN: Could not read EC2 on-demand vCPU quota."
-    return
-  fi
+mark_blocker() { BLOCKERS=$((BLOCKERS + 1)); }
+add_action()   { ACTIONS+=("$1"); }
-  local quota_int="${quota%.*}"
-  if (( quota_int < REQUIRED_VCPU )); then
-    echo "WARN: Quota may be too low: ${quota} vCPU available, ${REQUIRED_VCPU}+ recommended for the included cluster config."
+login_hint() {
+  if [[ -n "${AWS_PROFILE:-}" ]]; then
+    printf "aws sso login --profile %s   (or refresh credentials for profile '%s')" "$AWS_PROFILE" "$AWS_PROFILE"
   else
-    echo "OK: EC2 on-demand vCPU quota is ${quota}."
+    printf "aws sso login   (or 'aws configure' to set up credentials)"
   fi
 }
+# ---------- pre-flight ----------
+# Note: eksctl is NOT required. The cluster is deployed via a single
+# CloudFormation stack, so only the AWS CLI plus kubectl/helm are needed.
 require_cmd aws
-require_cmd eksctl
 require_cmd kubectl
 require_cmd helm
-echo "Rulebricks AWS access checks"
-echo "Region: $REGION"
-echo
-check_aws
-check_permission "EKS access" aws eks list-clusters --region "$REGION"
-check_permission "EC2 VPC access" aws ec2 describe-vpcs --region "$REGION" --max-items 5
-check_permission "IAM access" aws iam get-user
-check_permission "ECR public image pull path" aws ecr-public describe-registries --region us-east-1
-check_quota
-echo
-echo "Checking local Kubernetes tools..."
-eksctl version >/dev/null
-kubectl version --client=true >/dev/null
-helm version >/dev/null
-echo "OK: eksctl, kubectl, and Helm are installed."
-echo
-echo "AWS access checks completed. Warnings may require cloud-admin review before cluster creation."
+printf "Rulebricks AWS prerequisite check\n"
+printf "  Region:  %s\n" "$REGION"
+[[ -n "${AWS_PROFILE:-}" ]] && printf "  Profile: %s\n" "$AWS_PROFILE"
+printf "\n"
+# ---------- 1. Authentication ----------
+AUTH_OK=0
+ACCOUNT_ID=""
+CALLER_ARN=""
+if aws_run sts get-caller-identity --query "Account" --output text; then
+  ACCOUNT_ID="$AWS_STDOUT"
+  if aws_run sts get-caller-identity --query "Arn" --output text; then
+    CALLER_ARN="$AWS_STDOUT"
+  fi
+  row "AWS credentials valid" "OK ($ACCOUNT_ID)"
+  [[ -n "$CALLER_ARN" ]] && row "Caller identity" "$CALLER_ARN"
+  AUTH_OK=1
+else
+  if is_auth_error; then
+    row "AWS credentials valid" "FAIL - credentials missing or expired"
+  else
+    row "AWS credentials valid" "FAIL - ${AWS_STDERR%%$'\n'*}"
+  fi
+  add_action "Refresh credentials: $(login_hint)"
+  mark_blocker
+fi
+if [[ $AUTH_OK -eq 0 ]]; then
+  printf "\nRemaining checks skipped - fix authentication first.\n"
+  printf "\n========================================\n"
+  printf "RESULT: NOT READY\n"
+  printf "========================================\n"
+  printf "Required actions:\n"
+  i=1
+  for a in "${ACTIONS[@]}"; do
+    printf "  %d. %s\n" "$i" "$a"
+    i=$((i + 1))
+  done
+  exit 1
+fi
+# ---------- 2. Service access ----------
+# These cover what the CloudFormation stack touches: EKS, EC2/VPC, IAM (roles +
+# Pod Identity associations), S3 (log/backup buckets), APS (managed Prometheus),
+# and CloudFormation itself.
+declare -a missing_access=()
+aws_run eks list-clusters --region "$REGION" --output text >/dev/null \
+  || missing_access+=("eks:ListClusters")
+aws_run ec2 describe-vpcs --region "$REGION" --max-items 5 --output text >/dev/null \
+  || missing_access+=("ec2:DescribeVpcs")
+aws_run iam list-roles --max-items 5 --output text >/dev/null \
+  || missing_access+=("iam:ListRoles")
+aws_run s3api list-buckets --output text >/dev/null \
+  || missing_access+=("s3:ListAllMyBuckets")
+aws_run aps list-workspaces --region "$REGION" --output text >/dev/null \
+  || missing_access+=("aps:ListWorkspaces")
+aws_run cloudformation list-stacks --region "$REGION" --output text >/dev/null \
+  || missing_access+=("cloudformation:ListStacks")
+if [[ ${#missing_access[@]} -eq 0 ]]; then
+  row "EKS/EC2/IAM/S3/APS/CFN access" "OK"
+else
+  row "EKS/EC2/IAM/S3/APS/CFN access" "WARN - missing: ${missing_access[*]}"
+  add_action "Ask your AWS admin to grant the missing IAM actions in $REGION: ${missing_access[*]}"
+fi
+# ---------- 3. IAM role-creation rights (CAPABILITY_NAMED_IAM) ----------
+# The stack creates named IAM roles, so the deploying principal must be allowed
+# to create roles and attach policies. We can't fully simulate this without
+# iam:SimulatePrincipalPolicy, but we can flag whether the caller is obviously
+# an admin vs. a scoped role so the operator knows to expect a capability prompt.
+if aws_run iam simulate-principal-policy \
+     --policy-source-arn "$CALLER_ARN" \
+     --action-names iam:CreateRole iam:AttachRolePolicy iam:PutRolePolicy \
+     --query "EvaluationResults[?EvalDecision=='allowed'] | length(@)" \
+     --output text; then
+  allowed="$AWS_STDOUT"
+  if [[ "$allowed" == "3" ]]; then
+    row "IAM role-creation rights" "OK"
+  else
+    row "IAM role-creation rights" "WARN - some IAM create/attach actions denied"
+    add_action "The stack creates named IAM roles (deploy needs CAPABILITY_NAMED_IAM). Ensure your principal can iam:CreateRole / iam:AttachRolePolicy / iam:PutRolePolicy, or have an admin deploy."
+  fi
+else
+  # SimulatePrincipalPolicy itself is often denied for non-admins; don't block.
+  row "IAM role-creation rights" "WARN - could not simulate (needs iam:SimulatePrincipalPolicy)"
+  add_action "Could not verify IAM role-creation rights. The stack creates named IAM roles and must be deployed with --capabilities CAPABILITY_NAMED_IAM by a principal allowed to create roles."
+fi
+# ---------- 4. EC2 on-demand vCPU quota ----------
+quota_label="EC2 on-demand vCPU quota in $REGION (need ${REQUIRED_VCPU}+)"
+if aws_run service-quotas get-service-quota \
+     --service-code ec2 \
+     --quota-code L-1216C47A \
+     --region "$REGION" \
+     --query "Quota.Value" \
+     --output text; then
+  quota="$AWS_STDOUT"
+  if [[ -z "$quota" || "$quota" == "None" ]]; then
+    row "$quota_label" "WARN - empty response"
+    add_action "Check the EC2 'Running On-Demand Standard vCPUs' quota in the AWS console: Service Quotas → EC2."
+  else
+    quota_int="${quota%.*}"
+    if (( quota_int < REQUIRED_VCPU )); then
+      row "$quota_label" "WARN ($quota available)"
+      add_action "Request a quota increase: AWS console → Service Quotas → EC2 → 'Running On-Demand Standard vCPUs' in $REGION."
+    else
+      row "$quota_label" "OK ($quota available)"
+    fi
+  fi
+else
+  row "$quota_label" "WARN - could not read quota"
+  add_action "Manually verify EC2 vCPU quota in the AWS console (Service Quotas → EC2) for $REGION."
+fi
+# ---------- 5. Local tools ----------
+missing_tools=()
+kubectl version --client=true >/dev/null 2>&1 || missing_tools+=("kubectl")
+helm version   >/dev/null 2>&1 || missing_tools+=("helm")
+if [[ ${#missing_tools[@]} -gt 0 ]]; then
+  uniq_tools="$(printf '%s\n' "${missing_tools[@]}" | sort -u | tr '\n' ' ')"
+  row "Local tools (kubectl, helm)" "FAIL - missing/broken: ${uniq_tools% }"
+  add_action "Install/repair: ${uniq_tools% }"
+  mark_blocker
+else
+  row "Local tools (kubectl, helm)" "OK"
+fi
+# ---------- summary ----------
+printf "\n========================================\n"
+if [[ $BLOCKERS -eq 0 && ${#ACTIONS[@]} -eq 0 ]]; then
+  printf "RESULT: READY - you can deploy the CloudFormation stack.\n"
+  printf "========================================\n"
+  exit 0
+elif [[ $BLOCKERS -eq 0 ]]; then
+  printf "RESULT: READY WITH WARNINGS\n"
+  printf "========================================\n"
+  printf "The deploy should work, but address these first if possible:\n"
+else
+  printf "RESULT: NOT READY\n"
+  printf "========================================\n"
+  printf "Required actions:\n"
+fi
+i=1
+for a in "${ACTIONS[@]}"; do
+  printf "  %d. %s\n" "$i" "$a"
+  i=$((i + 1))
+done
+printf "\nRe-run this script after completing the actions above.\n"
+printf "(Set VERBOSE=1 to see raw AWS error messages.)\n"
+[[ $BLOCKERS -gt 0 ]] && exit 1 || exit 0

package/cluster-setup/aws/parameters.json ADDED Viewed

@@ -0,0 +1,13 @@
+[
+  { "ParameterKey": "ClusterName", "ParameterValue": "rulebricks-cluster" },
+  { "ParameterKey": "KubernetesVersion", "ParameterValue": "1.34" },
+  { "ParameterKey": "NodeInstanceType", "ParameterValue": "c7gn.xlarge" },
+  { "ParameterKey": "NodeDesiredCapacity", "ParameterValue": "2" },
+  { "ParameterKey": "NodeMinSize", "ParameterValue": "2" },
+  { "ParameterKey": "NodeMaxSize", "ParameterValue": "4" },
+  { "ParameterKey": "EnableBurstPool", "ParameterValue": "true" },
+  { "ParameterKey": "BurstInstanceType", "ParameterValue": "c7g.8xlarge" },
+  { "ParameterKey": "BurstNodeMaxSize", "ParameterValue": "1" },
+  { "ParameterKey": "NodeVolumeSizeGiB", "ParameterValue": "50" },
+  { "ParameterKey": "VpcCidr", "ParameterValue": "10.0.0.0/16" }
+]