PyPI - cluster-builder - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

cluster-builder 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cluster-builder might be problematic. Click here for more details.

Files changed (24) hide show

cluster_builder/config/cluster.py +35 -7
cluster_builder/config/postgres.py +4 -1
cluster_builder/infrastructure/executor.py +48 -48
cluster_builder/infrastructure/templates.py +2 -2
cluster_builder/swarmchestrate.py +268 -48
cluster_builder/templates/aws/main.tf +109 -46
cluster_builder/templates/deploy_manifest.tf +43 -0
cluster_builder/templates/edge/main.tf +98 -0
cluster_builder/templates/ha_user_data.sh.tpl +32 -1
cluster_builder/templates/master_user_data.sh.tpl +36 -5
cluster_builder/templates/openstack/main.tf +218 -0
cluster_builder/templates/openstack_provider.tf +70 -0
cluster_builder/templates/worker_user_data.sh.tpl +33 -1
cluster_builder/utils/hcl.py +91 -15
cluster_builder-0.3.2.dist-info/METADATA +339 -0
cluster_builder-0.3.2.dist-info/RECORD +25 -0
{cluster_builder-0.3.0.dist-info → cluster_builder-0.3.2.dist-info}/WHEEL +1 -1
cluster_builder/templates/edge/main.tf.j2 +0 -40
cluster_builder/templates/openstack/main.tf.j2 +0 -76
cluster_builder/templates/openstack/network_security_group.tf.j2 +0 -34
cluster_builder-0.3.0.dist-info/METADATA +0 -264
cluster_builder-0.3.0.dist-info/RECORD +0 -24
{cluster_builder-0.3.0.dist-info → cluster_builder-0.3.2.dist-info}/licenses/LICENSE +0 -0
{cluster_builder-0.3.0.dist-info → cluster_builder-0.3.2.dist-info}/top_level.txt +0 -0

cluster_builder/swarmchestrate.py CHANGED Viewed

@@ -2,10 +2,14 @@
 Swarmchestrate - Main orchestration class for K3s cluster management.
 """
+import json
 import os
 import logging
+from pathlib import Path
 import shutil
+import subprocess
 from typing import Optional
+import psycopg2
 from dotenv import load_dotenv
@@ -43,6 +47,7 @@ class Swarmchestrate:
         load_dotenv()
         try:
+            logger.debug("Loading PostgreSQL configuration from environment...")
             self.pg_config = PostgresConfig.from_env()
         except ValueError as e:
             logger.error(f"Invalid PostgreSQL configuration: {e}")
@@ -52,7 +57,7 @@ class Swarmchestrate:
         self.template_manager = TemplateManager()
         self.cluster_config = ClusterConfig(self.template_manager, output_dir)
-        logger.info(
+        logger.debug(
             f"Initialised with template_dir={template_dir}, output_dir={output_dir}"
         )
@@ -68,15 +73,6 @@ class Swarmchestrate:
         """
         return self.cluster_config.get_cluster_output_dir(cluster_name)
-    def generate_random_name(self) -> str:
-        """
-        Generate a readable random string using names-generator.
-        Returns:
-            A randomly generated name
-        """
-        return self.cluster_config.generate_random_name()
     def validate_configuration(self, cloud: str, config: dict) -> list:
         """
         Validate a configuration against the required variables for a cloud provider.
@@ -88,18 +84,21 @@ class Swarmchestrate:
         Returns:
             List of missing required variables (empty if all required variables are present)
         """
+        logger.debug(f"Validating configuration for cloud={cloud}, role={config.get('k3s_role')}")
         # Master IP validation
         has_master_ip = "master_ip" in config and config["master_ip"]
         role = config["k3s_role"]
         # Cannot add a master node to an existing cluster
         if has_master_ip and role == "master":
+            logger.error("Invalid configuration: master_ip specified with master role")
             raise ValueError(
                 "Cannot add master to existing cluster (master_ip specified with master role)"
             )
         # Worker/HA nodes require a master IP
         if not has_master_ip and role in ["worker", "ha"]:
+            logger.error(f"Invalid configuration: Role '{role}' requires master_ip to be specified")
             raise ValueError(f"Role '{role}' requires master_ip to be specified")
         required_vars = self.template_manager.get_required_variables(cloud)
@@ -111,6 +110,11 @@ class Swarmchestrate:
             if "default" not in var_config and var_name not in config:
                 missing_vars.append(var_name)
+        if missing_vars:
+            logger.warning(f"⚠️ Missing required variables for {cloud}: {missing_vars}")
+        else:
+            logger.debug(f"All required variables provided for {cloud}")
         return missing_vars
     def prepare_infrastructure(
@@ -134,9 +138,11 @@ class Swarmchestrate:
             RuntimeError: If file operations fail
         """
         try:
+            logger.debug("Preparing infrastructure configuration...")
             # Prepare the configuration
             cluster_dir, prepared_config = self.cluster_config.prepare(config)
+            logger.debug(f"Cluster directory prepared at: {cluster_dir}")
             # Validate the configuration
             cloud = prepared_config["cloud"]
             missing_vars = self.validate_configuration(cloud, prepared_config)
@@ -144,10 +150,14 @@ class Swarmchestrate:
                 raise ValueError(
                     f"Missing required variables for cloud provider '{cloud}': {', '.join(missing_vars)}"
                 )
+            logger.debug(f"Configuration validated for cloud: {cloud}")
             # Create provider configuration
-            self.template_manager.create_provider_config(cluster_dir, cloud)
-            logger.info(f"Created provider configuration for {cloud}")
+            if cloud!= "edge" :
+                self.template_manager.create_provider_config(cluster_dir, cloud)
+                logger.debug(f"Created provider configuration for {cloud}")
+            else:
+                logger.debug("Skipping provider configuration for edge.")
             # Create Terraform files
             main_tf_path = os.path.join(cluster_dir, "main.tf")
@@ -162,21 +172,22 @@ class Swarmchestrate:
                 conn_str,
                 prepared_config["cluster_name"],
             )
-            logger.info(f"Added backend configuration to {backend_tf_path}")
+            logger.debug(f"Added backend configuration to {backend_tf_path}")
             # Add module block
             target = prepared_config["resource_name"]
             hcl.add_module_block(main_tf_path, target, prepared_config)
-            logger.info(f"Added module block to {main_tf_path}")
+            logger.debug(f"Added module block to {main_tf_path}")
+            logger.debug("Infrastructure preparation complete.")
             return cluster_dir, prepared_config
         except Exception as e:
-            error_msg = f"Failed to prepare infrastructure: {e}"
+            error_msg = f"❌ Failed to prepare infrastructure: {e}"
             logger.error(error_msg)
             raise RuntimeError(error_msg)
-    def add_node(self, config: dict[str, any], dryrun: bool = False) -> str:
+    def add_node(self, config: dict[str, any], dryrun: bool = False) -> dict:
         """
         Add a node to an existing cluster or create a new cluster based on configuration.
@@ -189,14 +200,33 @@ class Swarmchestrate:
             dryrun: If True, only validate the configuration without deploying
         Returns:
-            The cluster name
+            The cluster name and other output values.
         Raises:
             ValueError: If required configuration is missing or invalid
             RuntimeError: If preparation or deployment fails
         """
         # Prepare the infrastructure configuration
         cluster_dir, prepared_config = self.prepare_infrastructure(config)
+        role = prepared_config["k3s_role"]
+        # Add output blocks for the module you just added
+        module_name = prepared_config["resource_name"]
+        logger.info(f"---------- Starting deployment of {module_name} ({role}) ----------")
+        outputs_file = os.path.join(cluster_dir, "outputs.tf")
+        # Define common output names
+        output_names = ["cluster_name", "master_ip", "worker_ip", "ha_ip", "k3s_token", "resource_name"]
+        # Include additional outputs based on the cloud type
+        if "aws" in cluster_dir:
+            output_names.append("instance_status")
+        elif "openstack" in cluster_dir:
+            output_names.append("instance_power_state")
+        # Add output blocks
+        hcl.add_output_blocks(outputs_file, module_name, output_names)
         logger.info(f"Adding node for cluster '{prepared_config['cluster_name']}'")
@@ -204,18 +234,54 @@ class Swarmchestrate:
         try:
             self.deploy(cluster_dir, dryrun)
             cluster_name = prepared_config["cluster_name"]
-            node_name = prepared_config["resource_name"]
+            resource_name = prepared_config["resource_name"]
             logger.info(
-                f"Successfully added '{node_name}' for cluster '{cluster_name}'"
+                f"✅ Successfully added '{resource_name}' for cluster '{cluster_name}'"
             )
-            return cluster_name
+            # Run 'tofu output -json' to get outputs
+            result = subprocess.run(
+                ["tofu", "output", "-json"],
+                cwd=cluster_dir,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True,
+            )
+            outputs = json.loads(result.stdout)
+            # Extract output values for all required fields
+            result_outputs = {
+                "cluster_name": outputs.get("cluster_name", {}).get("value"),
+                "master_ip": outputs.get("master_ip", {}).get("value"),
+                "k3s_token": outputs.get("k3s_token", {}).get("value"),
+                "worker_ip": outputs.get("worker_ip", {}).get("value"),
+                "ha_ip": outputs.get("ha_ip", {}).get("value"),
+                "resource_name": outputs.get("resource_name", {}).get("value")
+            }
+            # Add cloud-specific output
+            if "aws" in cluster_dir:
+                result_outputs["instance_status"] = outputs.get("instance_status", {}).get("value")
+            elif "openstack" in cluster_dir:
+                result_outputs["instance_power_state"] = outputs.get("instance_power_state", {}).get("value")
+            logger.info(f"----------- Deployment of {role} node successful -----------")
+            logger.debug(f"Deployment outputs: {result_outputs}")
+            return result_outputs
+        except subprocess.CalledProcessError as e:
+            error_msg = f"❌ Failed to get outputs: {e.stderr.strip()}"
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
         except Exception as e:
-            error_msg = f"Failed to add node: {e}"
+            error_msg = f"❌ Failed to add node: {e}"
             logger.error(error_msg)
             raise RuntimeError(error_msg)
     def remove_node(
-        self, cluster_name: str, resource_name: str, dryrun: bool = False
+        self, cluster_name: str, resource_name: str, is_edge: bool = False, dryrun: bool = False
     ) -> None:
         """
         Remove a specific node from a cluster.
@@ -225,14 +291,15 @@ class Swarmchestrate:
         reapplying the configuration.
         Args:
-            cluster_name: Name of the cluster containing the node
-            resource_name: Resource name of the node to remove
-            dryrun: If True, only validate the changes without applying
+            cluster_name: Name of the cluster
+            resource_name: Node name in K3s and module name in main.tf / OpenTofu
+            is_edge: True if the node is pre-provisioned (edge node)
+            dryrun: If True, only simulate actions without executing
         Raises:
             RuntimeError: If node removal fails
         """
-        logger.info(f"Removing node '{resource_name}' from cluster '{cluster_name}'...")
+        logger.info(f"------------ Removing node '{resource_name}' from cluster '{cluster_name}' ------------")
         # Get the directory for the specified cluster
         cluster_dir = self.get_cluster_output_dir(cluster_name)
@@ -251,21 +318,42 @@ class Swarmchestrate:
             raise RuntimeError(error_msg)
         try:
-            # Remove the module block for the specified resource
+            # Destroy VM only if cloud node (optional)
+            if not is_edge:
+                tofu_resource = f"opentofu_aws_instance.{resource_name}"
+                if not dryrun:
+                    CommandExecutor.run_command(
+                        ["tofu", "destroy", "-target", tofu_resource, "-auto-approve"],
+                        cwd=cluster_dir,
+                        description=f"Destroying VM for node {resource_name}",
+                    )
+                else:
+                    logger.info(f"Dryrun: would destroy VM for node '{resource_name}' (cloud node)")
+            # Remove module block from main.tf
             hcl.remove_module_block(main_tf_path, resource_name)
-            logger.info(
-                f"Removed module block for '{resource_name}' from {main_tf_path}"
-            )
+            logger.info(f"Removed module block for '{resource_name}' from {main_tf_path}")
-            self.deploy(cluster_dir, dryrun)
+            # Delete outputs.tf entirely (optional, safer for decentralized setup)
+            outputs_tf_path = os.path.join(cluster_dir, "outputs.tf")
+            if os.path.exists(outputs_tf_path):
+                os.remove(outputs_tf_path)
+                logger.info(f"Deleted outputs.tf before applying changes to remove '{resource_name}'")
+            # Apply OpenTofu configuration to update state
             if not dryrun:
-                logger.info(
-                    f"Successfully removed node '{resource_name}' from cluster '{cluster_name}'"
+                CommandExecutor.run_command(
+                    ["tofu", "apply", "-auto-approve"],
+                    cwd=cluster_dir,
+                    description=f"Applying OpenTofu configuration after removing node {resource_name}",
                 )
+            else:
+                logger.info(f"Dryrun: would apply OpenTofu configuration after removing node '{resource_name}'")
+            logger.info(f"✅ Node '{resource_name}' removed successfully from cluster '{cluster_name}'")
         except Exception as e:
-            error_msg = f"Failed to remove node '{resource_name}' from cluster '{cluster_name}': {str(e)}"
+            error_msg = f"❌ Failed to remove node '{resource_name}' from cluster '{cluster_name}': {str(e)}"
             logger.error(error_msg)
             raise RuntimeError(error_msg)
@@ -280,27 +368,41 @@ class Swarmchestrate:
         Raises:
             RuntimeError: If OpenTofu commands fail
         """
-        logger.info(f"Updating infrastructure in {cluster_dir}")
+        logger.debug(f"Updating infrastructure in {cluster_dir}")
         if not os.path.exists(cluster_dir):
-            error_msg = f"Cluster directory '{cluster_dir}' not found"
+            error_msg = f"❌ Cluster directory '{cluster_dir}' not found"
             logger.error(error_msg)
             raise RuntimeError(error_msg)
+        # Retrieve the environment variables for tofu logs
+        tf_log = os.getenv("TF_LOG", "INFO")
+        tf_log_path = os.getenv("TF_LOG_PATH", "/tmp/opentofu.log")
+        # Check if the environment variables are set
+        if not tf_log or not tf_log_path:
+            print("❌ Error: Missing required environment variables.")
+            exit(1)
+        # Prepare environment variables for subprocess
+        env_vars = os.environ.copy()
+        env_vars["TF_LOG"] = tf_log
+        env_vars["TF_LOG_PATH"] = tf_log_path
         try:
             # Initialise OpenTofu
             init_command = ["tofu", "init"]
             if dryrun:
                 logger.info("Dryrun: will init without backend and validate only")
                 init_command.append("-backend=false")
-            CommandExecutor.run_command(init_command, cluster_dir, "OpenTofu init")
+            CommandExecutor.run_command(init_command, cluster_dir, "OpenTofu init", env=env_vars)
             # Validate the deployment
             if dryrun:
                 CommandExecutor.run_command(
-                    ["tofu", "validate"], cluster_dir, "OpenTofu validate"
+                    ["tofu", "validate"], cluster_dir, "OpenTofu validate", env=env_vars
                 )
-                logger.info("Infrastructure successfully validated")
+                logger.info("✅ Infrastructure successfully validated")
                 return
             # Plan the deployment
@@ -309,16 +411,17 @@ class Swarmchestrate:
                 cluster_dir,
                 "OpenTofu plan",
                 timeout=30,
+                env=env_vars,
             )
             # Apply the deployment
             CommandExecutor.run_command(
-                ["tofu", "apply", "-auto-approve"], cluster_dir, "OpenTofu apply"
+                ["tofu", "apply", "-auto-approve"], cluster_dir, "OpenTofu apply", env=env_vars
             )
             logger.info("Infrastructure successfully updated")
         except RuntimeError as e:
-            error_msg = f"Failed to deploy infrastructure: {str(e)}"
+            error_msg = f"❌ Failed to deploy infrastructure: {str(e)}"
             logger.error(error_msg)
             raise RuntimeError(error_msg)
@@ -332,28 +435,29 @@ class Swarmchestrate:
         Raises:
             RuntimeError: If destruction fails
         """
-        logger.info(f"Destroying the K3s cluster '{cluster_name}'...")
+        logger.info(f"---------- Destroying the cluster '{cluster_name}' -----------")
         # Get the directory for the specified cluster
         cluster_dir = self.get_cluster_output_dir(cluster_name)
         if not os.path.exists(cluster_dir):
-            error_msg = f"Cluster directory '{cluster_dir}' not found"
+            error_msg = f"❌ Cluster directory '{cluster_dir}' not found"
             logger.error(error_msg)
             raise RuntimeError(error_msg)
         if dryrun:
-            logger.info("Dryrun: will only delete")
+            logger.info("Dryrun: will only delete cluster")
             shutil.rmtree(cluster_dir, ignore_errors=True)
             return
         try:
             # Plan destruction
             CommandExecutor.run_command(
                 ["tofu", "plan", "-destroy", "-input=false"],
                 cluster_dir,
                 "OpenTofu plan destruction",
-                timeout=30,
+                timeout=40,
             )
             # Execute destruction
@@ -365,9 +469,125 @@ class Swarmchestrate:
             # Remove the cluster directory
             shutil.rmtree(cluster_dir, ignore_errors=True)
-            logger.info(f"Removed cluster directory: {cluster_dir}")
+            logger.info(f"✅ Removed cluster directory: {cluster_dir}")
+            # Remove schema and database entry from PostgreSQL
+            self.remove_cluster_schema_from_db(cluster_name)
         except RuntimeError as e:
-            error_msg = f"Failed to destroy cluster '{cluster_name}': {str(e)}"
+            error_msg = f"❌ Failed to destroy cluster '{cluster_name}': {str(e)}"
             logger.error(error_msg)
             raise RuntimeError(error_msg)
+    def remove_cluster_schema_from_db(self, cluster_name: str) -> None:
+            """
+            Removes the schema and the entry for the cluster from the PostgreSQL database.
+            Args:
+                cluster_name: The name of the cluster to remove from the database
+            Raises:
+                RuntimeError: If the database operation fails
+            """
+            logger.info(f"Removing schema for cluster '{cluster_name}' from the PostgreSQL database...")
+            # Create a PostgreSQL connection string using the config
+            connection_string = self.pg_config.get_connection_string()
+            try:
+                # Connect to the PostgreSQL database
+                connection = psycopg2.connect(connection_string)
+                cursor = connection.cursor()
+                # Define the SQL query to delete the cluster schema
+                drop_schema_query = f'DROP SCHEMA IF EXISTS "{cluster_name}" CASCADE'
+                cursor.execute(drop_schema_query)
+                # Commit the transaction
+                connection.commit()
+                logger.info(f"Schema for cluster '{cluster_name}' removed from the database")
+                logger.info(f"----------- Destruction of cluster '{cluster_name}' successful -----------")
+            except psycopg2.Error as e:
+                logger.error(f"❌ Failed to remove schema for cluster '{cluster_name}' from the database: {e}")
+                raise RuntimeError(f" ❌Failed to remove schema for cluster '{cluster_name}' from the database")
+            finally:
+                # Close the database connection
+                if cursor:
+                    cursor.close()
+                if connection:
+                    connection.close()
+    def deploy_manifests(
+        self,
+        manifest_folder: str,
+        master_ip: str,
+        ssh_key_path: str,
+        ssh_user: str,
+    ):
+        """
+        Copy and apply manifests to a cluster using copy_manifest.tf in a temporaryfolder.
+        Args:
+            manifest_folder: Path to local manifest folder
+            master_ip: IP address of K3s master
+            ssh_key_path: Path to SSH private key
+            ssh_user: SSH username to connect to the master node
+        """
+        # Dedicated folder for copy-manifest operations
+        copy_dir = Path(self.output_dir) / "copy-manifest"
+        copy_dir.mkdir(parents=True, exist_ok=True)
+        logger.debug(f"Using copy-manifest folder: {copy_dir}")
+        try:
+            # Copy copy_manifest.tf from templates
+            tf_source_file = Path(self.template_manager.templates_dir) / "deploy_manifest.tf"
+            if not tf_source_file.exists():
+                logger.debug(f"deploy_manifest.tf not found at: {tf_source_file}")
+                raise RuntimeError(f"deploy_manifest.tf not found at: {tf_source_file}")
+            shutil.copy(tf_source_file, copy_dir)
+            logger.debug(f"Copied copy_manifest.tf to {copy_dir}")
+            # Prepare environment for OpenTofu
+            env_vars = os.environ.copy()
+            env_vars["TF_LOG"] = os.getenv("TF_LOG", "INFO")
+            env_vars["TF_LOG_PATH"] = os.getenv("TF_LOG_PATH", "/tmp/opentofu.log")
+            logger.info(f"------------ Applying manifest on node: {master_ip} -------------------")
+            # Run tofu init with spinner
+            CommandExecutor.run_command(
+                ["tofu", "init"],
+                cwd=str(copy_dir),
+                description="OpenTofu init",
+                env=env_vars,
+            )
+            # Run tofu apply with spinner
+            CommandExecutor.run_command(
+                [
+                    "tofu",
+                    "apply",
+                    "-auto-approve",
+                    f"-var=manifest_folder={manifest_folder}",
+                    f"-var=master_ip={master_ip}",
+                    f"-var=ssh_private_key_path={ssh_key_path}",
+                    f"-var=ssh_user={ssh_user}"
+                ],
+                cwd=str(copy_dir),
+                description="OpenTofu apply",
+                env=env_vars,
+            )
+            logger.info("------------ Successfully applied manifests -------------------")
+        except RuntimeError as e:
+            print(f"\n---------- ERROR ----------\n{e}\n")
+            raise
+        finally:
+            if copy_dir.exists():
+                shutil.rmtree(copy_dir)

cluster-builder 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

Potentially problematic release.

cluster-builder 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl