cluster-builder 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cluster-builder might be problematic. Click here for more details.

@@ -2,10 +2,14 @@
2
2
  Swarmchestrate - Main orchestration class for K3s cluster management.
3
3
  """
4
4
 
5
+ import json
5
6
  import os
6
7
  import logging
8
+ from pathlib import Path
7
9
  import shutil
10
+ import subprocess
8
11
  from typing import Optional
12
+ import psycopg2
9
13
 
10
14
  from dotenv import load_dotenv
11
15
 
@@ -43,6 +47,7 @@ class Swarmchestrate:
43
47
  load_dotenv()
44
48
 
45
49
  try:
50
+ logger.debug("Loading PostgreSQL configuration from environment...")
46
51
  self.pg_config = PostgresConfig.from_env()
47
52
  except ValueError as e:
48
53
  logger.error(f"Invalid PostgreSQL configuration: {e}")
@@ -52,7 +57,7 @@ class Swarmchestrate:
52
57
  self.template_manager = TemplateManager()
53
58
  self.cluster_config = ClusterConfig(self.template_manager, output_dir)
54
59
 
55
- logger.info(
60
+ logger.debug(
56
61
  f"Initialised with template_dir={template_dir}, output_dir={output_dir}"
57
62
  )
58
63
 
@@ -68,15 +73,6 @@ class Swarmchestrate:
68
73
  """
69
74
  return self.cluster_config.get_cluster_output_dir(cluster_name)
70
75
 
71
- def generate_random_name(self) -> str:
72
- """
73
- Generate a readable random string using names-generator.
74
-
75
- Returns:
76
- A randomly generated name
77
- """
78
- return self.cluster_config.generate_random_name()
79
-
80
76
  def validate_configuration(self, cloud: str, config: dict) -> list:
81
77
  """
82
78
  Validate a configuration against the required variables for a cloud provider.
@@ -88,18 +84,21 @@ class Swarmchestrate:
88
84
  Returns:
89
85
  List of missing required variables (empty if all required variables are present)
90
86
  """
87
+ logger.debug(f"Validating configuration for cloud={cloud}, role={config.get('k3s_role')}")
91
88
  # Master IP validation
92
89
  has_master_ip = "master_ip" in config and config["master_ip"]
93
90
  role = config["k3s_role"]
94
91
 
95
92
  # Cannot add a master node to an existing cluster
96
93
  if has_master_ip and role == "master":
94
+ logger.error("Invalid configuration: master_ip specified with master role")
97
95
  raise ValueError(
98
96
  "Cannot add master to existing cluster (master_ip specified with master role)"
99
97
  )
100
98
 
101
99
  # Worker/HA nodes require a master IP
102
100
  if not has_master_ip and role in ["worker", "ha"]:
101
+ logger.error(f"Invalid configuration: Role '{role}' requires master_ip to be specified")
103
102
  raise ValueError(f"Role '{role}' requires master_ip to be specified")
104
103
 
105
104
  required_vars = self.template_manager.get_required_variables(cloud)
@@ -111,6 +110,11 @@ class Swarmchestrate:
111
110
  if "default" not in var_config and var_name not in config:
112
111
  missing_vars.append(var_name)
113
112
 
113
+ if missing_vars:
114
+ logger.warning(f"⚠️ Missing required variables for {cloud}: {missing_vars}")
115
+ else:
116
+ logger.debug(f"All required variables provided for {cloud}")
117
+
114
118
  return missing_vars
115
119
 
116
120
  def prepare_infrastructure(
@@ -134,9 +138,11 @@ class Swarmchestrate:
134
138
  RuntimeError: If file operations fail
135
139
  """
136
140
  try:
141
+ logger.debug("Preparing infrastructure configuration...")
137
142
  # Prepare the configuration
138
143
  cluster_dir, prepared_config = self.cluster_config.prepare(config)
139
-
144
+ logger.debug(f"Cluster directory prepared at: {cluster_dir}")
145
+
140
146
  # Validate the configuration
141
147
  cloud = prepared_config["cloud"]
142
148
  missing_vars = self.validate_configuration(cloud, prepared_config)
@@ -144,10 +150,14 @@ class Swarmchestrate:
144
150
  raise ValueError(
145
151
  f"Missing required variables for cloud provider '{cloud}': {', '.join(missing_vars)}"
146
152
  )
153
+ logger.debug(f"Configuration validated for cloud: {cloud}")
147
154
 
148
155
  # Create provider configuration
149
- self.template_manager.create_provider_config(cluster_dir, cloud)
150
- logger.info(f"Created provider configuration for {cloud}")
156
+ if cloud!= "edge" :
157
+ self.template_manager.create_provider_config(cluster_dir, cloud)
158
+ logger.debug(f"Created provider configuration for {cloud}")
159
+ else:
160
+ logger.debug("Skipping provider configuration for edge.")
151
161
 
152
162
  # Create Terraform files
153
163
  main_tf_path = os.path.join(cluster_dir, "main.tf")
@@ -162,21 +172,22 @@ class Swarmchestrate:
162
172
  conn_str,
163
173
  prepared_config["cluster_name"],
164
174
  )
165
- logger.info(f"Added backend configuration to {backend_tf_path}")
175
+ logger.debug(f"Added backend configuration to {backend_tf_path}")
166
176
 
167
177
  # Add module block
168
178
  target = prepared_config["resource_name"]
169
179
  hcl.add_module_block(main_tf_path, target, prepared_config)
170
- logger.info(f"Added module block to {main_tf_path}")
180
+ logger.debug(f"Added module block to {main_tf_path}")
181
+ logger.debug("Infrastructure preparation complete.")
171
182
 
172
183
  return cluster_dir, prepared_config
173
184
 
174
185
  except Exception as e:
175
- error_msg = f"Failed to prepare infrastructure: {e}"
186
+ error_msg = f"Failed to prepare infrastructure: {e}"
176
187
  logger.error(error_msg)
177
188
  raise RuntimeError(error_msg)
178
189
 
179
- def add_node(self, config: dict[str, any], dryrun: bool = False) -> str:
190
+ def add_node(self, config: dict[str, any], dryrun: bool = False) -> dict:
180
191
  """
181
192
  Add a node to an existing cluster or create a new cluster based on configuration.
182
193
 
@@ -189,14 +200,33 @@ class Swarmchestrate:
189
200
  dryrun: If True, only validate the configuration without deploying
190
201
 
191
202
  Returns:
192
- The cluster name
203
+ The cluster name and other output values.
193
204
 
194
205
  Raises:
195
206
  ValueError: If required configuration is missing or invalid
196
207
  RuntimeError: If preparation or deployment fails
197
208
  """
198
209
  # Prepare the infrastructure configuration
210
+
199
211
  cluster_dir, prepared_config = self.prepare_infrastructure(config)
212
+ role = prepared_config["k3s_role"]
213
+
214
+ # Add output blocks for the module you just added
215
+ module_name = prepared_config["resource_name"]
216
+ logger.info(f"---------- Starting deployment of {module_name} ({role}) ----------")
217
+ outputs_file = os.path.join(cluster_dir, "outputs.tf")
218
+
219
+ # Define common output names
220
+ output_names = ["cluster_name", "master_ip", "worker_ip", "ha_ip", "k3s_token", "resource_name"]
221
+
222
+ # Include additional outputs based on the cloud type
223
+ if "aws" in cluster_dir:
224
+ output_names.append("instance_status")
225
+ elif "openstack" in cluster_dir:
226
+ output_names.append("instance_power_state")
227
+
228
+ # Add output blocks
229
+ hcl.add_output_blocks(outputs_file, module_name, output_names)
200
230
 
201
231
  logger.info(f"Adding node for cluster '{prepared_config['cluster_name']}'")
202
232
 
@@ -204,18 +234,54 @@ class Swarmchestrate:
204
234
  try:
205
235
  self.deploy(cluster_dir, dryrun)
206
236
  cluster_name = prepared_config["cluster_name"]
207
- node_name = prepared_config["resource_name"]
237
+ resource_name = prepared_config["resource_name"]
208
238
  logger.info(
209
- f"Successfully added '{node_name}' for cluster '{cluster_name}'"
239
+ f"Successfully added '{resource_name}' for cluster '{cluster_name}'"
210
240
  )
211
- return cluster_name
241
+ # Run 'tofu output -json' to get outputs
242
+ result = subprocess.run(
243
+ ["tofu", "output", "-json"],
244
+ cwd=cluster_dir,
245
+ stdout=subprocess.PIPE,
246
+ stderr=subprocess.PIPE,
247
+ text=True,
248
+ check=True,
249
+ )
250
+ outputs = json.loads(result.stdout)
251
+
252
+ # Extract output values for all required fields
253
+ result_outputs = {
254
+ "cluster_name": outputs.get("cluster_name", {}).get("value"),
255
+ "master_ip": outputs.get("master_ip", {}).get("value"),
256
+ "k3s_token": outputs.get("k3s_token", {}).get("value"),
257
+ "worker_ip": outputs.get("worker_ip", {}).get("value"),
258
+ "ha_ip": outputs.get("ha_ip", {}).get("value"),
259
+ "resource_name": outputs.get("resource_name", {}).get("value")
260
+ }
261
+ # Add cloud-specific output
262
+ if "aws" in cluster_dir:
263
+ result_outputs["instance_status"] = outputs.get("instance_status", {}).get("value")
264
+ elif "openstack" in cluster_dir:
265
+ result_outputs["instance_power_state"] = outputs.get("instance_power_state", {}).get("value")
266
+
267
+ logger.info(f"----------- Deployment of {role} node successful -----------")
268
+ logger.debug(f"Deployment outputs: {result_outputs}")
269
+
270
+ return result_outputs
271
+
272
+ except subprocess.CalledProcessError as e:
273
+ error_msg = f"❌ Failed to get outputs: {e.stderr.strip()}"
274
+ logger.error(error_msg)
275
+ raise RuntimeError(error_msg)
276
+
212
277
  except Exception as e:
213
- error_msg = f"Failed to add node: {e}"
278
+ error_msg = f"Failed to add node: {e}"
214
279
  logger.error(error_msg)
215
280
  raise RuntimeError(error_msg)
216
281
 
282
+
217
283
  def remove_node(
218
- self, cluster_name: str, resource_name: str, dryrun: bool = False
284
+ self, cluster_name: str, resource_name: str, is_edge: bool = False, dryrun: bool = False
219
285
  ) -> None:
220
286
  """
221
287
  Remove a specific node from a cluster.
@@ -225,14 +291,15 @@ class Swarmchestrate:
225
291
  reapplying the configuration.
226
292
 
227
293
  Args:
228
- cluster_name: Name of the cluster containing the node
229
- resource_name: Resource name of the node to remove
230
- dryrun: If True, only validate the changes without applying
294
+ cluster_name: Name of the cluster
295
+ resource_name: Node name in K3s and module name in main.tf / OpenTofu
296
+ is_edge: True if the node is pre-provisioned (edge node)
297
+ dryrun: If True, only simulate actions without executing
231
298
 
232
299
  Raises:
233
300
  RuntimeError: If node removal fails
234
301
  """
235
- logger.info(f"Removing node '{resource_name}' from cluster '{cluster_name}'...")
302
+ logger.info(f"------------ Removing node '{resource_name}' from cluster '{cluster_name}' ------------")
236
303
 
237
304
  # Get the directory for the specified cluster
238
305
  cluster_dir = self.get_cluster_output_dir(cluster_name)
@@ -251,21 +318,42 @@ class Swarmchestrate:
251
318
  raise RuntimeError(error_msg)
252
319
 
253
320
  try:
254
- # Remove the module block for the specified resource
321
+ # Destroy VM only if cloud node (optional)
322
+ if not is_edge:
323
+ tofu_resource = f"opentofu_aws_instance.{resource_name}"
324
+ if not dryrun:
325
+ CommandExecutor.run_command(
326
+ ["tofu", "destroy", "-target", tofu_resource, "-auto-approve"],
327
+ cwd=cluster_dir,
328
+ description=f"Destroying VM for node {resource_name}",
329
+ )
330
+ else:
331
+ logger.info(f"Dryrun: would destroy VM for node '{resource_name}' (cloud node)")
332
+
333
+ # Remove module block from main.tf
255
334
  hcl.remove_module_block(main_tf_path, resource_name)
256
- logger.info(
257
- f"Removed module block for '{resource_name}' from {main_tf_path}"
258
- )
335
+ logger.info(f"Removed module block for '{resource_name}' from {main_tf_path}")
259
336
 
260
- self.deploy(cluster_dir, dryrun)
337
+ # Delete outputs.tf entirely (optional, safer for decentralized setup)
338
+ outputs_tf_path = os.path.join(cluster_dir, "outputs.tf")
339
+ if os.path.exists(outputs_tf_path):
340
+ os.remove(outputs_tf_path)
341
+ logger.info(f"Deleted outputs.tf before applying changes to remove '{resource_name}'")
261
342
 
343
+ # Apply OpenTofu configuration to update state
262
344
  if not dryrun:
263
- logger.info(
264
- f"Successfully removed node '{resource_name}' from cluster '{cluster_name}'"
345
+ CommandExecutor.run_command(
346
+ ["tofu", "apply", "-auto-approve"],
347
+ cwd=cluster_dir,
348
+ description=f"Applying OpenTofu configuration after removing node {resource_name}",
265
349
  )
350
+ else:
351
+ logger.info(f"Dryrun: would apply OpenTofu configuration after removing node '{resource_name}'")
352
+
353
+ logger.info(f"✅ Node '{resource_name}' removed successfully from cluster '{cluster_name}'")
266
354
 
267
355
  except Exception as e:
268
- error_msg = f"Failed to remove node '{resource_name}' from cluster '{cluster_name}': {str(e)}"
356
+ error_msg = f"Failed to remove node '{resource_name}' from cluster '{cluster_name}': {str(e)}"
269
357
  logger.error(error_msg)
270
358
  raise RuntimeError(error_msg)
271
359
 
@@ -280,27 +368,41 @@ class Swarmchestrate:
280
368
  Raises:
281
369
  RuntimeError: If OpenTofu commands fail
282
370
  """
283
- logger.info(f"Updating infrastructure in {cluster_dir}")
371
+ logger.debug(f"Updating infrastructure in {cluster_dir}")
284
372
 
285
373
  if not os.path.exists(cluster_dir):
286
- error_msg = f"Cluster directory '{cluster_dir}' not found"
374
+ error_msg = f"Cluster directory '{cluster_dir}' not found"
287
375
  logger.error(error_msg)
288
376
  raise RuntimeError(error_msg)
289
377
 
378
+ # Retrieve the environment variables for tofu logs
379
+ tf_log = os.getenv("TF_LOG", "INFO")
380
+ tf_log_path = os.getenv("TF_LOG_PATH", "/tmp/opentofu.log")
381
+
382
+ # Check if the environment variables are set
383
+ if not tf_log or not tf_log_path:
384
+ print("❌ Error: Missing required environment variables.")
385
+ exit(1)
386
+
387
+ # Prepare environment variables for subprocess
388
+ env_vars = os.environ.copy()
389
+ env_vars["TF_LOG"] = tf_log
390
+ env_vars["TF_LOG_PATH"] = tf_log_path
391
+
290
392
  try:
291
393
  # Initialise OpenTofu
292
394
  init_command = ["tofu", "init"]
293
395
  if dryrun:
294
396
  logger.info("Dryrun: will init without backend and validate only")
295
397
  init_command.append("-backend=false")
296
- CommandExecutor.run_command(init_command, cluster_dir, "OpenTofu init")
398
+ CommandExecutor.run_command(init_command, cluster_dir, "OpenTofu init", env=env_vars)
297
399
 
298
400
  # Validate the deployment
299
401
  if dryrun:
300
402
  CommandExecutor.run_command(
301
- ["tofu", "validate"], cluster_dir, "OpenTofu validate"
403
+ ["tofu", "validate"], cluster_dir, "OpenTofu validate", env=env_vars
302
404
  )
303
- logger.info("Infrastructure successfully validated")
405
+ logger.info("Infrastructure successfully validated")
304
406
  return
305
407
 
306
408
  # Plan the deployment
@@ -309,16 +411,17 @@ class Swarmchestrate:
309
411
  cluster_dir,
310
412
  "OpenTofu plan",
311
413
  timeout=30,
414
+ env=env_vars,
312
415
  )
313
416
 
314
417
  # Apply the deployment
315
418
  CommandExecutor.run_command(
316
- ["tofu", "apply", "-auto-approve"], cluster_dir, "OpenTofu apply"
419
+ ["tofu", "apply", "-auto-approve"], cluster_dir, "OpenTofu apply", env=env_vars
317
420
  )
318
421
  logger.info("Infrastructure successfully updated")
319
422
 
320
423
  except RuntimeError as e:
321
- error_msg = f"Failed to deploy infrastructure: {str(e)}"
424
+ error_msg = f"Failed to deploy infrastructure: {str(e)}"
322
425
  logger.error(error_msg)
323
426
  raise RuntimeError(error_msg)
324
427
 
@@ -332,28 +435,29 @@ class Swarmchestrate:
332
435
  Raises:
333
436
  RuntimeError: If destruction fails
334
437
  """
335
- logger.info(f"Destroying the K3s cluster '{cluster_name}'...")
438
+ logger.info(f"---------- Destroying the cluster '{cluster_name}' -----------")
336
439
 
337
440
  # Get the directory for the specified cluster
338
441
  cluster_dir = self.get_cluster_output_dir(cluster_name)
339
442
 
340
443
  if not os.path.exists(cluster_dir):
341
- error_msg = f"Cluster directory '{cluster_dir}' not found"
444
+ error_msg = f"Cluster directory '{cluster_dir}' not found"
342
445
  logger.error(error_msg)
343
446
  raise RuntimeError(error_msg)
344
447
 
345
448
  if dryrun:
346
- logger.info("Dryrun: will only delete")
449
+ logger.info("Dryrun: will only delete cluster")
347
450
  shutil.rmtree(cluster_dir, ignore_errors=True)
348
451
  return
349
452
 
350
453
  try:
454
+
351
455
  # Plan destruction
352
456
  CommandExecutor.run_command(
353
457
  ["tofu", "plan", "-destroy", "-input=false"],
354
458
  cluster_dir,
355
459
  "OpenTofu plan destruction",
356
- timeout=30,
460
+ timeout=40,
357
461
  )
358
462
 
359
463
  # Execute destruction
@@ -365,9 +469,125 @@ class Swarmchestrate:
365
469
 
366
470
  # Remove the cluster directory
367
471
  shutil.rmtree(cluster_dir, ignore_errors=True)
368
- logger.info(f"Removed cluster directory: {cluster_dir}")
472
+ logger.info(f"Removed cluster directory: {cluster_dir}")
473
+
474
+ # Remove schema and database entry from PostgreSQL
475
+ self.remove_cluster_schema_from_db(cluster_name)
369
476
 
370
477
  except RuntimeError as e:
371
- error_msg = f"Failed to destroy cluster '{cluster_name}': {str(e)}"
478
+ error_msg = f"Failed to destroy cluster '{cluster_name}': {str(e)}"
372
479
  logger.error(error_msg)
373
480
  raise RuntimeError(error_msg)
481
+
482
+ def remove_cluster_schema_from_db(self, cluster_name: str) -> None:
483
+ """
484
+ Removes the schema and the entry for the cluster from the PostgreSQL database.
485
+
486
+ Args:
487
+ cluster_name: The name of the cluster to remove from the database
488
+
489
+ Raises:
490
+ RuntimeError: If the database operation fails
491
+ """
492
+ logger.info(f"Removing schema for cluster '{cluster_name}' from the PostgreSQL database...")
493
+
494
+ # Create a PostgreSQL connection string using the config
495
+ connection_string = self.pg_config.get_connection_string()
496
+
497
+ try:
498
+ # Connect to the PostgreSQL database
499
+ connection = psycopg2.connect(connection_string)
500
+ cursor = connection.cursor()
501
+
502
+ # Define the SQL query to delete the cluster schema
503
+ drop_schema_query = f'DROP SCHEMA IF EXISTS "{cluster_name}" CASCADE'
504
+ cursor.execute(drop_schema_query)
505
+
506
+ # Commit the transaction
507
+ connection.commit()
508
+
509
+ logger.info(f"Schema for cluster '{cluster_name}' removed from the database")
510
+ logger.info(f"----------- Destruction of cluster '{cluster_name}' successful -----------")
511
+
512
+ except psycopg2.Error as e:
513
+ logger.error(f"❌ Failed to remove schema for cluster '{cluster_name}' from the database: {e}")
514
+ raise RuntimeError(f" ❌Failed to remove schema for cluster '{cluster_name}' from the database")
515
+
516
+ finally:
517
+ # Close the database connection
518
+ if cursor:
519
+ cursor.close()
520
+ if connection:
521
+ connection.close()
522
+
523
+ def deploy_manifests(
524
+ self,
525
+ manifest_folder: str,
526
+ master_ip: str,
527
+ ssh_key_path: str,
528
+ ssh_user: str,
529
+ ):
530
+ """
531
+ Copy and apply manifests to a cluster using copy_manifest.tf in a temporaryfolder.
532
+
533
+ Args:
534
+ manifest_folder: Path to local manifest folder
535
+ master_ip: IP address of K3s master
536
+ ssh_key_path: Path to SSH private key
537
+ ssh_user: SSH username to connect to the master node
538
+ """
539
+ # Dedicated folder for copy-manifest operations
540
+ copy_dir = Path(self.output_dir) / "copy-manifest"
541
+ copy_dir.mkdir(parents=True, exist_ok=True)
542
+
543
+ logger.debug(f"Using copy-manifest folder: {copy_dir}")
544
+
545
+ try:
546
+ # Copy copy_manifest.tf from templates
547
+ tf_source_file = Path(self.template_manager.templates_dir) / "deploy_manifest.tf"
548
+ if not tf_source_file.exists():
549
+ logger.debug(f"deploy_manifest.tf not found at: {tf_source_file}")
550
+ raise RuntimeError(f"deploy_manifest.tf not found at: {tf_source_file}")
551
+ shutil.copy(tf_source_file, copy_dir)
552
+ logger.debug(f"Copied copy_manifest.tf to {copy_dir}")
553
+
554
+ # Prepare environment for OpenTofu
555
+ env_vars = os.environ.copy()
556
+ env_vars["TF_LOG"] = os.getenv("TF_LOG", "INFO")
557
+ env_vars["TF_LOG_PATH"] = os.getenv("TF_LOG_PATH", "/tmp/opentofu.log")
558
+
559
+ logger.info(f"------------ Applying manifest on node: {master_ip} -------------------")
560
+
561
+ # Run tofu init with spinner
562
+ CommandExecutor.run_command(
563
+ ["tofu", "init"],
564
+ cwd=str(copy_dir),
565
+ description="OpenTofu init",
566
+ env=env_vars,
567
+ )
568
+
569
+ # Run tofu apply with spinner
570
+ CommandExecutor.run_command(
571
+ [
572
+ "tofu",
573
+ "apply",
574
+ "-auto-approve",
575
+ f"-var=manifest_folder={manifest_folder}",
576
+ f"-var=master_ip={master_ip}",
577
+ f"-var=ssh_private_key_path={ssh_key_path}",
578
+ f"-var=ssh_user={ssh_user}"
579
+ ],
580
+ cwd=str(copy_dir),
581
+ description="OpenTofu apply",
582
+ env=env_vars,
583
+ )
584
+
585
+ logger.info("------------ Successfully applied manifests -------------------")
586
+
587
+ except RuntimeError as e:
588
+ print(f"\n---------- ERROR ----------\n{e}\n")
589
+ raise
590
+
591
+ finally:
592
+ if copy_dir.exists():
593
+ shutil.rmtree(copy_dir)