nvoi 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.claude/todo/refactor/00-overview.md +171 -0
  3. data/.claude/todo/refactor/01-objects.md +96 -0
  4. data/.claude/todo/refactor/02-utils.md +143 -0
  5. data/.claude/todo/refactor/03-external-cloud.md +164 -0
  6. data/.claude/todo/refactor/04-external-dns.md +104 -0
  7. data/.claude/todo/refactor/05-external.md +133 -0
  8. data/.claude/todo/refactor/06-cli.md +123 -0
  9. data/.claude/todo/refactor/07-cli-deploy-command.md +177 -0
  10. data/.claude/todo/refactor/08-cli-deploy-steps.md +201 -0
  11. data/.claude/todo/refactor/09-cli-delete-command.md +169 -0
  12. data/.claude/todo/refactor/10-cli-exec-command.md +157 -0
  13. data/.claude/todo/refactor/11-cli-credentials-command.md +190 -0
  14. data/.claude/todo/refactor/_target.md +79 -0
  15. data/.claude/todo/scaleway.impl.md +644 -0
  16. data/.claude/todo/scaleway.reference.md +520 -0
  17. data/Gemfile +1 -0
  18. data/Gemfile.lock +12 -2
  19. data/doc/config-schema.yaml +44 -11
  20. data/examples/golang/deploy.enc +0 -0
  21. data/examples/golang/main.go +18 -0
  22. data/exe/nvoi +3 -1
  23. data/lib/nvoi/cli/credentials/edit/command.rb +384 -0
  24. data/lib/nvoi/cli/credentials/show/command.rb +35 -0
  25. data/lib/nvoi/cli/db/command.rb +308 -0
  26. data/lib/nvoi/cli/delete/command.rb +75 -0
  27. data/lib/nvoi/cli/delete/steps/detach_volumes.rb +98 -0
  28. data/lib/nvoi/cli/delete/steps/teardown_dns.rb +49 -0
  29. data/lib/nvoi/cli/delete/steps/teardown_firewall.rb +46 -0
  30. data/lib/nvoi/cli/delete/steps/teardown_network.rb +30 -0
  31. data/lib/nvoi/cli/delete/steps/teardown_server.rb +50 -0
  32. data/lib/nvoi/cli/delete/steps/teardown_tunnel.rb +44 -0
  33. data/lib/nvoi/cli/delete/steps/teardown_volume.rb +61 -0
  34. data/lib/nvoi/cli/deploy/command.rb +184 -0
  35. data/lib/nvoi/cli/deploy/steps/build_image.rb +27 -0
  36. data/lib/nvoi/cli/deploy/steps/cleanup_images.rb +42 -0
  37. data/lib/nvoi/cli/deploy/steps/configure_tunnel.rb +100 -0
  38. data/lib/nvoi/cli/deploy/steps/deploy_service.rb +396 -0
  39. data/lib/nvoi/cli/deploy/steps/provision_network.rb +44 -0
  40. data/lib/nvoi/cli/deploy/steps/provision_server.rb +143 -0
  41. data/lib/nvoi/cli/deploy/steps/provision_volume.rb +171 -0
  42. data/lib/nvoi/cli/deploy/steps/setup_k3s.rb +481 -0
  43. data/lib/nvoi/cli/exec/command.rb +173 -0
  44. data/lib/nvoi/cli.rb +83 -142
  45. data/lib/nvoi/config_api/actions/app.rb +53 -0
  46. data/lib/nvoi/config_api/actions/compute_provider.rb +55 -0
  47. data/lib/nvoi/config_api/actions/database.rb +70 -0
  48. data/lib/nvoi/config_api/actions/env.rb +32 -0
  49. data/lib/nvoi/config_api/actions/secret.rb +32 -0
  50. data/lib/nvoi/config_api/actions/server.rb +66 -0
  51. data/lib/nvoi/config_api/actions/volume.rb +40 -0
  52. data/lib/nvoi/config_api/base.rb +44 -0
  53. data/lib/nvoi/config_api/result.rb +26 -0
  54. data/lib/nvoi/config_api.rb +70 -0
  55. data/lib/nvoi/errors.rb +68 -50
  56. data/lib/nvoi/external/cloud/aws.rb +425 -0
  57. data/lib/nvoi/external/cloud/base.rb +99 -0
  58. data/lib/nvoi/external/cloud/factory.rb +48 -0
  59. data/lib/nvoi/external/cloud/hetzner.rb +376 -0
  60. data/lib/nvoi/external/cloud/scaleway.rb +533 -0
  61. data/lib/nvoi/external/cloud.rb +15 -0
  62. data/lib/nvoi/external/containerd.rb +82 -0
  63. data/lib/nvoi/external/database/mysql.rb +84 -0
  64. data/lib/nvoi/external/database/postgres.rb +82 -0
  65. data/lib/nvoi/external/database/provider.rb +65 -0
  66. data/lib/nvoi/external/database/sqlite.rb +72 -0
  67. data/lib/nvoi/external/database.rb +22 -0
  68. data/lib/nvoi/external/dns/cloudflare.rb +292 -0
  69. data/lib/nvoi/external/kubectl.rb +65 -0
  70. data/lib/nvoi/external/ssh.rb +106 -0
  71. data/lib/nvoi/objects/config_override.rb +60 -0
  72. data/lib/nvoi/objects/configuration.rb +463 -0
  73. data/lib/nvoi/objects/database.rb +56 -0
  74. data/lib/nvoi/objects/dns.rb +14 -0
  75. data/lib/nvoi/objects/firewall.rb +11 -0
  76. data/lib/nvoi/objects/network.rb +11 -0
  77. data/lib/nvoi/objects/server.rb +14 -0
  78. data/lib/nvoi/objects/service_spec.rb +26 -0
  79. data/lib/nvoi/objects/tunnel.rb +14 -0
  80. data/lib/nvoi/objects/volume.rb +17 -0
  81. data/lib/nvoi/utils/config_loader.rb +172 -0
  82. data/lib/nvoi/utils/constants.rb +61 -0
  83. data/lib/nvoi/{credentials/manager.rb → utils/credential_store.rb} +16 -16
  84. data/lib/nvoi/{credentials → utils}/crypto.rb +8 -5
  85. data/lib/nvoi/{config → utils}/env_resolver.rb +10 -2
  86. data/lib/nvoi/utils/logger.rb +84 -0
  87. data/lib/nvoi/{config/naming.rb → utils/namer.rb} +28 -25
  88. data/lib/nvoi/{deployer → utils}/retry.rb +23 -3
  89. data/lib/nvoi/utils/templates.rb +62 -0
  90. data/lib/nvoi/version.rb +1 -1
  91. data/lib/nvoi.rb +10 -54
  92. data/templates/error-backend.yaml.erb +134 -0
  93. metadata +97 -44
  94. data/examples/golang/deploy.yml +0 -54
  95. data/lib/nvoi/cloudflare/client.rb +0 -287
  96. data/lib/nvoi/config/config.rb +0 -248
  97. data/lib/nvoi/config/loader.rb +0 -102
  98. data/lib/nvoi/config/ssh_keys.rb +0 -82
  99. data/lib/nvoi/config/types.rb +0 -274
  100. data/lib/nvoi/constants.rb +0 -59
  101. data/lib/nvoi/credentials/editor.rb +0 -272
  102. data/lib/nvoi/deployer/cleaner.rb +0 -36
  103. data/lib/nvoi/deployer/image_builder.rb +0 -23
  104. data/lib/nvoi/deployer/infrastructure.rb +0 -126
  105. data/lib/nvoi/deployer/orchestrator.rb +0 -146
  106. data/lib/nvoi/deployer/service_deployer.rb +0 -311
  107. data/lib/nvoi/deployer/tunnel_manager.rb +0 -57
  108. data/lib/nvoi/deployer/types.rb +0 -8
  109. data/lib/nvoi/k8s/renderer.rb +0 -44
  110. data/lib/nvoi/k8s/templates.rb +0 -29
  111. data/lib/nvoi/logger.rb +0 -72
  112. data/lib/nvoi/providers/aws.rb +0 -403
  113. data/lib/nvoi/providers/base.rb +0 -111
  114. data/lib/nvoi/providers/hetzner.rb +0 -288
  115. data/lib/nvoi/providers/hetzner_client.rb +0 -170
  116. data/lib/nvoi/remote/docker_manager.rb +0 -203
  117. data/lib/nvoi/remote/ssh_executor.rb +0 -72
  118. data/lib/nvoi/remote/volume_manager.rb +0 -103
  119. data/lib/nvoi/service/delete.rb +0 -234
  120. data/lib/nvoi/service/deploy.rb +0 -80
  121. data/lib/nvoi/service/exec.rb +0 -144
  122. data/lib/nvoi/service/provider.rb +0 -36
  123. data/lib/nvoi/steps/application_deployer.rb +0 -26
  124. data/lib/nvoi/steps/database_provisioner.rb +0 -60
  125. data/lib/nvoi/steps/k3s_cluster_setup.rb +0 -105
  126. data/lib/nvoi/steps/k3s_provisioner.rb +0 -351
  127. data/lib/nvoi/steps/server_provisioner.rb +0 -43
  128. data/lib/nvoi/steps/services_provisioner.rb +0 -29
  129. data/lib/nvoi/steps/tunnel_configurator.rb +0 -66
  130. data/lib/nvoi/steps/volume_provisioner.rb +0 -154
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nvoi
4
+ class Cli
5
+ module Deploy
6
+ module Steps
7
+ # ProvisionVolume handles block storage volume provisioning
8
+ class ProvisionVolume
9
+ def initialize(config, provider, log)
10
+ @config = config
11
+ @provider = provider
12
+ @log = log
13
+ @namer = config.namer
14
+ end
15
+
16
+ def run
17
+ volumes = collect_volumes
18
+ return if volumes.empty?
19
+
20
+ @log.info "Provisioning %d volume(s)", volumes.size
21
+
22
+ volumes.each do |vol_config|
23
+ provision_volume(vol_config)
24
+ end
25
+
26
+ @log.success "All volumes provisioned"
27
+ end
28
+
29
+ private
30
+
31
+ def collect_volumes
32
+ volumes = []
33
+
34
+ @config.deploy.application.servers.each do |server_group, server_config|
35
+ next unless server_config.volumes && !server_config.volumes.empty?
36
+
37
+ resolved_server = @namer.server_name(server_group, 1)
38
+
39
+ server_config.volumes.each do |vol_name, vol_config|
40
+ full_name = @namer.server_volume_name(server_group, vol_name)
41
+ volumes << {
42
+ name: full_name,
43
+ server_name: resolved_server,
44
+ mount_path: @namer.server_volume_host_path(server_group, vol_name),
45
+ size: vol_config.size
46
+ }
47
+ end
48
+ end
49
+
50
+ volumes
51
+ end
52
+
53
+ def provision_volume(vol_config)
54
+ @log.info "Provisioning volume: %s", vol_config[:name]
55
+
56
+ # Check if volume already exists
57
+ existing = @provider.get_volume_by_name(vol_config[:name])
58
+ if existing
59
+ @log.info "Volume already exists: %s", vol_config[:name]
60
+ ensure_attached_and_mounted(existing, vol_config)
61
+ return
62
+ end
63
+
64
+ # Find server to attach to
65
+ server = @provider.find_server(vol_config[:server_name])
66
+ raise Errors::VolumeError, "server not found: #{vol_config[:server_name]}" unless server
67
+
68
+ # Create volume
69
+ opts = Objects::Volume::CreateOptions.new(
70
+ name: vol_config[:name],
71
+ size: vol_config[:size],
72
+ server_id: server.id
73
+ )
74
+ volume = @provider.create_volume(opts)
75
+
76
+ # Attach volume
77
+ @log.info "Attaching volume to server..."
78
+ @provider.attach_volume(volume.id, server.id)
79
+
80
+ # Mount volume on server
81
+ mount_volume(server.public_ipv4, volume, vol_config[:mount_path])
82
+
83
+ @log.success "Volume provisioned and mounted: %s", vol_config[:name]
84
+ end
85
+
86
+ def ensure_attached_and_mounted(volume, vol_config)
87
+ server = @provider.find_server(vol_config[:server_name])
88
+ return unless server
89
+
90
+ # Attach if not attached
91
+ if volume.server_id.nil? || volume.server_id.empty?
92
+ @log.info "Attaching existing volume to server..."
93
+ @provider.attach_volume(volume.id, server.id)
94
+ volume = @provider.get_volume(volume.id)
95
+ else
96
+ @log.info "Volume already attached to server"
97
+ end
98
+
99
+ # Mount if not mounted
100
+ mount_volume(server.public_ipv4, volume, vol_config[:mount_path])
101
+ end
102
+
103
+ def mount_volume(server_ip, volume, mount_path)
104
+ ssh = External::Ssh.new(server_ip, @config.ssh_key_path)
105
+
106
+ # Get device path from provider
107
+ @log.info "Waiting for device path..."
108
+ device_path = @provider.wait_for_device_path(volume.id, ssh)
109
+ raise Errors::VolumeError, "volume #{volume.id} has no device path after attachment" unless device_path
110
+
111
+ @log.info "Device path: %s", device_path
112
+ @log.info "Waiting for device to be available on server..."
113
+
114
+ # Wait for device to be available
115
+ wait_for_device(ssh, device_path)
116
+
117
+ @log.info "Mounting volume at %s", mount_path
118
+
119
+ # Check if already mounted at target path
120
+ mount_check = ssh.execute("mountpoint -q #{mount_path} && echo 'mounted' || echo 'not'").strip
121
+ if mount_check == "mounted"
122
+ @log.info "Volume already mounted at %s", mount_path
123
+ return
124
+ end
125
+
126
+ # Create mount point
127
+ ssh.execute("sudo mkdir -p #{mount_path}")
128
+
129
+ # Check if device has filesystem
130
+ fs_check = ssh.execute("sudo blkid #{device_path} || true")
131
+ if fs_check.empty? || !fs_check.include?("TYPE=")
132
+ # Format with XFS
133
+ @log.info "Formatting volume with XFS"
134
+ ssh.execute("sudo mkfs.xfs #{device_path}")
135
+ end
136
+
137
+ # Mount
138
+ ssh.execute("sudo mount #{device_path} #{mount_path}")
139
+
140
+ # Add to fstab using UUID (more reliable than device path)
141
+ fstab_check = ssh.execute("grep '#{mount_path}' /etc/fstab || true")
142
+ if fstab_check.empty?
143
+ cmd = "UUID=$(sudo blkid -s UUID -o value #{device_path}) && " \
144
+ "echo \"UUID=$UUID #{mount_path} xfs defaults,nofail 0 2\" | sudo tee -a /etc/fstab"
145
+ ssh.execute(cmd)
146
+ end
147
+
148
+ # Verify mount succeeded
149
+ verify_mount(ssh, mount_path)
150
+
151
+ @log.success "Volume mounted at %s", mount_path
152
+ end
153
+
154
+ def wait_for_device(ssh, device_path)
155
+ ready = Utils::Retry.poll(max_attempts: 30, interval: 2) do
156
+ check = ssh.execute("test -b #{device_path} && echo 'ready' || true")
157
+ check.strip == "ready"
158
+ end
159
+
160
+ raise Errors::VolumeError, "device not available: #{device_path}" unless ready
161
+ end
162
+
163
+ def verify_mount(ssh, mount_path)
164
+ check = ssh.execute("mountpoint -q #{mount_path} && echo 'mounted' || echo 'not mounted'")
165
+ raise Errors::VolumeError, "volume not mounted at #{mount_path}" unless check.strip == "mounted"
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,481 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nvoi
4
+ class Cli
5
+ module Deploy
6
+ module Steps
7
+ # SetupK3s handles K3s cluster installation and configuration
8
+ class SetupK3s
9
+ def initialize(config, provider, log, main_server_ip)
10
+ @config = config
11
+ @provider = provider
12
+ @log = log
13
+ @main_server_ip = main_server_ip
14
+ end
15
+
16
+ def run
17
+ @log.info "Setting up K3s cluster"
18
+
19
+ # Find master server group
20
+ master_group, master_config = find_master_group
21
+ raise Errors::K8sError, "no master server group found" unless master_group
22
+
23
+ # Setup K3s on master
24
+ master_name = @config.namer.server_name(master_group, 1)
25
+ master = @provider.find_server(master_name)
26
+ raise Errors::K8sError, "master server not found: #{master_name}" unless master
27
+
28
+ master_ssh = External::Ssh.new(master.public_ipv4, @config.ssh_key_path)
29
+
30
+ # Provision master
31
+ cluster_token, master_private_ip = provision_master(master_ssh, master_group, master_name, master.private_ipv4)
32
+
33
+ # Setup workers
34
+ @config.deploy.application.servers.each do |group_name, group_config|
35
+ next if group_name == master_group
36
+ next unless group_config
37
+
38
+ count = group_config.count.positive? ? group_config.count : 1
39
+
40
+ (1..count).each do |i|
41
+ worker_name = @config.namer.server_name(group_name, i)
42
+ setup_worker(worker_name, group_name, cluster_token, master_private_ip, master_ssh)
43
+ end
44
+ end
45
+
46
+ @log.success "K3s cluster setup complete"
47
+ end
48
+
49
+ private
50
+
51
+ def find_master_group
52
+ @config.deploy.application.servers.each do |name, cfg|
53
+ return [name, cfg] if cfg&.master
54
+ end
55
+
56
+ # If only one group, use it as master
57
+ if @config.deploy.application.servers.size == 1
58
+ return @config.deploy.application.servers.first
59
+ end
60
+
61
+ nil
62
+ end
63
+
64
+ def provision_master(ssh, server_role, server_name, private_ip)
65
+ wait_for_cloud_init(ssh)
66
+
67
+ # Discover private IP via SSH if not provided by provider
68
+ private_ip ||= discover_private_ip(ssh)
69
+ raise Errors::K8sError, "server has no private IP - ensure network is attached" unless private_ip
70
+
71
+ # Check if K3s is already running
72
+ begin
73
+ ssh.execute("systemctl is-active k3s")
74
+ @log.info "K3s already running, skipping installation"
75
+ setup_kubeconfig(ssh)
76
+ token = get_cluster_token(ssh)
77
+ return [token, private_ip]
78
+ rescue Errors::SshCommandError
79
+ # Not running, continue installation
80
+ end
81
+
82
+ @log.info "Installing K3s server"
83
+
84
+ private_iface = get_interface_for_ip(ssh, private_ip)
85
+
86
+ @log.info "Installing k3s on private IP: %s, interface: %s", private_ip, private_iface
87
+
88
+ # Install Docker for image building
89
+ install_docker(ssh, private_ip)
90
+
91
+ # Configure k3s registries
92
+ configure_registries(ssh)
93
+
94
+ # Install K3s
95
+ install_cmd = <<~CMD
96
+ curl -sfL https://get.k3s.io | sudo sh -s - server \
97
+ --bind-address=#{private_ip} \
98
+ --advertise-address=#{private_ip} \
99
+ --node-ip=#{private_ip} \
100
+ --tls-san=#{private_ip} \
101
+ --flannel-iface=#{private_iface} \
102
+ --flannel-backend=wireguard-native \
103
+ --disable=traefik \
104
+ --write-kubeconfig-mode=644 \
105
+ --cluster-cidr=10.42.0.0/16 \
106
+ --service-cidr=10.43.0.0/16
107
+ CMD
108
+
109
+ ssh.execute(install_cmd, stream: true)
110
+ @log.success "K3s server installed"
111
+
112
+ setup_kubeconfig(ssh, private_ip)
113
+ wait_for_k3s_ready(ssh)
114
+
115
+ # Label master node
116
+ label_node(ssh, server_name, { "nvoi.io/server-name" => server_role })
117
+
118
+ # Setup registry and ingress
119
+ setup_registry(ssh)
120
+ setup_ingress_controller(ssh)
121
+
122
+ token = get_cluster_token(ssh)
123
+ [token, private_ip]
124
+ end
125
+
126
+ def setup_worker(worker_name, group_name, cluster_token, master_private_ip, master_ssh)
127
+ @log.info "Setting up K3s worker: %s", worker_name
128
+
129
+ worker = @provider.find_server(worker_name)
130
+ unless worker
131
+ @log.warning "Worker server not found: %s", worker_name
132
+ return
133
+ end
134
+
135
+ worker_ssh = External::Ssh.new(worker.public_ipv4, @config.ssh_key_path)
136
+ wait_for_cloud_init(worker_ssh)
137
+
138
+ # Discover private IP via SSH if not provided by provider
139
+ private_ip = worker.private_ipv4 || discover_private_ip(worker_ssh)
140
+ unless private_ip
141
+ @log.warning "Worker %s has no private IP, skipping", worker_name
142
+ return
143
+ end
144
+
145
+ # Check if K3s agent is already running
146
+ begin
147
+ worker_ssh.execute("systemctl is-active k3s-agent")
148
+ @log.info "K3s agent already running on %s", worker_name
149
+ return
150
+ rescue Errors::SshCommandError
151
+ # Not running, continue
152
+ end
153
+
154
+ @log.info "Installing K3s agent on %s", worker_name
155
+
156
+ private_iface = get_interface_for_ip(worker_ssh, private_ip)
157
+
158
+ cmd = <<~CMD
159
+ curl -sfL https://get.k3s.io | K3S_URL="https://#{master_private_ip}:6443" K3S_TOKEN="#{cluster_token}" sh -s - agent \
160
+ --node-ip=#{private_ip} \
161
+ --flannel-iface=#{private_iface} \
162
+ --node-name=#{worker_name}
163
+ CMD
164
+
165
+ worker_ssh.execute(cmd, stream: true)
166
+ @log.success "K3s agent installed on %s", worker_name
167
+
168
+ # Label worker node from master
169
+ label_worker_from_master(master_ssh, worker_name, group_name)
170
+ end
171
+
172
+ def wait_for_cloud_init(ssh)
173
+ @log.info "Waiting for cloud-init to complete"
174
+
175
+ ready = Utils::Retry.poll(max_attempts: 60, interval: 5) do
176
+ begin
177
+ output = ssh.execute("test -f /var/lib/cloud/instance/boot-finished && echo 'ready'")
178
+ output.include?("ready")
179
+ rescue Errors::SshCommandError
180
+ false
181
+ end
182
+ end
183
+
184
+ raise Errors::K8sError, "cloud-init timeout" unless ready
185
+
186
+ @log.success "Cloud-init complete"
187
+ end
188
+
189
+ def get_cluster_token(ssh)
190
+ @log.info "Retrieving K3s cluster token"
191
+ output = ssh.execute("sudo cat /var/lib/rancher/k3s/server/node-token")
192
+ token = output.strip
193
+ raise Errors::K8sError, "cluster token is empty" if token.empty?
194
+
195
+ @log.success "Cluster token retrieved"
196
+ token
197
+ end
198
+
199
+ def discover_private_ip(ssh)
200
+ # Match RFC1918 private ranges, exclude docker/bridge interfaces
201
+ output = ssh.execute("ip addr show | grep -v 'docker\\|br-\\|veth' | grep -E 'inet (10\\.|172\\.(1[6-9]|2[0-9]|3[01])\\.|192\\.168\\.)' | awk '{print $2}' | cut -d/ -f1 | head -1")
202
+ ip = output.strip
203
+ ip.empty? ? nil : ip
204
+ end
205
+
206
+ def get_interface_for_ip(ssh, ip)
207
+ # Find the interface that has this IP
208
+ output = ssh.execute("ip addr show | grep 'inet #{ip}/' | awk '{print $NF}'").strip
209
+ return output unless output.empty?
210
+
211
+ # Fallback: find any interface with the IP prefix
212
+ prefix = ip.split(".")[0..2].join(".")
213
+ output = ssh.execute("ip addr show | grep -v 'docker\\|br-\\|veth' | grep 'inet #{prefix}\\.' | awk '{print $NF}' | head -1").strip
214
+ output.empty? ? nil : output
215
+ end
216
+
217
+ def install_docker(ssh, private_ip)
218
+ begin
219
+ ssh.execute("systemctl is-active docker")
220
+ @log.info "Docker already running, skipping installation"
221
+ rescue Errors::SshCommandError
222
+ docker_install = <<~CMD
223
+ sudo apt-get update && sudo apt-get install -y docker.io
224
+ sudo systemctl start docker
225
+ sudo systemctl enable docker
226
+ sudo usermod -aG docker deploy
227
+ CMD
228
+
229
+ ssh.execute(docker_install, stream: true)
230
+ end
231
+
232
+ # Configure Docker for insecure registry
233
+ docker_config = <<~CMD
234
+ sudo mkdir -p /etc/docker
235
+ sudo tee /etc/docker/daemon.json > /dev/null <<EOF
236
+ {"insecure-registries": ["#{private_ip}:5001", "localhost:30500"]}
237
+ EOF
238
+ sudo systemctl restart docker
239
+ CMD
240
+
241
+ ssh.execute(docker_config)
242
+
243
+ # Add registry domain to /etc/hosts
244
+ ssh.execute('grep -q "nvoi-registry.default.svc.cluster.local" /etc/hosts || echo "127.0.0.1 nvoi-registry.default.svc.cluster.local" | sudo tee -a /etc/hosts')
245
+ end
246
+
247
+ def configure_registries(ssh)
248
+ config = <<~CMD
249
+ sudo mkdir -p /etc/rancher/k3s
250
+ sudo tee /etc/rancher/k3s/registries.yaml > /dev/null <<'REGEOF'
251
+ mirrors:
252
+ "nvoi-registry.default.svc.cluster.local:5000":
253
+ endpoint:
254
+ - "http://localhost:30500"
255
+ "localhost:30500":
256
+ endpoint:
257
+ - "http://localhost:30500"
258
+ configs:
259
+ "nvoi-registry.default.svc.cluster.local:5000":
260
+ tls:
261
+ insecure_skip_verify: true
262
+ "localhost:30500":
263
+ tls:
264
+ insecure_skip_verify: true
265
+ REGEOF
266
+ CMD
267
+
268
+ ssh.execute(config)
269
+ end
270
+
271
+ def setup_kubeconfig(ssh, private_ip = nil)
272
+ private_ip ||= discover_private_ip(ssh)
273
+
274
+ cmd = <<~CMD
275
+ sudo mkdir -p /home/deploy/.kube
276
+ sudo cp /etc/rancher/k3s/k3s.yaml /home/deploy/.kube/config
277
+ sudo sed -i "s/127.0.0.1/#{private_ip}/g" /home/deploy/.kube/config
278
+ sudo chown -R deploy:deploy /home/deploy/.kube
279
+ CMD
280
+
281
+ ssh.execute(cmd)
282
+ end
283
+
284
+ def wait_for_k3s_ready(ssh)
285
+ @log.info "Waiting for K3s to be ready"
286
+
287
+ ready = Utils::Retry.poll(max_attempts: 60, interval: 5) do
288
+ begin
289
+ output = ssh.execute("kubectl get nodes")
290
+ output.include?("Ready")
291
+ rescue Errors::SshCommandError
292
+ false
293
+ end
294
+ end
295
+
296
+ raise Errors::K8sError, "K3s failed to become ready" unless ready
297
+
298
+ @log.success "K3s is ready"
299
+ end
300
+
301
+ def label_node(ssh, node_name, labels)
302
+ actual_node = ssh.execute("kubectl get nodes -o jsonpath='{.items[0].metadata.name}'").strip
303
+
304
+ labels.each do |key, value|
305
+ ssh.execute("kubectl label node #{actual_node} #{key}=#{value} --overwrite")
306
+ end
307
+ end
308
+
309
+ def label_worker_from_master(master_ssh, worker_name, group_name)
310
+ @log.info "Labeling worker node: %s", worker_name
311
+
312
+ joined = Utils::Retry.poll(max_attempts: 30, interval: 5) do
313
+ begin
314
+ output = master_ssh.execute("kubectl get nodes -o name")
315
+ output.include?(worker_name)
316
+ rescue Errors::SshCommandError
317
+ false
318
+ end
319
+ end
320
+
321
+ unless joined
322
+ @log.warning "Worker node did not join cluster in time: %s", worker_name
323
+ return
324
+ end
325
+
326
+ master_ssh.execute("kubectl label node #{worker_name} nvoi.io/server-name=#{group_name} --overwrite")
327
+ @log.success "Worker labeled: %s", worker_name
328
+ end
329
+
330
+ def setup_registry(ssh)
331
+ @log.info "Setting up in-cluster registry"
332
+
333
+ manifest = <<~YAML
334
+ apiVersion: v1
335
+ kind: Namespace
336
+ metadata:
337
+ name: nvoi-system
338
+ ---
339
+ apiVersion: apps/v1
340
+ kind: Deployment
341
+ metadata:
342
+ name: nvoi-registry
343
+ namespace: default
344
+ spec:
345
+ replicas: 1
346
+ selector:
347
+ matchLabels:
348
+ app: nvoi-registry
349
+ template:
350
+ metadata:
351
+ labels:
352
+ app: nvoi-registry
353
+ spec:
354
+ containers:
355
+ - name: registry
356
+ image: registry:2
357
+ ports:
358
+ - containerPort: 5000
359
+ protocol: TCP
360
+ env:
361
+ - name: REGISTRY_HTTP_ADDR
362
+ value: "0.0.0.0:5000"
363
+ volumeMounts:
364
+ - name: registry-storage
365
+ mountPath: /var/lib/registry
366
+ volumes:
367
+ - name: registry-storage
368
+ emptyDir: {}
369
+ ---
370
+ apiVersion: v1
371
+ kind: Service
372
+ metadata:
373
+ name: nvoi-registry
374
+ namespace: default
375
+ spec:
376
+ type: NodePort
377
+ ports:
378
+ - port: 5000
379
+ targetPort: 5000
380
+ nodePort: 30500
381
+ selector:
382
+ app: nvoi-registry
383
+ YAML
384
+
385
+ ssh.execute("cat <<'EOF' | kubectl apply -f -\n#{manifest}\nEOF")
386
+
387
+ # Wait for registry to be ready
388
+ @log.info "Waiting for registry to be ready"
389
+
390
+ ready = Utils::Retry.poll(max_attempts: 24, interval: 5) do
391
+ begin
392
+ output = ssh.execute("kubectl get deployment nvoi-registry -n default -o jsonpath='{.status.readyReplicas}'")
393
+ output.strip == "1"
394
+ rescue Errors::SshCommandError
395
+ false
396
+ end
397
+ end
398
+
399
+ raise Errors::K8sError, "registry failed to become ready" unless ready
400
+
401
+ @log.success "In-cluster registry running on :30500"
402
+ end
403
+
404
+ def setup_ingress_controller(ssh)
405
+ @log.info "Setting up NGINX Ingress Controller"
406
+
407
+ ssh.execute("kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/controller-v1.10.0/deploy/static/provider/baremetal/deploy.yaml", stream: true)
408
+
409
+ @log.info "Waiting for NGINX Ingress Controller to be ready"
410
+
411
+ ready = Utils::Retry.poll(max_attempts: 60, interval: 10) do
412
+ begin
413
+ ready_replicas = ssh.execute("kubectl get deployment ingress-nginx-controller -n ingress-nginx -o jsonpath='{.status.readyReplicas}'").strip
414
+ desired_replicas = ssh.execute("kubectl get deployment ingress-nginx-controller -n ingress-nginx -o jsonpath='{.spec.replicas}'").strip
415
+
416
+ !ready_replicas.empty? && !desired_replicas.empty? && ready_replicas == desired_replicas
417
+ rescue Errors::SshCommandError
418
+ false
419
+ end
420
+ end
421
+
422
+ raise Errors::K8sError, "NGINX Ingress Controller failed to become ready" unless ready
423
+
424
+ @log.success "NGINX Ingress Controller is ready"
425
+ deploy_error_backend(ssh)
426
+ configure_custom_error_pages(ssh)
427
+ end
428
+
429
+ def deploy_error_backend(ssh)
430
+ @log.info "Deploying custom error backend"
431
+
432
+ Utils::Templates.apply_manifest(ssh, "error-backend.yaml", {})
433
+
434
+ ready = Utils::Retry.poll(max_attempts: 30, interval: 2) do
435
+ begin
436
+ replicas = ssh.execute("kubectl get deployment nvoi-error-backend -n ingress-nginx -o jsonpath='{.status.readyReplicas}'").strip
437
+ replicas == "1"
438
+ rescue Errors::SshCommandError
439
+ false
440
+ end
441
+ end
442
+
443
+ raise Errors::K8sError, "Error backend failed to become ready" unless ready
444
+
445
+ @log.success "Error backend is ready"
446
+ end
447
+
448
+ def configure_custom_error_pages(ssh)
449
+ @log.info "Configuring custom error pages for 502, 503, 504"
450
+
451
+ patch_cmd = <<~CMD
452
+ kubectl patch configmap ingress-nginx-controller -n ingress-nginx --type merge -p '{"data":{"custom-http-errors":"502,503,504"}}'
453
+ CMD
454
+
455
+ ssh.execute(patch_cmd)
456
+
457
+ check_cmd = "kubectl get deployment ingress-nginx-controller -n ingress-nginx -o jsonpath='{.spec.template.spec.containers[0].args}'"
458
+ current_args = ssh.execute(check_cmd)
459
+
460
+ unless current_args.include?("--default-backend-service")
461
+ patch_deployment = <<~CMD
462
+ kubectl patch deployment ingress-nginx-controller -n ingress-nginx --type=json -p='[
463
+ {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--default-backend-service=ingress-nginx/nvoi-error-backend"}
464
+ ]'
465
+ CMD
466
+
467
+ ssh.execute(patch_deployment)
468
+
469
+ @log.info "Waiting for ingress controller to restart..."
470
+ ssh.execute("kubectl rollout status deployment/ingress-nginx-controller -n ingress-nginx --timeout=120s")
471
+ else
472
+ @log.info "Custom error backend already configured"
473
+ end
474
+
475
+ @log.success "Custom error pages configured"
476
+ end
477
+ end
478
+ end
479
+ end
480
+ end
481
+ end