agentworks-cli 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentworks/__init__.py +1 -0
- agentworks/agents/__init__.py +0 -0
- agentworks/agents/manager.py +1095 -0
- agentworks/agents/templates.py +145 -0
- agentworks/catalog.py +264 -0
- agentworks/catalog.toml +131 -0
- agentworks/cli.py +1462 -0
- agentworks/completions/__init__.py +33 -0
- agentworks/completions/bash.py +179 -0
- agentworks/completions/install.py +122 -0
- agentworks/completions/powershell.py +270 -0
- agentworks/completions/spec.py +216 -0
- agentworks/completions/zsh.py +256 -0
- agentworks/config.py +894 -0
- agentworks/db.py +1083 -0
- agentworks/doctor.py +430 -0
- agentworks/git_credentials/__init__.py +0 -0
- agentworks/git_credentials/azdo.py +29 -0
- agentworks/git_credentials/base.py +71 -0
- agentworks/git_credentials/github.py +22 -0
- agentworks/nerf-config.yaml +16 -0
- agentworks/output.py +296 -0
- agentworks/remote_exec.py +286 -0
- agentworks/sample-config.toml +289 -0
- agentworks/sessions/__init__.py +0 -0
- agentworks/sessions/console.py +164 -0
- agentworks/sessions/manager.py +1297 -0
- agentworks/sessions/templates.py +101 -0
- agentworks/sessions/tmux.py +503 -0
- agentworks/sources.py +303 -0
- agentworks/ssh.py +759 -0
- agentworks/ssh_config.py +255 -0
- agentworks/vm_hosts/__init__.py +0 -0
- agentworks/vm_hosts/manager.py +86 -0
- agentworks/vms/__init__.py +0 -0
- agentworks/vms/backup.py +409 -0
- agentworks/vms/base.py +56 -0
- agentworks/vms/bootstrap_script.py +185 -0
- agentworks/vms/cloud_init.py +55 -0
- agentworks/vms/initializer.py +1523 -0
- agentworks/vms/manager.py +1122 -0
- agentworks/vms/provisioners/__init__.py +0 -0
- agentworks/vms/provisioners/azure.py +602 -0
- agentworks/vms/provisioners/lima.py +295 -0
- agentworks/vms/provisioners/proxmox.py +279 -0
- agentworks/vms/provisioners/proxmox_api.py +261 -0
- agentworks/vms/provisioners/wsl2.py +340 -0
- agentworks/vms/templates.py +152 -0
- agentworks/workspaces/__init__.py +0 -0
- agentworks/workspaces/backends/__init__.py +0 -0
- agentworks/workspaces/backends/local.py +119 -0
- agentworks/workspaces/backends/vm.py +175 -0
- agentworks/workspaces/manager.py +1080 -0
- agentworks/workspaces/templates.py +76 -0
- agentworks/workspaces/tmuxinator.py +80 -0
- agentworks_cli-0.2.1.dist-info/METADATA +635 -0
- agentworks_cli-0.2.1.dist-info/RECORD +59 -0
- agentworks_cli-0.2.1.dist-info/WHEEL +4 -0
- agentworks_cli-0.2.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,1122 @@
|
|
|
1
|
+
"""VM lifecycle management -- create, list, start, stop, delete."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from agentworks import output
|
|
8
|
+
from agentworks.config import VALID_PLATFORMS, validate_admin_username, validate_name
|
|
9
|
+
from agentworks.db import InitStatus, ProvisioningStatus, VMStatus
|
|
10
|
+
from agentworks.output import VMError
|
|
11
|
+
from agentworks.vms.initializer import (
|
|
12
|
+
initialize_vm,
|
|
13
|
+
rejoin_tailscale,
|
|
14
|
+
resolve_git_credential_providers,
|
|
15
|
+
run_initialization,
|
|
16
|
+
verify_git_credential_auth,
|
|
17
|
+
verify_tailscale_available,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from agentworks.config import Config
|
|
22
|
+
from agentworks.db import Database, VMRow
|
|
23
|
+
from agentworks.git_credentials.base import GitCredentialProvider
|
|
24
|
+
from agentworks.vms.base import VMProvisioner
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_provisioner(platform: str, vm_host_ssh: str | None = None) -> VMProvisioner:
|
|
28
|
+
"""Get the appropriate provisioner for a platform."""
|
|
29
|
+
if platform == "lima":
|
|
30
|
+
from agentworks.vms.provisioners.lima import LimaProvisioner
|
|
31
|
+
|
|
32
|
+
return LimaProvisioner(vm_host_ssh=vm_host_ssh)
|
|
33
|
+
elif platform == "azure":
|
|
34
|
+
from agentworks.vms.provisioners.azure import AzureProvisioner
|
|
35
|
+
|
|
36
|
+
return AzureProvisioner()
|
|
37
|
+
elif platform == "wsl2":
|
|
38
|
+
from agentworks.vms.provisioners.wsl2 import WSL2Provisioner
|
|
39
|
+
|
|
40
|
+
return WSL2Provisioner()
|
|
41
|
+
elif platform == "proxmox":
|
|
42
|
+
|
|
43
|
+
# ProxmoxProvisioner requires config; caller must use create_vm flow
|
|
44
|
+
raise ValueError("Use create_vm for proxmox provisioning")
|
|
45
|
+
else:
|
|
46
|
+
msg = f"Unknown platform: {platform}"
|
|
47
|
+
raise ValueError(msg)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def create_vm(
|
|
51
|
+
db: Database,
|
|
52
|
+
config: Config,
|
|
53
|
+
*,
|
|
54
|
+
name: str,
|
|
55
|
+
template: str | None = None,
|
|
56
|
+
platform: str | None = None,
|
|
57
|
+
vm_host: str | None = None,
|
|
58
|
+
cpus: int | None = None,
|
|
59
|
+
memory: int | None = None,
|
|
60
|
+
disk: int | None = None,
|
|
61
|
+
azure_vm_size: str | None = None,
|
|
62
|
+
admin_username: str | None = None,
|
|
63
|
+
) -> None:
|
|
64
|
+
"""Create a new VM: provision + initialize."""
|
|
65
|
+
from dataclasses import replace as _replace
|
|
66
|
+
|
|
67
|
+
from agentworks.vms.templates import resolve_template
|
|
68
|
+
|
|
69
|
+
vm_tmpl = resolve_template(config, template)
|
|
70
|
+
|
|
71
|
+
# Replace config.vm with the resolved template so downstream code
|
|
72
|
+
# (initializer, provisioners) uses the right template values.
|
|
73
|
+
if template is not None:
|
|
74
|
+
config = _replace(config, vm=vm_tmpl)
|
|
75
|
+
|
|
76
|
+
# Resolve defaults
|
|
77
|
+
platform = platform or config.defaults.platform or "lima"
|
|
78
|
+
if platform not in VALID_PLATFORMS:
|
|
79
|
+
raise VMError(f"invalid platform '{platform}'")
|
|
80
|
+
|
|
81
|
+
vm_name = name
|
|
82
|
+
validate_name(vm_name)
|
|
83
|
+
|
|
84
|
+
if db.get_vm(vm_name) is not None:
|
|
85
|
+
raise VMError(f"VM '{vm_name}' already exists")
|
|
86
|
+
|
|
87
|
+
# Resolve VM host for Lima
|
|
88
|
+
vm_host_ssh: str | None = None
|
|
89
|
+
vm_host_name: str | None = None
|
|
90
|
+
if platform == "lima":
|
|
91
|
+
vm_host_name = vm_host or config.defaults.vm_host
|
|
92
|
+
if vm_host_name:
|
|
93
|
+
host_row = db.get_vm_host(vm_host_name)
|
|
94
|
+
if host_row is None:
|
|
95
|
+
raise VMError(f"VM host '{vm_host_name}' not found")
|
|
96
|
+
vm_host_ssh = host_row.ssh_host
|
|
97
|
+
|
|
98
|
+
# Azure config validation
|
|
99
|
+
if platform == "azure" and config.azure is None:
|
|
100
|
+
raise VMError("[azure] config section required for azure platform")
|
|
101
|
+
|
|
102
|
+
# Proxmox config validation
|
|
103
|
+
if platform == "proxmox" and config.proxmox is None:
|
|
104
|
+
raise VMError("[proxmox] config section required for proxmox platform")
|
|
105
|
+
|
|
106
|
+
# Resolve resource settings: CLI flag > template > built-in default
|
|
107
|
+
resolved_cpus = cpus if cpus is not None else vm_tmpl.cpus
|
|
108
|
+
resolved_memory = memory if memory is not None else vm_tmpl.memory
|
|
109
|
+
resolved_disk = disk if disk is not None else vm_tmpl.disk
|
|
110
|
+
resolved_azure_size = azure_vm_size or vm_tmpl.azure_vm_size
|
|
111
|
+
resolved_admin_username = admin_username or config.admin.username
|
|
112
|
+
validate_admin_username(resolved_admin_username)
|
|
113
|
+
|
|
114
|
+
# Pre-flight checks
|
|
115
|
+
verify_tailscale_available()
|
|
116
|
+
providers = resolve_git_credential_providers(config, config.admin.git_credentials)
|
|
117
|
+
verify_git_credential_auth(providers)
|
|
118
|
+
|
|
119
|
+
# Collect secrets upfront so the user isn't interrupted mid-provisioning
|
|
120
|
+
tailscale_auth_key, git_tokens = _collect_secrets(providers, vm_name)
|
|
121
|
+
|
|
122
|
+
# Create DB record with as-provisioned resource values
|
|
123
|
+
db.insert_vm(
|
|
124
|
+
vm_name,
|
|
125
|
+
platform=platform,
|
|
126
|
+
vm_host_name=vm_host_name,
|
|
127
|
+
template=vm_tmpl.name,
|
|
128
|
+
cpus=resolved_cpus,
|
|
129
|
+
memory_gib=resolved_memory,
|
|
130
|
+
disk_gib=resolved_disk,
|
|
131
|
+
swap_gib=vm_tmpl.swap,
|
|
132
|
+
admin_username=resolved_admin_username,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# -- Provisioning --
|
|
136
|
+
# If this fails, nothing was created on the remote host (or the remote
|
|
137
|
+
# couldn't be reached), so we clean up the DB record.
|
|
138
|
+
try:
|
|
139
|
+
if platform == "lima":
|
|
140
|
+
from agentworks.vms.provisioners.lima import LimaProvisioner
|
|
141
|
+
|
|
142
|
+
lima = LimaProvisioner(vm_host_ssh=vm_host_ssh)
|
|
143
|
+
result = lima.create(
|
|
144
|
+
vm_name,
|
|
145
|
+
config,
|
|
146
|
+
cpus=resolved_cpus,
|
|
147
|
+
memory=resolved_memory,
|
|
148
|
+
disk=resolved_disk,
|
|
149
|
+
tailscale_auth_key=tailscale_auth_key,
|
|
150
|
+
)
|
|
151
|
+
elif platform == "azure":
|
|
152
|
+
from agentworks.vms.provisioners.azure import AzureProvisioner
|
|
153
|
+
|
|
154
|
+
azure = AzureProvisioner()
|
|
155
|
+
result = azure.create(
|
|
156
|
+
vm_name,
|
|
157
|
+
config,
|
|
158
|
+
azure_vm_size=resolved_azure_size,
|
|
159
|
+
disk=resolved_disk,
|
|
160
|
+
admin_username=resolved_admin_username,
|
|
161
|
+
tailscale_auth_key=tailscale_auth_key,
|
|
162
|
+
)
|
|
163
|
+
elif platform == "wsl2":
|
|
164
|
+
from agentworks.vms.provisioners.wsl2 import WSL2Provisioner
|
|
165
|
+
|
|
166
|
+
wsl2 = WSL2Provisioner()
|
|
167
|
+
result = wsl2.create(
|
|
168
|
+
vm_name,
|
|
169
|
+
config,
|
|
170
|
+
admin_username=resolved_admin_username,
|
|
171
|
+
)
|
|
172
|
+
elif platform == "proxmox":
|
|
173
|
+
from agentworks.vms.provisioners.proxmox import ProxmoxProvisioner
|
|
174
|
+
|
|
175
|
+
proxmox = ProxmoxProvisioner(config.proxmox) # type: ignore[arg-type]
|
|
176
|
+
result = proxmox.create(
|
|
177
|
+
vm_name,
|
|
178
|
+
config,
|
|
179
|
+
cpus=resolved_cpus,
|
|
180
|
+
memory=resolved_memory,
|
|
181
|
+
disk=resolved_disk,
|
|
182
|
+
admin_username=resolved_admin_username,
|
|
183
|
+
tailscale_auth_key=tailscale_auth_key,
|
|
184
|
+
)
|
|
185
|
+
else:
|
|
186
|
+
msg = f"Unknown platform: {platform}"
|
|
187
|
+
raise ValueError(msg)
|
|
188
|
+
except Exception as e:
|
|
189
|
+
db.delete_vm(vm_name)
|
|
190
|
+
raise VMError(f"provisioning failed: {e}") from e
|
|
191
|
+
|
|
192
|
+
# Update DB with platform-specific metadata
|
|
193
|
+
if result.azure_resource_id:
|
|
194
|
+
db.update_vm_azure_resource_id(vm_name, result.azure_resource_id)
|
|
195
|
+
if result.wsl_distro_name:
|
|
196
|
+
db.update_vm_wsl_distro_name(vm_name, result.wsl_distro_name)
|
|
197
|
+
if result.proxmox_vmid:
|
|
198
|
+
db.update_vm_proxmox_vmid(vm_name, result.proxmox_vmid)
|
|
199
|
+
|
|
200
|
+
# -- Initialization --
|
|
201
|
+
# If this fails, the VM exists on the remote host and may be debuggable.
|
|
202
|
+
# Keep the DB record so the user can reinit or delete.
|
|
203
|
+
# Build a callback to detach the Azure public IP once Tailscale is up
|
|
204
|
+
# (before Phase B starts). This minimizes the window where the VM has
|
|
205
|
+
# a public IP exposed to the internet.
|
|
206
|
+
def _on_tailscale_ready() -> None:
|
|
207
|
+
if platform == "azure":
|
|
208
|
+
from agentworks.vms.provisioners.azure import AzureProvisioner as _AP
|
|
209
|
+
|
|
210
|
+
_created_vm = db.get_vm(vm_name)
|
|
211
|
+
assert _created_vm is not None
|
|
212
|
+
_AP().detach_public_ip(_created_vm)
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
initialize_vm(
|
|
216
|
+
db,
|
|
217
|
+
config,
|
|
218
|
+
vm_name,
|
|
219
|
+
exec_target=result.admin_exec_target,
|
|
220
|
+
providers=providers,
|
|
221
|
+
is_wsl2=(platform == "wsl2"),
|
|
222
|
+
admin_username=resolved_admin_username,
|
|
223
|
+
tailscale_auth_key=tailscale_auth_key,
|
|
224
|
+
git_tokens=git_tokens,
|
|
225
|
+
bootstrap_complete=result.bootstrap_complete,
|
|
226
|
+
tailscale_ip=result.tailscale_ip,
|
|
227
|
+
on_tailscale_ready=_on_tailscale_ready,
|
|
228
|
+
)
|
|
229
|
+
except Exception as e:
|
|
230
|
+
from agentworks.ssh import LOG_DIR
|
|
231
|
+
|
|
232
|
+
log_hint = ""
|
|
233
|
+
logs = sorted(LOG_DIR.glob(f"{vm_name}-*-vm-create.log"), reverse=True)
|
|
234
|
+
if logs:
|
|
235
|
+
log_hint = f"\nDetails: {logs[0]}"
|
|
236
|
+
|
|
237
|
+
vm = db.get_vm(vm_name)
|
|
238
|
+
if vm is not None and vm.provisioning_status == ProvisioningStatus.FAILED.value:
|
|
239
|
+
raise VMError(
|
|
240
|
+
f"provisioning failed: {e}{log_hint}\n"
|
|
241
|
+
f"VM '{vm_name}' is in a failed state. Use 'vm delete {vm_name}' to clean up."
|
|
242
|
+
) from e
|
|
243
|
+
else:
|
|
244
|
+
raise VMError(
|
|
245
|
+
f"initialization failed: {e}{log_hint}\n"
|
|
246
|
+
f"VM '{vm_name}' may still be usable. Use 'vm reinit {vm_name}' to retry."
|
|
247
|
+
) from e
|
|
248
|
+
|
|
249
|
+
# -- Post-init: SSH config --
|
|
250
|
+
try:
|
|
251
|
+
from agentworks.ssh_config import sync_ssh_config
|
|
252
|
+
|
|
253
|
+
sync_ssh_config(config, db)
|
|
254
|
+
except Exception as e:
|
|
255
|
+
output.warn(f"SSH config sync failed: {e}")
|
|
256
|
+
output.detail("VM is likely still usable.")
|
|
257
|
+
|
|
258
|
+
# Final status is set by initialize_vm (COMPLETE or PARTIAL)
|
|
259
|
+
vm = db.get_vm(vm_name)
|
|
260
|
+
assert vm is not None
|
|
261
|
+
if vm.init_status == InitStatus.PARTIAL.value:
|
|
262
|
+
output.info(f"VM '{vm_name}' is ready (with warnings -- see above)")
|
|
263
|
+
else:
|
|
264
|
+
output.info(f"VM '{vm_name}' is ready!")
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def list_vms(db: Database) -> None:
|
|
268
|
+
"""List all VMs with their init and runtime status."""
|
|
269
|
+
vms = db.list_vms()
|
|
270
|
+
if not vms:
|
|
271
|
+
output.info("No VMs registered.")
|
|
272
|
+
return
|
|
273
|
+
|
|
274
|
+
header = (
|
|
275
|
+
f"{'NAME':<20} {'PLATFORM':<10} {'TEMPLATE':<12} {'HOST':<15} {'PROV':<12} {'INIT':<12} "
|
|
276
|
+
f"{'WS/AG/TS':<10} {'TAILSCALE':<20} {'CREATED'}"
|
|
277
|
+
)
|
|
278
|
+
output.info(header)
|
|
279
|
+
output.info("-" * len(header))
|
|
280
|
+
for vm in vms:
|
|
281
|
+
ws = db.count_workspaces_on_vm(vm.name)
|
|
282
|
+
ag = db.count_agents_on_vm(vm.name)
|
|
283
|
+
ts = db.count_sessions_on_vm(vm.name)
|
|
284
|
+
counts = f"{ws}/{ag}/{ts}"
|
|
285
|
+
output.info(
|
|
286
|
+
f"{vm.name:<20} {vm.platform:<10} {vm.template or '-':<12} {vm.vm_host_name or '-':<15} "
|
|
287
|
+
f"{vm.provisioning_status:<12} {vm.init_status:<12} "
|
|
288
|
+
f"{counts:<10} {vm.tailscale_host or '-':<20} {vm.created_at}"
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def describe_vm(db: Database, config: Config, name: str) -> None:
|
|
293
|
+
"""Show detailed information about a VM."""
|
|
294
|
+
vm = _require_vm(db, name)
|
|
295
|
+
|
|
296
|
+
# VM details
|
|
297
|
+
output.info(f"Name: {vm.name}")
|
|
298
|
+
output.info(f"Created: {vm.created_at}")
|
|
299
|
+
output.info(f"Platform: {vm.platform}")
|
|
300
|
+
output.info(f"Template: {vm.template or '-'}")
|
|
301
|
+
output.info(f"VM Host: {vm.vm_host_name or '-'}")
|
|
302
|
+
output.info(f"Admin User: {vm.admin_username}")
|
|
303
|
+
output.info(f"Provisioning: {vm.provisioning_status}")
|
|
304
|
+
output.info(f"Initialization: {vm.init_status}")
|
|
305
|
+
output.info(f"Tailscale IP: {vm.tailscale_host or '-'}")
|
|
306
|
+
|
|
307
|
+
# Resources table: Initial / Current / Used (Used%)
|
|
308
|
+
live = None
|
|
309
|
+
if vm.tailscale_host is not None:
|
|
310
|
+
live = _query_live_resources(vm, config)
|
|
311
|
+
|
|
312
|
+
if vm.cpus is not None or live is not None:
|
|
313
|
+
output.info(f"\n{'Resources':<16}{'Provisioned':<14}{'Current':<14}{'Used'}")
|
|
314
|
+
output.detail(
|
|
315
|
+
f"{'CPU':<16}"
|
|
316
|
+
f"{str(vm.cpus) if vm.cpus else '-':<14}"
|
|
317
|
+
f"{live['cpus'] if live else '-':<14}"
|
|
318
|
+
f"{'load ' + live['load_avg'] if live else '-'}"
|
|
319
|
+
)
|
|
320
|
+
output.detail(
|
|
321
|
+
f"{'Memory':<16}"
|
|
322
|
+
f"{str(vm.memory_gib) + 'G' if vm.memory_gib else '-':<14}"
|
|
323
|
+
f"{live['mem_total'] if live else '-':<14}"
|
|
324
|
+
f"{live['mem_used'] + ' (' + live['mem_pct'] + ')' if live else '-'}"
|
|
325
|
+
)
|
|
326
|
+
output.detail(
|
|
327
|
+
f"{'Swap':<16}"
|
|
328
|
+
f"{str(vm.swap_gib) + 'G' if vm.swap_gib else '-':<14}"
|
|
329
|
+
f"{live['swap_total'] if live else '-':<14}"
|
|
330
|
+
f"{live['swap_used'] + ' (' + live['swap_pct'] + ')' if live else '-'}"
|
|
331
|
+
)
|
|
332
|
+
output.detail(
|
|
333
|
+
f"{'Disk':<16}"
|
|
334
|
+
f"{str(vm.disk_gib) + 'G' if vm.disk_gib else '-':<14}"
|
|
335
|
+
f"{live['disk_total'] if live else '-':<14}"
|
|
336
|
+
f"{live['disk_used'] + ' (' + live['disk_pct'] + ')' if live else '-'}"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
if vm.azure_resource_id:
|
|
340
|
+
output.info(f"Azure ID: {vm.azure_resource_id}")
|
|
341
|
+
if vm.wsl_distro_name:
|
|
342
|
+
output.info(f"WSL Distro: {vm.wsl_distro_name}")
|
|
343
|
+
if vm.proxmox_vmid:
|
|
344
|
+
output.info(f"Proxmox VMID: {vm.proxmox_vmid}")
|
|
345
|
+
if vm.last_seen_at:
|
|
346
|
+
output.info(f"Last Seen: {vm.last_seen_at}")
|
|
347
|
+
|
|
348
|
+
# Agents on this VM
|
|
349
|
+
agents = db.list_agents(vm_name=name)
|
|
350
|
+
output.info(f"\nAgents ({len(agents)}):")
|
|
351
|
+
if agents:
|
|
352
|
+
for agent in agents:
|
|
353
|
+
grant_count = db.count_agent_grants(agent.name)
|
|
354
|
+
grant_label = "all" if agent.grant_all else str(grant_count)
|
|
355
|
+
output.detail(f"{agent.name} (user: {agent.linux_user}, grants: {grant_label})")
|
|
356
|
+
else:
|
|
357
|
+
output.detail("(none)")
|
|
358
|
+
|
|
359
|
+
# Workspaces with sessions
|
|
360
|
+
workspaces = db.list_workspaces(vm_name=name)
|
|
361
|
+
output.info(f"\nWorkspaces ({len(workspaces)}):")
|
|
362
|
+
if workspaces:
|
|
363
|
+
for ws in workspaces:
|
|
364
|
+
output.detail(f"{ws.name} ({ws.workspace_path})")
|
|
365
|
+
|
|
366
|
+
sessions = db.list_sessions(workspace_name=ws.name)
|
|
367
|
+
if sessions:
|
|
368
|
+
output.detail(f"Sessions ({len(sessions)}):", indent=2)
|
|
369
|
+
for s in sessions:
|
|
370
|
+
mode_label = f"agent:{s.agent_name}" if s.agent_name else "admin"
|
|
371
|
+
output.detail(f"{s.name} [{s.template}] {mode_label}", indent=3)
|
|
372
|
+
else:
|
|
373
|
+
output.detail("(no sessions)", indent=2)
|
|
374
|
+
else:
|
|
375
|
+
output.detail("(none)")
|
|
376
|
+
|
|
377
|
+
# Events
|
|
378
|
+
events = db.list_vm_events(name)
|
|
379
|
+
output.info(f"\nEvents ({len(events)}):")
|
|
380
|
+
if events:
|
|
381
|
+
for event in events:
|
|
382
|
+
evt_detail = f" {event.detail}" if event.detail else ""
|
|
383
|
+
output.detail(f"{event.created_at} {event.event}{evt_detail}")
|
|
384
|
+
else:
|
|
385
|
+
output.detail("(none)")
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def shell_vm(db: Database, config: Config, name: str) -> None:
|
|
389
|
+
"""Open a shell on a VM's home directory."""
|
|
390
|
+
import subprocess
|
|
391
|
+
import sys
|
|
392
|
+
|
|
393
|
+
vm = _require_vm(db, name)
|
|
394
|
+
_guard_failed_vm(vm)
|
|
395
|
+
if vm.tailscale_host is None:
|
|
396
|
+
raise VMError(f"VM '{name}' has no Tailscale IP (init may not be complete)")
|
|
397
|
+
|
|
398
|
+
ssh_cmd = ["ssh", "-t"]
|
|
399
|
+
if config.operator.ssh_private_key:
|
|
400
|
+
ssh_cmd.extend(["-i", str(config.operator.ssh_private_key)])
|
|
401
|
+
ssh_cmd.append(f"{vm.admin_username}@{vm.tailscale_host}")
|
|
402
|
+
|
|
403
|
+
sys.exit(subprocess.call(ssh_cmd))
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def exec_vm(db: Database, config: Config, name: str, command: list[str]) -> int:
|
|
407
|
+
"""Execute a command on a VM via direct SSH subprocess.
|
|
408
|
+
|
|
409
|
+
Uses inherited stdio for streaming output without buffering.
|
|
410
|
+
Returns the remote exit code.
|
|
411
|
+
"""
|
|
412
|
+
import shlex
|
|
413
|
+
import subprocess
|
|
414
|
+
|
|
415
|
+
vm = _require_vm(db, name)
|
|
416
|
+
_guard_failed_vm(vm)
|
|
417
|
+
if vm.tailscale_host is None:
|
|
418
|
+
raise VMError(f"VM '{name}' has no Tailscale IP (init may not be complete)")
|
|
419
|
+
|
|
420
|
+
ssh_cmd = ["ssh", "-T", "-o", "StrictHostKeyChecking=accept-new", "-o", "BatchMode=yes"]
|
|
421
|
+
if config.operator.ssh_private_key:
|
|
422
|
+
ssh_cmd.extend(["-i", str(config.operator.ssh_private_key)])
|
|
423
|
+
ssh_cmd.append(f"{vm.admin_username}@{vm.tailscale_host}")
|
|
424
|
+
ssh_cmd.append(command[0] if len(command) == 1 else shlex.join(command))
|
|
425
|
+
|
|
426
|
+
return subprocess.call(ssh_cmd)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def add_git_credential(db: Database, config: Config, name: str, credential_name: str) -> None:
|
|
430
|
+
"""Add or update a git credential on a VM."""
|
|
431
|
+
from agentworks.ssh import admin_exec_target
|
|
432
|
+
|
|
433
|
+
vm = _require_vm(db, name)
|
|
434
|
+
_guard_failed_vm(vm)
|
|
435
|
+
if vm.tailscale_host is None:
|
|
436
|
+
raise VMError(f"VM '{name}' has no Tailscale IP (init may not be complete)")
|
|
437
|
+
|
|
438
|
+
cred_config = config.git_credentials.get(credential_name)
|
|
439
|
+
if cred_config is None:
|
|
440
|
+
raise VMError(f"git credential '{credential_name}' not found in config")
|
|
441
|
+
|
|
442
|
+
providers = resolve_git_credential_providers(config, [credential_name])
|
|
443
|
+
provider = providers[credential_name]
|
|
444
|
+
|
|
445
|
+
token = provider.obtain_token(name)
|
|
446
|
+
new_lines = provider.credential_lines(token)
|
|
447
|
+
|
|
448
|
+
target = admin_exec_target(vm, config)
|
|
449
|
+
|
|
450
|
+
# Read existing credentials, filter out entries for the same host/path
|
|
451
|
+
result = target.run("cat ~/.git-credentials 2>/dev/null || true")
|
|
452
|
+
existing = result.stdout.strip().splitlines() if result.stdout.strip() else []
|
|
453
|
+
|
|
454
|
+
# Extract host/path from new lines for matching: "https://user:tok@host/path" -> "host/path"
|
|
455
|
+
new_hostpaths = {line.split("@", 1)[1] for line in new_lines if "@" in line}
|
|
456
|
+
|
|
457
|
+
# Filter out old entries whose host/path matches any new entry
|
|
458
|
+
filtered = [e for e in existing if "@" not in e or e.split("@", 1)[1] not in new_hostpaths]
|
|
459
|
+
|
|
460
|
+
# Write back filtered + new
|
|
461
|
+
all_lines = filtered + new_lines
|
|
462
|
+
cred_content = "\n".join(all_lines) + "\n"
|
|
463
|
+
target.write_file("~/.git-credentials", cred_content, mode="600")
|
|
464
|
+
target.run("git config --global credential.helper store")
|
|
465
|
+
|
|
466
|
+
output.info(f"Git credential '{credential_name}' configured on VM '{name}'")
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def start_vm(db: Database, config: Config, name: str) -> None:
|
|
470
|
+
"""Start a stopped VM."""
|
|
471
|
+
vm = _require_vm(db, name)
|
|
472
|
+
_guard_failed_vm(vm)
|
|
473
|
+
provisioner = _get_provisioner_for_vm(db, vm)
|
|
474
|
+
status = provisioner.status(vm)
|
|
475
|
+
if status == VMStatus.RUNNING:
|
|
476
|
+
output.info(f"VM '{name}' is already running")
|
|
477
|
+
else:
|
|
478
|
+
provisioner.start(vm)
|
|
479
|
+
|
|
480
|
+
_ensure_tailscale(db, config, vm, provisioner)
|
|
481
|
+
output.info(f"VM '{name}' is ready")
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def stop_vm(db: Database, config: Config, name: str) -> None:
|
|
485
|
+
"""Stop a running VM."""
|
|
486
|
+
vm = _require_vm(db, name)
|
|
487
|
+
_guard_failed_vm(vm)
|
|
488
|
+
provisioner = _get_provisioner_for_vm(db, vm)
|
|
489
|
+
status = provisioner.status(vm)
|
|
490
|
+
if status in (VMStatus.STOPPED, VMStatus.DEALLOCATED):
|
|
491
|
+
output.info(f"VM '{name}' is already stopped")
|
|
492
|
+
return
|
|
493
|
+
provisioner.stop(vm)
|
|
494
|
+
output.info(f"VM '{name}' stopped")
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def rekey_vm(
|
|
498
|
+
db: Database,
|
|
499
|
+
config: Config,
|
|
500
|
+
name: str,
|
|
501
|
+
*,
|
|
502
|
+
wait_for_share: bool = False,
|
|
503
|
+
ignore_env: bool = False,
|
|
504
|
+
) -> None:
|
|
505
|
+
"""Assign a new Tailscale auth key to a VM (logout + rejoin).
|
|
506
|
+
|
|
507
|
+
Useful for rotating keys, switching tailnets, or recovering from
|
|
508
|
+
expired ephemeral keys. Uses the provisioner's admin_exec_target
|
|
509
|
+
(out-of-band transport) since Tailscale connectivity drops during
|
|
510
|
+
the operation.
|
|
511
|
+
"""
|
|
512
|
+
import ipaddress
|
|
513
|
+
import os
|
|
514
|
+
import shlex
|
|
515
|
+
import time
|
|
516
|
+
|
|
517
|
+
from agentworks.ssh import SSHError, admin_exec_target, wait_for_reconnect
|
|
518
|
+
from agentworks.ssh_config import sync_ssh_config
|
|
519
|
+
from agentworks.vms.provisioners.azure import AzureProvisioner
|
|
520
|
+
|
|
521
|
+
vm = _require_vm(db, name)
|
|
522
|
+
_guard_failed_vm(vm)
|
|
523
|
+
|
|
524
|
+
provisioner = _get_provisioner_for_vm(db, vm, config)
|
|
525
|
+
status = provisioner.status(vm)
|
|
526
|
+
if status != VMStatus.RUNNING:
|
|
527
|
+
raise VMError(f"VM '{name}' is not running (status: {status.value})")
|
|
528
|
+
|
|
529
|
+
# Collect new auth key
|
|
530
|
+
ts_auth_key = os.environ.get("TAILSCALE_AUTH_KEY") if not ignore_env else None
|
|
531
|
+
if ts_auth_key:
|
|
532
|
+
output.detail("Tailscale auth key found in environment")
|
|
533
|
+
else:
|
|
534
|
+
ts_auth_key = output.prompt_secret(
|
|
535
|
+
"Tailscale auth key",
|
|
536
|
+
hint="Generate a key at https://login.tailscale.com/admin/settings/keys",
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
output.info(f"Rekeying '{name}'...")
|
|
540
|
+
|
|
541
|
+
# For Azure, attach a temporary public IP for out-of-band access
|
|
542
|
+
azure_provisioner = provisioner if isinstance(provisioner, AzureProvisioner) else None
|
|
543
|
+
if azure_provisioner is not None:
|
|
544
|
+
azure_provisioner.attach_public_ip(vm)
|
|
545
|
+
|
|
546
|
+
try:
|
|
547
|
+
exec_target = provisioner.admin_exec_target(vm, config=config)
|
|
548
|
+
|
|
549
|
+
# Wait for the provisioning transport to be reachable
|
|
550
|
+
output.detail("Waiting for provisioning transport...")
|
|
551
|
+
for attempt in range(6):
|
|
552
|
+
try:
|
|
553
|
+
exec_target.run("echo ok", timeout=10)
|
|
554
|
+
break
|
|
555
|
+
except SSHError:
|
|
556
|
+
if attempt == 5:
|
|
557
|
+
raise
|
|
558
|
+
output.detail(f"Attempt {attempt + 1} failed, retrying...")
|
|
559
|
+
time.sleep(5)
|
|
560
|
+
output.detail("Connected.")
|
|
561
|
+
|
|
562
|
+
# Restart, logout, login, restart. The initial restart clears any
|
|
563
|
+
# stale daemon state (a previous interrupted rekey can leave the
|
|
564
|
+
# daemon in a state where `tailscale logout` hangs waiting for a
|
|
565
|
+
# control plane response that never comes). The final restart
|
|
566
|
+
# fixes a Tailscale bug where the node registers but peers can't
|
|
567
|
+
# reach it after rekeying to a different tailnet.
|
|
568
|
+
# Restart command varies by platform. WSL2 may not have systemd.
|
|
569
|
+
is_wsl2 = vm.platform == "wsl2"
|
|
570
|
+
restart_cmd = "service tailscaled restart" if is_wsl2 else "systemctl restart tailscaled"
|
|
571
|
+
stabilize_secs = 15 # pause between steps for daemon/network stability
|
|
572
|
+
|
|
573
|
+
output.detail("Restarting Tailscale daemon...")
|
|
574
|
+
exec_target.run(restart_cmd, sudo=True, timeout=15)
|
|
575
|
+
time.sleep(stabilize_secs)
|
|
576
|
+
|
|
577
|
+
output.detail("Logging out of current tailnet...")
|
|
578
|
+
exec_target.run("tailscale logout", sudo=True, timeout=30)
|
|
579
|
+
time.sleep(stabilize_secs)
|
|
580
|
+
|
|
581
|
+
output.detail("Joining new tailnet...")
|
|
582
|
+
quoted_key = shlex.quote(ts_auth_key)
|
|
583
|
+
ts_up_cmd = f"tailscale up --auth-key {quoted_key}"
|
|
584
|
+
if is_wsl2:
|
|
585
|
+
ts_up_cmd += " --userspace-networking"
|
|
586
|
+
exec_target.run(ts_up_cmd, sudo=True, timeout=30)
|
|
587
|
+
time.sleep(stabilize_secs)
|
|
588
|
+
|
|
589
|
+
output.detail("Restarting Tailscale daemon...")
|
|
590
|
+
exec_target.run(restart_cmd, sudo=True, timeout=15)
|
|
591
|
+
time.sleep(stabilize_secs)
|
|
592
|
+
|
|
593
|
+
output.detail("Reading new Tailscale IP...")
|
|
594
|
+
result = exec_target.run("tailscale ip -4", sudo=True, timeout=15)
|
|
595
|
+
raw_ip = result.stdout.strip()
|
|
596
|
+
new_ip = raw_ip.splitlines()[0].strip() if raw_ip else ""
|
|
597
|
+
try:
|
|
598
|
+
ipaddress.IPv4Address(new_ip)
|
|
599
|
+
except ValueError:
|
|
600
|
+
raise SSHError(
|
|
601
|
+
f"tailscale ip -4 returned invalid address: {new_ip!r}\nfull output: {raw_ip}"
|
|
602
|
+
) from None
|
|
603
|
+
output.detail(f"Tailscale IP: {new_ip}")
|
|
604
|
+
|
|
605
|
+
# Update DB and SSH config with the new IP (correct regardless of
|
|
606
|
+
# reachability -- the old IP is definitely dead after logout)
|
|
607
|
+
db.update_vm_tailscale(name, new_ip)
|
|
608
|
+
sync_ssh_config(config, db)
|
|
609
|
+
db.insert_vm_event(name, "rekey", f"new_ip={new_ip}")
|
|
610
|
+
|
|
611
|
+
# If the operator needs to share the VM back, pause before connectivity check
|
|
612
|
+
if wait_for_share:
|
|
613
|
+
output.pause(
|
|
614
|
+
"Share the VM back to your tailnet, then press Enter to verify connectivity..."
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
# Always verify Tailscale SSH connectivity to the new IP
|
|
618
|
+
output.detail(f"Verifying SSH to {new_ip}...")
|
|
619
|
+
from dataclasses import replace
|
|
620
|
+
|
|
621
|
+
ts_target = admin_exec_target(vm, config)
|
|
622
|
+
assert ts_target.ssh is not None
|
|
623
|
+
ts_target = replace(ts_target, ssh=replace(ts_target.ssh, host=new_ip))
|
|
624
|
+
if wait_for_reconnect(ts_target):
|
|
625
|
+
output.info(f"VM '{name}' rekeyed successfully. Tailscale IP: {new_ip}")
|
|
626
|
+
else:
|
|
627
|
+
output.warn(
|
|
628
|
+
f"VM '{name}' rekeyed but {new_ip} is not reachable via SSH. "
|
|
629
|
+
"Check tailnet sharing/ACLs. Run 'vm rekey' again to retry."
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
finally:
|
|
633
|
+
if azure_provisioner is not None:
|
|
634
|
+
azure_provisioner.detach_public_ip(vm)
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def delete_vm(
|
|
638
|
+
db: Database,
|
|
639
|
+
config: Config,
|
|
640
|
+
name: str,
|
|
641
|
+
*,
|
|
642
|
+
force: bool = False,
|
|
643
|
+
yes: bool = False,
|
|
644
|
+
) -> None:
|
|
645
|
+
"""Delete a VM, cleaning up all associated resources."""
|
|
646
|
+
vm = _require_vm(db, name)
|
|
647
|
+
|
|
648
|
+
# Check for workspaces (which contain agents and sessions)
|
|
649
|
+
ws_count = db.count_workspaces_on_vm(name)
|
|
650
|
+
ag_count = db.count_agents_on_vm(name)
|
|
651
|
+
ts_count = db.count_sessions_on_vm(name)
|
|
652
|
+
has_children = ws_count > 0
|
|
653
|
+
|
|
654
|
+
if has_children and not force:
|
|
655
|
+
parts = [f"{ws_count} workspace(s)"]
|
|
656
|
+
if ag_count > 0:
|
|
657
|
+
parts.append(f"{ag_count} agent(s)")
|
|
658
|
+
if ts_count > 0:
|
|
659
|
+
parts.append(f"{ts_count} session(s)")
|
|
660
|
+
raise VMError(f"VM '{name}' has {', '.join(parts)}. Delete them first, or use --force.")
|
|
661
|
+
|
|
662
|
+
if not yes and not force:
|
|
663
|
+
msg = f"Delete VM '{name}'?"
|
|
664
|
+
if has_children:
|
|
665
|
+
parts = [f"{ws_count} workspace(s)"]
|
|
666
|
+
if ag_count > 0:
|
|
667
|
+
parts.append(f"{ag_count} agent(s)")
|
|
668
|
+
if ts_count > 0:
|
|
669
|
+
parts.append(f"{ts_count} session(s)")
|
|
670
|
+
msg += f" ({', '.join(parts)} will also be deleted)"
|
|
671
|
+
if not output.confirm(msg):
|
|
672
|
+
raise output.UserAbort("delete cancelled")
|
|
673
|
+
|
|
674
|
+
# Platform-specific cleanup (also handles Tailscale logout)
|
|
675
|
+
try:
|
|
676
|
+
provisioner = _get_provisioner_for_vm(db, vm)
|
|
677
|
+
|
|
678
|
+
# Tailscale logout (best-effort, via provisioning transport)
|
|
679
|
+
if vm.tailscale_host:
|
|
680
|
+
_tailscale_logout(provisioner, vm, config)
|
|
681
|
+
|
|
682
|
+
provisioner.delete(vm)
|
|
683
|
+
except Exception as e:
|
|
684
|
+
output.warn(f"platform cleanup failed: {e}")
|
|
685
|
+
|
|
686
|
+
# Clean up logs
|
|
687
|
+
from agentworks.ssh import LOG_DIR
|
|
688
|
+
|
|
689
|
+
vm_logs = list(LOG_DIR.glob(f"{name}-*.log")) if LOG_DIR.exists() else []
|
|
690
|
+
for log in vm_logs:
|
|
691
|
+
log.unlink(missing_ok=True)
|
|
692
|
+
if vm_logs:
|
|
693
|
+
output.info(f"Cleaned up {len(vm_logs)} log(s)")
|
|
694
|
+
|
|
695
|
+
# Remove from DB (cascades workspaces and agents), then rebuild SSH config
|
|
696
|
+
db.delete_vm(name)
|
|
697
|
+
|
|
698
|
+
from agentworks.ssh_config import sync_ssh_config
|
|
699
|
+
|
|
700
|
+
sync_ssh_config(config, db)
|
|
701
|
+
output.info(f"VM '{name}' deleted")
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def reinit_vm(
|
|
705
|
+
db: Database,
|
|
706
|
+
config: Config,
|
|
707
|
+
name: str,
|
|
708
|
+
) -> None:
|
|
709
|
+
"""Re-run initialization on a VM that has already been provisioned.
|
|
710
|
+
|
|
711
|
+
Requires provisioning_status == complete and a valid Tailscale connection.
|
|
712
|
+
"""
|
|
713
|
+
from agentworks.ssh import admin_exec_target
|
|
714
|
+
|
|
715
|
+
vm = _require_vm(db, name)
|
|
716
|
+
|
|
717
|
+
# Resolve the VM's template so init uses the right values
|
|
718
|
+
if vm.template and vm.template != "default":
|
|
719
|
+
from dataclasses import replace as _replace
|
|
720
|
+
|
|
721
|
+
from agentworks.vms.templates import resolve_template
|
|
722
|
+
|
|
723
|
+
config = _replace(config, vm=resolve_template(config, vm.template))
|
|
724
|
+
|
|
725
|
+
if vm.provisioning_status != ProvisioningStatus.COMPLETE.value:
|
|
726
|
+
raise VMError(
|
|
727
|
+
f"VM '{name}' provisioning is '{vm.provisioning_status}', not 'complete'. Cannot reinitialize."
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
if vm.tailscale_host is None:
|
|
731
|
+
raise VMError(f"VM '{name}' has no Tailscale IP")
|
|
732
|
+
|
|
733
|
+
# Pre-flight checks
|
|
734
|
+
verify_tailscale_available()
|
|
735
|
+
providers = resolve_git_credential_providers(config, config.admin.git_credentials)
|
|
736
|
+
verify_git_credential_auth(providers)
|
|
737
|
+
|
|
738
|
+
# Collect git tokens upfront
|
|
739
|
+
git_tokens: dict[str, str] = {}
|
|
740
|
+
for cred_name, provider in providers.items():
|
|
741
|
+
git_tokens[cred_name] = provider.obtain_token(name)
|
|
742
|
+
|
|
743
|
+
# Build Tailscale SSH target with logging
|
|
744
|
+
from agentworks.ssh import SSHLogger
|
|
745
|
+
|
|
746
|
+
logger = SSHLogger(name, "vm-reinit")
|
|
747
|
+
for token in git_tokens.values():
|
|
748
|
+
logger.add_redaction(token)
|
|
749
|
+
ts_target = admin_exec_target(vm, config, default_timeout=60, logger=logger)
|
|
750
|
+
|
|
751
|
+
home = f"/home/{vm.admin_username}"
|
|
752
|
+
|
|
753
|
+
try:
|
|
754
|
+
run_initialization(
|
|
755
|
+
db,
|
|
756
|
+
config,
|
|
757
|
+
name,
|
|
758
|
+
ts_target,
|
|
759
|
+
providers,
|
|
760
|
+
home,
|
|
761
|
+
vm.admin_username,
|
|
762
|
+
logger,
|
|
763
|
+
git_tokens=git_tokens,
|
|
764
|
+
)
|
|
765
|
+
except Exception:
|
|
766
|
+
logger.close()
|
|
767
|
+
output.warn(f"Log: {logger.path}")
|
|
768
|
+
raise
|
|
769
|
+
|
|
770
|
+
logger.close()
|
|
771
|
+
|
|
772
|
+
refreshed_vm = db.get_vm(name)
|
|
773
|
+
assert refreshed_vm is not None
|
|
774
|
+
if refreshed_vm.init_status == InitStatus.PARTIAL.value:
|
|
775
|
+
output.info(f"VM '{name}' reinitialized (with warnings -- see above)")
|
|
776
|
+
output.detail(f"Log: {logger.path}")
|
|
777
|
+
else:
|
|
778
|
+
output.info(f"VM '{name}' reinitialized successfully!")
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def _tailscale_logout(provisioner: VMProvisioner, vm: VMRow, config: Config) -> None:
|
|
782
|
+
"""Best-effort: deregister from Tailscale via the provisioning transport.
|
|
783
|
+
|
|
784
|
+
Uses the provisioner's admin_exec_target (not Tailscale SSH) because we
|
|
785
|
+
can't ask Tailscale to tear itself down over the connection it provides.
|
|
786
|
+
For Azure VMs, temporarily attaches a public IP for SSH access.
|
|
787
|
+
Proxmox raises NotImplementedError (guest agent not yet wired in).
|
|
788
|
+
"""
|
|
789
|
+
import time
|
|
790
|
+
|
|
791
|
+
from agentworks.ssh import SSHError as _SSHError
|
|
792
|
+
from agentworks.vms.provisioners.azure import AzureProvisioner
|
|
793
|
+
|
|
794
|
+
output.info("Deregistering from Tailscale...")
|
|
795
|
+
try:
|
|
796
|
+
azure_provisioner = provisioner if isinstance(provisioner, AzureProvisioner) else None
|
|
797
|
+
if azure_provisioner is not None:
|
|
798
|
+
azure_provisioner.attach_public_ip(vm)
|
|
799
|
+
exec_target = provisioner.admin_exec_target(vm, config=config)
|
|
800
|
+
|
|
801
|
+
# Wait for SSH to be reachable (public IP may have just been attached)
|
|
802
|
+
for attempt in range(6):
|
|
803
|
+
try:
|
|
804
|
+
exec_target.run("echo ok", timeout=10)
|
|
805
|
+
break
|
|
806
|
+
except (_SSHError, Exception):
|
|
807
|
+
if attempt == 5:
|
|
808
|
+
raise
|
|
809
|
+
time.sleep(5)
|
|
810
|
+
|
|
811
|
+
# Fire and forget: tailscale down + logout can disrupt networking
|
|
812
|
+
# on the VM, killing SSH-based transports before they get a response.
|
|
813
|
+
# Lima/WSL2 use local transports and are unaffected, but the nohup
|
|
814
|
+
# approach works universally.
|
|
815
|
+
exec_target.run(
|
|
816
|
+
"nohup sh -c 'tailscale down && tailscale logout' >/dev/null 2>&1 &",
|
|
817
|
+
sudo=True,
|
|
818
|
+
timeout=10,
|
|
819
|
+
)
|
|
820
|
+
output.info("Tailscale node deregistered")
|
|
821
|
+
except Exception as e:
|
|
822
|
+
output.warn(f"Tailscale logout failed (node may remain in admin console): {e}")
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
def _init_log_hint(vm_name: str) -> str:
|
|
826
|
+
"""Return a log hint suffix like ' See log: <path>' or empty string."""
|
|
827
|
+
from agentworks.ssh import LOG_DIR
|
|
828
|
+
|
|
829
|
+
if not LOG_DIR.exists():
|
|
830
|
+
return ""
|
|
831
|
+
logs = sorted(LOG_DIR.glob(f"{vm_name}-*.log"), reverse=True)
|
|
832
|
+
return f" See log: {logs[0]}" if logs else ""
|
|
833
|
+
|
|
834
|
+
|
|
835
|
+
def _guard_failed_vm(vm: VMRow) -> None:
|
|
836
|
+
"""Block operations on VMs with failed provisioning or initialization."""
|
|
837
|
+
if vm.provisioning_status == ProvisioningStatus.FAILED.value:
|
|
838
|
+
raise VMError(
|
|
839
|
+
f"VM '{vm.name}' has failed provisioning. Only 'vm delete' is supported.{_init_log_hint(vm.name)}"
|
|
840
|
+
)
|
|
841
|
+
if vm.init_status == InitStatus.FAILED.value:
|
|
842
|
+
raise VMError(
|
|
843
|
+
f"VM '{vm.name}' has failed initialization. "
|
|
844
|
+
f"Use 'vm reinit' to retry or 'vm delete' to remove.{_init_log_hint(vm.name)}"
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
def _collect_secrets(
|
|
849
|
+
providers: dict[str, GitCredentialProvider],
|
|
850
|
+
vm_name: str,
|
|
851
|
+
) -> tuple[str | None, dict[str, str]]:
|
|
852
|
+
"""Collect all secrets upfront before provisioning starts.
|
|
853
|
+
|
|
854
|
+
Returns (tailscale_auth_key, git_tokens).
|
|
855
|
+
"""
|
|
856
|
+
import os
|
|
857
|
+
|
|
858
|
+
output.info("Collecting credentials...")
|
|
859
|
+
|
|
860
|
+
# Tailscale
|
|
861
|
+
ts_auth_key = os.environ.get("TAILSCALE_AUTH_KEY")
|
|
862
|
+
if ts_auth_key:
|
|
863
|
+
output.detail("Tailscale auth key found in environment")
|
|
864
|
+
else:
|
|
865
|
+
ts_auth_key = output.prompt_secret(
|
|
866
|
+
" Tailscale auth key",
|
|
867
|
+
hint="Generate a key at https://login.tailscale.com/admin/settings/keys",
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
# Git credentials
|
|
871
|
+
git_tokens: dict[str, str] = {}
|
|
872
|
+
for name, provider in providers.items():
|
|
873
|
+
token = provider.obtain_token(vm_name)
|
|
874
|
+
git_tokens[name] = token
|
|
875
|
+
|
|
876
|
+
return ts_auth_key, git_tokens
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
def _query_live_resources(vm: VMRow, config: Config) -> dict[str, str] | None:
|
|
880
|
+
"""Query live resource usage from a VM over SSH."""
|
|
881
|
+
from agentworks.ssh import admin_exec_target, run
|
|
882
|
+
|
|
883
|
+
target = admin_exec_target(vm, config)
|
|
884
|
+
cmd = (
|
|
885
|
+
"nproc && "
|
|
886
|
+
"uptime | grep -oP 'load average: \\K[^,]+' && "
|
|
887
|
+
"free -b | awk '/^Mem:/{print $2,$3} /^Swap:/{print $2,$3}' && "
|
|
888
|
+
"df -h / | awk 'NR==2{print $2,$3,$5}'"
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
try:
|
|
892
|
+
result = run(target, cmd, check=False, retries=3)
|
|
893
|
+
except Exception:
|
|
894
|
+
return None
|
|
895
|
+
|
|
896
|
+
if not result.ok:
|
|
897
|
+
return None
|
|
898
|
+
|
|
899
|
+
lines = result.stdout.strip().splitlines()
|
|
900
|
+
if len(lines) < 5:
|
|
901
|
+
return None
|
|
902
|
+
|
|
903
|
+
try:
|
|
904
|
+
cpus = lines[0].strip()
|
|
905
|
+
load_avg = lines[1].strip()
|
|
906
|
+
mem_parts = lines[2].split()
|
|
907
|
+
swap_parts = lines[3].split()
|
|
908
|
+
disk_parts = lines[4].split()
|
|
909
|
+
|
|
910
|
+
mem_total_b = int(mem_parts[0])
|
|
911
|
+
mem_used_b = int(mem_parts[1])
|
|
912
|
+
swap_total_b = int(swap_parts[0])
|
|
913
|
+
swap_used_b = int(swap_parts[1])
|
|
914
|
+
|
|
915
|
+
mem_pct = f"{mem_used_b * 100 // mem_total_b}%" if mem_total_b > 0 else "0%"
|
|
916
|
+
swap_pct = f"{swap_used_b * 100 // swap_total_b}%" if swap_total_b > 0 else "0%"
|
|
917
|
+
|
|
918
|
+
return {
|
|
919
|
+
"cpus": cpus,
|
|
920
|
+
"load_avg": load_avg,
|
|
921
|
+
"mem_total": _human_bytes(mem_total_b),
|
|
922
|
+
"mem_used": _human_bytes(mem_used_b),
|
|
923
|
+
"mem_pct": mem_pct,
|
|
924
|
+
"swap_total": _human_bytes(swap_total_b),
|
|
925
|
+
"swap_used": _human_bytes(swap_used_b),
|
|
926
|
+
"swap_pct": swap_pct,
|
|
927
|
+
"disk_total": disk_parts[0],
|
|
928
|
+
"disk_used": disk_parts[1],
|
|
929
|
+
"disk_pct": disk_parts[2],
|
|
930
|
+
}
|
|
931
|
+
except (IndexError, ValueError):
|
|
932
|
+
return None
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def _human_bytes(b: int) -> str:
|
|
936
|
+
"""Format bytes as a human-readable string (e.g. 494M, 8.0G)."""
|
|
937
|
+
if b < 1024:
|
|
938
|
+
return f"{b}B"
|
|
939
|
+
for unit in ("K", "M", "G", "T"):
|
|
940
|
+
b_f = b / 1024
|
|
941
|
+
if b_f < 1024 or unit == "T":
|
|
942
|
+
return f"{b_f:.1f}{unit}" if b_f >= 10 else f"{b_f:.2f}{unit}"
|
|
943
|
+
b = int(b_f)
|
|
944
|
+
return f"{b}T"
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
def _require_vm(db: Database, name: str) -> VMRow:
|
|
948
|
+
vm = db.get_vm(name)
|
|
949
|
+
if vm is None:
|
|
950
|
+
raise VMError(f"VM '{name}' not found")
|
|
951
|
+
return vm
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def _get_provisioner_for_vm(db: Database, vm: VMRow, config: Config | None = None) -> VMProvisioner:
|
|
955
|
+
if vm.platform == "proxmox":
|
|
956
|
+
from agentworks.vms.provisioners.proxmox import ProxmoxProvisioner
|
|
957
|
+
|
|
958
|
+
if config is None:
|
|
959
|
+
from agentworks.config import load_config
|
|
960
|
+
config = load_config()
|
|
961
|
+
return ProxmoxProvisioner(config.proxmox) # type: ignore[arg-type]
|
|
962
|
+
|
|
963
|
+
vm_host_ssh: str | None = None
|
|
964
|
+
if vm.vm_host_name:
|
|
965
|
+
host = db.get_vm_host(vm.vm_host_name)
|
|
966
|
+
if host:
|
|
967
|
+
vm_host_ssh = host.ssh_host
|
|
968
|
+
return get_provisioner(vm.platform, vm_host_ssh)
|
|
969
|
+
|
|
970
|
+
|
|
971
|
+
def _is_tailscale_reachable(tailscale_host: str) -> bool:
|
|
972
|
+
"""Quick check whether a Tailscale IP is still reachable."""
|
|
973
|
+
import subprocess
|
|
974
|
+
|
|
975
|
+
try:
|
|
976
|
+
result = subprocess.run(
|
|
977
|
+
["tailscale", "ping", "--timeout=5s", "-c=1", tailscale_host],
|
|
978
|
+
capture_output=True,
|
|
979
|
+
text=True,
|
|
980
|
+
encoding="utf-8",
|
|
981
|
+
errors="replace",
|
|
982
|
+
timeout=10,
|
|
983
|
+
)
|
|
984
|
+
return result.returncode == 0
|
|
985
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
986
|
+
return False
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
def port_forward_vm(
|
|
990
|
+
db: Database,
|
|
991
|
+
config: Config,
|
|
992
|
+
name: str,
|
|
993
|
+
ports: list[str],
|
|
994
|
+
address: str = "localhost",
|
|
995
|
+
verbose: bool = False,
|
|
996
|
+
) -> None:
|
|
997
|
+
"""Forward one or more local ports to a VM via SSH tunnels.
|
|
998
|
+
|
|
999
|
+
Each port spec is either REMOTE_PORT (local defaults to same) or
|
|
1000
|
+
LOCAL_PORT:REMOTE_PORT, matching kubectl port-forward syntax.
|
|
1001
|
+
"""
|
|
1002
|
+
import signal
|
|
1003
|
+
import subprocess
|
|
1004
|
+
import sys
|
|
1005
|
+
|
|
1006
|
+
vm = _require_vm(db, name)
|
|
1007
|
+
_guard_failed_vm(vm)
|
|
1008
|
+
if vm.tailscale_host is None:
|
|
1009
|
+
raise VMError(f"VM '{name}' has no Tailscale IP (init may not be complete)")
|
|
1010
|
+
|
|
1011
|
+
# Parse port specs
|
|
1012
|
+
forwards: list[tuple[int, int]] = [] # (local_port, remote_port)
|
|
1013
|
+
for spec in ports:
|
|
1014
|
+
parts = spec.split(":")
|
|
1015
|
+
if len(parts) == 1:
|
|
1016
|
+
try:
|
|
1017
|
+
port = int(parts[0])
|
|
1018
|
+
except ValueError:
|
|
1019
|
+
raise VMError(f"invalid port '{spec}'") from None
|
|
1020
|
+
forwards.append((port, port))
|
|
1021
|
+
elif len(parts) == 2:
|
|
1022
|
+
try:
|
|
1023
|
+
local_port = int(parts[0])
|
|
1024
|
+
remote_port = int(parts[1])
|
|
1025
|
+
except ValueError:
|
|
1026
|
+
raise VMError(f"invalid port spec '{spec}'") from None
|
|
1027
|
+
forwards.append((local_port, remote_port))
|
|
1028
|
+
else:
|
|
1029
|
+
raise VMError(f"invalid port spec '{spec}' (expected [LOCAL:]REMOTE)")
|
|
1030
|
+
|
|
1031
|
+
# Validate port ranges
|
|
1032
|
+
for local_port, remote_port in forwards:
|
|
1033
|
+
for label, port in [("local", local_port), ("remote", remote_port)]:
|
|
1034
|
+
if port < 1 or port > 65535:
|
|
1035
|
+
raise VMError(f"{label} port {port} out of range (1-65535)")
|
|
1036
|
+
|
|
1037
|
+
# Build SSH command with -L flags for each forward
|
|
1038
|
+
ssh_cmd = ["ssh", "-N", "-o", "StrictHostKeyChecking=accept-new"]
|
|
1039
|
+
if config.operator.ssh_private_key:
|
|
1040
|
+
ssh_cmd.extend(["-i", str(config.operator.ssh_private_key)])
|
|
1041
|
+
for local_port, remote_port in forwards:
|
|
1042
|
+
ssh_cmd.extend(["-L", f"{address}:{local_port}:localhost:{remote_port}"])
|
|
1043
|
+
if verbose:
|
|
1044
|
+
ssh_cmd.append("-v")
|
|
1045
|
+
ssh_cmd.append(f"{vm.admin_username}@{vm.tailscale_host}")
|
|
1046
|
+
|
|
1047
|
+
# Print forwarding info
|
|
1048
|
+
for local_port, remote_port in forwards:
|
|
1049
|
+
output.info(f"Forwarding {address}:{local_port} -> {vm.tailscale_host}:{remote_port}")
|
|
1050
|
+
if not verbose:
|
|
1051
|
+
output.info("Use --verbose for detailed SSH output.")
|
|
1052
|
+
|
|
1053
|
+
# Run in foreground until interrupted
|
|
1054
|
+
try:
|
|
1055
|
+
proc = subprocess.Popen(ssh_cmd)
|
|
1056
|
+
|
|
1057
|
+
# Forward SIGINT/SIGTERM to the SSH process for clean shutdown
|
|
1058
|
+
def _handle_signal(sig: int, _frame: object) -> None:
|
|
1059
|
+
proc.terminate()
|
|
1060
|
+
|
|
1061
|
+
signal.signal(signal.SIGINT, _handle_signal)
|
|
1062
|
+
signal.signal(signal.SIGTERM, _handle_signal)
|
|
1063
|
+
|
|
1064
|
+
rc = proc.wait()
|
|
1065
|
+
sys.exit(rc)
|
|
1066
|
+
except OSError as e:
|
|
1067
|
+
raise VMError(f"failed to start SSH: {e}") from e
|
|
1068
|
+
|
|
1069
|
+
|
|
1070
|
+
def _ensure_tailscale(
|
|
1071
|
+
db: Database,
|
|
1072
|
+
config: Config,
|
|
1073
|
+
vm: VMRow,
|
|
1074
|
+
provisioner: VMProvisioner,
|
|
1075
|
+
) -> None:
|
|
1076
|
+
"""After starting a VM, verify Tailscale connectivity and rejoin if needed."""
|
|
1077
|
+
from agentworks.ssh import admin_exec_target, wait_for_reconnect
|
|
1078
|
+
|
|
1079
|
+
# Refresh VM row in case tailscale_host was cleared on stop
|
|
1080
|
+
vm = _require_vm(db, vm.name)
|
|
1081
|
+
|
|
1082
|
+
# If we have a known Tailscale host, wait for it to reconnect after boot.
|
|
1083
|
+
# This avoids unnecessarily attaching a public IP on Azure.
|
|
1084
|
+
if vm.tailscale_host:
|
|
1085
|
+
if wait_for_reconnect(admin_exec_target(vm, config)):
|
|
1086
|
+
return
|
|
1087
|
+
|
|
1088
|
+
# Tailscale didn't reconnect (ephemeral key expired, etc.)
|
|
1089
|
+
output.info(f"Tailscale node {vm.tailscale_host} did not reconnect, rejoining...")
|
|
1090
|
+
db.clear_vm_tailscale(vm.name)
|
|
1091
|
+
|
|
1092
|
+
# For Azure, attach a temporary public IP for the rejoin
|
|
1093
|
+
from agentworks.vms.provisioners.azure import AzureProvisioner
|
|
1094
|
+
|
|
1095
|
+
azure_provisioner = provisioner if isinstance(provisioner, AzureProvisioner) else None
|
|
1096
|
+
if azure_provisioner is not None:
|
|
1097
|
+
azure_provisioner.attach_public_ip(vm)
|
|
1098
|
+
|
|
1099
|
+
try:
|
|
1100
|
+
verify_tailscale_available()
|
|
1101
|
+
exec_target = provisioner.admin_exec_target(vm, config=config)
|
|
1102
|
+
rejoin_tailscale(
|
|
1103
|
+
db,
|
|
1104
|
+
vm.name,
|
|
1105
|
+
exec_target,
|
|
1106
|
+
is_wsl2=(vm.platform == "wsl2"),
|
|
1107
|
+
)
|
|
1108
|
+
finally:
|
|
1109
|
+
if azure_provisioner is not None:
|
|
1110
|
+
azure_provisioner.detach_public_ip(vm)
|
|
1111
|
+
|
|
1112
|
+
# Wait for Tailscale SSH to reconnect after IP change
|
|
1113
|
+
from agentworks.ssh import admin_exec_target, wait_for_reconnect
|
|
1114
|
+
|
|
1115
|
+
refreshed = db.get_vm(vm.name)
|
|
1116
|
+
if refreshed and refreshed.tailscale_host:
|
|
1117
|
+
wait_for_reconnect(admin_exec_target(refreshed, config))
|
|
1118
|
+
|
|
1119
|
+
# Update SSH config in case the Tailscale IP changed
|
|
1120
|
+
from agentworks.ssh_config import sync_ssh_config
|
|
1121
|
+
|
|
1122
|
+
sync_ssh_config(config, db)
|