agentworks-cli 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. agentworks/__init__.py +1 -0
  2. agentworks/agents/__init__.py +0 -0
  3. agentworks/agents/manager.py +1095 -0
  4. agentworks/agents/templates.py +145 -0
  5. agentworks/catalog.py +264 -0
  6. agentworks/catalog.toml +131 -0
  7. agentworks/cli.py +1462 -0
  8. agentworks/completions/__init__.py +33 -0
  9. agentworks/completions/bash.py +179 -0
  10. agentworks/completions/install.py +122 -0
  11. agentworks/completions/powershell.py +270 -0
  12. agentworks/completions/spec.py +216 -0
  13. agentworks/completions/zsh.py +256 -0
  14. agentworks/config.py +894 -0
  15. agentworks/db.py +1083 -0
  16. agentworks/doctor.py +430 -0
  17. agentworks/git_credentials/__init__.py +0 -0
  18. agentworks/git_credentials/azdo.py +29 -0
  19. agentworks/git_credentials/base.py +71 -0
  20. agentworks/git_credentials/github.py +22 -0
  21. agentworks/nerf-config.yaml +16 -0
  22. agentworks/output.py +296 -0
  23. agentworks/remote_exec.py +286 -0
  24. agentworks/sample-config.toml +289 -0
  25. agentworks/sessions/__init__.py +0 -0
  26. agentworks/sessions/console.py +164 -0
  27. agentworks/sessions/manager.py +1297 -0
  28. agentworks/sessions/templates.py +101 -0
  29. agentworks/sessions/tmux.py +503 -0
  30. agentworks/sources.py +303 -0
  31. agentworks/ssh.py +759 -0
  32. agentworks/ssh_config.py +255 -0
  33. agentworks/vm_hosts/__init__.py +0 -0
  34. agentworks/vm_hosts/manager.py +86 -0
  35. agentworks/vms/__init__.py +0 -0
  36. agentworks/vms/backup.py +409 -0
  37. agentworks/vms/base.py +56 -0
  38. agentworks/vms/bootstrap_script.py +185 -0
  39. agentworks/vms/cloud_init.py +55 -0
  40. agentworks/vms/initializer.py +1523 -0
  41. agentworks/vms/manager.py +1122 -0
  42. agentworks/vms/provisioners/__init__.py +0 -0
  43. agentworks/vms/provisioners/azure.py +602 -0
  44. agentworks/vms/provisioners/lima.py +295 -0
  45. agentworks/vms/provisioners/proxmox.py +279 -0
  46. agentworks/vms/provisioners/proxmox_api.py +261 -0
  47. agentworks/vms/provisioners/wsl2.py +340 -0
  48. agentworks/vms/templates.py +152 -0
  49. agentworks/workspaces/__init__.py +0 -0
  50. agentworks/workspaces/backends/__init__.py +0 -0
  51. agentworks/workspaces/backends/local.py +119 -0
  52. agentworks/workspaces/backends/vm.py +175 -0
  53. agentworks/workspaces/manager.py +1080 -0
  54. agentworks/workspaces/templates.py +76 -0
  55. agentworks/workspaces/tmuxinator.py +80 -0
  56. agentworks_cli-0.2.1.dist-info/METADATA +635 -0
  57. agentworks_cli-0.2.1.dist-info/RECORD +59 -0
  58. agentworks_cli-0.2.1.dist-info/WHEEL +4 -0
  59. agentworks_cli-0.2.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1523 @@
1
+ """VM lifecycle: provisioning (one-time) and initialization (repeatable).
2
+
3
+ Two phases:
4
+ A. Provisioning (over provisioning transport): bootstrap, SSH key, Tailscale join.
5
+ One-time, platform-specific, pass/fail. Tracked via provisioning_status.
6
+ B. Initialization (over Tailscale SSH): packages, install commands, git credentials,
7
+ dotfiles. Repeatable via `vm reinit`. Tracked via init_status.
8
+
9
+ Phase A steps are fatal -- if they fail, the VM is unreachable and useless.
10
+ Phase B steps are non-fatal -- failures produce warnings and a 'partial' status.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import ipaddress
16
+ import shlex
17
+ import subprocess
18
+ import tempfile
19
+ from collections.abc import Callable
20
+ from pathlib import Path
21
+ from typing import TYPE_CHECKING
22
+
23
+ from agentworks import output
24
+ from agentworks.db import InitStatus, ProvisioningStatus
25
+ from agentworks.output import ConnectivityError, VMError
26
+ from agentworks.ssh import ExecTarget, SSHError, SSHLogger, SSHTarget
27
+ from agentworks.vms.cloud_init import INIT_SYSTEM_PACKAGES, PROVISIONING_PACKAGES
28
+
29
+ if TYPE_CHECKING:
30
+ from collections.abc import Mapping
31
+
32
+ from agentworks.catalog import AptSourceEntry, SystemInstallCommandEntry, UserInstallCommandEntry
33
+ from agentworks.config import Config
34
+ from agentworks.db import Database
35
+ from agentworks.git_credentials.base import GitCredentialProvider
36
+
37
+
38
+ AGENTWORKS_PROFILE = ".agentworks-profile.sh"
39
+ AGENTWORKS_RC = ".agentworks-rc.sh"
40
+
41
+
42
+ def _write_agentworks_profile(
43
+ target: ExecTarget,
44
+ path_additions: list[str],
45
+ logger: SSHLogger,
46
+ ) -> None:
47
+ """Write the agentworks-managed login profile fragment.
48
+
49
+ Writes $HOME/.agentworks-profile.sh with PATH exports and env vars.
50
+ Sourced from ~/.profile (bash/sh) and ~/.zprofile (zsh) -- runs once
51
+ per login shell, inherited by child processes.
52
+ Always written (even if empty) so that reinit can clear previously set paths.
53
+ """
54
+ # Deduplicate paths while preserving order
55
+ seen: set[str] = set()
56
+ unique_paths: list[str] = []
57
+ for p in path_additions:
58
+ if p not in seen:
59
+ seen.add(p)
60
+ unique_paths.append(p)
61
+
62
+ logger.step("Shell profile")
63
+ output.detail(f"Writing agentworks profile ({len(unique_paths)} PATH entries)...")
64
+
65
+ try:
66
+ lines = ["# Managed by agentworks -- do not edit"]
67
+ for p in unique_paths:
68
+ expanded = p.replace("~", "$HOME", 1) if p.startswith("~") else p
69
+ lines.append(f'export PATH="{expanded}:$PATH"')
70
+ target.write_file(f"~/{AGENTWORKS_PROFILE}", "\n".join(lines) + "\n")
71
+
72
+ # Source from ~/.profile (bash/sh) and ~/.zprofile (zsh)
73
+ source_line = f". $HOME/{AGENTWORKS_PROFILE}"
74
+ for rc in ("$HOME/.profile", "$HOME/.zprofile"):
75
+ target.run(
76
+ f"grep -q {AGENTWORKS_PROFILE} {rc} 2>/dev/null || printf '%s\\n' '{source_line}' >> {rc}",
77
+ )
78
+ except SSHError as e:
79
+ msg = f"shell profile write failed: {e}"
80
+ logger.warning(msg)
81
+ output.warn(msg)
82
+
83
+
84
+ def _write_agentworks_rc(
85
+ target: ExecTarget,
86
+ shell_snippets: list[str],
87
+ logger: SSHLogger,
88
+ ) -> None:
89
+ """Write the agentworks-managed rc fragment for interactive shells.
90
+
91
+ Writes $HOME/.agentworks-rc.sh with shell hooks (e.g., mise activate).
92
+ Sourced from ~/.bashrc and ~/.zshrc -- runs per interactive shell instance.
93
+ Always written (even if empty) so that reinit can clear previously set hooks.
94
+ """
95
+ logger.step("Shell rc")
96
+ output.detail("Writing agentworks rc...")
97
+
98
+ try:
99
+ lines = ["# Managed by agentworks -- do not edit"]
100
+ lines.extend(shell_snippets)
101
+ target.write_file(f"~/{AGENTWORKS_RC}", "\n".join(lines) + "\n")
102
+
103
+ # Source from ~/.bashrc and ~/.zshrc
104
+ source_line = f". $HOME/{AGENTWORKS_RC}"
105
+ for rc in ("$HOME/.bashrc", "$HOME/.zshrc"):
106
+ target.run(
107
+ f"grep -q {AGENTWORKS_RC} {rc} 2>/dev/null || printf '%s\\n' '{source_line}' >> {rc}",
108
+ )
109
+ except SSHError as e:
110
+ msg = f"shell rc write failed: {e}"
111
+ logger.warning(msg)
112
+ output.warn(msg)
113
+
114
+
115
+ # -- Mise installation ---------------------------------------------------------
116
+
117
+ MISE_GPG_KEY_URL = "https://mise.jdx.dev/gpg-key.pub"
118
+ MISE_GPG_KEY_PATH = "/etc/apt/keyrings/mise-archive-keyring.asc"
119
+ MISE_SOURCE_LINE = f"deb [signed-by={MISE_GPG_KEY_PATH}] https://mise.jdx.dev/deb stable main"
120
+ MISE_SOURCE_FILE = "/etc/apt/sources.list.d/mise.list"
121
+
122
+
123
+ MISE_ACTIVATE_LINES = (
124
+ "# agentworks-mise-activate\n"
125
+ 'if [ -n "$ZSH_VERSION" ]; then\n'
126
+ ' eval "$(mise activate zsh)"\n'
127
+ 'elif [ -n "$BASH_VERSION" ]; then\n'
128
+ ' eval "$(mise activate bash)"\n'
129
+ "else\n"
130
+ ' echo "agentworks: mise activate skipped (unsupported shell)" >&2\n'
131
+ "fi"
132
+ )
133
+
134
+
135
+ def _mise_shims_path(home: str) -> list[str]:
136
+ """Return PATH additions for mise shims (for non-interactive contexts)."""
137
+ return [f"{home}/.local/share/mise/shims"]
138
+
139
+
140
+ def _write_mise_config(
141
+ target: ExecTarget,
142
+ packages: list[str],
143
+ install_before: str,
144
+ home: str,
145
+ logger: SSHLogger,
146
+ ) -> None:
147
+ """Write ~/.config/mise/config.toml from mise_packages list.
148
+
149
+ Packages are name@version strings (e.g., "jq@1.8.1").
150
+ """
151
+ if not packages:
152
+ return
153
+
154
+ logger.step("Mise config")
155
+ output.detail(f"Writing mise config with {len(packages)} package(s)...")
156
+
157
+ settings_lines = ["[settings]", f'install_before = "{install_before}"', ""]
158
+ tools_lines = ["[tools]"]
159
+
160
+ for pkg in packages:
161
+ if "@" in pkg:
162
+ name, version = pkg.rsplit("@", 1)
163
+ tools_lines.append(f'"{name}" = "{version}"')
164
+ else:
165
+ tools_lines.append(f'"{pkg}" = "latest"')
166
+
167
+ mise_config = "\n".join(settings_lines + tools_lines) + "\n"
168
+
169
+ try:
170
+ mise_config_dir = f"{home}/.config/mise"
171
+ target.run(f"mkdir -p {mise_config_dir}")
172
+ target.write_file(f"{mise_config_dir}/config.toml", mise_config)
173
+ except SSHError as e:
174
+ msg = f"mise config write failed: {e}"
175
+ logger.warning(msg)
176
+ output.warn(msg)
177
+
178
+
179
+ def _fetch_mise_lockfile(
180
+ target: ExecTarget,
181
+ lockfile_source: str,
182
+ home: str,
183
+ logger: SSHLogger,
184
+ ) -> None:
185
+ """Fetch a mise lockfile from a source reference to ~/.config/mise/mise.lock."""
186
+ from agentworks.sources import SourceRefError, fetch_file, parse_source_ref
187
+
188
+ logger.step("Mise lockfile")
189
+ output.detail(f"Fetching mise lockfile from {lockfile_source}...")
190
+
191
+ try:
192
+ ref = parse_source_ref(lockfile_source, default_filename="mise.lock")
193
+ dest = f"{home}/.config/mise/mise.lock"
194
+ target.run(f"mkdir -p {home}/.config/mise")
195
+ fetch_file(ref, target, dest, logger=logger)
196
+ except SourceRefError as e:
197
+ msg = f"mise lockfile fetch failed: {e}"
198
+ logger.warning(msg)
199
+ output.warn(msg)
200
+
201
+
202
+ def _parse_mise_failures(error: SSHError) -> list[str]:
203
+ """Extract failed tool names from mise stderr output.
204
+
205
+ Parses lines like:
206
+ mise ERROR Failed to install aqua:npryce/adr-tools@3.0.0: reason here
207
+ The tool name can contain colons (backend:path@version), so we split
208
+ on ": " (colon-space) to separate tool from reason.
209
+ """
210
+ failures: list[str] = []
211
+ error_str = str(error)
212
+ for line in error_str.splitlines():
213
+ if "Failed to install" in line:
214
+ part = line.split("Failed to install", 1)[1].strip()
215
+ tool = part.split(": ", 1)[0].strip()
216
+ if tool and tool not in failures:
217
+ failures.append(tool)
218
+ return failures
219
+
220
+
221
+ def _run_mise_install(
222
+ target: ExecTarget,
223
+ shell: str,
224
+ home: str,
225
+ allow_unlocked: bool,
226
+ logger: SSHLogger,
227
+ *,
228
+ prune: bool = True,
229
+ ) -> None:
230
+ """Run mise install, handling locked/unlocked modes.
231
+
232
+ If a lockfile is present, tries --locked first. If that fails due to
233
+ unlocked packages and allow_unlocked is true, retries without --locked.
234
+ """
235
+ logger.step("Mise install")
236
+
237
+ # Check if a lockfile is present
238
+ lockfile_path = f"{home}/.config/mise/mise.lock"
239
+ has_lockfile = False
240
+ try:
241
+ check = target.run(f"test -f {lockfile_path}", check=False)
242
+ has_lockfile = check.ok
243
+ except SSHError:
244
+ pass
245
+
246
+ installed = False
247
+
248
+ if has_lockfile:
249
+ output.detail("Running mise install (locked)...")
250
+ try:
251
+ target.run(
252
+ f"{shell} -lc 'mise install -y --locked'",
253
+ timeout=300,
254
+ )
255
+ output.detail("Mise packages installed (locked)")
256
+ installed = True
257
+ except SSHError as e:
258
+ logger.warning(f"mise install --locked failed: {e}")
259
+ failures = _parse_mise_failures(e)
260
+ for tool in failures:
261
+ output.warn(f"Locked install failed, not in lockfile: {tool}")
262
+ if not failures:
263
+ output.warn("mise install --locked failed (see vm logs)")
264
+ if not allow_unlocked:
265
+ output.warn("Hint: set mise_allow_unlocked = true to install unlocked packages")
266
+ return
267
+ output.warn("Retrying unlocked...")
268
+
269
+ if not installed:
270
+ output.detail("Running mise install...")
271
+ try:
272
+ target.run(
273
+ f"{shell} -lc 'mise install -y'",
274
+ timeout=300,
275
+ )
276
+ output.detail("Mise packages installed")
277
+ installed = True
278
+ except SSHError as e:
279
+ logger.warning(f"mise install failed: {e}")
280
+ failures = _parse_mise_failures(e)
281
+ for tool in failures:
282
+ output.warn(f"Failed: {tool}")
283
+ if not failures:
284
+ output.warn("mise install failed (see vm logs)")
285
+
286
+ # Prune stale tool versions not in the current config
287
+ if installed and prune:
288
+ import contextlib
289
+
290
+ with contextlib.suppress(SSHError):
291
+ target.run(f"{shell} -lc 'mise prune -y'", timeout=60)
292
+
293
+
294
+ # -- SSH authorized keys ------------------------------------------------------
295
+
296
+ AUTHORIZED_KEYS_HEADER = """\
297
+ # Managed by agentworks -- manual edits will be overwritten on reinit.
298
+ # To add keys, use operator.extra_ssh_public_keys in your agentworks config.
299
+ """
300
+
301
+
302
+ def _reconcile_authorized_keys(
303
+ target: ExecTarget,
304
+ config: Config,
305
+ home: str,
306
+ logger: SSHLogger,
307
+ ) -> None:
308
+ """Reconcile ~/.ssh/authorized_keys with the configured key set.
309
+
310
+ Writes the primary ssh_public_key plus any extra_ssh_public_keys from
311
+ config. This is a full overwrite so that removed keys are cleaned up
312
+ on reinit.
313
+ """
314
+ logger.step("SSH authorized keys")
315
+
316
+ keys: list[str] = [config.operator.ssh_public_key.read_text().strip()]
317
+ for path in config.operator.extra_ssh_public_keys:
318
+ keys.append(path.read_text().strip())
319
+
320
+ extra_count = len(keys) - 1
321
+ label = f"1 primary + {extra_count} extra" if extra_count else "1 primary"
322
+ output.detail(f"Reconciling authorized_keys ({label})...")
323
+
324
+ content = AUTHORIZED_KEYS_HEADER + "\n".join(keys) + "\n"
325
+ try:
326
+ target.write_file(f"{home}/.ssh/authorized_keys", content, mode="600")
327
+ except SSHError as e:
328
+ msg = f"authorized_keys reconciliation failed: {e}"
329
+ logger.warning(msg)
330
+ output.warn(msg)
331
+
332
+
333
+ def _configure_apt_sources(
334
+ target: ExecTarget,
335
+ config: Config,
336
+ catalog: object,
337
+ logger: SSHLogger,
338
+ ) -> None:
339
+ """Configure apt sources required by selected apt_packages. Idempotent."""
340
+ from agentworks.catalog import ResolvedCatalog
341
+
342
+ assert isinstance(catalog, ResolvedCatalog)
343
+
344
+ # Collect all apt sources needed by selected apt_packages
345
+ required_sources: dict[str, AptSourceEntry] = {}
346
+ for pkg_name in config.vm.apt_packages:
347
+ pkg = catalog.apt_packages.get(pkg_name)
348
+ if pkg is None:
349
+ continue
350
+ for src_name in pkg.apt_sources:
351
+ if src_name not in required_sources:
352
+ src = catalog.apt_sources.get(src_name)
353
+ if src is not None:
354
+ required_sources[src_name] = src
355
+
356
+ if not required_sources:
357
+ return
358
+
359
+ logger.step("Apt sources")
360
+
361
+ # Detect architecture
362
+ arch_result = target.run("dpkg --print-architecture", check=False)
363
+ arch = arch_result.stdout.strip() if arch_result.returncode == 0 else "amd64"
364
+
365
+ newly_configured = False
366
+ for name, src in required_sources.items():
367
+ # Check if GPG key already exists
368
+ key_exists = target.run(f"test -f {shlex.quote(src.key_path)}", check=False).returncode == 0
369
+
370
+ if not key_exists:
371
+ output.detail(f"Configuring apt source '{name}'...")
372
+ try:
373
+ # Ensure parent directory for key_path exists
374
+ from pathlib import PurePosixPath
375
+
376
+ key_dir = str(PurePosixPath(src.key_path).parent)
377
+ target.run(f"install -m 0755 -d {shlex.quote(key_dir)}", sudo=True)
378
+
379
+ # Download GPG key
380
+ if src.key_dearmor:
381
+ # Wrap in sh -c so sudo applies to the entire pipeline,
382
+ # not just the curl on the left side of the pipe.
383
+ inner = f"curl -fsSL {shlex.quote(src.key_url)} | gpg --dearmor -o {shlex.quote(src.key_path)}"
384
+ target.run(
385
+ f"sh -c {shlex.quote(inner)}",
386
+ sudo=True,
387
+ timeout=60,
388
+ )
389
+ else:
390
+ target.run(
391
+ f"curl -fsSL {shlex.quote(src.key_url)} -o {shlex.quote(src.key_path)}",
392
+ sudo=True,
393
+ timeout=60,
394
+ )
395
+ target.run(f"chmod a+r {shlex.quote(src.key_path)}", sudo=True)
396
+ except SSHError as exc:
397
+ msg = f"apt source '{name}' failed: {exc}"
398
+ logger.warning(msg)
399
+ output.warn(msg)
400
+ continue
401
+
402
+ # Always ensure the source list file has the correct content,
403
+ # even when the key already existed (the source URL may have changed).
404
+ resolved_source = src.source.replace("{arch}", arch)
405
+ source_path = f"/etc/apt/sources.list.d/{src.source_file}"
406
+ expected = resolved_source + "\n"
407
+ current = target.run(f"cat {shlex.quote(source_path)}", check=False)
408
+ if current.returncode == 0 and current.stdout == expected:
409
+ if key_exists:
410
+ output.detail(f"Apt source '{name}': already configured, skipping")
411
+ logger.output(f"apt source {name}: key and source list up to date, skipping")
412
+ continue
413
+
414
+ if key_exists:
415
+ output.detail(f"Apt source '{name}': updating source list...")
416
+ logger.output(f"apt source {name}: key exists but source list needs update")
417
+
418
+ try:
419
+ target.run(
420
+ f"bash -c {shlex.quote(f'printf "%s\\n" {shlex.quote(resolved_source)} > {source_path}')}",
421
+ sudo=True,
422
+ )
423
+ newly_configured = True
424
+ except SSHError as e:
425
+ msg = f"apt source '{name}' failed: {e}"
426
+ logger.warning(msg)
427
+ output.warn(msg)
428
+
429
+ if newly_configured:
430
+ output.detail("Running apt-get update...")
431
+ try:
432
+ target.run("apt-get update -qq", sudo=True, timeout=120)
433
+ except SSHError as e:
434
+ msg = f"apt-get update failed after adding sources: {e}"
435
+ logger.warning(msg)
436
+ output.warn(msg)
437
+
438
+
439
+ def _install_system_packages(
440
+ target: ExecTarget,
441
+ logger: SSHLogger,
442
+ ) -> None:
443
+ """Install system repos and packages. Always runs on every init/reinit."""
444
+ logger.step("System packages")
445
+
446
+ # Add mise apt source
447
+ try:
448
+ target.run(
449
+ f"curl -fsSL {MISE_GPG_KEY_URL} -o {MISE_GPG_KEY_PATH}",
450
+ sudo=True,
451
+ timeout=30,
452
+ )
453
+ inner = f"printf '%s\\n' '{MISE_SOURCE_LINE}' > {MISE_SOURCE_FILE}"
454
+ target.run(f"sh -c {shlex.quote(inner)}", sudo=True)
455
+ except SSHError as e:
456
+ msg = f"mise apt source setup failed: {e}"
457
+ logger.warning(msg)
458
+ output.warn(msg)
459
+
460
+ output.detail("Running apt-get update...")
461
+ try:
462
+ target.run("apt-get update -qq", sudo=True, timeout=120)
463
+ except SSHError as e:
464
+ msg = f"apt-get update failed: {e}"
465
+ logger.warning(msg)
466
+ output.warn(msg)
467
+
468
+ output.detail(f"Installing {len(INIT_SYSTEM_PACKAGES)} system packages...")
469
+ apt_str = " ".join(shlex.quote(p) for p in INIT_SYSTEM_PACKAGES)
470
+ try:
471
+ target.run(
472
+ f"DEBIAN_FRONTEND=noninteractive apt-get install -y -qq -o Dpkg::Options::=--force-confnew {apt_str}",
473
+ sudo=True,
474
+ timeout=300,
475
+ )
476
+ except SSHError as e:
477
+ msg = f"system packages failed: {e}"
478
+ logger.warning(msg)
479
+ output.warn(msg)
480
+
481
+
482
+ def _install_apt_packages(
483
+ target: ExecTarget,
484
+ config: Config,
485
+ catalog: object,
486
+ logger: SSHLogger,
487
+ ) -> None:
488
+ """Install apt packages from both direct list and catalog entries."""
489
+ from agentworks.catalog import ResolvedCatalog
490
+
491
+ assert isinstance(catalog, ResolvedCatalog)
492
+
493
+ # Collect all apt packages: direct list + catalog entries
494
+ all_apt: list[str] = list(config.vm.apt)
495
+ for pkg_name in config.vm.apt_packages:
496
+ pkg = catalog.apt_packages.get(pkg_name)
497
+ if pkg is not None:
498
+ all_apt.extend(pkg.apt)
499
+
500
+ if not all_apt:
501
+ return
502
+
503
+ logger.step("Apt packages")
504
+ output.detail(f"Installing {len(all_apt)} apt packages...")
505
+ apt_str = " ".join(shlex.quote(p) for p in all_apt)
506
+ try:
507
+ target.run(
508
+ f"DEBIAN_FRONTEND=noninteractive apt-get install -y -qq -o Dpkg::Options::=--force-confnew {apt_str}",
509
+ sudo=True,
510
+ timeout=300,
511
+ )
512
+ except SSHError as e:
513
+ msg = f"apt packages failed: {e}"
514
+ logger.warning(msg)
515
+ output.warn(msg)
516
+
517
+
518
+ def _build_test_command(
519
+ entry: SystemInstallCommandEntry | UserInstallCommandEntry,
520
+ shell: str,
521
+ home: str,
522
+ ) -> str | None:
523
+ """Build a shell command to check if an install command's tool is present.
524
+
525
+ test_exec uses a login shell (-l) with interactive flag (-i) to ensure
526
+ all profile/rc files are sourced, matching a real login session.
527
+ """
528
+ if entry.test_exec:
529
+ return f"{shell} -lic {shlex.quote(f'command -v {shlex.quote(entry.test_exec)}')} > /dev/null 2>&1"
530
+ if entry.test_file:
531
+ path = entry.test_file.replace("~", home, 1) if entry.test_file.startswith("~") else entry.test_file
532
+ return f"test -f {shlex.quote(path)}"
533
+ if entry.test_dir:
534
+ path = entry.test_dir.replace("~", home, 1) if entry.test_dir.startswith("~") else entry.test_dir
535
+ return f"test -d {shlex.quote(path)}"
536
+ return None
537
+
538
+
539
+ def _run_catalog_commands(
540
+ target: ExecTarget,
541
+ command_names: list[str],
542
+ entries: Mapping[str, SystemInstallCommandEntry | UserInstallCommandEntry],
543
+ shell: str,
544
+ home: str,
545
+ logger: SSHLogger,
546
+ *,
547
+ label: str = "Install command",
548
+ ) -> list[str]:
549
+ """Run install commands from a catalog entry dict. Returns PATH additions."""
550
+ if not command_names:
551
+ return []
552
+
553
+ path_additions: list[str] = []
554
+ total = len(command_names)
555
+
556
+ for i, name in enumerate(command_names, 1):
557
+ entry = entries.get(name)
558
+ if entry is None:
559
+ msg = f"{label.lower()} '{name}' not found in catalog"
560
+ logger.warning(msg)
561
+ output.warn(msg)
562
+ continue
563
+ logger.step(f"{label} {i}/{total}: {name}")
564
+
565
+ # Skip if already installed (short timeout -- this should be instant)
566
+ test_cmd = _build_test_command(entry, shell, home)
567
+ if test_cmd:
568
+ try:
569
+ check = target.run(test_cmd, check=False, timeout=10)
570
+ if check.returncode == 0:
571
+ output.detail(f"{label} {i}/{total} ({name}): already installed, skipping")
572
+ logger.output(f"{name}: already installed ({test_cmd}), skipping")
573
+ path_additions.extend(entry.path)
574
+ continue
575
+ except SSHError as e:
576
+ # Timeout or connection issue -- assume not installed, proceed
577
+ logger.output(f"{name}: install check failed ({e}), assuming not installed")
578
+
579
+ truncated = entry.command[:60]
580
+ output.detail(f"{label} {i}/{total} ({name}): {truncated}...")
581
+ try:
582
+ target.run(f"{shlex.quote(shell)} -lc {shlex.quote(entry.command)}", timeout=120)
583
+ except SSHError as e:
584
+ msg = f"{label.lower()} '{name}' failed: {truncated}... ({e})"
585
+ logger.warning(msg)
586
+ output.warn(msg)
587
+ path_additions.extend(entry.path)
588
+
589
+ return path_additions
590
+
591
+
592
+ def verify_tailscale_available() -> None:
593
+ """Pre-flight: verify the local machine is on Tailscale."""
594
+ try:
595
+ result = subprocess.run(
596
+ ["tailscale", "status"], capture_output=True, text=True, encoding="utf-8", errors="replace", timeout=10
597
+ )
598
+ except FileNotFoundError:
599
+ raise ConnectivityError("'tailscale' command not found. Install Tailscale on this machine.") from None
600
+ except subprocess.TimeoutExpired:
601
+ raise ConnectivityError("'tailscale status' timed out. Is Tailscale running?") from None
602
+
603
+ if result.returncode != 0:
604
+ raise ConnectivityError(
605
+ "This machine is not connected to Tailscale. "
606
+ "VM initialization requires Tailscale to switch from the provisioning "
607
+ "transport to direct SSH. Run 'tailscale up' first."
608
+ )
609
+
610
+
611
+ def resolve_git_credential_providers(
612
+ config: Config,
613
+ names: list[str],
614
+ ) -> dict[str, GitCredentialProvider]:
615
+ """Resolve git credential provider instances from config.
616
+
617
+ Names are the credential names to resolve (from admin.config.git_credentials
618
+ or agent.config.git_credentials).
619
+ """
620
+ from agentworks.git_credentials.azdo import AzDOCredentialProvider
621
+ from agentworks.git_credentials.github import GitHubCredentialProvider
622
+
623
+ providers: dict[str, GitCredentialProvider] = {}
624
+ if not names:
625
+ return providers
626
+ for name in names:
627
+ cred_config = config.git_credentials.get(name)
628
+ if cred_config is None:
629
+ raise VMError(f"git credential '{name}' not found in config")
630
+ desc = cred_config.description
631
+ if cred_config.type == "azdo":
632
+ assert cred_config.org is not None
633
+ providers[name] = AzDOCredentialProvider(config_name=name, org=cred_config.org, description=desc)
634
+ elif cred_config.type == "github":
635
+ providers[name] = GitHubCredentialProvider(config_name=name, description=desc)
636
+ return providers
637
+
638
+
639
+ def verify_git_credential_auth(providers: dict[str, GitCredentialProvider]) -> None:
640
+ """Pre-flight: verify auth for all selected git credential providers."""
641
+ for name, provider in providers.items():
642
+ if not provider.verify_auth():
643
+ raise VMError(f"Authentication check failed for '{name}'. {provider.auth_hint()}")
644
+ if providers:
645
+ labels = [p.display_name for p in providers.values()]
646
+ output.info(f"Git credentials configured: {', '.join(labels)}")
647
+
648
+
649
+ def rejoin_tailscale(
650
+ db: Database,
651
+ vm_name: str,
652
+ exec_target: ExecTarget,
653
+ *,
654
+ is_wsl2: bool = False,
655
+ ) -> str:
656
+ """Re-join Tailscale on a VM that lost its node (e.g. ephemeral key).
657
+
658
+ Installs Tailscale if needed, prompts for an auth key, joins the tailnet,
659
+ and updates the DB with the new Tailscale IP.
660
+
661
+ Returns the new Tailscale IP.
662
+ """
663
+ output.info("Tailscale node not reachable. Re-joining tailnet...")
664
+
665
+ # Ensure Tailscale is installed (idempotent)
666
+ exec_target.run(
667
+ "bash -c 'command -v tailscale >/dev/null || curl -fsSL https://tailscale.com/install.sh | sh'",
668
+ sudo=True,
669
+ check=False,
670
+ )
671
+
672
+ return _join_tailscale(db, vm_name, exec_target, is_wsl2=is_wsl2)
673
+
674
+
675
+ def _join_tailscale(
676
+ db: Database,
677
+ vm_name: str,
678
+ exec_target: ExecTarget,
679
+ *,
680
+ is_wsl2: bool = False,
681
+ logger: SSHLogger | None = None,
682
+ tailscale_auth_key: str | None = None,
683
+ ) -> str:
684
+ """Join Tailscale, update DB. Returns the Tailscale IP."""
685
+ import os
686
+
687
+ ts_auth_key = tailscale_auth_key or os.environ.get("TAILSCALE_AUTH_KEY")
688
+ if not ts_auth_key:
689
+ ts_auth_key = output.prompt_secret(
690
+ " Tailscale auth key",
691
+ hint="Generate a key at https://login.tailscale.com/admin/settings/keys",
692
+ )
693
+ quoted_key = shlex.quote(ts_auth_key)
694
+ ts_cmd = f"tailscale up --auth-key {quoted_key}"
695
+ if is_wsl2:
696
+ ts_cmd += " --userspace-networking"
697
+
698
+ # Redact the auth key from any attached loggers before it appears in logs
699
+ if exec_target.logger is not None:
700
+ exec_target.logger.add_redaction(ts_auth_key)
701
+ if logger is not None:
702
+ logger.add_redaction(ts_auth_key)
703
+
704
+ exec_target.run(ts_cmd, sudo=True)
705
+ result = exec_target.run("tailscale ip -4", sudo=True)
706
+
707
+ raw_ip_output = result.stdout.strip()
708
+ tailscale_ip = raw_ip_output.splitlines()[0].strip() if raw_ip_output else ""
709
+ try:
710
+ ipaddress.IPv4Address(tailscale_ip)
711
+ except ValueError:
712
+ raise SSHError(f"tailscale ip -4 returned invalid address: {raw_ip_output!r}") from None
713
+ output.detail(f"Tailscale IP: {tailscale_ip}")
714
+ db.update_vm_tailscale(vm_name, tailscale_ip)
715
+ return tailscale_ip
716
+
717
+
718
+ def _describe_transport(exec_target: ExecTarget) -> str:
719
+ """Return a short description of the transport used by an ExecTarget."""
720
+ if exec_target.ssh is not None:
721
+ return f"ssh:{exec_target.ssh.host}"
722
+ if exec_target.lima is not None:
723
+ return f"lima:{exec_target.lima.vm_name}"
724
+ if exec_target.remote_lima is not None:
725
+ return f"remote-lima:{exec_target.remote_lima.vm_name}"
726
+ if exec_target.wsl2 is not None:
727
+ return f"wsl2:{exec_target.wsl2.distro_name}"
728
+ return "unknown"
729
+
730
+
731
+ def initialize_vm(
732
+ db: Database,
733
+ config: Config,
734
+ vm_name: str,
735
+ exec_target: ExecTarget,
736
+ providers: dict[str, GitCredentialProvider],
737
+ *,
738
+ is_wsl2: bool = False,
739
+ admin_username: str = "agentworks",
740
+ tailscale_auth_key: str | None = None,
741
+ git_tokens: dict[str, str] | None = None,
742
+ bootstrap_complete: bool = False,
743
+ tailscale_ip: str | None = None,
744
+ on_tailscale_ready: Callable[[], None] | None = None,
745
+ ) -> None:
746
+ """Run the full initialization sequence on a newly provisioned VM.
747
+
748
+ Phase A (bootstrap) steps are fatal -- any failure aborts initialization.
749
+ Phase B (setup) steps are non-fatal -- failures are logged as warnings
750
+ and the VM gets 'partial' status instead of 'complete'.
751
+ """
752
+ from dataclasses import replace
753
+
754
+ from agentworks.ssh import SSHLogger
755
+
756
+ home = f"/home/{admin_username}"
757
+ logger = SSHLogger(vm_name, "vm-create")
758
+ if tailscale_auth_key:
759
+ logger.add_redaction(tailscale_auth_key)
760
+ if git_tokens:
761
+ for token in git_tokens.values():
762
+ logger.add_redaction(token)
763
+
764
+ # Attach logger to the provisioning transport
765
+ exec_target = replace(exec_target, logger=logger)
766
+
767
+ transport = _describe_transport(exec_target)
768
+
769
+ try:
770
+ db.insert_vm_event(vm_name, "provisioning_started", transport)
771
+ ts_target = _phase_a_bootstrap(
772
+ db,
773
+ config,
774
+ vm_name,
775
+ exec_target,
776
+ home,
777
+ admin_username,
778
+ is_wsl2,
779
+ logger,
780
+ tailscale_auth_key=tailscale_auth_key,
781
+ bootstrap_complete=bootstrap_complete,
782
+ tailscale_ip=tailscale_ip,
783
+ )
784
+ db.insert_vm_event(vm_name, "provisioning_complete", ts_target.ssh.host if ts_target.ssh else None)
785
+ except Exception as e:
786
+ db.update_vm_provisioning_status(vm_name, ProvisioningStatus.FAILED)
787
+ db.insert_vm_event(vm_name, "provisioning_failed", str(e))
788
+ logger.close()
789
+ output.warn(f"Log: {logger.path}")
790
+ raise
791
+
792
+ # Tailscale is up; caller can clean up provisioning-only resources
793
+ # (e.g., detach Azure public IP since Phase B uses Tailscale SSH).
794
+ # Removing the public IP can destabilize the network stack briefly,
795
+ # so we wait for Tailscale SSH to be reliably reachable before
796
+ # proceeding with Phase B.
797
+ if on_tailscale_ready is not None:
798
+ try:
799
+ on_tailscale_ready()
800
+ except Exception as e:
801
+ output.warn(f"post-provisioning cleanup failed: {e}")
802
+
803
+ # Wait for Tailscale SSH to reconnect after network changes
804
+ from agentworks.ssh import wait_for_reconnect
805
+
806
+ wait_for_reconnect(ts_target)
807
+
808
+ run_initialization(
809
+ db,
810
+ config,
811
+ vm_name,
812
+ ts_target,
813
+ providers,
814
+ home,
815
+ admin_username,
816
+ logger,
817
+ git_tokens=git_tokens,
818
+ is_first_init=True,
819
+ )
820
+
821
+
822
+ def run_initialization(
823
+ db: Database,
824
+ config: Config,
825
+ vm_name: str,
826
+ ts_target: ExecTarget,
827
+ providers: dict[str, GitCredentialProvider],
828
+ home: str,
829
+ admin_username: str,
830
+ logger: SSHLogger,
831
+ *,
832
+ git_tokens: dict[str, str] | None = None,
833
+ is_first_init: bool = False,
834
+ ) -> None:
835
+ """Run Phase B (initialization) with status tracking and event logging.
836
+
837
+ This is called both from initialize_vm() after provisioning and
838
+ from reinit_vm() for repeatable re-initialization. Pass
839
+ ``is_first_init=True`` from initialize_vm so steps that expect prior
840
+ state (e.g. tmux socket dirs) can skip warnings on missing state.
841
+ """
842
+ db.insert_vm_event(vm_name, "init_started")
843
+
844
+ try:
845
+ _phase_b_setup(
846
+ db,
847
+ config,
848
+ vm_name,
849
+ ts_target,
850
+ providers,
851
+ home,
852
+ admin_username,
853
+ logger,
854
+ git_tokens=git_tokens,
855
+ is_first_init=is_first_init,
856
+ )
857
+ except Exception as e:
858
+ db.update_vm_init_status(vm_name, InitStatus.FAILED)
859
+ db.insert_vm_event(vm_name, "init_failed", str(e))
860
+ logger.close()
861
+ raise
862
+
863
+ if logger.has_warnings:
864
+ db.update_vm_init_status(vm_name, InitStatus.PARTIAL)
865
+ db.insert_vm_event(vm_name, "init_partial", f"{len(logger.warnings)} warning(s)")
866
+ else:
867
+ db.update_vm_init_status(vm_name, InitStatus.COMPLETE)
868
+ db.insert_vm_event(vm_name, "init_complete")
869
+
870
+ logger.close()
871
+
872
+
873
+ def _phase_a_bootstrap(
874
+ db: Database,
875
+ config: Config,
876
+ vm_name: str,
877
+ exec_target: ExecTarget,
878
+ home: str,
879
+ admin_username: str,
880
+ is_wsl2: bool,
881
+ logger: SSHLogger,
882
+ *,
883
+ tailscale_auth_key: str | None = None,
884
+ bootstrap_complete: bool = False,
885
+ tailscale_ip: str | None = None,
886
+ ) -> ExecTarget:
887
+ """Phase A: Bootstrap (over provisioning transport). All steps are fatal.
888
+
889
+ Three paths depending on how much the provisioner already handled:
890
+
891
+ 1. bootstrap_complete=True (Lima/Azure): The provisioner already ran the
892
+ full bootstrap. Skip straight to Tailscale SSH verification.
893
+ 2. Otherwise (WSL2): Run full bootstrap script over the provisioning
894
+ transport (user, packages, SSH key, swap, Tailscale).
895
+
896
+ Returns the Tailscale ExecTarget for Phase B.
897
+ """
898
+ db.update_vm_provisioning_status(vm_name, ProvisioningStatus.IN_PROGRESS)
899
+
900
+ if bootstrap_complete and tailscale_ip:
901
+ # Lima/Azure: provisioner already ran the full bootstrap.
902
+ # Just update DB and move on to SSH verification.
903
+ logger.step("Bootstrap (provisioner)")
904
+ logger.output(f"Tailscale IP: {tailscale_ip}")
905
+ db.update_vm_tailscale(vm_name, tailscale_ip)
906
+ db.update_vm_provisioning_status(vm_name, ProvisioningStatus.COMPLETE)
907
+ else:
908
+ # WSL2: run bootstrap script over the provisioning transport
909
+ tailscale_ip = _run_bootstrap_script(
910
+ db,
911
+ config,
912
+ vm_name,
913
+ exec_target,
914
+ admin_username,
915
+ is_wsl2,
916
+ logger,
917
+ tailscale_auth_key=tailscale_auth_key,
918
+ )
919
+
920
+ # Switch to Tailscale SSH, carrying over the SSH logger.
921
+ # On Windows, force TTY to prevent zsh/login shell pipe hangs.
922
+ import sys
923
+
924
+ ts_target = ExecTarget(
925
+ ssh=SSHTarget(
926
+ host=tailscale_ip,
927
+ user=admin_username,
928
+ identity_file=config.operator.ssh_private_key,
929
+ force_tty=sys.platform == "win32",
930
+ ),
931
+ default_timeout=60,
932
+ logger=exec_target.logger,
933
+ )
934
+
935
+ # Verify Tailscale SSH works (retry -- peer connection may take time)
936
+ logger.step("Verify Tailscale SSH")
937
+ output.detail("Verifying Tailscale SSH...")
938
+ import time
939
+
940
+ for attempt in range(5):
941
+ try:
942
+ ts_target.run("echo ok", timeout=15)
943
+ break
944
+ except SSHError:
945
+ if attempt == 4:
946
+ raise
947
+ output.detail(f"Tailscale SSH not ready, retrying ({attempt + 1}/5)...")
948
+ time.sleep(3)
949
+
950
+ return ts_target
951
+
952
+
953
+ def _run_bootstrap_script(
954
+ db: Database,
955
+ config: Config,
956
+ vm_name: str,
957
+ exec_target: ExecTarget,
958
+ admin_username: str,
959
+ is_wsl2: bool,
960
+ logger: SSHLogger,
961
+ *,
962
+ tailscale_auth_key: str | None = None,
963
+ ) -> str:
964
+ """Generate, copy, and run a bootstrap script on the VM. Returns Tailscale IP.
965
+
966
+ Used for WSL2 where the bootstrap cannot be embedded in a provisioner's
967
+ native mechanism (Lima provision block, Azure cloud-init).
968
+ """
969
+ import tempfile
970
+
971
+ from agentworks.vms.bootstrap_script import generate_bootstrap_script, parse_bootstrap_output, vm_hostname
972
+
973
+ output.info("Bootstrapping VM (detached)...")
974
+
975
+ # Resolve Tailscale auth key
976
+ ts_auth_key = _resolve_tailscale_auth_key(tailscale_auth_key)
977
+
978
+ ssh_public_key = config.operator.ssh_public_key.read_text().strip()
979
+ # Determine platform for hostname. Look up the VM record for the actual
980
+ # platform; fall back to transport-based detection.
981
+ platform = "wsl2" if is_wsl2 else "unknown"
982
+ vm_row = db.get_vm(vm_name)
983
+ if vm_row is not None:
984
+ platform = vm_row.platform
985
+ script = generate_bootstrap_script(
986
+ admin_username=admin_username,
987
+ ssh_public_key=ssh_public_key,
988
+ provisioning_packages=PROVISIONING_PACKAGES,
989
+ tailscale_auth_key=ts_auth_key,
990
+ hostname=vm_hostname(platform, vm_name),
991
+ swap=0 if is_wsl2 else config.vm.swap, # WSL2 provisioner handles swap
992
+ is_wsl2=is_wsl2,
993
+ )
994
+
995
+ # Copy script to VM and execute via detached nohup
996
+ remote_script = "/tmp/agentworks-bootstrap.sh"
997
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=".sh", delete=False) as f:
998
+ f.write(script.encode("utf-8"))
999
+ local_script = f.name
1000
+
1001
+ try:
1002
+ exec_target.copy_to(local_script, remote_script)
1003
+ finally:
1004
+ import os
1005
+
1006
+ os.unlink(local_script)
1007
+
1008
+ from agentworks.remote_exec import run_detached
1009
+
1010
+ output.detail("Running bootstrap script...")
1011
+ detached = run_detached(
1012
+ exec_target,
1013
+ f"sudo -n /bin/bash {remote_script}",
1014
+ label="Bootstrap",
1015
+ base_path=f"/tmp/agentworks-bootstrap-{vm_name}",
1016
+ quiet=True, # we parse the structured output ourselves
1017
+ )
1018
+ exec_target.run(f"rm -f {remote_script}", sudo=True, check=False)
1019
+
1020
+ # Parse structured output
1021
+ bootstrap = parse_bootstrap_output(detached.output, detached.exit_code)
1022
+
1023
+ # Feed results into logger and console
1024
+ for step in bootstrap.steps:
1025
+ logger.step(step.name)
1026
+ if step.success_msg:
1027
+ output.detail(f"{step.name}: {step.success_msg}")
1028
+ logger.output(step.success_msg)
1029
+ for warning in step.warnings:
1030
+ output.warn(warning)
1031
+ logger.warning(warning)
1032
+ if step.error:
1033
+ output.warn(f"Error: {step.error}")
1034
+ logger.log_error(step.error)
1035
+
1036
+ # Log full output for troubleshooting
1037
+ if detached.output:
1038
+ logger.output(detached.output)
1039
+
1040
+ if not bootstrap.ok:
1041
+ msg = f"Bootstrap script failed (exit {detached.exit_code})"
1042
+ if detached.output:
1043
+ msg += f"\n{detached.output[-500:]}"
1044
+ raise SSHError(msg)
1045
+
1046
+ # Update DB with Tailscale info
1047
+ assert bootstrap.tailscale_ip is not None
1048
+ tailscale_ip = bootstrap.tailscale_ip
1049
+ output.detail(f"Tailscale IP: {tailscale_ip}")
1050
+ db.update_vm_tailscale(vm_name, tailscale_ip)
1051
+ db.update_vm_provisioning_status(vm_name, ProvisioningStatus.COMPLETE)
1052
+
1053
+ return tailscale_ip
1054
+
1055
+
1056
+ def _resolve_tailscale_auth_key(tailscale_auth_key: str | None = None) -> str:
1057
+ """Resolve Tailscale auth key from argument, env var, or prompt."""
1058
+ import os
1059
+
1060
+ key = tailscale_auth_key or os.environ.get("TAILSCALE_AUTH_KEY")
1061
+ if key:
1062
+ return key
1063
+ return output.prompt_secret(
1064
+ " Tailscale auth key",
1065
+ hint="Generate a key at https://login.tailscale.com/admin/settings/keys",
1066
+ )
1067
+
1068
+
1069
+ def _phase_b_setup(
1070
+ db: Database,
1071
+ config: Config,
1072
+ vm_name: str,
1073
+ ts_target: ExecTarget,
1074
+ providers: dict[str, GitCredentialProvider],
1075
+ home: str,
1076
+ admin_username: str,
1077
+ logger: SSHLogger,
1078
+ *,
1079
+ git_tokens: dict[str, str] | None = None,
1080
+ is_first_init: bool = False,
1081
+ ) -> None:
1082
+ """Phase B: Setup (over Tailscale SSH). Non-fatal steps warn and continue."""
1083
+ from agentworks.catalog import load_catalog, validate_selections
1084
+
1085
+ output.info("Initializing VM...")
1086
+ db.update_vm_init_status(vm_name, InitStatus.IN_PROGRESS)
1087
+ catalog = load_catalog(config)
1088
+ validate_selections(config, catalog)
1089
+
1090
+ # Non-fatal: system repos + packages (mise repo added, then all packages)
1091
+ _install_system_packages(ts_target, logger)
1092
+
1093
+ # Non-fatal: apt sources required by selected apt_packages
1094
+ _configure_apt_sources(ts_target, config, catalog, logger)
1095
+
1096
+ # Non-fatal: apt packages (direct list + catalog entries)
1097
+ _install_apt_packages(ts_target, config, catalog, logger)
1098
+
1099
+ # Non-fatal: snap packages
1100
+ if config.vm.snap:
1101
+ logger.step("Snap packages")
1102
+ output.detail(f"Installing {len(config.vm.snap)} snap packages...")
1103
+ for pkg in config.vm.snap:
1104
+ try:
1105
+ ts_target.run(f"snap install {shlex.quote(pkg)}", sudo=True, timeout=120)
1106
+ except SSHError as e:
1107
+ msg = f"snap install '{pkg}' failed: {e}"
1108
+ logger.warning(msg)
1109
+ output.warn(msg)
1110
+
1111
+ # Non-fatal: set default shell (before install commands so installers
1112
+ # write to the correct rc file)
1113
+ logger.step("Shell configuration")
1114
+ admin_shell = config.admin.shell
1115
+ output.detail(f"Setting shell to {admin_shell}...")
1116
+ try:
1117
+ # Touch .zshrc before chsh to prevent zsh's first-run wizard
1118
+ # (zsh-newuser-install) from prompting interactively on next login
1119
+ if admin_shell == "zsh":
1120
+ ts_target.run(f"touch {home}/.zshrc", check=False)
1121
+ ts_target.run(
1122
+ f"usermod -s $(which {shlex.quote(admin_shell)}) {shlex.quote(admin_username)}",
1123
+ sudo=True,
1124
+ )
1125
+ except SSHError as e:
1126
+ msg = f"shell configuration failed: {e}"
1127
+ logger.warning(msg)
1128
+ output.warn(msg)
1129
+
1130
+ # Non-fatal: reconcile authorized_keys
1131
+ _reconcile_authorized_keys(ts_target, config, home, logger)
1132
+
1133
+ # Non-fatal: workspaces directory with ACLs for group-writable files.
1134
+ # Default ACLs ensure new files/dirs inherit group rwx regardless of umask.
1135
+ # Access ACLs fix existing files. Applied recursively to cover all workspaces.
1136
+ workspaces_dir = config.paths.vm_workspaces
1137
+ if workspaces_dir.startswith("/home/"):
1138
+ output.warn(
1139
+ f"vm_workspaces is under /home ({workspaces_dir}). "
1140
+ "This may require the home directory to be world-traversable."
1141
+ )
1142
+ try:
1143
+ # acl is now installed as a system package in _install_system_packages
1144
+ ts_target.run(f"mkdir -p {workspaces_dir}", sudo=True)
1145
+ # Ensure all parent directories are traversable by agents
1146
+ ts_target.run(
1147
+ f'sh -c \'p={workspaces_dir}; while [ "$p" != "/" ]; do chmod a+x "$p"; p=$(dirname "$p"); done\'',
1148
+ sudo=True,
1149
+ )
1150
+ # Default ACLs on directories only (setfacl -R -d warns on files)
1151
+ ts_target.run(
1152
+ f"find {workspaces_dir} -type d -exec setfacl -d -m g::rwx -m m::rwx {{}} +",
1153
+ sudo=True,
1154
+ timeout=120,
1155
+ )
1156
+ # Access ACLs on all existing files and dirs
1157
+ ts_target.run(
1158
+ f"setfacl -R -m g::rwx -m m::rwx {workspaces_dir}",
1159
+ sudo=True,
1160
+ timeout=120,
1161
+ )
1162
+ except SSHError as e:
1163
+ msg = f"workspaces directory setup failed: {e}"
1164
+ logger.warning(msg)
1165
+ output.warn(msg)
1166
+
1167
+ # Non-fatal: agent tmux socket directory infrastructure.
1168
+ # Creates the shared group, root directory, and per-agent subdirectories.
1169
+ try:
1170
+ from agentworks.sessions.tmux import (
1171
+ cleanup_stale_sockets,
1172
+ ensure_agent_socket_dir,
1173
+ ensure_agent_socket_root,
1174
+ )
1175
+
1176
+ logger.step("Agent tmux socket directories")
1177
+ output.detail("Setting up agent tmux socket infrastructure...")
1178
+
1179
+ ensure_agent_socket_root(ts_target, admin_username, warn_if_missing=not is_first_init)
1180
+ for agent in db.list_agents(vm_name):
1181
+ ensure_agent_socket_dir(ts_target, agent.linux_user)
1182
+ removed = cleanup_stale_sockets(ts_target, agent.linux_user)
1183
+ if removed:
1184
+ output.detail(f"Cleaned up {removed} stale socket(s) for {agent.linux_user}")
1185
+ except SSHError as e:
1186
+ msg = f"agent tmux socket setup failed: {e}"
1187
+ logger.warning(msg)
1188
+ output.warn(msg)
1189
+
1190
+ # Non-fatal: system install commands
1191
+ system_path = _run_catalog_commands(
1192
+ ts_target,
1193
+ config.vm.system_install_commands,
1194
+ catalog.system_install_commands,
1195
+ admin_shell,
1196
+ home,
1197
+ logger,
1198
+ label="System install command",
1199
+ )
1200
+
1201
+ # Non-fatal: mise config (written before dotfiles so dotfiles can override)
1202
+ mise_path: list[str] = _mise_shims_path(home)
1203
+ if config.admin.mise_packages:
1204
+ _write_mise_config(ts_target, config.admin.mise_packages, config.admin.mise_install_before, home, logger)
1205
+
1206
+ # Non-fatal: git safe.directory wildcard (disables ownership checks for the
1207
+ # multi-user workspace model where agents access repos owned by admin)
1208
+ if config.admin.git_force_safe_directory:
1209
+ try:
1210
+ ts_target.run("git config --global --add safe.directory '*'")
1211
+ output.detail("Git safe.directory wildcard configured")
1212
+ except SSHError as e:
1213
+ msg = f"git safe.directory setup failed: {e}"
1214
+ logger.warning(msg)
1215
+ output.warn(msg)
1216
+
1217
+ # Non-fatal: git credentials (before dotfiles and mise lockfile for private repos)
1218
+ if providers:
1219
+ _configure_git_credentials(vm_name, ts_target, providers, logger, git_tokens=git_tokens)
1220
+
1221
+ # Non-fatal: dotfiles (can override mise config, can provide lockfile)
1222
+ if config.admin.dotfiles_source:
1223
+ logger.step("Dotfiles")
1224
+ dest = config.admin.dotfiles_destination.replace("~", home)
1225
+ try:
1226
+ from agentworks.sources import SourceRefError, fetch_dir, parse_source_ref
1227
+
1228
+ ref = parse_source_ref(config.admin.dotfiles_source)
1229
+ output.detail(f"Syncing dotfiles from {config.admin.dotfiles_source}...")
1230
+ fetch_dir(ref, ts_target, dest, logger=logger)
1231
+
1232
+ output.detail(f"Running dotfiles install: {config.admin.dotfiles_install_cmd}")
1233
+ ts_target.run(f"cd {dest} && {config.admin.dotfiles_install_cmd}", timeout=120)
1234
+ except (SourceRefError, Exception) as e:
1235
+ msg = f"dotfiles install failed: {e}"
1236
+ logger.warning(msg)
1237
+ output.warn(msg)
1238
+
1239
+ # Non-fatal: mise lockfile (after git creds and dotfiles; overrides dotfiles lockfile)
1240
+ if config.admin.mise_lockfile:
1241
+ _fetch_mise_lockfile(ts_target, config.admin.mise_lockfile, home, logger)
1242
+
1243
+ # Non-fatal: mise install (after config + dotfiles + lockfile are all settled)
1244
+ prune = config.admin.mise_prune_on_reinit
1245
+ if config.admin.mise_packages or config.admin.mise_lockfile:
1246
+ _run_mise_install(ts_target, admin_shell, home, config.admin.mise_allow_unlocked, logger, prune=prune)
1247
+ else:
1248
+ try:
1249
+ check = ts_target.run(f"test -f {home}/.config/mise/config.toml", check=False)
1250
+ if check.ok:
1251
+ _run_mise_install(ts_target, admin_shell, home, config.admin.mise_allow_unlocked, logger, prune=prune)
1252
+ except SSHError:
1253
+ pass
1254
+
1255
+ # Non-fatal: user install commands for admin user (may depend on mise tools)
1256
+ user_path = _run_catalog_commands(
1257
+ ts_target,
1258
+ config.admin.user_install_commands,
1259
+ catalog.user_install_commands,
1260
+ admin_shell,
1261
+ home,
1262
+ logger,
1263
+ label="User install command",
1264
+ )
1265
+
1266
+ # Non-fatal: shell profile (PATH exports, sourced at login)
1267
+ all_paths = system_path + mise_path + user_path
1268
+ _write_agentworks_profile(ts_target, all_paths, logger)
1269
+
1270
+ # Non-fatal: shell rc (interactive shell hooks like mise activate)
1271
+ rc_snippets = [MISE_ACTIVATE_LINES] if config.admin.mise_activate else ["# mise activation disabled"]
1272
+ _write_agentworks_rc(ts_target, rc_snippets, logger)
1273
+
1274
+ # Non-fatal: nerf tools
1275
+ if config.vm.nerf_build_claude_plugin:
1276
+ _build_nerf_claude_plugin(ts_target, config, logger)
1277
+
1278
+ # Non-fatal: install nerf Claude plugin for admin user
1279
+ if config.admin.nerf_install_claude_plugin:
1280
+ _install_nerf_claude_plugin_for_user(ts_target, admin_shell, logger)
1281
+
1282
+ # Non-fatal: Claude Code marketplaces and plugins for admin user
1283
+ def _admin_run_cmd(cmd: str, timeout: int) -> object:
1284
+ inner = shlex.quote(cmd)
1285
+ return ts_target.run(f"{admin_shell} -lc {inner}", timeout=timeout)
1286
+
1287
+ install_claude_plugins(
1288
+ _admin_run_cmd, config.admin.claude_marketplaces, config.admin.claude_plugins, logger
1289
+ )
1290
+
1291
+
1292
+ def _build_nerf_claude_plugin(
1293
+ ts_target: ExecTarget,
1294
+ config: Config,
1295
+ logger: SSHLogger,
1296
+ ) -> None:
1297
+ """Build the nerf Claude Code plugin locally and deploy to the VM. Non-fatal."""
1298
+ logger.step("Nerf tools (Claude plugin)")
1299
+ output.detail("Building nerf Claude Code plugin...")
1300
+
1301
+ nerf_home = config.vm.nerf_home_dir
1302
+ plugin_dir = f"{nerf_home}/claude-plugin"
1303
+
1304
+ try:
1305
+ try:
1306
+ from nerftools import BUILTIN_MANIFESTS_DIR # type: ignore[import-untyped]
1307
+ from nerftools.config import load_config, resolve_claude_plugin_meta # type: ignore[import-untyped]
1308
+ from nerftools.formats import build_claude_plugin # type: ignore[import-untyped]
1309
+ from nerftools.manifest import ( # type: ignore[import-untyped]
1310
+ ManifestError,
1311
+ load_manifest,
1312
+ merge_manifests,
1313
+ )
1314
+ except ImportError as e:
1315
+ raise RuntimeError(f"nerftools is not installed: {e}") from e
1316
+
1317
+ manifest_paths: list[Path] = []
1318
+ if not config.vm.skip_nerf_defaults and BUILTIN_MANIFESTS_DIR.exists():
1319
+ for f in sorted(BUILTIN_MANIFESTS_DIR.iterdir()):
1320
+ if f.suffix == ".yaml" and f.is_file():
1321
+ manifest_paths.append(f)
1322
+ manifest_paths.extend(config.vm.nerf_addl_manifests)
1323
+
1324
+ try:
1325
+ manifests = merge_manifests([load_manifest(p) for p in manifest_paths])
1326
+ except ManifestError as e:
1327
+ raise RuntimeError(f"nerf manifest error: {e}") from e
1328
+
1329
+ # Plugin metadata from agentworks nerf-config.yaml.
1330
+ # Version is fixed (from nerftools defaults) so the plugin path stays
1331
+ # stable across rebuilds -- important because Claude Code grants
1332
+ # permissions based on absolute tool paths.
1333
+ nerf_config_path = Path(__file__).resolve().parent.parent / "nerf-config.yaml"
1334
+ nerf_config = load_config(nerf_config_path)
1335
+ plugin_meta, marketplace_meta = resolve_claude_plugin_meta(nerf_config)
1336
+
1337
+ with tempfile.TemporaryDirectory() as tmp:
1338
+ tmp_path = Path(tmp)
1339
+ build_claude_plugin(manifests, tmp_path, plugin_meta, marketplace_meta=marketplace_meta)
1340
+
1341
+ # Clean and create remote dir
1342
+ ts_target.run(f"rm -rf {shlex.quote(plugin_dir)}", sudo=True)
1343
+ ts_target.run(f"mkdir -p {shlex.quote(plugin_dir)}", sudo=True)
1344
+ ts_target.run(f"sudo chown -R $(id -un):$(id -un) {shlex.quote(plugin_dir)}")
1345
+
1346
+ # Copy plugin artifacts
1347
+ ts_target.copy_dir_to(tmp_path, plugin_dir, delete=False, timeout=60)
1348
+
1349
+ # Make the entire nerf home world-readable so all users can access the plugin
1350
+ ts_target.run(
1351
+ f"chmod -R a+rX {shlex.quote(nerf_home)}",
1352
+ sudo=True,
1353
+ )
1354
+ # Fix execute bits on scripts (Windows tarballs lose them, a+rX only sets x on dirs)
1355
+ find_cmd = (
1356
+ f"find {shlex.quote(plugin_dir)} -type f"
1357
+ r" \( -name 'nerf-*' -o -name 'nerfctl-*' \) -exec chmod a+x {} +"
1358
+ )
1359
+ ts_target.run(find_cmd)
1360
+
1361
+ # Write an install helper with the plugin/marketplace names baked in
1362
+ # so _install_nerf_claude_plugin_for_user can call it without parsing JSON.
1363
+ p_name = shlex.quote(plugin_meta.name)
1364
+ m_name = shlex.quote(marketplace_meta.name if marketplace_meta else plugin_meta.name)
1365
+ # Drop the pre-1.0 marketplace name if a previous build registered it,
1366
+ # otherwise `marketplace add` no-ops on the same path under the old name.
1367
+ install_script = (
1368
+ "#!/usr/bin/env bash\n"
1369
+ "set -euo pipefail\n"
1370
+ 'PLUGIN_DIR="$(cd "$(dirname "$0")/.." && pwd)"\n'
1371
+ "claude plugin marketplace remove agentworks-nerf-local >/dev/null 2>&1 || true\n"
1372
+ 'claude plugin marketplace add "$PLUGIN_DIR"\n'
1373
+ f"claude plugin install {p_name}@{m_name} --scope user\n"
1374
+ )
1375
+ install_path = f"{plugin_dir}/scripts/install-plugin"
1376
+ scripts_dir = shlex.quote(plugin_dir + "/scripts")
1377
+ quoted_script = shlex.quote(install_script)
1378
+ quoted_path = shlex.quote(install_path)
1379
+ ts_target.run(
1380
+ f"mkdir -p {scripts_dir} && printf '%s' {quoted_script} > {quoted_path} && chmod a+x {quoted_path}",
1381
+ )
1382
+
1383
+ output.detail(f"Nerf Claude plugin built to {plugin_dir}")
1384
+
1385
+ # System-wide env var so all users can locate nerf home
1386
+ env_line = f'export AGENTWORKS_NERF_HOME="{nerf_home}"'
1387
+ ts_target.run(
1388
+ f"printf '%s\\n' {shlex.quote(env_line)} | sudo tee /etc/profile.d/agentworks-nerf.sh > /dev/null",
1389
+ )
1390
+ ts_target.run(
1391
+ f"grep -qF AGENTWORKS_NERF_HOME /etc/zsh/zprofile 2>/dev/null"
1392
+ f" || printf '%s\\n' {shlex.quote(env_line)} | sudo tee -a /etc/zsh/zprofile > /dev/null",
1393
+ )
1394
+
1395
+ except (SSHError, RuntimeError) as e:
1396
+ msg = f"nerf Claude plugin build failed: {e}"
1397
+ logger.warning(msg)
1398
+ output.warn(msg)
1399
+
1400
+
1401
+ def _install_nerf_claude_plugin_for_user(
1402
+ target: ExecTarget,
1403
+ shell: str,
1404
+ logger: SSHLogger,
1405
+ ) -> None:
1406
+ """Install the nerf Claude Code plugin for the current user. Non-fatal."""
1407
+ logger.step("Nerf plugin install")
1408
+
1409
+ try:
1410
+ # Check that the plugin and install helper exist via the system env var
1411
+ check_result = target.run(
1412
+ f"{shell} -lc 'test -x $AGENTWORKS_NERF_HOME/claude-plugin/scripts/install-plugin'",
1413
+ check=False,
1414
+ )
1415
+ if not check_result.ok:
1416
+ output.warn(
1417
+ "nerf Claude plugin not found on this VM. "
1418
+ "Set nerf_build_claude_plugin = true in your VM template and reinit."
1419
+ )
1420
+ return
1421
+
1422
+ output.detail("Installing nerf Claude plugin...")
1423
+ target.run(
1424
+ f"{shell} -lc '$AGENTWORKS_NERF_HOME/claude-plugin/scripts/install-plugin'",
1425
+ timeout=30,
1426
+ )
1427
+ output.detail("Nerf Claude plugin installed")
1428
+ except SSHError as e:
1429
+ msg = f"nerf plugin install failed: {e}"
1430
+ logger.warning(msg)
1431
+ output.warn(msg)
1432
+
1433
+
1434
+ RunCmd = Callable[[str, int], object]
1435
+ """Callable that runs a shell command with a timeout. Used to abstract
1436
+ admin (target.run) vs agent (_run_as_agent) execution."""
1437
+
1438
+
1439
+ def install_claude_plugins(
1440
+ run_cmd: RunCmd,
1441
+ marketplaces: list[str],
1442
+ plugins: list[str],
1443
+ logger: SSHLogger | None = None,
1444
+ ) -> None:
1445
+ """Register Claude Code marketplaces and install plugins. Non-fatal.
1446
+
1447
+ The caller provides a run_cmd that handles shell/user context:
1448
+ - Admin: wraps in login shell via {shell} -lc
1449
+ - Agent: wraps in su - via _run_as_agent
1450
+ """
1451
+ if not marketplaces and not plugins:
1452
+ return
1453
+
1454
+ if logger:
1455
+ logger.step("Claude plugins")
1456
+
1457
+ try:
1458
+ # Verify claude is available before attempting marketplace/plugin setup
1459
+ run_cmd("command -v claude >/dev/null 2>&1", 10)
1460
+ except SSHError as e:
1461
+ msg = (
1462
+ f"claude CLI not available; skipping marketplace/plugin setup ({e}). "
1463
+ "Install claude (e.g. via user_install_commands or any other method) and rerun init."
1464
+ )
1465
+ if logger:
1466
+ logger.warning(msg)
1467
+ output.warn(msg)
1468
+ return
1469
+
1470
+ try:
1471
+ for source in marketplaces:
1472
+ output.detail(f"Registering Claude marketplace: {source}")
1473
+ run_cmd(f"claude plugin marketplace add {shlex.quote(source)}", 60)
1474
+
1475
+ for plugin in plugins:
1476
+ output.detail(f"Installing Claude plugin: {plugin}")
1477
+ run_cmd(f"claude plugin install {shlex.quote(plugin)} --scope user", 60)
1478
+ except SSHError as e:
1479
+ msg = f"Claude plugin install failed: {e}"
1480
+ if logger:
1481
+ logger.warning(msg)
1482
+ output.warn(msg)
1483
+
1484
+
1485
+ def _configure_git_credentials(
1486
+ vm_name: str,
1487
+ ts_target: ExecTarget,
1488
+ providers: dict[str, GitCredentialProvider],
1489
+ logger: SSHLogger,
1490
+ git_tokens: dict[str, str] | None = None,
1491
+ ) -> None:
1492
+ """Configure git credential store on the VM with pre-collected or prompted tokens."""
1493
+ logger.step("Git credentials")
1494
+ output.detail("Configuring git credentials...")
1495
+
1496
+ tokens = git_tokens or {}
1497
+
1498
+ # Collect credential lines from all providers
1499
+ credential_lines: list[str] = []
1500
+ for name, provider in providers.items():
1501
+ try:
1502
+ token = tokens.get(name) or provider.obtain_token(vm_name)
1503
+ credential_lines.extend(provider.credential_lines(token))
1504
+ except Exception as e:
1505
+ msg = f"git credential setup failed for {name}: {e}"
1506
+ logger.warning(msg)
1507
+ output.warn(msg)
1508
+
1509
+ if not credential_lines:
1510
+ return
1511
+
1512
+ # Write credentials and configure git on the VM
1513
+ try:
1514
+ cred_content = "\n".join(credential_lines) + "\n"
1515
+ ts_target.write_file("~/.git-credentials", cred_content, mode="600")
1516
+ ts_target.run(
1517
+ "git config --global credential.helper store",
1518
+ )
1519
+ output.detail(f"Git credentials configured for {len(providers)} provider(s)")
1520
+ except SSHError as e:
1521
+ msg = f"git credential store setup failed: {e}"
1522
+ logger.warning(msg)
1523
+ output.warn(msg)