agentworks-cli 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentworks/__init__.py +1 -0
- agentworks/agents/__init__.py +0 -0
- agentworks/agents/manager.py +1095 -0
- agentworks/agents/templates.py +145 -0
- agentworks/catalog.py +264 -0
- agentworks/catalog.toml +131 -0
- agentworks/cli.py +1462 -0
- agentworks/completions/__init__.py +33 -0
- agentworks/completions/bash.py +179 -0
- agentworks/completions/install.py +122 -0
- agentworks/completions/powershell.py +270 -0
- agentworks/completions/spec.py +216 -0
- agentworks/completions/zsh.py +256 -0
- agentworks/config.py +894 -0
- agentworks/db.py +1083 -0
- agentworks/doctor.py +430 -0
- agentworks/git_credentials/__init__.py +0 -0
- agentworks/git_credentials/azdo.py +29 -0
- agentworks/git_credentials/base.py +71 -0
- agentworks/git_credentials/github.py +22 -0
- agentworks/nerf-config.yaml +16 -0
- agentworks/output.py +296 -0
- agentworks/remote_exec.py +286 -0
- agentworks/sample-config.toml +289 -0
- agentworks/sessions/__init__.py +0 -0
- agentworks/sessions/console.py +164 -0
- agentworks/sessions/manager.py +1297 -0
- agentworks/sessions/templates.py +101 -0
- agentworks/sessions/tmux.py +503 -0
- agentworks/sources.py +303 -0
- agentworks/ssh.py +759 -0
- agentworks/ssh_config.py +255 -0
- agentworks/vm_hosts/__init__.py +0 -0
- agentworks/vm_hosts/manager.py +86 -0
- agentworks/vms/__init__.py +0 -0
- agentworks/vms/backup.py +409 -0
- agentworks/vms/base.py +56 -0
- agentworks/vms/bootstrap_script.py +185 -0
- agentworks/vms/cloud_init.py +55 -0
- agentworks/vms/initializer.py +1523 -0
- agentworks/vms/manager.py +1122 -0
- agentworks/vms/provisioners/__init__.py +0 -0
- agentworks/vms/provisioners/azure.py +602 -0
- agentworks/vms/provisioners/lima.py +295 -0
- agentworks/vms/provisioners/proxmox.py +279 -0
- agentworks/vms/provisioners/proxmox_api.py +261 -0
- agentworks/vms/provisioners/wsl2.py +340 -0
- agentworks/vms/templates.py +152 -0
- agentworks/workspaces/__init__.py +0 -0
- agentworks/workspaces/backends/__init__.py +0 -0
- agentworks/workspaces/backends/local.py +119 -0
- agentworks/workspaces/backends/vm.py +175 -0
- agentworks/workspaces/manager.py +1080 -0
- agentworks/workspaces/templates.py +76 -0
- agentworks/workspaces/tmuxinator.py +80 -0
- agentworks_cli-0.2.1.dist-info/METADATA +635 -0
- agentworks_cli-0.2.1.dist-info/RECORD +59 -0
- agentworks_cli-0.2.1.dist-info/WHEEL +4 -0
- agentworks_cli-0.2.1.dist-info/entry_points.txt +2 -0
agentworks/output.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Output contract between business logic and the presentation layer.
|
|
2
|
+
|
|
3
|
+
Business logic reports data through the handler (info, detail, warn, progress)
|
|
4
|
+
and signals errors by raising exceptions from the hierarchy below. The
|
|
5
|
+
presentation layer (CLI, web, test) sets the handler implementation and
|
|
6
|
+
catches exceptions.
|
|
7
|
+
|
|
8
|
+
Business logic must never import typer, call sys.exit, or format output.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import sys
|
|
14
|
+
import time
|
|
15
|
+
from typing import Protocol
|
|
16
|
+
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
# Progress handle
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Progress(Protocol):
|
|
23
|
+
"""Handle returned by OutputHandler.progress() for tracking a long operation."""
|
|
24
|
+
|
|
25
|
+
def update(self, current: int | None = None, message: str | None = None) -> None:
|
|
26
|
+
"""Report progress. current is meaningful when total was provided."""
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
def done(self, message: str | None = None) -> None:
|
|
30
|
+
"""Mark the operation as complete."""
|
|
31
|
+
...
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
# Handler protocol
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class OutputHandler(Protocol):
|
|
40
|
+
"""Contract for all user-facing output from business logic.
|
|
41
|
+
|
|
42
|
+
Implementations decide rendering: terminal, web, test capture, etc.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def info(self, message: str) -> None:
|
|
46
|
+
"""One-shot status message (top-level)."""
|
|
47
|
+
...
|
|
48
|
+
|
|
49
|
+
def detail(self, message: str, indent: int = 1) -> None:
|
|
50
|
+
"""Sub-step or detail message. indent controls nesting depth (1 = 2 spaces, 2 = 4, etc.)."""
|
|
51
|
+
...
|
|
52
|
+
|
|
53
|
+
def warn(self, message: str) -> None:
|
|
54
|
+
"""Non-fatal warning."""
|
|
55
|
+
...
|
|
56
|
+
|
|
57
|
+
def confirm(self, message: str, default: bool = False) -> bool:
|
|
58
|
+
"""Present a yes/no question. Returns True for yes, False for no."""
|
|
59
|
+
...
|
|
60
|
+
|
|
61
|
+
def choose(self, message: str, options: list[str]) -> int:
|
|
62
|
+
"""Present a list of options. Returns the index of the selected option."""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
def pause(self, message: str) -> None:
|
|
66
|
+
"""Wait for user acknowledgment (press Enter)."""
|
|
67
|
+
...
|
|
68
|
+
|
|
69
|
+
def prompt(self, label: str, default: str | None = None) -> str:
|
|
70
|
+
"""Collect a string value. If default is provided and user enters nothing, returns default."""
|
|
71
|
+
...
|
|
72
|
+
|
|
73
|
+
def prompt_secret(self, label: str, hint: str | None = None) -> str:
|
|
74
|
+
"""Collect a secret value with masked input. Rejects empty values."""
|
|
75
|
+
...
|
|
76
|
+
|
|
77
|
+
def progress(self, label: str, total: int | None = None) -> Progress:
|
|
78
|
+
"""Start a tracked operation. Returns a Progress handle.
|
|
79
|
+
|
|
80
|
+
If total is provided, the operation is determinate (percentage-based).
|
|
81
|
+
Otherwise it is indeterminate (elapsed time only).
|
|
82
|
+
"""
|
|
83
|
+
...
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
# Default handler (plain print, no terminal magic)
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class _DefaultProgress:
|
|
92
|
+
def __init__(self, label: str, total: int | None = None) -> None:
|
|
93
|
+
self._label = label
|
|
94
|
+
self._total = total
|
|
95
|
+
self._start = time.monotonic()
|
|
96
|
+
|
|
97
|
+
def update(self, current: int | None = None, message: str | None = None) -> None:
|
|
98
|
+
parts = [f" {self._label}..."]
|
|
99
|
+
if current is not None and self._total is not None and self._total > 0:
|
|
100
|
+
pct = current / self._total * 100
|
|
101
|
+
parts.append(f" {pct:.0f}% ({current}/{self._total})")
|
|
102
|
+
if message:
|
|
103
|
+
parts.append(f" {message}")
|
|
104
|
+
print("".join(parts))
|
|
105
|
+
|
|
106
|
+
def done(self, message: str | None = None) -> None:
|
|
107
|
+
elapsed = time.monotonic() - self._start
|
|
108
|
+
suffix = f" {message}" if message else ""
|
|
109
|
+
print(f" {self._label} done ({elapsed:.0f}s){suffix}")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class _DefaultHandler:
|
|
113
|
+
def info(self, message: str) -> None:
|
|
114
|
+
print(message)
|
|
115
|
+
|
|
116
|
+
def detail(self, message: str, indent: int = 1) -> None:
|
|
117
|
+
print(f"{' ' * indent}{message}")
|
|
118
|
+
|
|
119
|
+
def warn(self, message: str) -> None:
|
|
120
|
+
print(f"Warning: {message}", file=sys.stderr)
|
|
121
|
+
|
|
122
|
+
def confirm(self, message: str, default: bool = False) -> bool:
|
|
123
|
+
try:
|
|
124
|
+
suffix = " [Y/n]" if default else " [y/N]"
|
|
125
|
+
response = input(message + suffix + " ").strip().lower()
|
|
126
|
+
if not response:
|
|
127
|
+
return default
|
|
128
|
+
return response in ("y", "yes")
|
|
129
|
+
except (EOFError, KeyboardInterrupt):
|
|
130
|
+
raise UserAbort("interrupted") from None
|
|
131
|
+
|
|
132
|
+
def choose(self, message: str, options: list[str]) -> int:
|
|
133
|
+
try:
|
|
134
|
+
print(message)
|
|
135
|
+
for i, option in enumerate(options, 1):
|
|
136
|
+
print(f" {i}) {option}")
|
|
137
|
+
while True:
|
|
138
|
+
try:
|
|
139
|
+
choice = int(input("Choice: "))
|
|
140
|
+
if 1 <= choice <= len(options):
|
|
141
|
+
return choice - 1
|
|
142
|
+
except ValueError:
|
|
143
|
+
pass
|
|
144
|
+
print(f"Invalid choice. Enter 1-{len(options)}.")
|
|
145
|
+
except (EOFError, KeyboardInterrupt):
|
|
146
|
+
raise UserAbort("interrupted") from None
|
|
147
|
+
|
|
148
|
+
def pause(self, message: str) -> None:
|
|
149
|
+
try:
|
|
150
|
+
input(message)
|
|
151
|
+
except (EOFError, KeyboardInterrupt):
|
|
152
|
+
raise UserAbort("interrupted") from None
|
|
153
|
+
|
|
154
|
+
def prompt(self, label: str, default: str | None = None) -> str:
|
|
155
|
+
try:
|
|
156
|
+
suffix = f" [{default}]" if default else ""
|
|
157
|
+
value = input(f"{label}{suffix}: ").strip()
|
|
158
|
+
return value if value else (default or "")
|
|
159
|
+
except (EOFError, KeyboardInterrupt):
|
|
160
|
+
raise UserAbort("interrupted") from None
|
|
161
|
+
|
|
162
|
+
def prompt_secret(self, label: str, hint: str | None = None) -> str:
|
|
163
|
+
import getpass
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
if hint:
|
|
167
|
+
print(f" {hint}", file=sys.stderr)
|
|
168
|
+
while True:
|
|
169
|
+
value = getpass.getpass(f"{label}: ")
|
|
170
|
+
if value.strip():
|
|
171
|
+
return value
|
|
172
|
+
print("(empty, try again)", file=sys.stderr)
|
|
173
|
+
except (EOFError, KeyboardInterrupt):
|
|
174
|
+
raise UserAbort("interrupted") from None
|
|
175
|
+
|
|
176
|
+
def progress(self, label: str, total: int | None = None) -> Progress:
|
|
177
|
+
print(f" {label}...")
|
|
178
|
+
return _DefaultProgress(label, total)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# ---------------------------------------------------------------------------
|
|
182
|
+
# Module API
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
_handler: OutputHandler = _DefaultHandler()
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def info(message: str) -> None:
|
|
189
|
+
"""Emit a top-level status message."""
|
|
190
|
+
_handler.info(message)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def detail(message: str, indent: int = 1) -> None:
|
|
194
|
+
"""Emit an indented detail/sub-step message. indent controls nesting depth."""
|
|
195
|
+
_handler.detail(message, indent)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def warn(message: str) -> None:
|
|
199
|
+
"""Emit a non-fatal warning."""
|
|
200
|
+
_handler.warn(message)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def confirm(message: str, default: bool = False) -> bool:
|
|
204
|
+
"""Present a yes/no question. Returns True for yes, False for no."""
|
|
205
|
+
return _handler.confirm(message, default)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def choose(message: str, options: list[str]) -> int:
|
|
209
|
+
"""Present a list of options. Returns the index of the selected option."""
|
|
210
|
+
return _handler.choose(message, options)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def pause(message: str) -> None:
|
|
214
|
+
"""Wait for user acknowledgment (press Enter)."""
|
|
215
|
+
_handler.pause(message)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def prompt(label: str, default: str | None = None) -> str:
|
|
219
|
+
"""Collect a string value. Returns default if user enters nothing."""
|
|
220
|
+
return _handler.prompt(label, default)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def prompt_secret(label: str, hint: str | None = None) -> str:
|
|
224
|
+
"""Collect a secret value with masked input. Rejects empty values."""
|
|
225
|
+
return _handler.prompt_secret(label, hint)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def progress(label: str, total: int | None = None) -> Progress:
|
|
229
|
+
"""Start a tracked operation. Returns a Progress handle."""
|
|
230
|
+
return _handler.progress(label, total)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def set_handler(handler: OutputHandler) -> None:
|
|
234
|
+
"""Replace the global output handler.
|
|
235
|
+
|
|
236
|
+
Call from the application entrypoint to route output through the appropriate
|
|
237
|
+
mechanism (typer.echo for CLI, websocket for web, list collector for tests).
|
|
238
|
+
"""
|
|
239
|
+
global _handler
|
|
240
|
+
_handler = handler
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def get_handler() -> OutputHandler:
|
|
244
|
+
"""Return the current output handler."""
|
|
245
|
+
return _handler
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# ---------------------------------------------------------------------------
|
|
249
|
+
# Exception hierarchy
|
|
250
|
+
# ---------------------------------------------------------------------------
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class AgentworksError(Exception):
|
|
254
|
+
"""Base exception for all agentworks business logic errors.
|
|
255
|
+
|
|
256
|
+
The presentation layer catches this (and subclasses) at the entrypoint
|
|
257
|
+
and decides how to render the error.
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class VMError(AgentworksError):
|
|
262
|
+
"""Error related to VM operations."""
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class WorkspaceError(AgentworksError):
|
|
266
|
+
"""Error related to workspace operations."""
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class AgentError(AgentworksError):
|
|
270
|
+
"""Error related to agent operations."""
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class SessionError(AgentworksError):
|
|
274
|
+
"""Error related to session operations."""
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
class BrokenSessionError(SessionError):
|
|
278
|
+
"""Session is BROKEN (PID alive but tmux unreachable). Requires --force."""
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class ConnectivityError(AgentworksError):
|
|
282
|
+
"""Error related to network, SSH, or Tailscale connectivity."""
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
class BackupError(AgentworksError):
|
|
286
|
+
"""Error related to backup-specific failures."""
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class ValidationError(AgentworksError):
|
|
290
|
+
"""Invalid user input (name, argument, etc.)."""
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
class UserAbort(AgentworksError):
|
|
294
|
+
"""Raised when the user declines a confirmation prompt."""
|
|
295
|
+
|
|
296
|
+
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""Detached remote command execution with nohup + poll.
|
|
2
|
+
|
|
3
|
+
Runs long-running commands on remote hosts in a way that survives SSH
|
|
4
|
+
disconnects. The command runs under nohup with output redirected to a file.
|
|
5
|
+
The workstation polls for completion by checking the process status and
|
|
6
|
+
tailing new output.
|
|
7
|
+
|
|
8
|
+
If the workstation reconnects after a drop, it detects the still-running
|
|
9
|
+
process and resumes polling instead of starting a new one.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import contextlib
|
|
15
|
+
import time
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
from agentworks import output
|
|
20
|
+
from agentworks.ssh import SSHError
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from agentworks.ssh import ExecTarget
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class DetachedResult:
|
|
28
|
+
"""Result of a detached remote command."""
|
|
29
|
+
|
|
30
|
+
exit_code: int
|
|
31
|
+
output: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Shell wrapper that writes PID, runs the command, then writes exit status.
|
|
35
|
+
_WRAPPER_TEMPLATE = """\
|
|
36
|
+
#!/bin/bash
|
|
37
|
+
echo $$ > {pid_file}
|
|
38
|
+
{command} > {output_file} 2>&1
|
|
39
|
+
echo $? > {status_file}
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def run_detached(
|
|
44
|
+
target: ExecTarget,
|
|
45
|
+
command: str,
|
|
46
|
+
*,
|
|
47
|
+
label: str = "Remote command",
|
|
48
|
+
base_path: str = "/tmp/agentworks-detached",
|
|
49
|
+
poll_interval: int = 3,
|
|
50
|
+
quiet_timeout: int = 300,
|
|
51
|
+
timeout: int | None = None,
|
|
52
|
+
as_root: bool = False,
|
|
53
|
+
quiet: bool = False,
|
|
54
|
+
) -> DetachedResult:
|
|
55
|
+
"""Run a command detached on a remote host, polling for completion.
|
|
56
|
+
|
|
57
|
+
If a previous run is still in progress (PID file exists, process alive),
|
|
58
|
+
resumes polling instead of starting a new one.
|
|
59
|
+
|
|
60
|
+
Running as root: prefer ``as_root=True`` to embedding ``sudo -n`` in the
|
|
61
|
+
command. With ``as_root=True``, the wrapper script itself runs as root so
|
|
62
|
+
the command, its output, and cleanup all happen uniformly with root
|
|
63
|
+
privileges. Inline ``sudo -n`` is only appropriate when parts of a
|
|
64
|
+
multi-step command need different privilege levels (e.g., a pipeline that
|
|
65
|
+
mixes root and non-root stages).
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
target: Remote execution target.
|
|
69
|
+
command: Shell command to run.
|
|
70
|
+
label: Human-readable label for progress messages.
|
|
71
|
+
base_path: Base path for output/pid/status files (unique per operation).
|
|
72
|
+
poll_interval: Seconds between polls.
|
|
73
|
+
quiet_timeout: Warn if no new output for this many seconds.
|
|
74
|
+
timeout: Hard timeout in seconds. The remote process is killed and
|
|
75
|
+
exit code 1 is returned. Partial output is still captured.
|
|
76
|
+
as_root: Run the wrapper script as root. Prefer this over embedding
|
|
77
|
+
``sudo -n`` in the command.
|
|
78
|
+
quiet: Suppress progress output (still captured in the result).
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
DetachedResult with exit code and full output.
|
|
82
|
+
"""
|
|
83
|
+
output_file = f"{base_path}.out"
|
|
84
|
+
pid_file = f"{base_path}.pid"
|
|
85
|
+
status_file = f"{base_path}.status"
|
|
86
|
+
wrapper_file = f"{base_path}.sh"
|
|
87
|
+
|
|
88
|
+
# Check for a completed previous run (reconnect after process finished)
|
|
89
|
+
if _status_file_exists(target, status_file):
|
|
90
|
+
if not quiet:
|
|
91
|
+
output.detail(f"{label}: found completed result from previous run")
|
|
92
|
+
# Check for an existing running process (resume scenario)
|
|
93
|
+
elif _is_running(target, pid_file):
|
|
94
|
+
if not quiet:
|
|
95
|
+
output.detail(f"{label}: resuming in-progress operation...")
|
|
96
|
+
else:
|
|
97
|
+
# Write and start the wrapper script
|
|
98
|
+
wrapper = _WRAPPER_TEMPLATE.format(
|
|
99
|
+
command=command,
|
|
100
|
+
output_file=output_file,
|
|
101
|
+
pid_file=pid_file,
|
|
102
|
+
status_file=status_file,
|
|
103
|
+
)
|
|
104
|
+
target.write_file(wrapper_file, wrapper)
|
|
105
|
+
|
|
106
|
+
# Clear any stale files from a previous run
|
|
107
|
+
target.run(f"rm -f {output_file} {pid_file} {status_file}", sudo=as_root, check=False)
|
|
108
|
+
|
|
109
|
+
# Launch detached. nohup must be OUTSIDE sudo so that SIGHUP (from
|
|
110
|
+
# SSH PTY teardown) hits nohup first, not sudo. tty=False is the
|
|
111
|
+
# primary protection (no PTY = no SIGHUP), but the nohup ordering
|
|
112
|
+
# provides defense-in-depth. We don't use sudo=True here because
|
|
113
|
+
# that wraps in bash -c, putting nohup inside the sudo'd shell.
|
|
114
|
+
if as_root:
|
|
115
|
+
nohup_cmd = f"nohup sudo -n /bin/bash {wrapper_file} </dev/null >/dev/null 2>&1 &"
|
|
116
|
+
else:
|
|
117
|
+
nohup_cmd = f"nohup /bin/bash {wrapper_file} </dev/null >/dev/null 2>&1 &"
|
|
118
|
+
target.run(nohup_cmd, tty=False, check=False)
|
|
119
|
+
|
|
120
|
+
# Brief pause for PID file to be written
|
|
121
|
+
time.sleep(0.5)
|
|
122
|
+
|
|
123
|
+
if not quiet:
|
|
124
|
+
output.detail(f"{label}: started (detached)")
|
|
125
|
+
|
|
126
|
+
# Poll for completion
|
|
127
|
+
captured = _poll_until_done(
|
|
128
|
+
target,
|
|
129
|
+
output_file,
|
|
130
|
+
pid_file,
|
|
131
|
+
status_file,
|
|
132
|
+
label=label,
|
|
133
|
+
poll_interval=poll_interval,
|
|
134
|
+
quiet_timeout=quiet_timeout,
|
|
135
|
+
timeout=timeout,
|
|
136
|
+
quiet=quiet,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Read exit code (retry on SSH failure like the output read)
|
|
140
|
+
exit_code = 1
|
|
141
|
+
for _ec_attempt in range(6):
|
|
142
|
+
try:
|
|
143
|
+
exit_code = _read_exit_code(target, status_file)
|
|
144
|
+
break
|
|
145
|
+
except SSHError:
|
|
146
|
+
time.sleep(5)
|
|
147
|
+
|
|
148
|
+
# Cleanup remote files (best-effort, may fail if SSH is still recovering)
|
|
149
|
+
with contextlib.suppress(SSHError):
|
|
150
|
+
target.run(f"rm -f {wrapper_file} {pid_file} {status_file} {output_file}", sudo=as_root, check=False)
|
|
151
|
+
|
|
152
|
+
return DetachedResult(exit_code=exit_code, output=captured)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _is_running(target: ExecTarget, pid_file: str) -> bool:
|
|
156
|
+
"""Check if a detached process is still running."""
|
|
157
|
+
# Check PID file exists
|
|
158
|
+
result = target.run(f"test -f {pid_file}", check=False)
|
|
159
|
+
if result.returncode != 0:
|
|
160
|
+
return False
|
|
161
|
+
# Read PID and check if process is alive (ps -p works regardless of user)
|
|
162
|
+
result = target.run(f"ps -p $(cat {pid_file}) > /dev/null 2>&1", check=False)
|
|
163
|
+
return result.returncode == 0
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _status_file_exists(target: ExecTarget, status_file: str) -> bool:
|
|
167
|
+
"""Check if a status file exists (process completed)."""
|
|
168
|
+
result = target.run(f"test -f {status_file}", check=False)
|
|
169
|
+
return result.returncode == 0
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _poll_until_done(
|
|
173
|
+
target: ExecTarget,
|
|
174
|
+
output_file: str,
|
|
175
|
+
pid_file: str,
|
|
176
|
+
status_file: str,
|
|
177
|
+
*,
|
|
178
|
+
label: str,
|
|
179
|
+
poll_interval: int,
|
|
180
|
+
quiet_timeout: int,
|
|
181
|
+
timeout: int | None = None,
|
|
182
|
+
quiet: bool = False,
|
|
183
|
+
) -> str:
|
|
184
|
+
"""Poll the remote process until it completes, streaming new output."""
|
|
185
|
+
last_size = 0
|
|
186
|
+
last_output_time = time.monotonic()
|
|
187
|
+
start_time = time.monotonic()
|
|
188
|
+
warned_quiet = False
|
|
189
|
+
|
|
190
|
+
ssh_failures = 0
|
|
191
|
+
|
|
192
|
+
while True:
|
|
193
|
+
time.sleep(poll_interval)
|
|
194
|
+
|
|
195
|
+
# Hard timeout -- kill the remote process to avoid orphans
|
|
196
|
+
if timeout is not None and (time.monotonic() - start_time) > timeout:
|
|
197
|
+
output.warn(
|
|
198
|
+
f"{label}: timed out after {timeout}s, killing remote process"
|
|
199
|
+
)
|
|
200
|
+
with contextlib.suppress(SSHError):
|
|
201
|
+
target.run(
|
|
202
|
+
f"test -f {pid_file} && kill $(cat {pid_file}) 2>/dev/null",
|
|
203
|
+
check=False,
|
|
204
|
+
)
|
|
205
|
+
break
|
|
206
|
+
|
|
207
|
+
# All polling commands go through SSH which may be temporarily
|
|
208
|
+
# down (e.g., tailscale logout disrupts Azure networking). Catch
|
|
209
|
+
# SSHError and retry -- the wrapper script on the VM keeps running.
|
|
210
|
+
try:
|
|
211
|
+
# Read new output since last poll
|
|
212
|
+
new_output = _read_new_output(target, output_file, last_size)
|
|
213
|
+
if new_output:
|
|
214
|
+
last_output_time = time.monotonic()
|
|
215
|
+
warned_quiet = False
|
|
216
|
+
if not quiet:
|
|
217
|
+
for line in new_output.splitlines():
|
|
218
|
+
output.detail(line)
|
|
219
|
+
last_size += len(new_output.encode("utf-8"))
|
|
220
|
+
|
|
221
|
+
# Check if process finished (status file exists)
|
|
222
|
+
status_check = target.run(f"test -f {status_file}", check=False)
|
|
223
|
+
if status_check.returncode == 0:
|
|
224
|
+
# Process done -- read any remaining output
|
|
225
|
+
final_output = _read_new_output(target, output_file, last_size)
|
|
226
|
+
if final_output and not quiet:
|
|
227
|
+
for line in final_output.splitlines():
|
|
228
|
+
output.detail(line)
|
|
229
|
+
break
|
|
230
|
+
|
|
231
|
+
# Check if process is still alive (PID check)
|
|
232
|
+
if not _is_running(target, pid_file):
|
|
233
|
+
# Process gone but no status file -- unexpected termination
|
|
234
|
+
break
|
|
235
|
+
|
|
236
|
+
# Reset SSH failure counter on success
|
|
237
|
+
if ssh_failures > 0 and not quiet:
|
|
238
|
+
output.detail(f"{label}: connection restored")
|
|
239
|
+
ssh_failures = 0
|
|
240
|
+
|
|
241
|
+
except SSHError:
|
|
242
|
+
ssh_failures += 1
|
|
243
|
+
if not quiet:
|
|
244
|
+
if ssh_failures == 1:
|
|
245
|
+
output.detail(f"{label}: connection lost, waiting for recovery...")
|
|
246
|
+
elif ssh_failures % 6 == 0:
|
|
247
|
+
output.detail(f"{label}: still waiting... ({ssh_failures * poll_interval}s)")
|
|
248
|
+
# Don't break -- the wrapper script is still running on the VM
|
|
249
|
+
continue
|
|
250
|
+
|
|
251
|
+
# Warn if no output for a while
|
|
252
|
+
quiet_secs = time.monotonic() - last_output_time
|
|
253
|
+
if quiet_secs > quiet_timeout and not warned_quiet:
|
|
254
|
+
output.warn(
|
|
255
|
+
f"{label}: no output for {int(quiet_secs)}s (still running)..."
|
|
256
|
+
)
|
|
257
|
+
warned_quiet = True
|
|
258
|
+
|
|
259
|
+
# Read the full output for the caller. Retry on SSH failure since the
|
|
260
|
+
# connection may still be recovering after a transient disruption.
|
|
261
|
+
for _read_attempt in range(6):
|
|
262
|
+
try:
|
|
263
|
+
result = target.run(f"cat {output_file} 2>/dev/null", check=False)
|
|
264
|
+
return result.stdout
|
|
265
|
+
except SSHError:
|
|
266
|
+
time.sleep(5)
|
|
267
|
+
output.warn(f"{label}: unable to retrieve remote output after repeated SSH failures")
|
|
268
|
+
return ""
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _read_new_output(target: ExecTarget, output_file: str, offset: int) -> str:
|
|
272
|
+
"""Read new bytes from the output file since the given offset."""
|
|
273
|
+
result = target.run(
|
|
274
|
+
f"tail -c +{offset + 1} {output_file} 2>/dev/null",
|
|
275
|
+
check=False,
|
|
276
|
+
)
|
|
277
|
+
return result.stdout if result.returncode == 0 else ""
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _read_exit_code(target: ExecTarget, status_file: str) -> int:
|
|
281
|
+
"""Read the exit code from the status file."""
|
|
282
|
+
result = target.run(f"cat {status_file} 2>/dev/null", check=False)
|
|
283
|
+
try:
|
|
284
|
+
return int(result.stdout.strip())
|
|
285
|
+
except (ValueError, AttributeError):
|
|
286
|
+
return 1 # assume failure if we can't read it
|