wafer-cli 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wafer/ssh_keys.py ADDED
@@ -0,0 +1,261 @@
1
+ """SSH Keys CLI - Manage SSH public keys for workspace access.
2
+
3
+ This module provides the implementation for the `wafer ssh-keys` subcommand.
4
+ Users register their SSH public keys here, which are then installed in all
5
+ workspaces they attach to (BYOK - Bring Your Own Key model).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+
14
+ import httpx
15
+
16
+ from .api_client import get_api_url
17
+ from .auth import get_auth_headers
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class SshKey:
22
+ """Registered SSH key info."""
23
+
24
+ id: str
25
+ public_key: str
26
+ name: str | None
27
+ created_at: str
28
+
29
+
30
+ def _get_client() -> tuple[str, dict[str, str]]:
31
+ """Get API URL and auth headers."""
32
+ api_url = get_api_url()
33
+ headers = get_auth_headers()
34
+
35
+ assert api_url, "API URL must be configured"
36
+ assert api_url.startswith("http"), "API URL must be a valid HTTP(S) URL"
37
+
38
+ return api_url, headers
39
+
40
+
41
+ def _get_key_fingerprint(public_key: str) -> str:
42
+ """Extract a short fingerprint from a public key for display.
43
+
44
+ Returns the first 12 characters of the base64 data portion.
45
+ """
46
+ parts = public_key.strip().split()
47
+ if len(parts) >= 2:
48
+ return parts[1][:12] + "..."
49
+ return public_key[:12] + "..."
50
+
51
+
52
+ def _get_key_type(public_key: str) -> str:
53
+ """Extract the key type from a public key."""
54
+ parts = public_key.strip().split()
55
+ if parts:
56
+ return parts[0]
57
+ return "unknown"
58
+
59
+
60
+ def _detect_ssh_keys() -> list[Path]:
61
+ """Detect existing SSH public keys on disk.
62
+
63
+ Returns list of paths to found public key files, in preference order.
64
+ """
65
+ ssh_dir = Path.home() / ".ssh"
66
+ candidates = [
67
+ "id_ed25519.pub", # Preferred (modern, secure, fast)
68
+ "id_rsa.pub", # Legacy but common
69
+ "id_ecdsa.pub", # Less common
70
+ "id_dsa.pub", # Deprecated
71
+ ]
72
+
73
+ found = []
74
+ for filename in candidates:
75
+ key_path = ssh_dir / filename
76
+ if key_path.exists():
77
+ found.append(key_path)
78
+
79
+ return found
80
+
81
+
82
+ def list_ssh_keys(json_output: bool = False) -> str:
83
+ """List all registered SSH keys.
84
+
85
+ Returns:
86
+ Formatted output string
87
+ """
88
+ api_url, headers = _get_client()
89
+
90
+ try:
91
+ with httpx.Client(timeout=30.0, headers=headers) as client:
92
+ response = client.get(f"{api_url}/v1/user/ssh-keys")
93
+ response.raise_for_status()
94
+ keys = response.json()
95
+ except httpx.HTTPStatusError as e:
96
+ if e.response.status_code == 401:
97
+ raise RuntimeError("Not authenticated. Run: wafer login") from e
98
+ raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e
99
+ except httpx.RequestError as e:
100
+ raise RuntimeError(f"Could not reach API: {e}") from e
101
+
102
+ if json_output:
103
+ return json.dumps(keys, indent=2)
104
+
105
+ if not keys:
106
+ return (
107
+ "No SSH keys registered.\n"
108
+ "\n"
109
+ "Add your SSH key:\n"
110
+ " wafer ssh-keys add\n"
111
+ "\n"
112
+ "This will auto-detect your key from ~/.ssh/"
113
+ )
114
+
115
+ lines = ["SSH Keys:"]
116
+ for key in keys:
117
+ key_type = _get_key_type(key["public_key"])
118
+ fingerprint = _get_key_fingerprint(key["public_key"])
119
+ name = key.get("name") or "(no name)"
120
+ lines.append(f" • {name}: {key_type} {fingerprint}")
121
+ lines.append(f" ID: {key['id']}")
122
+
123
+ return "\n".join(lines)
124
+
125
+
126
+ def add_ssh_key(
127
+ pubkey_path: Path | None = None,
128
+ name: str | None = None,
129
+ json_output: bool = False,
130
+ ) -> str:
131
+ """Add an SSH public key.
132
+
133
+ Args:
134
+ pubkey_path: Path to public key file (auto-detects if None)
135
+ name: Optional friendly name for the key
136
+ json_output: Return JSON instead of formatted output
137
+
138
+ Returns:
139
+ Formatted output string
140
+ """
141
+ # Auto-detect if no path provided
142
+ if pubkey_path is None:
143
+ detected = _detect_ssh_keys()
144
+ if not detected:
145
+ raise RuntimeError(
146
+ "No SSH key found in ~/.ssh/\n"
147
+ "\n"
148
+ "Generate one with:\n"
149
+ " ssh-keygen -t ed25519\n"
150
+ "\n"
151
+ "Or specify a path:\n"
152
+ " wafer ssh-keys add /path/to/key.pub"
153
+ )
154
+ pubkey_path = detected[0]
155
+
156
+ # Validate path
157
+ if not pubkey_path.exists():
158
+ raise RuntimeError(f"File not found: {pubkey_path}")
159
+
160
+ if not pubkey_path.suffix == ".pub" and "pub" not in pubkey_path.name:
161
+ raise RuntimeError(
162
+ f"Expected a public key file (.pub), got: {pubkey_path}\n"
163
+ "\n"
164
+ "Make sure you're adding the PUBLIC key, not the private key."
165
+ )
166
+
167
+ # Read key content
168
+ try:
169
+ public_key = pubkey_path.read_text().strip()
170
+ except Exception as e:
171
+ raise RuntimeError(f"Could not read key file: {e}") from e
172
+
173
+ # Validate basic format
174
+ if not public_key.startswith(("ssh-", "ecdsa-", "sk-")):
175
+ raise RuntimeError(
176
+ f"Invalid SSH public key format in {pubkey_path}\n"
177
+ "\n"
178
+ "Expected OpenSSH format (e.g., 'ssh-ed25519 AAAAC3... user@host')"
179
+ )
180
+
181
+ # Auto-generate name from key type and filename if not provided
182
+ if name is None:
183
+ key_type = _get_key_type(public_key)
184
+ # Use key type without prefix (e.g., "ed25519" instead of "ssh-ed25519")
185
+ short_type = key_type.replace("ssh-", "").replace("ecdsa-sha2-", "")
186
+ name = short_type
187
+
188
+ # Call API
189
+ api_url, headers = _get_client()
190
+ request_body = {
191
+ "public_key": public_key,
192
+ "name": name,
193
+ }
194
+
195
+ try:
196
+ with httpx.Client(timeout=30.0, headers=headers) as client:
197
+ response = client.post(
198
+ f"{api_url}/v1/user/ssh-keys",
199
+ json=request_body,
200
+ )
201
+ response.raise_for_status()
202
+ key_data = response.json()
203
+ except httpx.HTTPStatusError as e:
204
+ if e.response.status_code == 401:
205
+ raise RuntimeError("Not authenticated. Run: wafer login") from e
206
+ if e.response.status_code == 400:
207
+ # Parse error detail
208
+ try:
209
+ detail = e.response.json().get("detail", e.response.text)
210
+ except Exception:
211
+ detail = e.response.text
212
+ raise RuntimeError(f"Invalid key: {detail}") from e
213
+ raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e
214
+ except httpx.RequestError as e:
215
+ raise RuntimeError(f"Could not reach API: {e}") from e
216
+
217
+ if json_output:
218
+ return json.dumps(key_data, indent=2)
219
+
220
+ key_type = _get_key_type(public_key)
221
+ fingerprint = _get_key_fingerprint(public_key)
222
+
223
+ return (
224
+ f"✓ SSH key registered: {name}\n"
225
+ f" Type: {key_type}\n"
226
+ f" Fingerprint: {fingerprint}\n"
227
+ f" Source: {pubkey_path}\n"
228
+ f"\n"
229
+ f"Your key will be installed in all workspaces you attach to."
230
+ )
231
+
232
+
233
+ def remove_ssh_key(key_id: str, json_output: bool = False) -> str:
234
+ """Remove an SSH key.
235
+
236
+ Args:
237
+ key_id: UUID of the key to remove
238
+ json_output: Return JSON instead of formatted output
239
+
240
+ Returns:
241
+ Formatted output string
242
+ """
243
+ api_url, headers = _get_client()
244
+
245
+ try:
246
+ with httpx.Client(timeout=30.0, headers=headers) as client:
247
+ response = client.delete(f"{api_url}/v1/user/ssh-keys/{key_id}")
248
+ response.raise_for_status()
249
+ except httpx.HTTPStatusError as e:
250
+ if e.response.status_code == 401:
251
+ raise RuntimeError("Not authenticated. Run: wafer login") from e
252
+ if e.response.status_code == 404:
253
+ raise RuntimeError(f"SSH key not found: {key_id}") from e
254
+ raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e
255
+ except httpx.RequestError as e:
256
+ raise RuntimeError(f"Could not reach API: {e}") from e
257
+
258
+ if json_output:
259
+ return json.dumps({"status": "deleted", "key_id": key_id}, indent=2)
260
+
261
+ return f"✓ SSH key removed: {key_id}"
wafer/target_lock.py ADDED
@@ -0,0 +1,270 @@
1
+ """Target locking for concurrent access control.
2
+
3
+ Uses file locks (fcntl.flock) to ensure only one process uses a target at a time.
4
+ Locks are automatically released when the process exits or crashes.
5
+
6
+ Usage:
7
+ # Try to acquire a single target
8
+ with try_acquire_target("mi300x-1") as acquired:
9
+ if acquired:
10
+ # Got the lock, run eval
11
+ ...
12
+ else:
13
+ # Target busy
14
+ ...
15
+
16
+ # Acquire first available from a pool
17
+ with acquire_from_pool(["mi300x-1", "mi300x-2", "mi300x-3"]) as target:
18
+ if target:
19
+ # Got a target, run eval
20
+ ...
21
+ else:
22
+ # All targets busy
23
+ ...
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import fcntl
29
+ import json
30
+ import os
31
+ import sys
32
+ import time
33
+ from collections.abc import Iterator
34
+ from contextlib import contextmanager
35
+ from datetime import UTC
36
+ from pathlib import Path
37
+
38
+
39
+ def _emit_gpu_event(event_type: str, **data: dict) -> None:
40
+ """Emit structured GPU event to stderr as JSON.
41
+
42
+ Events are written to stderr (not stdout) to avoid interfering with
43
+ command output parsing. Format: JSON with newline.
44
+
45
+ These events can be:
46
+ 1. Parsed from bash output in eval events.jsonl
47
+ 2. Piped to observability systems
48
+ 3. Aggregated for GPU utilization metrics
49
+ """
50
+ from datetime import datetime
51
+
52
+ event = {
53
+ "type": event_type,
54
+ "timestamp": datetime.now(UTC).isoformat(),
55
+ "pid": os.getpid(),
56
+ **data,
57
+ }
58
+ # Write to stderr so it doesn't interfere with stdout capture
59
+ print(f"[GPU_EVENT] {json.dumps(event)}", file=sys.stderr, flush=True)
60
+
61
+
62
+ # Lock directory
63
+ LOCKS_DIR = Path.home() / ".wafer" / "locks"
64
+
65
+
66
+ def _ensure_locks_dir() -> None:
67
+ """Ensure locks directory exists."""
68
+ LOCKS_DIR.mkdir(parents=True, exist_ok=True)
69
+
70
+
71
+ def _lock_path(target_name: str) -> Path:
72
+ """Get path to lock file for a target."""
73
+ return LOCKS_DIR / f"{target_name}.lock"
74
+
75
+
76
+ @contextmanager
77
+ def try_acquire_target(target_name: str) -> Iterator[bool]:
78
+ """Try to acquire exclusive lock on a target.
79
+
80
+ Args:
81
+ target_name: Name of the target to lock
82
+
83
+ Yields:
84
+ True if lock was acquired, False if target is busy
85
+
86
+ The lock is automatically released when the context exits,
87
+ or if the process crashes.
88
+ """
89
+ _ensure_locks_dir()
90
+ lock_file = _lock_path(target_name)
91
+
92
+ # Open or create lock file
93
+ fd = os.open(str(lock_file), os.O_CREAT | os.O_RDWR)
94
+
95
+ try:
96
+ # Try non-blocking exclusive lock
97
+ fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
98
+ # Write PID to lock file for debugging
99
+ os.ftruncate(fd, 0)
100
+ os.write(fd, f"{os.getpid()}\n".encode())
101
+ acquire_time = time.time()
102
+ _emit_gpu_event("gpu_acquire", target=target_name)
103
+ try:
104
+ yield True
105
+ finally:
106
+ # Release lock
107
+ hold_duration_ms = (time.time() - acquire_time) * 1000
108
+ _emit_gpu_event(
109
+ "gpu_release",
110
+ target=target_name,
111
+ hold_duration_ms=round(hold_duration_ms, 1),
112
+ )
113
+ fcntl.flock(fd, fcntl.LOCK_UN)
114
+ except BlockingIOError:
115
+ # Lock is held by another process
116
+ yield False
117
+ finally:
118
+ os.close(fd)
119
+
120
+
121
+ @contextmanager
122
+ def acquire_from_pool(
123
+ target_names: list[str],
124
+ timeout: float | None = None,
125
+ poll_interval: float = 1.0,
126
+ ) -> Iterator[str | None]:
127
+ """Acquire first available target from a list.
128
+
129
+ Tries each target in order, returns the first one that's available.
130
+ If all targets are busy and timeout is set, waits and retries.
131
+
132
+ Args:
133
+ target_names: List of target names to try
134
+ timeout: Max seconds to wait for a target. None = no waiting (fail immediately).
135
+ Use float('inf') to wait forever.
136
+ poll_interval: Seconds between retries when waiting
137
+
138
+ Yields:
139
+ Name of acquired target, or None if all are busy (and timeout expired)
140
+
141
+ Example:
142
+ # Wait up to 5 minutes for a target
143
+ with acquire_from_pool(["gpu-1", "gpu-2", "gpu-3"], timeout=300) as target:
144
+ if target:
145
+ print(f"Got {target}")
146
+ run_eval(target)
147
+ else:
148
+ print("All targets busy after timeout")
149
+ """
150
+ _ensure_locks_dir()
151
+
152
+ start_time = time.monotonic()
153
+
154
+ while True:
155
+ # Try each target in order
156
+ for target_name in target_names:
157
+ lock_file = _lock_path(target_name)
158
+ fd = os.open(str(lock_file), os.O_CREAT | os.O_RDWR)
159
+
160
+ try:
161
+ fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
162
+ # Got the lock - write PID and yield
163
+ os.ftruncate(fd, 0)
164
+ os.write(fd, f"{os.getpid()}\n".encode())
165
+ acquire_time = time.time()
166
+ _emit_gpu_event("gpu_acquire", target=target_name, pool=target_names)
167
+ try:
168
+ yield target_name
169
+ return # Success - exit after context
170
+ finally:
171
+ hold_duration_ms = (time.time() - acquire_time) * 1000
172
+ _emit_gpu_event(
173
+ "gpu_release",
174
+ target=target_name,
175
+ pool=target_names,
176
+ hold_duration_ms=round(hold_duration_ms, 1),
177
+ )
178
+ fcntl.flock(fd, fcntl.LOCK_UN)
179
+ os.close(fd)
180
+ except BlockingIOError:
181
+ # This target is busy, try next
182
+ os.close(fd)
183
+ continue
184
+
185
+ # All targets busy - check if we should wait
186
+ if timeout is None:
187
+ # No waiting, fail immediately
188
+ break
189
+
190
+ elapsed = time.monotonic() - start_time
191
+ if elapsed >= timeout:
192
+ # Timeout expired
193
+ break
194
+
195
+ # Wait and retry
196
+ remaining = timeout - elapsed
197
+ print(f" All targets busy, waiting... ({int(remaining)}s remaining)", file=sys.stderr)
198
+ time.sleep(poll_interval)
199
+
200
+ # All targets busy (timeout expired or no waiting)
201
+ yield None
202
+
203
+
204
+ def is_target_locked(target_name: str) -> bool:
205
+ """Check if a target is currently locked.
206
+
207
+ Note: This is a point-in-time check - the lock status can change
208
+ immediately after this returns.
209
+
210
+ Args:
211
+ target_name: Name of the target to check
212
+
213
+ Returns:
214
+ True if target is locked, False if available
215
+ """
216
+ _ensure_locks_dir()
217
+ lock_file = _lock_path(target_name)
218
+
219
+ if not lock_file.exists():
220
+ return False
221
+
222
+ fd = os.open(str(lock_file), os.O_RDONLY)
223
+ try:
224
+ # Try non-blocking lock
225
+ fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
226
+ # Got it - so it wasn't locked
227
+ fcntl.flock(fd, fcntl.LOCK_UN)
228
+ return False
229
+ except BlockingIOError:
230
+ return True
231
+ finally:
232
+ os.close(fd)
233
+
234
+
235
+ def get_lock_holder(target_name: str) -> int | None:
236
+ """Get PID of process holding lock on a target.
237
+
238
+ Args:
239
+ target_name: Name of the target
240
+
241
+ Returns:
242
+ PID of lock holder, or None if not locked or unknown
243
+ """
244
+ lock_file = _lock_path(target_name)
245
+
246
+ if not lock_file.exists():
247
+ return None
248
+
249
+ try:
250
+ content = lock_file.read_text().strip()
251
+ return int(content)
252
+ except (ValueError, OSError):
253
+ return None
254
+
255
+
256
+ def list_locked_targets() -> list[str]:
257
+ """List all currently locked targets.
258
+
259
+ Returns:
260
+ List of target names that are currently locked
261
+ """
262
+ _ensure_locks_dir()
263
+
264
+ locked = []
265
+ for lock_file in LOCKS_DIR.glob("*.lock"):
266
+ target_name = lock_file.stem
267
+ if is_target_locked(target_name):
268
+ locked.append(target_name)
269
+
270
+ return sorted(locked)