wafer-cli 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/auth.py +85 -0
- wafer/cli.py +1196 -160
- wafer/evaluate.py +1171 -209
- wafer/gpu_run.py +5 -1
- wafer/kernel_scope.py +453 -0
- wafer/problems.py +357 -0
- wafer/target_lock.py +270 -0
- wafer/targets.py +490 -0
- wafer/wevin_cli.py +2 -0
- wafer/workspaces.py +53 -1
- {wafer_cli-0.2.7.dist-info → wafer_cli-0.2.9.dist-info}/METADATA +1 -1
- {wafer_cli-0.2.7.dist-info → wafer_cli-0.2.9.dist-info}/RECORD +15 -12
- {wafer_cli-0.2.7.dist-info → wafer_cli-0.2.9.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.7.dist-info → wafer_cli-0.2.9.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.7.dist-info → wafer_cli-0.2.9.dist-info}/top_level.txt +0 -0
wafer/targets.py
CHANGED
|
@@ -257,6 +257,220 @@ def get_default_target() -> str | None:
|
|
|
257
257
|
return data.get("default_target")
|
|
258
258
|
|
|
259
259
|
|
|
260
|
+
# ── Pool Management ─────────────────────────────────────────────────────────
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def get_pool(name: str) -> list[str]:
|
|
264
|
+
"""Get list of targets in a named pool.
|
|
265
|
+
|
|
266
|
+
Pools are defined in ~/.wafer/config.toml:
|
|
267
|
+
[pools.my-pool]
|
|
268
|
+
targets = ["target-1", "target-2", "target-3"]
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
name: Pool name
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
List of target names in the pool
|
|
275
|
+
|
|
276
|
+
Raises:
|
|
277
|
+
FileNotFoundError: If pool doesn't exist
|
|
278
|
+
"""
|
|
279
|
+
if not CONFIG_FILE.exists():
|
|
280
|
+
raise FileNotFoundError(f"Pool not found: {name} (no config file)")
|
|
281
|
+
|
|
282
|
+
with open(CONFIG_FILE, "rb") as f:
|
|
283
|
+
data = tomllib.load(f)
|
|
284
|
+
|
|
285
|
+
pools = data.get("pools", {})
|
|
286
|
+
if name not in pools:
|
|
287
|
+
raise FileNotFoundError(
|
|
288
|
+
f"Pool not found: {name}\n"
|
|
289
|
+
f" Define pools in ~/.wafer/config.toml:\n"
|
|
290
|
+
f" [pools.{name}]\n"
|
|
291
|
+
f' targets = ["target-1", "target-2"]'
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
pool_config = pools[name]
|
|
295
|
+
targets = pool_config.get("targets", [])
|
|
296
|
+
|
|
297
|
+
if not targets:
|
|
298
|
+
raise ValueError(f"Pool '{name}' has no targets defined")
|
|
299
|
+
|
|
300
|
+
return targets
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def get_target_type(name: str) -> str | None:
|
|
304
|
+
"""Get the type of a target without fully loading it.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
name: Target name
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
Target type string (runpod, digitalocean, baremetal, etc.) or None if not found
|
|
311
|
+
"""
|
|
312
|
+
path = _target_path(name)
|
|
313
|
+
if not path.exists():
|
|
314
|
+
return None
|
|
315
|
+
|
|
316
|
+
with open(path, "rb") as f:
|
|
317
|
+
data = tomllib.load(f)
|
|
318
|
+
|
|
319
|
+
return data.get("type")
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def filter_pool_by_auth(target_names: list[str]) -> tuple[list[str], list[str]]:
|
|
323
|
+
"""Filter pool targets to only those with valid authentication.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
target_names: List of target names to filter
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Tuple of (usable_targets, skipped_targets)
|
|
330
|
+
"""
|
|
331
|
+
from wafer_core.auth import get_api_key
|
|
332
|
+
|
|
333
|
+
usable = []
|
|
334
|
+
skipped = []
|
|
335
|
+
|
|
336
|
+
for name in target_names:
|
|
337
|
+
target_type = get_target_type(name)
|
|
338
|
+
if target_type is None:
|
|
339
|
+
# Target doesn't exist, skip it
|
|
340
|
+
skipped.append(name)
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
# Check auth requirements by target type
|
|
344
|
+
if target_type == "runpod":
|
|
345
|
+
if not get_api_key("runpod"):
|
|
346
|
+
skipped.append(name)
|
|
347
|
+
continue
|
|
348
|
+
elif target_type == "digitalocean":
|
|
349
|
+
if not get_api_key("digitalocean"):
|
|
350
|
+
skipped.append(name)
|
|
351
|
+
continue
|
|
352
|
+
# Other types (baremetal, vm, workspace, modal) don't need runtime API keys
|
|
353
|
+
|
|
354
|
+
usable.append(name)
|
|
355
|
+
|
|
356
|
+
return usable, skipped
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def list_pools() -> list[str]:
|
|
360
|
+
"""List all configured pool names.
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Sorted list of pool names
|
|
364
|
+
"""
|
|
365
|
+
if not CONFIG_FILE.exists():
|
|
366
|
+
return []
|
|
367
|
+
|
|
368
|
+
with open(CONFIG_FILE, "rb") as f:
|
|
369
|
+
data = tomllib.load(f)
|
|
370
|
+
|
|
371
|
+
return sorted(data.get("pools", {}).keys())
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def save_pool(name: str, targets: list[str]) -> None:
|
|
375
|
+
"""Save or update a pool configuration.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
name: Pool name
|
|
379
|
+
targets: List of target names (must all exist)
|
|
380
|
+
|
|
381
|
+
Raises:
|
|
382
|
+
FileNotFoundError: If any target doesn't exist
|
|
383
|
+
"""
|
|
384
|
+
# Verify all targets exist
|
|
385
|
+
existing_targets = list_targets()
|
|
386
|
+
missing = [t for t in targets if t not in existing_targets]
|
|
387
|
+
if missing:
|
|
388
|
+
raise FileNotFoundError(f"Targets not found: {', '.join(missing)}")
|
|
389
|
+
|
|
390
|
+
_ensure_dirs()
|
|
391
|
+
|
|
392
|
+
# Load existing config
|
|
393
|
+
if CONFIG_FILE.exists():
|
|
394
|
+
with open(CONFIG_FILE, "rb") as f:
|
|
395
|
+
data = tomllib.load(f)
|
|
396
|
+
else:
|
|
397
|
+
data = {}
|
|
398
|
+
|
|
399
|
+
# Update pools section
|
|
400
|
+
if "pools" not in data:
|
|
401
|
+
data["pools"] = {}
|
|
402
|
+
|
|
403
|
+
data["pools"][name] = {"targets": targets}
|
|
404
|
+
|
|
405
|
+
# Write back - need custom handling for nested structure
|
|
406
|
+
_write_config_with_pools(data)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _write_config_with_pools(data: dict) -> None:
|
|
410
|
+
"""Write config file with pools support.
|
|
411
|
+
|
|
412
|
+
Handles the nested [pools.name] TOML structure and preserves
|
|
413
|
+
existing nested sections like [default], [api], [environments.*].
|
|
414
|
+
"""
|
|
415
|
+
lines = []
|
|
416
|
+
|
|
417
|
+
# Collect nested sections to write after top-level keys
|
|
418
|
+
nested_sections: dict[str, dict] = {}
|
|
419
|
+
|
|
420
|
+
# Write top-level keys first (except pools and nested dicts)
|
|
421
|
+
for key, value in data.items():
|
|
422
|
+
if key == "pools":
|
|
423
|
+
continue
|
|
424
|
+
if value is None:
|
|
425
|
+
continue
|
|
426
|
+
if isinstance(value, dict):
|
|
427
|
+
# Save nested sections for later
|
|
428
|
+
nested_sections[key] = value
|
|
429
|
+
elif isinstance(value, str):
|
|
430
|
+
lines.append(f'{key} = "{value}"')
|
|
431
|
+
elif isinstance(value, bool):
|
|
432
|
+
lines.append(f"{key} = {str(value).lower()}")
|
|
433
|
+
elif isinstance(value, int | float):
|
|
434
|
+
lines.append(f"{key} = {value}")
|
|
435
|
+
elif isinstance(value, list):
|
|
436
|
+
if all(isinstance(v, int) for v in value):
|
|
437
|
+
lines.append(f"{key} = {value}")
|
|
438
|
+
else:
|
|
439
|
+
formatted = ", ".join(f'"{v}"' if isinstance(v, str) else str(v) for v in value)
|
|
440
|
+
lines.append(f"{key} = [{formatted}]")
|
|
441
|
+
|
|
442
|
+
# Write nested sections (e.g., [default], [api], [environments.foo])
|
|
443
|
+
for section_name, section_data in nested_sections.items():
|
|
444
|
+
lines.append("")
|
|
445
|
+
lines.append(f"[{section_name}]")
|
|
446
|
+
for key, value in section_data.items():
|
|
447
|
+
if value is None:
|
|
448
|
+
continue
|
|
449
|
+
if isinstance(value, str):
|
|
450
|
+
lines.append(f'{key} = "{value}"')
|
|
451
|
+
elif isinstance(value, bool):
|
|
452
|
+
lines.append(f"{key} = {str(value).lower()}")
|
|
453
|
+
elif isinstance(value, int | float):
|
|
454
|
+
lines.append(f"{key} = {value}")
|
|
455
|
+
elif isinstance(value, list):
|
|
456
|
+
if all(isinstance(v, int) for v in value):
|
|
457
|
+
lines.append(f"{key} = {value}")
|
|
458
|
+
else:
|
|
459
|
+
formatted = ", ".join(f'"{v}"' if isinstance(v, str) else str(v) for v in value)
|
|
460
|
+
lines.append(f"{key} = [{formatted}]")
|
|
461
|
+
|
|
462
|
+
# Write pools
|
|
463
|
+
pools = data.get("pools", {})
|
|
464
|
+
for pool_name, pool_config in pools.items():
|
|
465
|
+
lines.append("")
|
|
466
|
+
lines.append(f"[pools.{pool_name}]")
|
|
467
|
+
targets = pool_config.get("targets", [])
|
|
468
|
+
formatted = ", ".join(f'"{t}"' for t in targets)
|
|
469
|
+
lines.append(f"targets = [{formatted}]")
|
|
470
|
+
|
|
471
|
+
CONFIG_FILE.write_text("\n".join(lines) + "\n")
|
|
472
|
+
|
|
473
|
+
|
|
260
474
|
def set_default_target(name: str) -> None:
|
|
261
475
|
"""Set default target.
|
|
262
476
|
|
|
@@ -350,3 +564,279 @@ def get_target_info(target: TargetConfig) -> dict[str, str]:
|
|
|
350
564
|
info["Compute"] = target.compute_capability
|
|
351
565
|
|
|
352
566
|
return info
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
# Probe script to run on target - checks available backends
|
|
570
|
+
_PROBE_SCRIPT = """
|
|
571
|
+
import json
|
|
572
|
+
import shutil
|
|
573
|
+
import sys
|
|
574
|
+
|
|
575
|
+
def probe():
|
|
576
|
+
result = {
|
|
577
|
+
"python_version": sys.version.split()[0],
|
|
578
|
+
"backends": {},
|
|
579
|
+
"packages": {},
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
# Check Triton
|
|
583
|
+
try:
|
|
584
|
+
import triton
|
|
585
|
+
result["backends"]["triton"] = triton.__version__
|
|
586
|
+
except ImportError:
|
|
587
|
+
result["backends"]["triton"] = None
|
|
588
|
+
|
|
589
|
+
# Check torch
|
|
590
|
+
try:
|
|
591
|
+
import torch
|
|
592
|
+
result["packages"]["torch"] = torch.__version__
|
|
593
|
+
result["backends"]["torch"] = torch.__version__
|
|
594
|
+
result["cuda_available"] = torch.cuda.is_available()
|
|
595
|
+
if torch.cuda.is_available():
|
|
596
|
+
result["gpu_name"] = torch.cuda.get_device_name(0)
|
|
597
|
+
props = torch.cuda.get_device_properties(0)
|
|
598
|
+
result["compute_capability"] = f"{props.major}.{props.minor}"
|
|
599
|
+
except ImportError:
|
|
600
|
+
result["packages"]["torch"] = None
|
|
601
|
+
|
|
602
|
+
# Check hipcc (AMD)
|
|
603
|
+
hipcc = shutil.which("hipcc")
|
|
604
|
+
result["backends"]["hipcc"] = hipcc
|
|
605
|
+
|
|
606
|
+
# Check nvcc (NVIDIA)
|
|
607
|
+
nvcc = shutil.which("nvcc")
|
|
608
|
+
result["backends"]["nvcc"] = nvcc
|
|
609
|
+
|
|
610
|
+
# Check ROCm version
|
|
611
|
+
try:
|
|
612
|
+
with open("/opt/rocm/.info/version", "r") as f:
|
|
613
|
+
result["rocm_version"] = f.read().strip()
|
|
614
|
+
except Exception:
|
|
615
|
+
result["rocm_version"] = None
|
|
616
|
+
|
|
617
|
+
# Check CUDA version from nvcc
|
|
618
|
+
if nvcc:
|
|
619
|
+
import subprocess
|
|
620
|
+
try:
|
|
621
|
+
out = subprocess.check_output([nvcc, "--version"], text=True)
|
|
622
|
+
for line in out.split("\\n"):
|
|
623
|
+
if "release" in line.lower():
|
|
624
|
+
# Parse "Cuda compilation tools, release 12.1, V12.1.105"
|
|
625
|
+
parts = line.split("release")
|
|
626
|
+
if len(parts) > 1:
|
|
627
|
+
result["cuda_version"] = parts[1].split(",")[0].strip()
|
|
628
|
+
break
|
|
629
|
+
except Exception:
|
|
630
|
+
pass
|
|
631
|
+
|
|
632
|
+
print(json.dumps(result))
|
|
633
|
+
|
|
634
|
+
if __name__ == "__main__":
|
|
635
|
+
probe()
|
|
636
|
+
"""
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
class ProbeError(Exception):
|
|
640
|
+
"""Error during target probing with actionable context."""
|
|
641
|
+
|
|
642
|
+
pass
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
async def probe_target_capabilities(target: TargetConfig) -> dict[str, Any]:
|
|
646
|
+
"""Probe a target to discover available compilation backends.
|
|
647
|
+
|
|
648
|
+
Connects to the target and runs a probe script to check:
|
|
649
|
+
- Triton availability
|
|
650
|
+
- torch availability
|
|
651
|
+
- HIP/CUDA compiler
|
|
652
|
+
- ROCm/CUDA version
|
|
653
|
+
- GPU info
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
target: Target config
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
Dict with capabilities info
|
|
660
|
+
|
|
661
|
+
Raises:
|
|
662
|
+
ProbeError: With actionable error message on failure
|
|
663
|
+
"""
|
|
664
|
+
import json
|
|
665
|
+
import subprocess
|
|
666
|
+
|
|
667
|
+
if isinstance(target, RunPodTarget):
|
|
668
|
+
import trio_asyncio
|
|
669
|
+
from wafer_core.targets.runpod import RunPodError, get_pod_state, runpod_ssh_context
|
|
670
|
+
|
|
671
|
+
# Check if pod exists before trying to connect
|
|
672
|
+
pod_state = get_pod_state(target.name)
|
|
673
|
+
|
|
674
|
+
try:
|
|
675
|
+
# Need trio_asyncio.open_loop() for asyncssh bridge used by runpod_ssh_context
|
|
676
|
+
async with trio_asyncio.open_loop():
|
|
677
|
+
async with runpod_ssh_context(target) as ssh_info:
|
|
678
|
+
ssh_target = f"{ssh_info.user}@{ssh_info.host}"
|
|
679
|
+
port = ssh_info.port
|
|
680
|
+
key_path = target.ssh_key
|
|
681
|
+
|
|
682
|
+
# Find Python and run probe using subprocess (simpler than async ssh)
|
|
683
|
+
def run_ssh_cmd(cmd: str) -> tuple[int, str, str]:
|
|
684
|
+
try:
|
|
685
|
+
result = subprocess.run(
|
|
686
|
+
[
|
|
687
|
+
"ssh",
|
|
688
|
+
"-o",
|
|
689
|
+
"StrictHostKeyChecking=no",
|
|
690
|
+
"-o",
|
|
691
|
+
"UserKnownHostsFile=/dev/null",
|
|
692
|
+
"-o",
|
|
693
|
+
"ConnectTimeout=30",
|
|
694
|
+
"-i",
|
|
695
|
+
str(key_path),
|
|
696
|
+
"-p",
|
|
697
|
+
str(port),
|
|
698
|
+
ssh_target,
|
|
699
|
+
cmd,
|
|
700
|
+
],
|
|
701
|
+
capture_output=True,
|
|
702
|
+
text=True,
|
|
703
|
+
timeout=60,
|
|
704
|
+
)
|
|
705
|
+
return result.returncode, result.stdout, result.stderr
|
|
706
|
+
except subprocess.TimeoutExpired:
|
|
707
|
+
raise ProbeError(
|
|
708
|
+
f"SSH connection timed out\n"
|
|
709
|
+
f" Host: {ssh_target}:{port}\n"
|
|
710
|
+
f" Hint: The pod may be starting up. Try again in 30 seconds."
|
|
711
|
+
) from None
|
|
712
|
+
|
|
713
|
+
# Find Python
|
|
714
|
+
python_exe = "python3"
|
|
715
|
+
for candidate in [
|
|
716
|
+
"/opt/conda/envs/py_3.10/bin/python3",
|
|
717
|
+
"/opt/conda/bin/python3",
|
|
718
|
+
]:
|
|
719
|
+
code, out, _ = run_ssh_cmd(f"{candidate} --version 2>/dev/null && echo OK")
|
|
720
|
+
if code == 0 and "OK" in out:
|
|
721
|
+
python_exe = candidate
|
|
722
|
+
break
|
|
723
|
+
|
|
724
|
+
# Run probe script
|
|
725
|
+
escaped_script = _PROBE_SCRIPT.replace("'", "'\"'\"'")
|
|
726
|
+
code, out, err = run_ssh_cmd(f"{python_exe} -c '{escaped_script}'")
|
|
727
|
+
if code != 0:
|
|
728
|
+
raise ProbeError(
|
|
729
|
+
f"Probe script failed on target\n"
|
|
730
|
+
f" Exit code: {code}\n"
|
|
731
|
+
f" Error: {err.strip() if err else 'unknown'}"
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
try:
|
|
735
|
+
return json.loads(out)
|
|
736
|
+
except json.JSONDecodeError as e:
|
|
737
|
+
raise ProbeError(
|
|
738
|
+
f"Failed to parse probe output\n Error: {e}\n Output: {out[:200]}..."
|
|
739
|
+
) from None
|
|
740
|
+
|
|
741
|
+
except RunPodError as e:
|
|
742
|
+
# RunPod API errors (provisioning, pod not found, etc.)
|
|
743
|
+
raise ProbeError(f"RunPod error for target '{target.name}'\n {e}") from None
|
|
744
|
+
except OSError as e:
|
|
745
|
+
# SSH connection errors
|
|
746
|
+
if pod_state:
|
|
747
|
+
raise ProbeError(
|
|
748
|
+
f"SSH connection failed to target '{target.name}'\n"
|
|
749
|
+
f" Host: {pod_state.ssh_username}@{pod_state.public_ip}:{pod_state.ssh_port}\n"
|
|
750
|
+
f" Error: {e}\n"
|
|
751
|
+
f" Hint: Check if the pod is still running with 'wafer config targets pods'"
|
|
752
|
+
) from None
|
|
753
|
+
raise ProbeError(
|
|
754
|
+
f"SSH connection failed to target '{target.name}'\n"
|
|
755
|
+
f" Error: {e}\n"
|
|
756
|
+
f" Hint: No pod found. One will be provisioned on next probe attempt."
|
|
757
|
+
) from None
|
|
758
|
+
|
|
759
|
+
elif isinstance(target, (BaremetalTarget, VMTarget)):
|
|
760
|
+
import subprocess
|
|
761
|
+
|
|
762
|
+
# Parse ssh_target (user@host:port or user@host)
|
|
763
|
+
ssh_target = target.ssh_target
|
|
764
|
+
if ":" in ssh_target.split("@")[-1]:
|
|
765
|
+
host_port = ssh_target.split("@")[-1]
|
|
766
|
+
host = host_port.rsplit(":", 1)[0]
|
|
767
|
+
port = host_port.rsplit(":", 1)[1]
|
|
768
|
+
user = ssh_target.split("@")[0]
|
|
769
|
+
ssh_target = f"{user}@{host}"
|
|
770
|
+
else:
|
|
771
|
+
host = ssh_target.split("@")[-1]
|
|
772
|
+
port = "22"
|
|
773
|
+
user = ssh_target.split("@")[0]
|
|
774
|
+
|
|
775
|
+
key_path = target.ssh_key
|
|
776
|
+
|
|
777
|
+
def run_ssh_cmd(cmd: str) -> tuple[int, str, str]:
|
|
778
|
+
try:
|
|
779
|
+
result = subprocess.run(
|
|
780
|
+
[
|
|
781
|
+
"ssh",
|
|
782
|
+
"-o",
|
|
783
|
+
"StrictHostKeyChecking=no",
|
|
784
|
+
"-o",
|
|
785
|
+
"UserKnownHostsFile=/dev/null",
|
|
786
|
+
"-o",
|
|
787
|
+
"ConnectTimeout=30",
|
|
788
|
+
"-i",
|
|
789
|
+
str(key_path),
|
|
790
|
+
"-p",
|
|
791
|
+
port,
|
|
792
|
+
ssh_target,
|
|
793
|
+
cmd,
|
|
794
|
+
],
|
|
795
|
+
capture_output=True,
|
|
796
|
+
text=True,
|
|
797
|
+
timeout=60,
|
|
798
|
+
)
|
|
799
|
+
return result.returncode, result.stdout, result.stderr
|
|
800
|
+
except subprocess.TimeoutExpired:
|
|
801
|
+
raise ProbeError(
|
|
802
|
+
f"SSH connection timed out\n"
|
|
803
|
+
f" Host: {ssh_target}:{port}\n"
|
|
804
|
+
f" Hint: Check if the host is reachable and SSH is running."
|
|
805
|
+
) from None
|
|
806
|
+
|
|
807
|
+
# Test SSH connection first
|
|
808
|
+
code, out, err = run_ssh_cmd("echo OK")
|
|
809
|
+
if code != 0:
|
|
810
|
+
raise ProbeError(
|
|
811
|
+
f"SSH connection failed to target '{target.name}'\n"
|
|
812
|
+
f" Host: {user}@{host}:{port}\n"
|
|
813
|
+
f" Key: {key_path}\n"
|
|
814
|
+
f" Error: {err.strip() if err else 'connection refused or timeout'}\n"
|
|
815
|
+
f" Hint: Verify the host is reachable and the SSH key is authorized."
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
# Run probe script
|
|
819
|
+
escaped_script = _PROBE_SCRIPT.replace("'", "'\"'\"'")
|
|
820
|
+
code, out, err = run_ssh_cmd(f"python3 -c '{escaped_script}'")
|
|
821
|
+
if code != 0:
|
|
822
|
+
raise ProbeError(
|
|
823
|
+
f"Probe script failed on target '{target.name}'\n"
|
|
824
|
+
f" Exit code: {code}\n"
|
|
825
|
+
f" Error: {err.strip() if err else 'unknown'}\n"
|
|
826
|
+
f" Hint: Ensure python3 is installed on the target."
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
try:
|
|
830
|
+
return json.loads(out)
|
|
831
|
+
except json.JSONDecodeError as e:
|
|
832
|
+
raise ProbeError(
|
|
833
|
+
f"Failed to parse probe output from '{target.name}'\n"
|
|
834
|
+
f" Error: {e}\n"
|
|
835
|
+
f" Output: {out[:200]}..."
|
|
836
|
+
) from None
|
|
837
|
+
|
|
838
|
+
else:
|
|
839
|
+
raise ProbeError(
|
|
840
|
+
f"Probing not supported for target type: {type(target).__name__}\n"
|
|
841
|
+
f" Supported types: RunPod, Baremetal, VM"
|
|
842
|
+
)
|
wafer/wevin_cli.py
CHANGED
|
@@ -253,6 +253,7 @@ def _build_environment(
|
|
|
253
253
|
) -> Environment:
|
|
254
254
|
"""Build a CodingEnvironment from template config."""
|
|
255
255
|
from wafer_core.environments.coding import CodingEnvironment
|
|
256
|
+
from wafer_core.rollouts.templates import DANGEROUS_BASH_COMMANDS
|
|
256
257
|
|
|
257
258
|
working_dir = Path(corpus_path) if corpus_path else Path.cwd()
|
|
258
259
|
resolved_tools = tools_override or tpl.tools
|
|
@@ -260,6 +261,7 @@ def _build_environment(
|
|
|
260
261
|
working_dir=working_dir,
|
|
261
262
|
enabled_tools=resolved_tools,
|
|
262
263
|
bash_allowlist=tpl.bash_allowlist,
|
|
264
|
+
bash_denylist=DANGEROUS_BASH_COMMANDS,
|
|
263
265
|
) # type: ignore[assignment]
|
|
264
266
|
return env
|
|
265
267
|
|
wafer/workspaces.py
CHANGED
|
@@ -396,6 +396,9 @@ Host wafer-{workspace_id}
|
|
|
396
396
|
"Or add to ~/.ssh/config:",
|
|
397
397
|
config_entry,
|
|
398
398
|
f"Then connect with: ssh wafer-{workspace_id}",
|
|
399
|
+
"",
|
|
400
|
+
"To run GPU commands without interactive SSH:",
|
|
401
|
+
f' wafer workspaces exec {workspace_id} "<command>"',
|
|
399
402
|
]
|
|
400
403
|
|
|
401
404
|
return "\n".join(lines)
|
|
@@ -652,6 +655,33 @@ def get_workspace(workspace_id: str, json_output: bool = False) -> str:
|
|
|
652
655
|
return "\n".join(lines)
|
|
653
656
|
|
|
654
657
|
|
|
658
|
+
def _handle_sync_event(sync_type: str) -> None:
|
|
659
|
+
"""Handle sync events and print status to stderr.
|
|
660
|
+
|
|
661
|
+
Sync events:
|
|
662
|
+
- FORWARD:START - Starting workspace → GPU sync
|
|
663
|
+
- FORWARD:DONE:N - Synced N files to GPU
|
|
664
|
+
- FORWARD:WARN:msg - Warning during forward sync
|
|
665
|
+
- REVERSE:START - Starting GPU → workspace sync
|
|
666
|
+
- REVERSE:DONE:N - Synced N artifacts back
|
|
667
|
+
"""
|
|
668
|
+
import sys
|
|
669
|
+
|
|
670
|
+
if sync_type == "FORWARD:START":
|
|
671
|
+
print("[sync] Syncing workspace → GPU...", end="", file=sys.stderr, flush=True)
|
|
672
|
+
elif sync_type.startswith("FORWARD:DONE:"):
|
|
673
|
+
count = sync_type.split(":")[-1]
|
|
674
|
+
print(f" done ({count} files)", file=sys.stderr)
|
|
675
|
+
elif sync_type.startswith("FORWARD:WARN:"):
|
|
676
|
+
msg = sync_type[13:] # Remove "FORWARD:WARN:"
|
|
677
|
+
print(f" warning: {msg}", file=sys.stderr)
|
|
678
|
+
elif sync_type == "REVERSE:START":
|
|
679
|
+
print("[sync] Syncing artifacts back...", end="", file=sys.stderr, flush=True)
|
|
680
|
+
elif sync_type.startswith("REVERSE:DONE:"):
|
|
681
|
+
count = sync_type.split(":")[-1]
|
|
682
|
+
print(f" done ({count} files)", file=sys.stderr)
|
|
683
|
+
|
|
684
|
+
|
|
655
685
|
@dataclass(frozen=True)
|
|
656
686
|
class SSEEvent:
|
|
657
687
|
"""Parsed SSE event result."""
|
|
@@ -659,6 +689,7 @@ class SSEEvent:
|
|
|
659
689
|
output: str | None # Content to print (None = no output)
|
|
660
690
|
exit_code: int | None # Exit code if stream should end (None = continue)
|
|
661
691
|
is_error: bool # Whether output goes to stderr
|
|
692
|
+
sync_event: str | None = None # Sync event type (e.g., "FORWARD:START")
|
|
662
693
|
|
|
663
694
|
|
|
664
695
|
def _parse_sse_content(content: str) -> SSEEvent:
|
|
@@ -680,6 +711,16 @@ def _parse_sse_content(content: str) -> SSEEvent:
|
|
|
680
711
|
if content.startswith("[ERROR]"):
|
|
681
712
|
return SSEEvent(output=content[8:], exit_code=1, is_error=True)
|
|
682
713
|
|
|
714
|
+
# Sync events: [SYNC:FORWARD:START], [SYNC:FORWARD:DONE:5], etc.
|
|
715
|
+
if content.startswith("[SYNC:"):
|
|
716
|
+
# Extract sync type (e.g., "FORWARD:START" or "REVERSE:DONE:5")
|
|
717
|
+
sync_type = content[6:-1] # Remove [SYNC: and ]
|
|
718
|
+
return SSEEvent(output=None, exit_code=None, is_error=False, sync_event=sync_type)
|
|
719
|
+
|
|
720
|
+
# Status events we can ignore (already handled elsewhere)
|
|
721
|
+
if content.startswith("[STATUS:") or content.startswith("[CONTEXT:"):
|
|
722
|
+
return SSEEvent(output=None, exit_code=None, is_error=False)
|
|
723
|
+
|
|
683
724
|
# Regular output
|
|
684
725
|
return SSEEvent(output=content, exit_code=None, is_error=False)
|
|
685
726
|
|
|
@@ -688,13 +729,15 @@ def exec_command(
|
|
|
688
729
|
workspace_id: str,
|
|
689
730
|
command: str,
|
|
690
731
|
timeout_seconds: int | None = None,
|
|
732
|
+
routing: str | None = None,
|
|
691
733
|
) -> int:
|
|
692
|
-
"""Execute a command in workspace
|
|
734
|
+
"""Execute a command in workspace, streaming output.
|
|
693
735
|
|
|
694
736
|
Args:
|
|
695
737
|
workspace_id: Workspace ID or name
|
|
696
738
|
command: Command to execute
|
|
697
739
|
timeout_seconds: Execution timeout (default: 300, from config)
|
|
740
|
+
routing: Routing hint - "auto", "gpu", "cpu", or "baremetal" (default: auto)
|
|
698
741
|
|
|
699
742
|
Returns:
|
|
700
743
|
Exit code (0 = success, non-zero = failure)
|
|
@@ -714,6 +757,10 @@ def exec_command(
|
|
|
714
757
|
if timeout_seconds:
|
|
715
758
|
request_body["timeout_seconds"] = timeout_seconds
|
|
716
759
|
|
|
760
|
+
# Add routing hint if specified
|
|
761
|
+
if routing:
|
|
762
|
+
request_body["requirements"] = {"routing": routing}
|
|
763
|
+
|
|
717
764
|
try:
|
|
718
765
|
# Use streaming request for SSE output
|
|
719
766
|
with httpx.Client(timeout=None, headers=headers) as client:
|
|
@@ -736,6 +783,11 @@ def exec_command(
|
|
|
736
783
|
|
|
737
784
|
event = _parse_sse_content(line[6:])
|
|
738
785
|
|
|
786
|
+
# Handle sync events - display status to stderr
|
|
787
|
+
if event.sync_event:
|
|
788
|
+
_handle_sync_event(event.sync_event)
|
|
789
|
+
continue
|
|
790
|
+
|
|
739
791
|
if event.output is not None:
|
|
740
792
|
print(event.output, file=sys.stderr if event.is_error else sys.stdout)
|
|
741
793
|
|
|
@@ -2,32 +2,35 @@ wafer/GUIDE.md,sha256=Z_jsSgHAS6bFa83VKhG9jxjUK1XpLjR1fEIKapDa_6g,3195
|
|
|
2
2
|
wafer/__init__.py,sha256=kBM_ONCpU6UUMBOH8Tmg4A88sNFnbaD59o61cJs-uYM,90
|
|
3
3
|
wafer/analytics.py,sha256=Xxw3bbY3XLgedSJPwzIOBJIjyycIiornWCpjoWbTKYU,8190
|
|
4
4
|
wafer/api_client.py,sha256=cPULiTxqOAYYSfDTNJgd-6Pqrt3IM4Gm9903U7yGIwY,6163
|
|
5
|
-
wafer/auth.py,sha256=
|
|
5
|
+
wafer/auth.py,sha256=acBVOz-3la6avztDGjtLRopdjNRIqbrV4tRMM1FAmHI,13682
|
|
6
6
|
wafer/autotuner.py,sha256=6gH0Ho7T58EFerMQcHQxshWe3DF4qU7fb5xthAh5SPM,44364
|
|
7
7
|
wafer/billing.py,sha256=jbLB2lI4_9f2KD8uEFDi_ixLlowe5hasC0TIZJyIXRg,7163
|
|
8
|
-
wafer/cli.py,sha256=
|
|
8
|
+
wafer/cli.py,sha256=To50huPpXV0wv5oH2mx0LJ5vTWt2WRyjYsyrhvf-_2c,220933
|
|
9
9
|
wafer/config.py,sha256=h5Eo9_yfWqWGoPNdVQikI9GoZVUeysunSYiixf1mKcw,3411
|
|
10
10
|
wafer/corpus.py,sha256=yTF3UA5bOa8BII2fmcXf-3WsIsM5DX4etysv0AzVknE,8912
|
|
11
|
-
wafer/evaluate.py,sha256=
|
|
11
|
+
wafer/evaluate.py,sha256=ss4847PC2wrua9wtYECkNzBv5Oww_79o9CIBwvVpI94,169607
|
|
12
12
|
wafer/global_config.py,sha256=fhaR_RU3ufMksDmOohH1OLeQ0JT0SDW1hEip_zaP75k,11345
|
|
13
|
-
wafer/gpu_run.py,sha256=
|
|
13
|
+
wafer/gpu_run.py,sha256=TwqXy72T7f2I7e6n5WWod3xgxCPnDhU0BgLsB4CUoQY,9716
|
|
14
14
|
wafer/inference.py,sha256=tZCO5i05FKY27ewis3CSBHFBeFbXY3xwj0DSjdoMY9s,4314
|
|
15
|
+
wafer/kernel_scope.py,sha256=ynfGdIOd2U-jFpyJGPq0pwW7NkGAbz7TWZaLs_TxQy8,16127
|
|
15
16
|
wafer/ncu_analyze.py,sha256=rAWzKQRZEY6E_CL3gAWUaW3uZ4kvQVZskVCPDpsFJuE,24633
|
|
16
17
|
wafer/nsys_analyze.py,sha256=dRsYNYp1IqzGSPrQuEMW5vRbIxr-VrQwQbotLSrPvlY,6795
|
|
18
|
+
wafer/problems.py,sha256=ce2sy10A1nnNUG3VGsseTS8jL7LZsku4dE8zVf9JHQ4,11296
|
|
17
19
|
wafer/rocprof_compute.py,sha256=Tu16Vb05b2grvheFWi1XLGlAr6m48NEDeZoDyw_4Uzw,19885
|
|
18
20
|
wafer/rocprof_sdk.py,sha256=fAYCxpfJa5BZTTkIMBOXg4KsYK4i_wNOKrJJn1ZfypM,10086
|
|
19
21
|
wafer/rocprof_systems.py,sha256=4IWbMcbYk1x_8iS7P3FC_u5sgH6EXADCtR2lV9id80M,18629
|
|
20
|
-
wafer/
|
|
22
|
+
wafer/target_lock.py,sha256=SDKhNzv2N7gsphGflcNni9FE5YYuAMuEthngAJEo4Gs,7809
|
|
23
|
+
wafer/targets.py,sha256=9r-iRWoKSH5cQl1LcamaX-T7cNVOg99ngIm_hlRk-qU,26922
|
|
21
24
|
wafer/tracelens.py,sha256=g9ZIeFyNojZn4uTd3skPqIrRiL7aMJOz_-GOd3aiyy4,7998
|
|
22
|
-
wafer/wevin_cli.py,sha256=
|
|
23
|
-
wafer/workspaces.py,sha256=
|
|
25
|
+
wafer/wevin_cli.py,sha256=1_o2P47namZmPkbt47TnyYDmwhEzQYbSg5zjHffu2JQ,16802
|
|
26
|
+
wafer/workspaces.py,sha256=aClxuwi-EgSuXchDR1F2blMiQTb5RV1K2CMpFESE_9Y,28013
|
|
24
27
|
wafer/skills/wafer-guide/SKILL.md,sha256=UfBeIe5GKFzOYcbPmcs8U2nrjbfr-jSMRwg0jQDBfb0,3058
|
|
25
28
|
wafer/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
29
|
wafer/templates/ask_docs.py,sha256=Lxs-faz9v5m4Qa4NjF2X_lE8KwM9ES9MNJkxo7ep56o,2256
|
|
27
30
|
wafer/templates/optimize_kernel.py,sha256=u6AL7Q3uttqlnBLzcoFdsiPq5lV2TV3bgqwCYYlK9gk,2357
|
|
28
31
|
wafer/templates/trace_analyze.py,sha256=XE1VqzVkIUsZbXF8EzQdDYgg-AZEYAOFpr6B_vnRELc,2880
|
|
29
|
-
wafer_cli-0.2.
|
|
30
|
-
wafer_cli-0.2.
|
|
31
|
-
wafer_cli-0.2.
|
|
32
|
-
wafer_cli-0.2.
|
|
33
|
-
wafer_cli-0.2.
|
|
32
|
+
wafer_cli-0.2.9.dist-info/METADATA,sha256=gMP7nuGTR3QTp9lXATbu-tFbQHA3bg1Q0v57lV2vuKQ,559
|
|
33
|
+
wafer_cli-0.2.9.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
34
|
+
wafer_cli-0.2.9.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
|
|
35
|
+
wafer_cli-0.2.9.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
|
|
36
|
+
wafer_cli-0.2.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|