wafer-cli 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +18 -7
- wafer/api_client.py +4 -0
- wafer/auth.py +85 -0
- wafer/cli.py +2339 -404
- wafer/corpus.py +158 -32
- wafer/evaluate.py +1232 -201
- wafer/gpu_run.py +5 -1
- wafer/kernel_scope.py +554 -0
- wafer/nsys_analyze.py +903 -73
- wafer/nsys_profile.py +511 -0
- wafer/output.py +241 -0
- wafer/problems.py +357 -0
- wafer/skills/wafer-guide/SKILL.md +13 -0
- wafer/ssh_keys.py +261 -0
- wafer/target_lock.py +270 -0
- wafer/targets.py +490 -0
- wafer/targets_ops.py +718 -0
- wafer/wevin_cli.py +129 -18
- wafer/workspaces.py +282 -182
- {wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/METADATA +1 -1
- wafer_cli-0.2.10.dist-info/RECORD +40 -0
- wafer_cli-0.2.8.dist-info/RECORD +0 -33
- {wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/top_level.txt +0 -0
wafer/targets.py
CHANGED
|
@@ -257,6 +257,220 @@ def get_default_target() -> str | None:
|
|
|
257
257
|
return data.get("default_target")
|
|
258
258
|
|
|
259
259
|
|
|
260
|
+
# ── Pool Management ─────────────────────────────────────────────────────────
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def get_pool(name: str) -> list[str]:
|
|
264
|
+
"""Get list of targets in a named pool.
|
|
265
|
+
|
|
266
|
+
Pools are defined in ~/.wafer/config.toml:
|
|
267
|
+
[pools.my-pool]
|
|
268
|
+
targets = ["target-1", "target-2", "target-3"]
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
name: Pool name
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
List of target names in the pool
|
|
275
|
+
|
|
276
|
+
Raises:
|
|
277
|
+
FileNotFoundError: If pool doesn't exist
|
|
278
|
+
"""
|
|
279
|
+
if not CONFIG_FILE.exists():
|
|
280
|
+
raise FileNotFoundError(f"Pool not found: {name} (no config file)")
|
|
281
|
+
|
|
282
|
+
with open(CONFIG_FILE, "rb") as f:
|
|
283
|
+
data = tomllib.load(f)
|
|
284
|
+
|
|
285
|
+
pools = data.get("pools", {})
|
|
286
|
+
if name not in pools:
|
|
287
|
+
raise FileNotFoundError(
|
|
288
|
+
f"Pool not found: {name}\n"
|
|
289
|
+
f" Define pools in ~/.wafer/config.toml:\n"
|
|
290
|
+
f" [pools.{name}]\n"
|
|
291
|
+
f' targets = ["target-1", "target-2"]'
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
pool_config = pools[name]
|
|
295
|
+
targets = pool_config.get("targets", [])
|
|
296
|
+
|
|
297
|
+
if not targets:
|
|
298
|
+
raise ValueError(f"Pool '{name}' has no targets defined")
|
|
299
|
+
|
|
300
|
+
return targets
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def get_target_type(name: str) -> str | None:
|
|
304
|
+
"""Get the type of a target without fully loading it.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
name: Target name
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
Target type string (runpod, digitalocean, baremetal, etc.) or None if not found
|
|
311
|
+
"""
|
|
312
|
+
path = _target_path(name)
|
|
313
|
+
if not path.exists():
|
|
314
|
+
return None
|
|
315
|
+
|
|
316
|
+
with open(path, "rb") as f:
|
|
317
|
+
data = tomllib.load(f)
|
|
318
|
+
|
|
319
|
+
return data.get("type")
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def filter_pool_by_auth(target_names: list[str]) -> tuple[list[str], list[str]]:
|
|
323
|
+
"""Filter pool targets to only those with valid authentication.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
target_names: List of target names to filter
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Tuple of (usable_targets, skipped_targets)
|
|
330
|
+
"""
|
|
331
|
+
from wafer_core.auth import get_api_key
|
|
332
|
+
|
|
333
|
+
usable = []
|
|
334
|
+
skipped = []
|
|
335
|
+
|
|
336
|
+
for name in target_names:
|
|
337
|
+
target_type = get_target_type(name)
|
|
338
|
+
if target_type is None:
|
|
339
|
+
# Target doesn't exist, skip it
|
|
340
|
+
skipped.append(name)
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
# Check auth requirements by target type
|
|
344
|
+
if target_type == "runpod":
|
|
345
|
+
if not get_api_key("runpod"):
|
|
346
|
+
skipped.append(name)
|
|
347
|
+
continue
|
|
348
|
+
elif target_type == "digitalocean":
|
|
349
|
+
if not get_api_key("digitalocean"):
|
|
350
|
+
skipped.append(name)
|
|
351
|
+
continue
|
|
352
|
+
# Other types (baremetal, vm, workspace, modal) don't need runtime API keys
|
|
353
|
+
|
|
354
|
+
usable.append(name)
|
|
355
|
+
|
|
356
|
+
return usable, skipped
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def list_pools() -> list[str]:
|
|
360
|
+
"""List all configured pool names.
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Sorted list of pool names
|
|
364
|
+
"""
|
|
365
|
+
if not CONFIG_FILE.exists():
|
|
366
|
+
return []
|
|
367
|
+
|
|
368
|
+
with open(CONFIG_FILE, "rb") as f:
|
|
369
|
+
data = tomllib.load(f)
|
|
370
|
+
|
|
371
|
+
return sorted(data.get("pools", {}).keys())
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def save_pool(name: str, targets: list[str]) -> None:
|
|
375
|
+
"""Save or update a pool configuration.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
name: Pool name
|
|
379
|
+
targets: List of target names (must all exist)
|
|
380
|
+
|
|
381
|
+
Raises:
|
|
382
|
+
FileNotFoundError: If any target doesn't exist
|
|
383
|
+
"""
|
|
384
|
+
# Verify all targets exist
|
|
385
|
+
existing_targets = list_targets()
|
|
386
|
+
missing = [t for t in targets if t not in existing_targets]
|
|
387
|
+
if missing:
|
|
388
|
+
raise FileNotFoundError(f"Targets not found: {', '.join(missing)}")
|
|
389
|
+
|
|
390
|
+
_ensure_dirs()
|
|
391
|
+
|
|
392
|
+
# Load existing config
|
|
393
|
+
if CONFIG_FILE.exists():
|
|
394
|
+
with open(CONFIG_FILE, "rb") as f:
|
|
395
|
+
data = tomllib.load(f)
|
|
396
|
+
else:
|
|
397
|
+
data = {}
|
|
398
|
+
|
|
399
|
+
# Update pools section
|
|
400
|
+
if "pools" not in data:
|
|
401
|
+
data["pools"] = {}
|
|
402
|
+
|
|
403
|
+
data["pools"][name] = {"targets": targets}
|
|
404
|
+
|
|
405
|
+
# Write back - need custom handling for nested structure
|
|
406
|
+
_write_config_with_pools(data)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _write_config_with_pools(data: dict) -> None:
|
|
410
|
+
"""Write config file with pools support.
|
|
411
|
+
|
|
412
|
+
Handles the nested [pools.name] TOML structure and preserves
|
|
413
|
+
existing nested sections like [default], [api], [environments.*].
|
|
414
|
+
"""
|
|
415
|
+
lines = []
|
|
416
|
+
|
|
417
|
+
# Collect nested sections to write after top-level keys
|
|
418
|
+
nested_sections: dict[str, dict] = {}
|
|
419
|
+
|
|
420
|
+
# Write top-level keys first (except pools and nested dicts)
|
|
421
|
+
for key, value in data.items():
|
|
422
|
+
if key == "pools":
|
|
423
|
+
continue
|
|
424
|
+
if value is None:
|
|
425
|
+
continue
|
|
426
|
+
if isinstance(value, dict):
|
|
427
|
+
# Save nested sections for later
|
|
428
|
+
nested_sections[key] = value
|
|
429
|
+
elif isinstance(value, str):
|
|
430
|
+
lines.append(f'{key} = "{value}"')
|
|
431
|
+
elif isinstance(value, bool):
|
|
432
|
+
lines.append(f"{key} = {str(value).lower()}")
|
|
433
|
+
elif isinstance(value, int | float):
|
|
434
|
+
lines.append(f"{key} = {value}")
|
|
435
|
+
elif isinstance(value, list):
|
|
436
|
+
if all(isinstance(v, int) for v in value):
|
|
437
|
+
lines.append(f"{key} = {value}")
|
|
438
|
+
else:
|
|
439
|
+
formatted = ", ".join(f'"{v}"' if isinstance(v, str) else str(v) for v in value)
|
|
440
|
+
lines.append(f"{key} = [{formatted}]")
|
|
441
|
+
|
|
442
|
+
# Write nested sections (e.g., [default], [api], [environments.foo])
|
|
443
|
+
for section_name, section_data in nested_sections.items():
|
|
444
|
+
lines.append("")
|
|
445
|
+
lines.append(f"[{section_name}]")
|
|
446
|
+
for key, value in section_data.items():
|
|
447
|
+
if value is None:
|
|
448
|
+
continue
|
|
449
|
+
if isinstance(value, str):
|
|
450
|
+
lines.append(f'{key} = "{value}"')
|
|
451
|
+
elif isinstance(value, bool):
|
|
452
|
+
lines.append(f"{key} = {str(value).lower()}")
|
|
453
|
+
elif isinstance(value, int | float):
|
|
454
|
+
lines.append(f"{key} = {value}")
|
|
455
|
+
elif isinstance(value, list):
|
|
456
|
+
if all(isinstance(v, int) for v in value):
|
|
457
|
+
lines.append(f"{key} = {value}")
|
|
458
|
+
else:
|
|
459
|
+
formatted = ", ".join(f'"{v}"' if isinstance(v, str) else str(v) for v in value)
|
|
460
|
+
lines.append(f"{key} = [{formatted}]")
|
|
461
|
+
|
|
462
|
+
# Write pools
|
|
463
|
+
pools = data.get("pools", {})
|
|
464
|
+
for pool_name, pool_config in pools.items():
|
|
465
|
+
lines.append("")
|
|
466
|
+
lines.append(f"[pools.{pool_name}]")
|
|
467
|
+
targets = pool_config.get("targets", [])
|
|
468
|
+
formatted = ", ".join(f'"{t}"' for t in targets)
|
|
469
|
+
lines.append(f"targets = [{formatted}]")
|
|
470
|
+
|
|
471
|
+
CONFIG_FILE.write_text("\n".join(lines) + "\n")
|
|
472
|
+
|
|
473
|
+
|
|
260
474
|
def set_default_target(name: str) -> None:
|
|
261
475
|
"""Set default target.
|
|
262
476
|
|
|
@@ -350,3 +564,279 @@ def get_target_info(target: TargetConfig) -> dict[str, str]:
|
|
|
350
564
|
info["Compute"] = target.compute_capability
|
|
351
565
|
|
|
352
566
|
return info
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
# Probe script to run on target - checks available backends
|
|
570
|
+
_PROBE_SCRIPT = """
|
|
571
|
+
import json
|
|
572
|
+
import shutil
|
|
573
|
+
import sys
|
|
574
|
+
|
|
575
|
+
def probe():
|
|
576
|
+
result = {
|
|
577
|
+
"python_version": sys.version.split()[0],
|
|
578
|
+
"backends": {},
|
|
579
|
+
"packages": {},
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
# Check Triton
|
|
583
|
+
try:
|
|
584
|
+
import triton
|
|
585
|
+
result["backends"]["triton"] = triton.__version__
|
|
586
|
+
except ImportError:
|
|
587
|
+
result["backends"]["triton"] = None
|
|
588
|
+
|
|
589
|
+
# Check torch
|
|
590
|
+
try:
|
|
591
|
+
import torch
|
|
592
|
+
result["packages"]["torch"] = torch.__version__
|
|
593
|
+
result["backends"]["torch"] = torch.__version__
|
|
594
|
+
result["cuda_available"] = torch.cuda.is_available()
|
|
595
|
+
if torch.cuda.is_available():
|
|
596
|
+
result["gpu_name"] = torch.cuda.get_device_name(0)
|
|
597
|
+
props = torch.cuda.get_device_properties(0)
|
|
598
|
+
result["compute_capability"] = f"{props.major}.{props.minor}"
|
|
599
|
+
except ImportError:
|
|
600
|
+
result["packages"]["torch"] = None
|
|
601
|
+
|
|
602
|
+
# Check hipcc (AMD)
|
|
603
|
+
hipcc = shutil.which("hipcc")
|
|
604
|
+
result["backends"]["hipcc"] = hipcc
|
|
605
|
+
|
|
606
|
+
# Check nvcc (NVIDIA)
|
|
607
|
+
nvcc = shutil.which("nvcc")
|
|
608
|
+
result["backends"]["nvcc"] = nvcc
|
|
609
|
+
|
|
610
|
+
# Check ROCm version
|
|
611
|
+
try:
|
|
612
|
+
with open("/opt/rocm/.info/version", "r") as f:
|
|
613
|
+
result["rocm_version"] = f.read().strip()
|
|
614
|
+
except Exception:
|
|
615
|
+
result["rocm_version"] = None
|
|
616
|
+
|
|
617
|
+
# Check CUDA version from nvcc
|
|
618
|
+
if nvcc:
|
|
619
|
+
import subprocess
|
|
620
|
+
try:
|
|
621
|
+
out = subprocess.check_output([nvcc, "--version"], text=True)
|
|
622
|
+
for line in out.split("\\n"):
|
|
623
|
+
if "release" in line.lower():
|
|
624
|
+
# Parse "Cuda compilation tools, release 12.1, V12.1.105"
|
|
625
|
+
parts = line.split("release")
|
|
626
|
+
if len(parts) > 1:
|
|
627
|
+
result["cuda_version"] = parts[1].split(",")[0].strip()
|
|
628
|
+
break
|
|
629
|
+
except Exception:
|
|
630
|
+
pass
|
|
631
|
+
|
|
632
|
+
print(json.dumps(result))
|
|
633
|
+
|
|
634
|
+
if __name__ == "__main__":
|
|
635
|
+
probe()
|
|
636
|
+
"""
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
class ProbeError(Exception):
|
|
640
|
+
"""Error during target probing with actionable context."""
|
|
641
|
+
|
|
642
|
+
pass
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
async def probe_target_capabilities(target: TargetConfig) -> dict[str, Any]:
|
|
646
|
+
"""Probe a target to discover available compilation backends.
|
|
647
|
+
|
|
648
|
+
Connects to the target and runs a probe script to check:
|
|
649
|
+
- Triton availability
|
|
650
|
+
- torch availability
|
|
651
|
+
- HIP/CUDA compiler
|
|
652
|
+
- ROCm/CUDA version
|
|
653
|
+
- GPU info
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
target: Target config
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
Dict with capabilities info
|
|
660
|
+
|
|
661
|
+
Raises:
|
|
662
|
+
ProbeError: With actionable error message on failure
|
|
663
|
+
"""
|
|
664
|
+
import json
|
|
665
|
+
import subprocess
|
|
666
|
+
|
|
667
|
+
if isinstance(target, RunPodTarget):
|
|
668
|
+
import trio_asyncio
|
|
669
|
+
from wafer_core.targets.runpod import RunPodError, get_pod_state, runpod_ssh_context
|
|
670
|
+
|
|
671
|
+
# Check if pod exists before trying to connect
|
|
672
|
+
pod_state = get_pod_state(target.name)
|
|
673
|
+
|
|
674
|
+
try:
|
|
675
|
+
# Need trio_asyncio.open_loop() for asyncssh bridge used by runpod_ssh_context
|
|
676
|
+
async with trio_asyncio.open_loop():
|
|
677
|
+
async with runpod_ssh_context(target) as ssh_info:
|
|
678
|
+
ssh_target = f"{ssh_info.user}@{ssh_info.host}"
|
|
679
|
+
port = ssh_info.port
|
|
680
|
+
key_path = target.ssh_key
|
|
681
|
+
|
|
682
|
+
# Find Python and run probe using subprocess (simpler than async ssh)
|
|
683
|
+
def run_ssh_cmd(cmd: str) -> tuple[int, str, str]:
|
|
684
|
+
try:
|
|
685
|
+
result = subprocess.run(
|
|
686
|
+
[
|
|
687
|
+
"ssh",
|
|
688
|
+
"-o",
|
|
689
|
+
"StrictHostKeyChecking=no",
|
|
690
|
+
"-o",
|
|
691
|
+
"UserKnownHostsFile=/dev/null",
|
|
692
|
+
"-o",
|
|
693
|
+
"ConnectTimeout=30",
|
|
694
|
+
"-i",
|
|
695
|
+
str(key_path),
|
|
696
|
+
"-p",
|
|
697
|
+
str(port),
|
|
698
|
+
ssh_target,
|
|
699
|
+
cmd,
|
|
700
|
+
],
|
|
701
|
+
capture_output=True,
|
|
702
|
+
text=True,
|
|
703
|
+
timeout=60,
|
|
704
|
+
)
|
|
705
|
+
return result.returncode, result.stdout, result.stderr
|
|
706
|
+
except subprocess.TimeoutExpired:
|
|
707
|
+
raise ProbeError(
|
|
708
|
+
f"SSH connection timed out\n"
|
|
709
|
+
f" Host: {ssh_target}:{port}\n"
|
|
710
|
+
f" Hint: The pod may be starting up. Try again in 30 seconds."
|
|
711
|
+
) from None
|
|
712
|
+
|
|
713
|
+
# Find Python
|
|
714
|
+
python_exe = "python3"
|
|
715
|
+
for candidate in [
|
|
716
|
+
"/opt/conda/envs/py_3.10/bin/python3",
|
|
717
|
+
"/opt/conda/bin/python3",
|
|
718
|
+
]:
|
|
719
|
+
code, out, _ = run_ssh_cmd(f"{candidate} --version 2>/dev/null && echo OK")
|
|
720
|
+
if code == 0 and "OK" in out:
|
|
721
|
+
python_exe = candidate
|
|
722
|
+
break
|
|
723
|
+
|
|
724
|
+
# Run probe script
|
|
725
|
+
escaped_script = _PROBE_SCRIPT.replace("'", "'\"'\"'")
|
|
726
|
+
code, out, err = run_ssh_cmd(f"{python_exe} -c '{escaped_script}'")
|
|
727
|
+
if code != 0:
|
|
728
|
+
raise ProbeError(
|
|
729
|
+
f"Probe script failed on target\n"
|
|
730
|
+
f" Exit code: {code}\n"
|
|
731
|
+
f" Error: {err.strip() if err else 'unknown'}"
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
try:
|
|
735
|
+
return json.loads(out)
|
|
736
|
+
except json.JSONDecodeError as e:
|
|
737
|
+
raise ProbeError(
|
|
738
|
+
f"Failed to parse probe output\n Error: {e}\n Output: {out[:200]}..."
|
|
739
|
+
) from None
|
|
740
|
+
|
|
741
|
+
except RunPodError as e:
|
|
742
|
+
# RunPod API errors (provisioning, pod not found, etc.)
|
|
743
|
+
raise ProbeError(f"RunPod error for target '{target.name}'\n {e}") from None
|
|
744
|
+
except OSError as e:
|
|
745
|
+
# SSH connection errors
|
|
746
|
+
if pod_state:
|
|
747
|
+
raise ProbeError(
|
|
748
|
+
f"SSH connection failed to target '{target.name}'\n"
|
|
749
|
+
f" Host: {pod_state.ssh_username}@{pod_state.public_ip}:{pod_state.ssh_port}\n"
|
|
750
|
+
f" Error: {e}\n"
|
|
751
|
+
f" Hint: Check if the pod is still running with 'wafer config targets pods'"
|
|
752
|
+
) from None
|
|
753
|
+
raise ProbeError(
|
|
754
|
+
f"SSH connection failed to target '{target.name}'\n"
|
|
755
|
+
f" Error: {e}\n"
|
|
756
|
+
f" Hint: No pod found. One will be provisioned on next probe attempt."
|
|
757
|
+
) from None
|
|
758
|
+
|
|
759
|
+
elif isinstance(target, (BaremetalTarget, VMTarget)):
|
|
760
|
+
import subprocess
|
|
761
|
+
|
|
762
|
+
# Parse ssh_target (user@host:port or user@host)
|
|
763
|
+
ssh_target = target.ssh_target
|
|
764
|
+
if ":" in ssh_target.split("@")[-1]:
|
|
765
|
+
host_port = ssh_target.split("@")[-1]
|
|
766
|
+
host = host_port.rsplit(":", 1)[0]
|
|
767
|
+
port = host_port.rsplit(":", 1)[1]
|
|
768
|
+
user = ssh_target.split("@")[0]
|
|
769
|
+
ssh_target = f"{user}@{host}"
|
|
770
|
+
else:
|
|
771
|
+
host = ssh_target.split("@")[-1]
|
|
772
|
+
port = "22"
|
|
773
|
+
user = ssh_target.split("@")[0]
|
|
774
|
+
|
|
775
|
+
key_path = target.ssh_key
|
|
776
|
+
|
|
777
|
+
def run_ssh_cmd(cmd: str) -> tuple[int, str, str]:
|
|
778
|
+
try:
|
|
779
|
+
result = subprocess.run(
|
|
780
|
+
[
|
|
781
|
+
"ssh",
|
|
782
|
+
"-o",
|
|
783
|
+
"StrictHostKeyChecking=no",
|
|
784
|
+
"-o",
|
|
785
|
+
"UserKnownHostsFile=/dev/null",
|
|
786
|
+
"-o",
|
|
787
|
+
"ConnectTimeout=30",
|
|
788
|
+
"-i",
|
|
789
|
+
str(key_path),
|
|
790
|
+
"-p",
|
|
791
|
+
port,
|
|
792
|
+
ssh_target,
|
|
793
|
+
cmd,
|
|
794
|
+
],
|
|
795
|
+
capture_output=True,
|
|
796
|
+
text=True,
|
|
797
|
+
timeout=60,
|
|
798
|
+
)
|
|
799
|
+
return result.returncode, result.stdout, result.stderr
|
|
800
|
+
except subprocess.TimeoutExpired:
|
|
801
|
+
raise ProbeError(
|
|
802
|
+
f"SSH connection timed out\n"
|
|
803
|
+
f" Host: {ssh_target}:{port}\n"
|
|
804
|
+
f" Hint: Check if the host is reachable and SSH is running."
|
|
805
|
+
) from None
|
|
806
|
+
|
|
807
|
+
# Test SSH connection first
|
|
808
|
+
code, out, err = run_ssh_cmd("echo OK")
|
|
809
|
+
if code != 0:
|
|
810
|
+
raise ProbeError(
|
|
811
|
+
f"SSH connection failed to target '{target.name}'\n"
|
|
812
|
+
f" Host: {user}@{host}:{port}\n"
|
|
813
|
+
f" Key: {key_path}\n"
|
|
814
|
+
f" Error: {err.strip() if err else 'connection refused or timeout'}\n"
|
|
815
|
+
f" Hint: Verify the host is reachable and the SSH key is authorized."
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
# Run probe script
|
|
819
|
+
escaped_script = _PROBE_SCRIPT.replace("'", "'\"'\"'")
|
|
820
|
+
code, out, err = run_ssh_cmd(f"python3 -c '{escaped_script}'")
|
|
821
|
+
if code != 0:
|
|
822
|
+
raise ProbeError(
|
|
823
|
+
f"Probe script failed on target '{target.name}'\n"
|
|
824
|
+
f" Exit code: {code}\n"
|
|
825
|
+
f" Error: {err.strip() if err else 'unknown'}\n"
|
|
826
|
+
f" Hint: Ensure python3 is installed on the target."
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
try:
|
|
830
|
+
return json.loads(out)
|
|
831
|
+
except json.JSONDecodeError as e:
|
|
832
|
+
raise ProbeError(
|
|
833
|
+
f"Failed to parse probe output from '{target.name}'\n"
|
|
834
|
+
f" Error: {e}\n"
|
|
835
|
+
f" Output: {out[:200]}..."
|
|
836
|
+
) from None
|
|
837
|
+
|
|
838
|
+
else:
|
|
839
|
+
raise ProbeError(
|
|
840
|
+
f"Probing not supported for target type: {type(target).__name__}\n"
|
|
841
|
+
f" Supported types: RunPod, Baremetal, VM"
|
|
842
|
+
)
|