wafer-cli 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wafer/targets.py CHANGED
@@ -257,6 +257,220 @@ def get_default_target() -> str | None:
257
257
  return data.get("default_target")
258
258
 
259
259
 
260
+ # ── Pool Management ─────────────────────────────────────────────────────────
261
+
262
+
263
+ def get_pool(name: str) -> list[str]:
264
+ """Get list of targets in a named pool.
265
+
266
+ Pools are defined in ~/.wafer/config.toml:
267
+ [pools.my-pool]
268
+ targets = ["target-1", "target-2", "target-3"]
269
+
270
+ Args:
271
+ name: Pool name
272
+
273
+ Returns:
274
+ List of target names in the pool
275
+
276
+ Raises:
277
+ FileNotFoundError: If pool doesn't exist
278
+ """
279
+ if not CONFIG_FILE.exists():
280
+ raise FileNotFoundError(f"Pool not found: {name} (no config file)")
281
+
282
+ with open(CONFIG_FILE, "rb") as f:
283
+ data = tomllib.load(f)
284
+
285
+ pools = data.get("pools", {})
286
+ if name not in pools:
287
+ raise FileNotFoundError(
288
+ f"Pool not found: {name}\n"
289
+ f" Define pools in ~/.wafer/config.toml:\n"
290
+ f" [pools.{name}]\n"
291
+ f' targets = ["target-1", "target-2"]'
292
+ )
293
+
294
+ pool_config = pools[name]
295
+ targets = pool_config.get("targets", [])
296
+
297
+ if not targets:
298
+ raise ValueError(f"Pool '{name}' has no targets defined")
299
+
300
+ return targets
301
+
302
+
303
+ def get_target_type(name: str) -> str | None:
304
+ """Get the type of a target without fully loading it.
305
+
306
+ Args:
307
+ name: Target name
308
+
309
+ Returns:
310
+ Target type string (runpod, digitalocean, baremetal, etc.) or None if not found
311
+ """
312
+ path = _target_path(name)
313
+ if not path.exists():
314
+ return None
315
+
316
+ with open(path, "rb") as f:
317
+ data = tomllib.load(f)
318
+
319
+ return data.get("type")
320
+
321
+
322
+ def filter_pool_by_auth(target_names: list[str]) -> tuple[list[str], list[str]]:
323
+ """Filter pool targets to only those with valid authentication.
324
+
325
+ Args:
326
+ target_names: List of target names to filter
327
+
328
+ Returns:
329
+ Tuple of (usable_targets, skipped_targets)
330
+ """
331
+ from wafer_core.auth import get_api_key
332
+
333
+ usable = []
334
+ skipped = []
335
+
336
+ for name in target_names:
337
+ target_type = get_target_type(name)
338
+ if target_type is None:
339
+ # Target doesn't exist, skip it
340
+ skipped.append(name)
341
+ continue
342
+
343
+ # Check auth requirements by target type
344
+ if target_type == "runpod":
345
+ if not get_api_key("runpod"):
346
+ skipped.append(name)
347
+ continue
348
+ elif target_type == "digitalocean":
349
+ if not get_api_key("digitalocean"):
350
+ skipped.append(name)
351
+ continue
352
+ # Other types (baremetal, vm, workspace, modal) don't need runtime API keys
353
+
354
+ usable.append(name)
355
+
356
+ return usable, skipped
357
+
358
+
359
+ def list_pools() -> list[str]:
360
+ """List all configured pool names.
361
+
362
+ Returns:
363
+ Sorted list of pool names
364
+ """
365
+ if not CONFIG_FILE.exists():
366
+ return []
367
+
368
+ with open(CONFIG_FILE, "rb") as f:
369
+ data = tomllib.load(f)
370
+
371
+ return sorted(data.get("pools", {}).keys())
372
+
373
+
374
+ def save_pool(name: str, targets: list[str]) -> None:
375
+ """Save or update a pool configuration.
376
+
377
+ Args:
378
+ name: Pool name
379
+ targets: List of target names (must all exist)
380
+
381
+ Raises:
382
+ FileNotFoundError: If any target doesn't exist
383
+ """
384
+ # Verify all targets exist
385
+ existing_targets = list_targets()
386
+ missing = [t for t in targets if t not in existing_targets]
387
+ if missing:
388
+ raise FileNotFoundError(f"Targets not found: {', '.join(missing)}")
389
+
390
+ _ensure_dirs()
391
+
392
+ # Load existing config
393
+ if CONFIG_FILE.exists():
394
+ with open(CONFIG_FILE, "rb") as f:
395
+ data = tomllib.load(f)
396
+ else:
397
+ data = {}
398
+
399
+ # Update pools section
400
+ if "pools" not in data:
401
+ data["pools"] = {}
402
+
403
+ data["pools"][name] = {"targets": targets}
404
+
405
+ # Write back - need custom handling for nested structure
406
+ _write_config_with_pools(data)
407
+
408
+
409
+ def _write_config_with_pools(data: dict) -> None:
410
+ """Write config file with pools support.
411
+
412
+ Handles the nested [pools.name] TOML structure and preserves
413
+ existing nested sections like [default], [api], [environments.*].
414
+ """
415
+ lines = []
416
+
417
+ # Collect nested sections to write after top-level keys
418
+ nested_sections: dict[str, dict] = {}
419
+
420
+ # Write top-level keys first (except pools and nested dicts)
421
+ for key, value in data.items():
422
+ if key == "pools":
423
+ continue
424
+ if value is None:
425
+ continue
426
+ if isinstance(value, dict):
427
+ # Save nested sections for later
428
+ nested_sections[key] = value
429
+ elif isinstance(value, str):
430
+ lines.append(f'{key} = "{value}"')
431
+ elif isinstance(value, bool):
432
+ lines.append(f"{key} = {str(value).lower()}")
433
+ elif isinstance(value, int | float):
434
+ lines.append(f"{key} = {value}")
435
+ elif isinstance(value, list):
436
+ if all(isinstance(v, int) for v in value):
437
+ lines.append(f"{key} = {value}")
438
+ else:
439
+ formatted = ", ".join(f'"{v}"' if isinstance(v, str) else str(v) for v in value)
440
+ lines.append(f"{key} = [{formatted}]")
441
+
442
+ # Write nested sections (e.g., [default], [api], [environments.foo])
443
+ for section_name, section_data in nested_sections.items():
444
+ lines.append("")
445
+ lines.append(f"[{section_name}]")
446
+ for key, value in section_data.items():
447
+ if value is None:
448
+ continue
449
+ if isinstance(value, str):
450
+ lines.append(f'{key} = "{value}"')
451
+ elif isinstance(value, bool):
452
+ lines.append(f"{key} = {str(value).lower()}")
453
+ elif isinstance(value, int | float):
454
+ lines.append(f"{key} = {value}")
455
+ elif isinstance(value, list):
456
+ if all(isinstance(v, int) for v in value):
457
+ lines.append(f"{key} = {value}")
458
+ else:
459
+ formatted = ", ".join(f'"{v}"' if isinstance(v, str) else str(v) for v in value)
460
+ lines.append(f"{key} = [{formatted}]")
461
+
462
+ # Write pools
463
+ pools = data.get("pools", {})
464
+ for pool_name, pool_config in pools.items():
465
+ lines.append("")
466
+ lines.append(f"[pools.{pool_name}]")
467
+ targets = pool_config.get("targets", [])
468
+ formatted = ", ".join(f'"{t}"' for t in targets)
469
+ lines.append(f"targets = [{formatted}]")
470
+
471
+ CONFIG_FILE.write_text("\n".join(lines) + "\n")
472
+
473
+
260
474
  def set_default_target(name: str) -> None:
261
475
  """Set default target.
262
476
 
@@ -350,3 +564,279 @@ def get_target_info(target: TargetConfig) -> dict[str, str]:
350
564
  info["Compute"] = target.compute_capability
351
565
 
352
566
  return info
567
+
568
+
569
+ # Probe script to run on target - checks available backends
570
+ _PROBE_SCRIPT = """
571
+ import json
572
+ import shutil
573
+ import sys
574
+
575
+ def probe():
576
+ result = {
577
+ "python_version": sys.version.split()[0],
578
+ "backends": {},
579
+ "packages": {},
580
+ }
581
+
582
+ # Check Triton
583
+ try:
584
+ import triton
585
+ result["backends"]["triton"] = triton.__version__
586
+ except ImportError:
587
+ result["backends"]["triton"] = None
588
+
589
+ # Check torch
590
+ try:
591
+ import torch
592
+ result["packages"]["torch"] = torch.__version__
593
+ result["backends"]["torch"] = torch.__version__
594
+ result["cuda_available"] = torch.cuda.is_available()
595
+ if torch.cuda.is_available():
596
+ result["gpu_name"] = torch.cuda.get_device_name(0)
597
+ props = torch.cuda.get_device_properties(0)
598
+ result["compute_capability"] = f"{props.major}.{props.minor}"
599
+ except ImportError:
600
+ result["packages"]["torch"] = None
601
+
602
+ # Check hipcc (AMD)
603
+ hipcc = shutil.which("hipcc")
604
+ result["backends"]["hipcc"] = hipcc
605
+
606
+ # Check nvcc (NVIDIA)
607
+ nvcc = shutil.which("nvcc")
608
+ result["backends"]["nvcc"] = nvcc
609
+
610
+ # Check ROCm version
611
+ try:
612
+ with open("/opt/rocm/.info/version", "r") as f:
613
+ result["rocm_version"] = f.read().strip()
614
+ except Exception:
615
+ result["rocm_version"] = None
616
+
617
+ # Check CUDA version from nvcc
618
+ if nvcc:
619
+ import subprocess
620
+ try:
621
+ out = subprocess.check_output([nvcc, "--version"], text=True)
622
+ for line in out.split("\\n"):
623
+ if "release" in line.lower():
624
+ # Parse "Cuda compilation tools, release 12.1, V12.1.105"
625
+ parts = line.split("release")
626
+ if len(parts) > 1:
627
+ result["cuda_version"] = parts[1].split(",")[0].strip()
628
+ break
629
+ except Exception:
630
+ pass
631
+
632
+ print(json.dumps(result))
633
+
634
+ if __name__ == "__main__":
635
+ probe()
636
+ """
637
+
638
+
639
+ class ProbeError(Exception):
640
+ """Error during target probing with actionable context."""
641
+
642
+ pass
643
+
644
+
645
+ async def probe_target_capabilities(target: TargetConfig) -> dict[str, Any]:
646
+ """Probe a target to discover available compilation backends.
647
+
648
+ Connects to the target and runs a probe script to check:
649
+ - Triton availability
650
+ - torch availability
651
+ - HIP/CUDA compiler
652
+ - ROCm/CUDA version
653
+ - GPU info
654
+
655
+ Args:
656
+ target: Target config
657
+
658
+ Returns:
659
+ Dict with capabilities info
660
+
661
+ Raises:
662
+ ProbeError: With actionable error message on failure
663
+ """
664
+ import json
665
+ import subprocess
666
+
667
+ if isinstance(target, RunPodTarget):
668
+ import trio_asyncio
669
+ from wafer_core.targets.runpod import RunPodError, get_pod_state, runpod_ssh_context
670
+
671
+ # Check if pod exists before trying to connect
672
+ pod_state = get_pod_state(target.name)
673
+
674
+ try:
675
+ # Need trio_asyncio.open_loop() for asyncssh bridge used by runpod_ssh_context
676
+ async with trio_asyncio.open_loop():
677
+ async with runpod_ssh_context(target) as ssh_info:
678
+ ssh_target = f"{ssh_info.user}@{ssh_info.host}"
679
+ port = ssh_info.port
680
+ key_path = target.ssh_key
681
+
682
+ # Find Python and run probe using subprocess (simpler than async ssh)
683
+ def run_ssh_cmd(cmd: str) -> tuple[int, str, str]:
684
+ try:
685
+ result = subprocess.run(
686
+ [
687
+ "ssh",
688
+ "-o",
689
+ "StrictHostKeyChecking=no",
690
+ "-o",
691
+ "UserKnownHostsFile=/dev/null",
692
+ "-o",
693
+ "ConnectTimeout=30",
694
+ "-i",
695
+ str(key_path),
696
+ "-p",
697
+ str(port),
698
+ ssh_target,
699
+ cmd,
700
+ ],
701
+ capture_output=True,
702
+ text=True,
703
+ timeout=60,
704
+ )
705
+ return result.returncode, result.stdout, result.stderr
706
+ except subprocess.TimeoutExpired:
707
+ raise ProbeError(
708
+ f"SSH connection timed out\n"
709
+ f" Host: {ssh_target}:{port}\n"
710
+ f" Hint: The pod may be starting up. Try again in 30 seconds."
711
+ ) from None
712
+
713
+ # Find Python
714
+ python_exe = "python3"
715
+ for candidate in [
716
+ "/opt/conda/envs/py_3.10/bin/python3",
717
+ "/opt/conda/bin/python3",
718
+ ]:
719
+ code, out, _ = run_ssh_cmd(f"{candidate} --version 2>/dev/null && echo OK")
720
+ if code == 0 and "OK" in out:
721
+ python_exe = candidate
722
+ break
723
+
724
+ # Run probe script
725
+ escaped_script = _PROBE_SCRIPT.replace("'", "'\"'\"'")
726
+ code, out, err = run_ssh_cmd(f"{python_exe} -c '{escaped_script}'")
727
+ if code != 0:
728
+ raise ProbeError(
729
+ f"Probe script failed on target\n"
730
+ f" Exit code: {code}\n"
731
+ f" Error: {err.strip() if err else 'unknown'}"
732
+ )
733
+
734
+ try:
735
+ return json.loads(out)
736
+ except json.JSONDecodeError as e:
737
+ raise ProbeError(
738
+ f"Failed to parse probe output\n Error: {e}\n Output: {out[:200]}..."
739
+ ) from None
740
+
741
+ except RunPodError as e:
742
+ # RunPod API errors (provisioning, pod not found, etc.)
743
+ raise ProbeError(f"RunPod error for target '{target.name}'\n {e}") from None
744
+ except OSError as e:
745
+ # SSH connection errors
746
+ if pod_state:
747
+ raise ProbeError(
748
+ f"SSH connection failed to target '{target.name}'\n"
749
+ f" Host: {pod_state.ssh_username}@{pod_state.public_ip}:{pod_state.ssh_port}\n"
750
+ f" Error: {e}\n"
751
+ f" Hint: Check if the pod is still running with 'wafer config targets pods'"
752
+ ) from None
753
+ raise ProbeError(
754
+ f"SSH connection failed to target '{target.name}'\n"
755
+ f" Error: {e}\n"
756
+ f" Hint: No pod found. One will be provisioned on next probe attempt."
757
+ ) from None
758
+
759
+ elif isinstance(target, (BaremetalTarget, VMTarget)):
760
+ import subprocess
761
+
762
+ # Parse ssh_target (user@host:port or user@host)
763
+ ssh_target = target.ssh_target
764
+ if ":" in ssh_target.split("@")[-1]:
765
+ host_port = ssh_target.split("@")[-1]
766
+ host = host_port.rsplit(":", 1)[0]
767
+ port = host_port.rsplit(":", 1)[1]
768
+ user = ssh_target.split("@")[0]
769
+ ssh_target = f"{user}@{host}"
770
+ else:
771
+ host = ssh_target.split("@")[-1]
772
+ port = "22"
773
+ user = ssh_target.split("@")[0]
774
+
775
+ key_path = target.ssh_key
776
+
777
+ def run_ssh_cmd(cmd: str) -> tuple[int, str, str]:
778
+ try:
779
+ result = subprocess.run(
780
+ [
781
+ "ssh",
782
+ "-o",
783
+ "StrictHostKeyChecking=no",
784
+ "-o",
785
+ "UserKnownHostsFile=/dev/null",
786
+ "-o",
787
+ "ConnectTimeout=30",
788
+ "-i",
789
+ str(key_path),
790
+ "-p",
791
+ port,
792
+ ssh_target,
793
+ cmd,
794
+ ],
795
+ capture_output=True,
796
+ text=True,
797
+ timeout=60,
798
+ )
799
+ return result.returncode, result.stdout, result.stderr
800
+ except subprocess.TimeoutExpired:
801
+ raise ProbeError(
802
+ f"SSH connection timed out\n"
803
+ f" Host: {ssh_target}:{port}\n"
804
+ f" Hint: Check if the host is reachable and SSH is running."
805
+ ) from None
806
+
807
+ # Test SSH connection first
808
+ code, out, err = run_ssh_cmd("echo OK")
809
+ if code != 0:
810
+ raise ProbeError(
811
+ f"SSH connection failed to target '{target.name}'\n"
812
+ f" Host: {user}@{host}:{port}\n"
813
+ f" Key: {key_path}\n"
814
+ f" Error: {err.strip() if err else 'connection refused or timeout'}\n"
815
+ f" Hint: Verify the host is reachable and the SSH key is authorized."
816
+ )
817
+
818
+ # Run probe script
819
+ escaped_script = _PROBE_SCRIPT.replace("'", "'\"'\"'")
820
+ code, out, err = run_ssh_cmd(f"python3 -c '{escaped_script}'")
821
+ if code != 0:
822
+ raise ProbeError(
823
+ f"Probe script failed on target '{target.name}'\n"
824
+ f" Exit code: {code}\n"
825
+ f" Error: {err.strip() if err else 'unknown'}\n"
826
+ f" Hint: Ensure python3 is installed on the target."
827
+ )
828
+
829
+ try:
830
+ return json.loads(out)
831
+ except json.JSONDecodeError as e:
832
+ raise ProbeError(
833
+ f"Failed to parse probe output from '{target.name}'\n"
834
+ f" Error: {e}\n"
835
+ f" Output: {out[:200]}..."
836
+ ) from None
837
+
838
+ else:
839
+ raise ProbeError(
840
+ f"Probing not supported for target type: {type(target).__name__}\n"
841
+ f" Supported types: RunPod, Baremetal, VM"
842
+ )