skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/seeweb.py +103 -0
- sky/authentication.py +38 -0
- sky/backends/backend_utils.py +148 -30
- sky/backends/cloud_vm_ray_backend.py +606 -223
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -37
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/seeweb_catalog.py +184 -0
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/seeweb.py +463 -0
- sky/core.py +46 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
- sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +75 -26
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +67 -24
- sky/logs/agent.py +10 -2
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +14 -3
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +806 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +252 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +14 -8
- sky/skylet/job_lib.py +128 -10
- sky/skylet/log_lib.py +14 -3
- sky/skylet/log_lib.pyi +9 -0
- sky/skylet/services.py +203 -0
- sky/skylet/skylet.py +4 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +11 -5
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py
CHANGED
|
@@ -59,7 +59,6 @@ from sky import task as task_lib
|
|
|
59
59
|
from sky.adaptors import common as adaptors_common
|
|
60
60
|
from sky.client import sdk
|
|
61
61
|
from sky.client.cli import flags
|
|
62
|
-
from sky.client.cli import git
|
|
63
62
|
from sky.data import storage_utils
|
|
64
63
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
65
64
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
@@ -79,7 +78,6 @@ from sky.utils import controller_utils
|
|
|
79
78
|
from sky.utils import dag_utils
|
|
80
79
|
from sky.utils import directory_utils
|
|
81
80
|
from sky.utils import env_options
|
|
82
|
-
from sky.utils import git as git_utils
|
|
83
81
|
from sky.utils import infra_utils
|
|
84
82
|
from sky.utils import log_utils
|
|
85
83
|
from sky.utils import registry
|
|
@@ -783,8 +781,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
|
783
781
|
|
|
784
782
|
# Update the workdir config from the command line parameters.
|
|
785
783
|
# And update the envs and secrets from the workdir.
|
|
786
|
-
|
|
787
|
-
|
|
784
|
+
task.update_workdir(workdir, git_url, git_ref)
|
|
785
|
+
task.update_envs_and_secrets_from_workdir()
|
|
788
786
|
|
|
789
787
|
# job launch specific.
|
|
790
788
|
if job_recovery is not None:
|
|
@@ -799,73 +797,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
|
799
797
|
return task
|
|
800
798
|
|
|
801
799
|
|
|
802
|
-
def _update_task_workdir(task: task_lib.Task, workdir: Optional[str],
|
|
803
|
-
git_url: Optional[str], git_ref: Optional[str]):
|
|
804
|
-
"""Updates the task workdir.
|
|
805
|
-
|
|
806
|
-
Args:
|
|
807
|
-
task: The task to update.
|
|
808
|
-
workdir: The workdir to update.
|
|
809
|
-
git_url: The git url to update.
|
|
810
|
-
git_ref: The git ref to update.
|
|
811
|
-
"""
|
|
812
|
-
if task.workdir is None or isinstance(task.workdir, str):
|
|
813
|
-
if workdir is not None:
|
|
814
|
-
task.workdir = workdir
|
|
815
|
-
return
|
|
816
|
-
if git_url is not None:
|
|
817
|
-
task.workdir = {}
|
|
818
|
-
task.workdir['url'] = git_url
|
|
819
|
-
if git_ref is not None:
|
|
820
|
-
task.workdir['ref'] = git_ref
|
|
821
|
-
return
|
|
822
|
-
return
|
|
823
|
-
if git_url is not None:
|
|
824
|
-
task.workdir['url'] = git_url
|
|
825
|
-
if git_ref is not None:
|
|
826
|
-
task.workdir['ref'] = git_ref
|
|
827
|
-
return
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
def _update_task_workdir_and_secrets_from_workdir(task: task_lib.Task):
|
|
831
|
-
"""Updates the task secrets from the workdir.
|
|
832
|
-
|
|
833
|
-
Args:
|
|
834
|
-
task: The task to update.
|
|
835
|
-
"""
|
|
836
|
-
if task.workdir is None:
|
|
837
|
-
return
|
|
838
|
-
if not isinstance(task.workdir, dict):
|
|
839
|
-
return
|
|
840
|
-
url = task.workdir['url']
|
|
841
|
-
ref = task.workdir.get('ref', '')
|
|
842
|
-
token = os.environ.get(git_utils.GIT_TOKEN_ENV_VAR)
|
|
843
|
-
ssh_key_path = os.environ.get(git_utils.GIT_SSH_KEY_PATH_ENV_VAR)
|
|
844
|
-
try:
|
|
845
|
-
git_repo = git.GitRepo(url, ref, token, ssh_key_path)
|
|
846
|
-
clone_info = git_repo.get_repo_clone_info()
|
|
847
|
-
if clone_info is None:
|
|
848
|
-
return
|
|
849
|
-
task.envs[git_utils.GIT_URL_ENV_VAR] = clone_info.url
|
|
850
|
-
if ref:
|
|
851
|
-
ref_type = git_repo.get_ref_type()
|
|
852
|
-
if ref_type == git.GitRefType.COMMIT:
|
|
853
|
-
task.envs[git_utils.GIT_COMMIT_HASH_ENV_VAR] = ref
|
|
854
|
-
elif ref_type == git.GitRefType.BRANCH:
|
|
855
|
-
task.envs[git_utils.GIT_BRANCH_ENV_VAR] = ref
|
|
856
|
-
elif ref_type == git.GitRefType.TAG:
|
|
857
|
-
task.envs[git_utils.GIT_TAG_ENV_VAR] = ref
|
|
858
|
-
if clone_info.token is None and clone_info.ssh_key is None:
|
|
859
|
-
return
|
|
860
|
-
if clone_info.token is not None:
|
|
861
|
-
task.secrets[git_utils.GIT_TOKEN_ENV_VAR] = clone_info.token
|
|
862
|
-
if clone_info.ssh_key is not None:
|
|
863
|
-
task.secrets[git_utils.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
|
|
864
|
-
except exceptions.GitError as e:
|
|
865
|
-
with ux_utils.print_exception_no_traceback():
|
|
866
|
-
raise ValueError(f'{str(e)}') from None
|
|
867
|
-
|
|
868
|
-
|
|
869
800
|
class _NaturalOrderGroup(click.Group):
|
|
870
801
|
"""Lists commands in the order defined in this script.
|
|
871
802
|
|
sky/client/sdk_async.py
CHANGED
|
@@ -456,6 +456,7 @@ async def download_logs(cluster_name: str,
|
|
|
456
456
|
async def start(
|
|
457
457
|
cluster_name: str,
|
|
458
458
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
459
|
+
wait_for: Optional['autostop_lib.AutostopWaitFor'] = None,
|
|
459
460
|
retry_until_up: bool = False,
|
|
460
461
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
461
462
|
force: bool = False,
|
|
@@ -464,7 +465,8 @@ async def start(
|
|
|
464
465
|
"""Async version of start() that restarts a cluster."""
|
|
465
466
|
request_id = await context_utils.to_thread(sdk.start, cluster_name,
|
|
466
467
|
idle_minutes_to_autostop,
|
|
467
|
-
retry_until_up, down,
|
|
468
|
+
wait_for, retry_until_up, down,
|
|
469
|
+
force)
|
|
468
470
|
if stream_logs is not None:
|
|
469
471
|
return await _stream_and_get(request_id, stream_logs)
|
|
470
472
|
else:
|
|
@@ -504,13 +506,14 @@ async def stop(
|
|
|
504
506
|
async def autostop(
|
|
505
507
|
cluster_name: str,
|
|
506
508
|
idle_minutes: int,
|
|
509
|
+
wait_for: Optional['autostop_lib.AutostopWaitFor'] = None,
|
|
507
510
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
508
511
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
509
512
|
) -> None:
|
|
510
513
|
"""Async version of autostop() that schedules an autostop/autodown for a
|
|
511
514
|
cluster."""
|
|
512
515
|
request_id = await context_utils.to_thread(sdk.autostop, cluster_name,
|
|
513
|
-
idle_minutes, down)
|
|
516
|
+
idle_minutes, wait_for, down)
|
|
514
517
|
if stream_logs is not None:
|
|
515
518
|
return await _stream_and_get(request_id, stream_logs)
|
|
516
519
|
else:
|
sky/clouds/__init__.py
CHANGED
|
@@ -28,6 +28,7 @@ from sky.clouds.oci import OCI
|
|
|
28
28
|
from sky.clouds.paperspace import Paperspace
|
|
29
29
|
from sky.clouds.runpod import RunPod
|
|
30
30
|
from sky.clouds.scp import SCP
|
|
31
|
+
from sky.clouds.seeweb import Seeweb
|
|
31
32
|
from sky.clouds.ssh import SSH
|
|
32
33
|
from sky.clouds.vast import Vast
|
|
33
34
|
from sky.clouds.vsphere import Vsphere
|
|
@@ -58,6 +59,7 @@ __all__ = [
|
|
|
58
59
|
'Fluidstack',
|
|
59
60
|
'Nebius',
|
|
60
61
|
'Hyperbolic',
|
|
62
|
+
'Seeweb',
|
|
61
63
|
# Utility functions
|
|
62
64
|
'cloud_in_iterable',
|
|
63
65
|
]
|
sky/clouds/aws.py
CHANGED
|
@@ -39,9 +39,11 @@ logger = sky_logging.init_logger(__name__)
|
|
|
39
39
|
|
|
40
40
|
# Image ID tags
|
|
41
41
|
_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu'
|
|
42
|
+
_DEFAULT_CPU_ARM64_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-arm64'
|
|
42
43
|
# For GPU-related package version,
|
|
43
44
|
# see sky/catalog/images/provisioners/cuda.sh
|
|
44
45
|
_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu'
|
|
46
|
+
_DEFAULT_GPU_ARM64_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-arm64'
|
|
45
47
|
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
|
|
46
48
|
_DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'
|
|
47
49
|
|
|
@@ -364,13 +366,22 @@ class AWS(clouds.Cloud):
|
|
|
364
366
|
@classmethod
|
|
365
367
|
def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
|
|
366
368
|
acc = cls.get_accelerators_from_instance_type(instance_type)
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
369
|
+
arch = cls.get_arch_from_instance_type(instance_type)
|
|
370
|
+
if arch == constants.ARM64_ARCH:
|
|
371
|
+
image_id = catalog.get_image_id_from_tag(
|
|
372
|
+
_DEFAULT_CPU_ARM64_IMAGE_ID, region_name, clouds='aws')
|
|
373
|
+
else:
|
|
374
|
+
image_id = catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
|
|
372
375
|
region_name,
|
|
373
376
|
clouds='aws')
|
|
377
|
+
if acc is not None:
|
|
378
|
+
if arch == constants.ARM64_ARCH:
|
|
379
|
+
image_id = catalog.get_image_id_from_tag(
|
|
380
|
+
_DEFAULT_GPU_ARM64_IMAGE_ID, region_name, clouds='aws')
|
|
381
|
+
else:
|
|
382
|
+
image_id = catalog.get_image_id_from_tag(_DEFAULT_GPU_IMAGE_ID,
|
|
383
|
+
region_name,
|
|
384
|
+
clouds='aws')
|
|
374
385
|
assert len(acc) == 1, acc
|
|
375
386
|
acc_name = list(acc.keys())[0]
|
|
376
387
|
if acc_name == 'K80':
|
|
@@ -573,6 +584,13 @@ class AWS(clouds.Cloud):
|
|
|
573
584
|
return catalog.get_accelerators_from_instance_type(instance_type,
|
|
574
585
|
clouds='aws')
|
|
575
586
|
|
|
587
|
+
@classmethod
|
|
588
|
+
def get_arch_from_instance_type(
|
|
589
|
+
cls,
|
|
590
|
+
instance_type: str,
|
|
591
|
+
) -> Optional[str]:
|
|
592
|
+
return catalog.get_arch_from_instance_type(instance_type, clouds='aws')
|
|
593
|
+
|
|
576
594
|
@classmethod
|
|
577
595
|
def get_vcpus_mem_from_instance_type(
|
|
578
596
|
cls,
|
sky/clouds/cloud.py
CHANGED
|
@@ -340,6 +340,14 @@ class Cloud:
|
|
|
340
340
|
"""Returns {acc: acc_count} held by 'instance_type', if any."""
|
|
341
341
|
raise NotImplementedError
|
|
342
342
|
|
|
343
|
+
@classmethod
|
|
344
|
+
def get_arch_from_instance_type(
|
|
345
|
+
cls,
|
|
346
|
+
instance_type: str,
|
|
347
|
+
) -> Optional[str]:
|
|
348
|
+
"""Returns the arch of the instance type, if any."""
|
|
349
|
+
raise NotImplementedError
|
|
350
|
+
|
|
343
351
|
@classmethod
|
|
344
352
|
def get_default_instance_type(cls,
|
|
345
353
|
cpus: Optional[str] = None,
|
sky/clouds/kubernetes.py
CHANGED
|
@@ -841,6 +841,8 @@ class Kubernetes(clouds.Cloud):
|
|
|
841
841
|
from_instance_type(default_instance_type))
|
|
842
842
|
|
|
843
843
|
gpu_task_cpus = k8s_instance_type.cpus
|
|
844
|
+
if resources.cpus is None:
|
|
845
|
+
gpu_task_cpus = gpu_task_cpus * acc_count
|
|
844
846
|
# Special handling to bump up memory multiplier for GPU instances
|
|
845
847
|
gpu_task_memory = (float(resources.memory.strip('+')) if
|
|
846
848
|
resources.memory is not None else gpu_task_cpus *
|
sky/clouds/seeweb.py
ADDED
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
"""Seeweb Cloud
|
|
2
|
+
|
|
3
|
+
History:
|
|
4
|
+
@ Aug 6, 2025: Initial version of the integration.
|
|
5
|
+
- Francesco Massa
|
|
6
|
+
- Marco Cristofanilli (marco.cATseeweb.it)
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import typing
|
|
13
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
14
|
+
|
|
15
|
+
from sky import catalog
|
|
16
|
+
from sky import clouds
|
|
17
|
+
from sky.adaptors import seeweb as seeweb_adaptor
|
|
18
|
+
from sky.provision import seeweb as seeweb_provision
|
|
19
|
+
from sky.utils import registry
|
|
20
|
+
from sky.utils import resources_utils
|
|
21
|
+
from sky.utils import ux_utils
|
|
22
|
+
|
|
23
|
+
if typing.TYPE_CHECKING:
|
|
24
|
+
from sky import resources as resources_lib
|
|
25
|
+
from sky.utils import status_lib
|
|
26
|
+
from sky.utils import volume as volume_lib
|
|
27
|
+
|
|
28
|
+
# ---------- key file path -----------------
|
|
29
|
+
_SEEWEB_KEY_FILE = '~/.seeweb_cloud/seeweb_keys'
|
|
30
|
+
# (content: ini-like)
|
|
31
|
+
# api_key = <TOKEN>
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@registry.CLOUD_REGISTRY.register
|
|
35
|
+
class Seeweb(clouds.Cloud):
|
|
36
|
+
"""Seeweb GPU Cloud."""
|
|
37
|
+
|
|
38
|
+
_REPR = 'Seeweb'
|
|
39
|
+
# Define unsupported features to provide clear error messages
|
|
40
|
+
# This helps users understand what Seeweb can and cannot do
|
|
41
|
+
_CLOUD_UNSUPPORTED_FEATURES = {
|
|
42
|
+
clouds.CloudImplementationFeatures.MULTI_NODE:
|
|
43
|
+
('Multi-node not supported. '
|
|
44
|
+
'Seeweb does not support multi-node clusters.'),
|
|
45
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
|
46
|
+
('Custom disk tiers not supported. '
|
|
47
|
+
'Seeweb does not support custom disk tiers.'),
|
|
48
|
+
clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
|
|
49
|
+
('Storage mounting not supported. '
|
|
50
|
+
'Seeweb does not support storage mounting.'),
|
|
51
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
|
52
|
+
('High availability controllers not supported. '
|
|
53
|
+
'Seeweb does not support high availability controllers.'),
|
|
54
|
+
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
|
|
55
|
+
('Spot instances not supported. '
|
|
56
|
+
'Seeweb does not support spot instances.'),
|
|
57
|
+
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
|
58
|
+
('Disk cloning not supported. '
|
|
59
|
+
'Seeweb does not support disk cloning.'),
|
|
60
|
+
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
|
|
61
|
+
('Docker images not supported. '
|
|
62
|
+
'Seeweb does not support Docker images.'),
|
|
63
|
+
clouds.CloudImplementationFeatures.IMAGE_ID:
|
|
64
|
+
('Custom image IDs not supported. '
|
|
65
|
+
'Seeweb does not support custom image IDs.'),
|
|
66
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
|
67
|
+
('Custom network tiers not supported. '
|
|
68
|
+
'Seeweb does not support custom network tiers.'),
|
|
69
|
+
clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
|
|
70
|
+
('Host controllers not supported. '
|
|
71
|
+
'Seeweb does not support host controllers.'),
|
|
72
|
+
clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
|
|
73
|
+
('Custom multi-network not supported. '
|
|
74
|
+
'Seeweb does not support custom multi-network.'),
|
|
75
|
+
}
|
|
76
|
+
_MAX_CLUSTER_NAME_LEN_LIMIT = 120
|
|
77
|
+
_regions: List[clouds.Region] = []
|
|
78
|
+
|
|
79
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
|
80
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
|
81
|
+
|
|
82
|
+
# Enable port support with updatable version
|
|
83
|
+
OPEN_PORTS_VERSION = clouds.OpenPortsVersion.UPDATABLE
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def _unsupported_features_for_resources(
|
|
87
|
+
cls, resources: 'resources_lib.Resources'
|
|
88
|
+
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
89
|
+
return cls._CLOUD_UNSUPPORTED_FEATURES
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def max_cluster_name_length(cls) -> Optional[int]:
|
|
93
|
+
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def regions(cls) -> List['clouds.Region']:
|
|
97
|
+
"""Return available regions for Seeweb."""
|
|
98
|
+
# Get regions from the catalog system
|
|
99
|
+
# This reads from the CSV files generated by fetch_seeweb.py
|
|
100
|
+
regions = catalog.regions(clouds='seeweb')
|
|
101
|
+
return regions
|
|
102
|
+
|
|
103
|
+
@classmethod
|
|
104
|
+
def regions_with_offering(
|
|
105
|
+
cls,
|
|
106
|
+
instance_type: str,
|
|
107
|
+
accelerators: Optional[Dict[str, int]],
|
|
108
|
+
use_spot: bool,
|
|
109
|
+
region: Optional[str],
|
|
110
|
+
zone: Optional[str],
|
|
111
|
+
) -> List[clouds.Region]:
|
|
112
|
+
assert zone is None, 'Seeweb does not support zones.'
|
|
113
|
+
del zone
|
|
114
|
+
if use_spot:
|
|
115
|
+
return []
|
|
116
|
+
|
|
117
|
+
# Get regions from catalog based on instance type
|
|
118
|
+
# This will read the CSV and return only regions
|
|
119
|
+
# where the instance type exists
|
|
120
|
+
regions = catalog.get_region_zones_for_instance_type(
|
|
121
|
+
instance_type, use_spot, 'seeweb')
|
|
122
|
+
|
|
123
|
+
if region is not None:
|
|
124
|
+
regions = [r for r in regions if r.name == region]
|
|
125
|
+
|
|
126
|
+
return regions
|
|
127
|
+
|
|
128
|
+
@classmethod
|
|
129
|
+
def zones_provision_loop(
|
|
130
|
+
cls,
|
|
131
|
+
*,
|
|
132
|
+
region: str,
|
|
133
|
+
num_nodes: int,
|
|
134
|
+
instance_type: str,
|
|
135
|
+
accelerators: Optional[Dict[str, int]] = None,
|
|
136
|
+
use_spot: bool = False,
|
|
137
|
+
) -> Iterator[None]:
|
|
138
|
+
del num_nodes
|
|
139
|
+
regions = cls.regions_with_offering(instance_type,
|
|
140
|
+
accelerators,
|
|
141
|
+
use_spot,
|
|
142
|
+
region=region,
|
|
143
|
+
zone=None)
|
|
144
|
+
for r in regions:
|
|
145
|
+
assert r.zones is None, r
|
|
146
|
+
yield r.zones
|
|
147
|
+
|
|
148
|
+
@classmethod
|
|
149
|
+
def get_zone_shell_cmd(cls) -> Optional[str]:
|
|
150
|
+
"""Seeweb doesn't support zones."""
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
def instance_type_to_hourly_cost(
|
|
154
|
+
self,
|
|
155
|
+
instance_type: str,
|
|
156
|
+
use_spot: bool,
|
|
157
|
+
region: Optional[str],
|
|
158
|
+
zone: Optional[str],
|
|
159
|
+
) -> float:
|
|
160
|
+
cost = catalog.get_hourly_cost(instance_type,
|
|
161
|
+
use_spot=use_spot,
|
|
162
|
+
region=region,
|
|
163
|
+
zone=zone,
|
|
164
|
+
clouds='seeweb')
|
|
165
|
+
return cost
|
|
166
|
+
|
|
167
|
+
def accelerators_to_hourly_cost(
|
|
168
|
+
self,
|
|
169
|
+
accelerators: Dict[str, int],
|
|
170
|
+
use_spot: bool,
|
|
171
|
+
region: Optional[str],
|
|
172
|
+
zone: Optional[str],
|
|
173
|
+
) -> float:
|
|
174
|
+
|
|
175
|
+
return 0.0
|
|
176
|
+
|
|
177
|
+
def get_egress_cost(self, num_gigabytes: float):
|
|
178
|
+
return 0.0
|
|
179
|
+
|
|
180
|
+
def make_deploy_resources_variables(
|
|
181
|
+
self,
|
|
182
|
+
resources: 'resources_lib.Resources',
|
|
183
|
+
cluster_name: resources_utils.ClusterName,
|
|
184
|
+
region: 'clouds.Region',
|
|
185
|
+
zones: Optional[List['clouds.Zone']],
|
|
186
|
+
num_nodes: int,
|
|
187
|
+
dryrun: bool = False,
|
|
188
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
189
|
+
) -> Dict[str, Any]:
|
|
190
|
+
"""Create deployment variables for Seeweb."""
|
|
191
|
+
|
|
192
|
+
# Note: Spot instances and multi-node are automatically handled by
|
|
193
|
+
# the framework via _CLOUD_UNSUPPORTED_FEATURES
|
|
194
|
+
|
|
195
|
+
resources = resources.assert_launchable()
|
|
196
|
+
|
|
197
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
|
198
|
+
resources.instance_type)
|
|
199
|
+
|
|
200
|
+
# Standard custom_resources string for Ray
|
|
201
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
|
202
|
+
acc_dict)
|
|
203
|
+
|
|
204
|
+
# Seeweb-specific GPU configuration for the provisioner
|
|
205
|
+
# This tells the provisioner how to configure GPU resources
|
|
206
|
+
seeweb_gpu_config = None
|
|
207
|
+
if resources.accelerators:
|
|
208
|
+
# If the instance has accelerators, prepare GPU configuration
|
|
209
|
+
accelerator_name = list(resources.accelerators.keys())[0]
|
|
210
|
+
accelerator_count = resources.accelerators[accelerator_name]
|
|
211
|
+
seeweb_gpu_config = {
|
|
212
|
+
'gpu': accelerator_count,
|
|
213
|
+
'gpu_label': accelerator_name,
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
# Seeweb uses pre-configured images based on instance type
|
|
217
|
+
# Determine image based on whether the instance type name contains "GPU"
|
|
218
|
+
if resources.instance_type and 'GPU' in resources.instance_type.upper():
|
|
219
|
+
# GPU instance - use image with NVIDIA drivers
|
|
220
|
+
if resources.instance_type in ['ECS1GPU10', 'ECS2GPU10']:
|
|
221
|
+
# H200 GPU instance - use UEFI image with NVIDIA drivers
|
|
222
|
+
image_id = 'ubuntu-2204-uefi-nvidia-driver'
|
|
223
|
+
else:
|
|
224
|
+
# Other GPU instance - use standard image with NVIDIA drivers
|
|
225
|
+
image_id = 'ubuntu-2204-nvidia-driver'
|
|
226
|
+
else:
|
|
227
|
+
# CPU-only instance - use standard Ubuntu image
|
|
228
|
+
image_id = 'ubuntu-2204'
|
|
229
|
+
|
|
230
|
+
result = {
|
|
231
|
+
'instance_type': resources.instance_type,
|
|
232
|
+
'region': region.name,
|
|
233
|
+
'cluster_name': cluster_name,
|
|
234
|
+
'custom_resources': custom_resources,
|
|
235
|
+
'seeweb_gpu_config': seeweb_gpu_config,
|
|
236
|
+
'image_id': image_id,
|
|
237
|
+
}
|
|
238
|
+
return result
|
|
239
|
+
|
|
240
|
+
@classmethod
|
|
241
|
+
def get_vcpus_mem_from_instance_type(
|
|
242
|
+
cls, instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
|
243
|
+
result = catalog.get_vcpus_mem_from_instance_type(instance_type,
|
|
244
|
+
clouds='seeweb')
|
|
245
|
+
return result
|
|
246
|
+
|
|
247
|
+
@classmethod
|
|
248
|
+
def get_accelerators_from_instance_type(
|
|
249
|
+
cls,
|
|
250
|
+
instance_type: str,
|
|
251
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
|
252
|
+
result = catalog.get_accelerators_from_instance_type(instance_type,
|
|
253
|
+
clouds='seeweb')
|
|
254
|
+
return result
|
|
255
|
+
|
|
256
|
+
@classmethod
|
|
257
|
+
def get_default_instance_type(
|
|
258
|
+
cls,
|
|
259
|
+
cpus: Optional[str] = None,
|
|
260
|
+
memory: Optional[str] = None,
|
|
261
|
+
disk_tier: Optional[resources_utils.DiskTier] = None,
|
|
262
|
+
region: Optional[str] = None,
|
|
263
|
+
zone: Optional[str] = None,
|
|
264
|
+
) -> Optional[str]:
|
|
265
|
+
result = catalog.get_default_instance_type(cpus=cpus,
|
|
266
|
+
memory=memory,
|
|
267
|
+
disk_tier=disk_tier,
|
|
268
|
+
clouds='seeweb')
|
|
269
|
+
return result
|
|
270
|
+
|
|
271
|
+
def _get_feasible_launchable_resources(
|
|
272
|
+
self, resources: 'resources_lib.Resources'
|
|
273
|
+
) -> 'resources_utils.FeasibleResources':
|
|
274
|
+
"""Get feasible resources for Seeweb."""
|
|
275
|
+
if resources.use_spot:
|
|
276
|
+
return resources_utils.FeasibleResources(
|
|
277
|
+
[], [], 'Spot instances not supported on Seeweb')
|
|
278
|
+
|
|
279
|
+
if resources.accelerators and len(resources.accelerators) > 1:
|
|
280
|
+
return resources_utils.FeasibleResources(
|
|
281
|
+
[], [], 'Multiple accelerator types not supported on Seeweb')
|
|
282
|
+
|
|
283
|
+
# If no instance_type is specified, try to get a default one
|
|
284
|
+
if not resources.instance_type:
|
|
285
|
+
# If accelerators are specified, try to find instance
|
|
286
|
+
# type forthat accelerator
|
|
287
|
+
if resources.accelerators:
|
|
288
|
+
# Get the first accelerator
|
|
289
|
+
# (we already checked there's only one)
|
|
290
|
+
acc_name, acc_count = list(resources.accelerators.items())[0]
|
|
291
|
+
|
|
292
|
+
# Use catalog to find instance type for this accelerator
|
|
293
|
+
# This leverages the catalog system to find suitable instances
|
|
294
|
+
(
|
|
295
|
+
instance_types,
|
|
296
|
+
fuzzy_candidates,
|
|
297
|
+
) = catalog.get_instance_type_for_accelerator(
|
|
298
|
+
acc_name=acc_name,
|
|
299
|
+
acc_count=acc_count,
|
|
300
|
+
cpus=resources.cpus,
|
|
301
|
+
memory=resources.memory,
|
|
302
|
+
use_spot=resources.use_spot,
|
|
303
|
+
region=resources.region,
|
|
304
|
+
zone=resources.zone,
|
|
305
|
+
clouds='seeweb',
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
if instance_types and len(instance_types) > 0:
|
|
309
|
+
# Use the first (cheapest) instance type
|
|
310
|
+
selected_instance_type = instance_types[0]
|
|
311
|
+
resources = resources.copy(
|
|
312
|
+
instance_type=selected_instance_type)
|
|
313
|
+
else:
|
|
314
|
+
return resources_utils.FeasibleResources(
|
|
315
|
+
[],
|
|
316
|
+
fuzzy_candidates,
|
|
317
|
+
f'No instance type found for accelerator'
|
|
318
|
+
f'{acc_name}:{acc_count} on Seeweb',
|
|
319
|
+
)
|
|
320
|
+
else:
|
|
321
|
+
# No accelerators specified, use default instance type
|
|
322
|
+
default_instance_type = self.get_default_instance_type(
|
|
323
|
+
cpus=resources.cpus,
|
|
324
|
+
memory=resources.memory,
|
|
325
|
+
region=resources.region,
|
|
326
|
+
zone=resources.zone,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
if default_instance_type:
|
|
330
|
+
# Create new resources with the default instance type
|
|
331
|
+
resources = resources.copy(
|
|
332
|
+
instance_type=default_instance_type)
|
|
333
|
+
else:
|
|
334
|
+
return resources_utils.FeasibleResources(
|
|
335
|
+
[],
|
|
336
|
+
[],
|
|
337
|
+
f'No suitable instance type found for'
|
|
338
|
+
f'cpus={resources.cpus}, memory={resources.memory}',
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Check if instance type exists
|
|
342
|
+
if resources.instance_type:
|
|
343
|
+
exists = catalog.instance_type_exists(resources.instance_type,
|
|
344
|
+
clouds='seeweb')
|
|
345
|
+
if not exists:
|
|
346
|
+
return resources_utils.FeasibleResources(
|
|
347
|
+
[],
|
|
348
|
+
[],
|
|
349
|
+
f'Instance type {resources.instance_type}'
|
|
350
|
+
f' not available on Seeweb',
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Set the cloud if not already set
|
|
354
|
+
if not resources.cloud:
|
|
355
|
+
resources = resources.copy(cloud=self)
|
|
356
|
+
|
|
357
|
+
# Return the resources as feasible
|
|
358
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
|
359
|
+
|
|
360
|
+
@classmethod
|
|
361
|
+
def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
|
|
362
|
+
"""Check Seeweb compute credentials."""
|
|
363
|
+
try:
|
|
364
|
+
result = seeweb_adaptor.check_compute_credentials()
|
|
365
|
+
return result, None
|
|
366
|
+
except Exception as e: # pylint: disable=broad-except
|
|
367
|
+
return False, str(e)
|
|
368
|
+
|
|
369
|
+
@classmethod
|
|
370
|
+
def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
|
|
371
|
+
"""Check Seeweb storage credentials."""
|
|
372
|
+
try:
|
|
373
|
+
result = seeweb_adaptor.check_storage_credentials()
|
|
374
|
+
return result, None
|
|
375
|
+
except Exception as e: # pylint: disable=broad-except
|
|
376
|
+
return False, str(e)
|
|
377
|
+
|
|
378
|
+
@classmethod
|
|
379
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
|
380
|
+
# Seeweb doesn't have user identity concept
|
|
381
|
+
return None
|
|
382
|
+
|
|
383
|
+
@classmethod
|
|
384
|
+
def query_status(
|
|
385
|
+
cls,
|
|
386
|
+
name: str,
|
|
387
|
+
tag_filters: Dict[str, str],
|
|
388
|
+
region: Optional[str],
|
|
389
|
+
zone: Optional[str],
|
|
390
|
+
**kwargs,
|
|
391
|
+
) -> List['status_lib.ClusterStatus']:
|
|
392
|
+
"""Query the status of Seeweb cluster instances."""
|
|
393
|
+
cluster_name_on_cloud = name
|
|
394
|
+
|
|
395
|
+
result = seeweb_provision.instance.query_instances(
|
|
396
|
+
cluster_name=name,
|
|
397
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
|
398
|
+
provider_config={},
|
|
399
|
+
non_terminated_only=True)
|
|
400
|
+
# Convert Dict[str, Tuple[Optional[ClusterStatus],
|
|
401
|
+
# Optional[str]]] to List[ClusterStatus]
|
|
402
|
+
return [status for status, _ in result.values() if status is not None]
|
|
403
|
+
|
|
404
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
405
|
+
"""Returns the credential files to mount."""
|
|
406
|
+
# Mount the Seeweb API key file to the remote instance
|
|
407
|
+
# This allows the provisioner to authenticate with Seeweb API
|
|
408
|
+
result = {
|
|
409
|
+
_SEEWEB_KEY_FILE: _SEEWEB_KEY_FILE,
|
|
410
|
+
}
|
|
411
|
+
return result
|
|
412
|
+
|
|
413
|
+
def instance_type_exists(self, instance_type: str) -> bool:
|
|
414
|
+
"""Returns whether the instance type exists for Seeweb."""
|
|
415
|
+
result = catalog.instance_type_exists(instance_type, clouds='seeweb')
|
|
416
|
+
return result
|
|
417
|
+
|
|
418
|
+
@classmethod
|
|
419
|
+
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
|
|
420
|
+
"""Seeweb doesn't support custom images."""
|
|
421
|
+
del image_id, region
|
|
422
|
+
with ux_utils.print_exception_no_traceback():
|
|
423
|
+
raise ValueError(f'Custom images are not supported on {cls._REPR}. '
|
|
424
|
+
'Seeweb clusters use pre-configured images only.')
|
|
425
|
+
|
|
426
|
+
# Image-related methods (not supported)
|
|
427
|
+
@classmethod
|
|
428
|
+
def create_image_from_cluster(
|
|
429
|
+
cls,
|
|
430
|
+
cluster_name: resources_utils.ClusterName,
|
|
431
|
+
region: Optional[str],
|
|
432
|
+
zone: Optional[str],
|
|
433
|
+
) -> str:
|
|
434
|
+
del cluster_name, region, zone # unused
|
|
435
|
+
with ux_utils.print_exception_no_traceback():
|
|
436
|
+
raise ValueError(
|
|
437
|
+
f'Creating images from clusters is not supported on'
|
|
438
|
+
f' {cls._REPR}. Seeweb does not support custom'
|
|
439
|
+
f' image creation.')
|
|
440
|
+
|
|
441
|
+
@classmethod
|
|
442
|
+
def maybe_move_image(
|
|
443
|
+
cls,
|
|
444
|
+
image_id: str,
|
|
445
|
+
source_region: str,
|
|
446
|
+
target_region: str,
|
|
447
|
+
source_zone: Optional[str],
|
|
448
|
+
target_zone: Optional[str],
|
|
449
|
+
) -> str:
|
|
450
|
+
del image_id, source_region, target_region, source_zone, target_zone
|
|
451
|
+
with ux_utils.print_exception_no_traceback():
|
|
452
|
+
raise ValueError(
|
|
453
|
+
f'Moving images between regions is not supported on'
|
|
454
|
+
f' {cls._REPR}. '
|
|
455
|
+
'Seeweb does not support custom images.')
|
|
456
|
+
|
|
457
|
+
@classmethod
|
|
458
|
+
def delete_image(cls, image_id: str, region: Optional[str]) -> None:
|
|
459
|
+
del image_id, region
|
|
460
|
+
with ux_utils.print_exception_no_traceback():
|
|
461
|
+
raise ValueError(
|
|
462
|
+
f'Deleting images is not supported on {cls._REPR}. '
|
|
463
|
+
'Seeweb does not support custom image management.')
|