skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/slurm.py +159 -72
- sky/backends/backend_utils.py +52 -10
- sky/backends/cloud_vm_ray_backend.py +192 -32
- sky/backends/task_codegen.py +40 -2
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +0 -7
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +11 -8
- sky/client/cli/command.py +106 -54
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +8 -0
- sky/client/sdk_async.py +9 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +44 -12
- sky/clouds/ssh.py +1 -1
- sky/clouds/vast.py +30 -17
- sky/core.py +69 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +29 -4
- sky/global_user_state.py +108 -16
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +9 -0
- sky/optimizer.py +2 -1
- sky/provision/__init__.py +11 -9
- sky/provision/kubernetes/utils.py +122 -15
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/instance.py +75 -29
- sky/provision/slurm/utils.py +213 -107
- sky/provision/vast/utils.py +1 -0
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +16 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +28 -10
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/requests/serializers/return_value_serializers.py +14 -4
- sky/server/server.py +434 -107
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +21 -10
- sky/sky_logging.py +2 -1
- sky/skylet/constants.py +22 -5
- sky/skylet/executor/slurm.py +4 -6
- sky/skylet/job_lib.py +89 -4
- sky/skylet/services.py +18 -3
- sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -6
- sky/templates/slurm-ray.yml.j2 +32 -2
- sky/templates/websocket_proxy.py +18 -41
- sky/users/permission.py +61 -51
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +256 -94
- sky/utils/command_runner.pyi +16 -0
- sky/utils/common_utils.py +30 -29
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +63 -20
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
- /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -81,6 +81,7 @@ from sky.utils import timeline
|
|
|
81
81
|
from sky.utils import ux_utils
|
|
82
82
|
from sky.utils import volume as volume_lib
|
|
83
83
|
from sky.utils import yaml_utils
|
|
84
|
+
from sky.utils.plugin_extensions import ExternalFailureSource
|
|
84
85
|
|
|
85
86
|
if typing.TYPE_CHECKING:
|
|
86
87
|
import grpc
|
|
@@ -915,8 +916,10 @@ class RetryingVmProvisioner(object):
|
|
|
915
916
|
elif to_provision.region is not None and to_provision.cloud is not None:
|
|
916
917
|
# For public clouds, provision.region is always set.
|
|
917
918
|
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
919
|
+
ssh_node_pool_name = common_utils.removeprefix(
|
|
920
|
+
to_provision.region, 'ssh-')
|
|
918
921
|
message += (
|
|
919
|
-
f'in SSH Node Pool ({
|
|
922
|
+
f'in SSH Node Pool ({ssh_node_pool_name}) '
|
|
920
923
|
f'for {requested_resources}. The SSH Node Pool may not '
|
|
921
924
|
'have enough resources.')
|
|
922
925
|
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
@@ -1176,7 +1179,9 @@ class RetryingVmProvisioner(object):
|
|
|
1176
1179
|
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
|
1177
1180
|
suffix = '.'
|
|
1178
1181
|
if region.name.startswith('ssh-'):
|
|
1179
|
-
|
|
1182
|
+
ssh_node_pool_name = common_utils.removeprefix(
|
|
1183
|
+
region.name, 'ssh-')
|
|
1184
|
+
suffix = f' ({ssh_node_pool_name})'
|
|
1180
1185
|
logger.info(
|
|
1181
1186
|
ux_utils.starting_message(
|
|
1182
1187
|
f'Launching{controller_str} on '
|
|
@@ -2732,6 +2737,13 @@ class SkyletClient:
|
|
|
2732
2737
|
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
2733
2738
|
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
2734
2739
|
|
|
2740
|
+
def get_job_exit_codes(
|
|
2741
|
+
self,
|
|
2742
|
+
request: 'jobsv1_pb2.GetJobExitCodesRequest',
|
|
2743
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2744
|
+
) -> 'jobsv1_pb2.GetJobExitCodesResponse':
|
|
2745
|
+
return self._jobs_stub.GetJobExitCodes(request, timeout=timeout)
|
|
2746
|
+
|
|
2735
2747
|
def tail_logs(
|
|
2736
2748
|
self,
|
|
2737
2749
|
request: 'jobsv1_pb2.TailLogsRequest',
|
|
@@ -3040,6 +3052,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3040
3052
|
'sky api status -v | grep '
|
|
3041
3053
|
f'{cluster_name}'))
|
|
3042
3054
|
|
|
3055
|
+
def _maybe_clear_external_cluster_failures(
|
|
3056
|
+
self, cluster_name: str,
|
|
3057
|
+
prev_cluster_status: Optional[status_lib.ClusterStatus]) -> None:
|
|
3058
|
+
"""Clear any existing cluster failures when reusing a cluster.
|
|
3059
|
+
|
|
3060
|
+
Clear any existing cluster failures when reusing a cluster. This ensures
|
|
3061
|
+
that when a cluster failure is detected (causing the cluster to be
|
|
3062
|
+
marked as INIT), the user can recover the cluster via `sky start` or
|
|
3063
|
+
`sky launch` and clear the failure.
|
|
3064
|
+
"""
|
|
3065
|
+
if prev_cluster_status is not None:
|
|
3066
|
+
failures = ExternalFailureSource.clear(cluster_name=cluster_name)
|
|
3067
|
+
if failures:
|
|
3068
|
+
failure_details = [f'"{f["failure_mode"]}"' for f in failures]
|
|
3069
|
+
plural = 's' if len(failures) > 1 else ''
|
|
3070
|
+
logger.info(f'{colorama.Style.DIM}Cleared {len(failures)} '
|
|
3071
|
+
f'existing cluster failure{plural} for cluster '
|
|
3072
|
+
f'{cluster_name!r}: {", ".join(failure_details)}'
|
|
3073
|
+
f'{colorama.Style.RESET_ALL}')
|
|
3074
|
+
|
|
3043
3075
|
def _locked_provision(
|
|
3044
3076
|
self,
|
|
3045
3077
|
lock_id: str,
|
|
@@ -3070,6 +3102,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3070
3102
|
to_provision_config.num_nodes, to_provision_config.resources)
|
|
3071
3103
|
usage_lib.messages.usage.update_cluster_status(prev_cluster_status)
|
|
3072
3104
|
|
|
3105
|
+
self._maybe_clear_external_cluster_failures(cluster_name,
|
|
3106
|
+
prev_cluster_status)
|
|
3107
|
+
|
|
3073
3108
|
# TODO(suquark): once we have sky on PyPI, we should directly
|
|
3074
3109
|
# install sky from PyPI.
|
|
3075
3110
|
# NOTE: can take ~2s.
|
|
@@ -3428,7 +3463,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3428
3463
|
ssh_user=handle.ssh_user,
|
|
3429
3464
|
docker_user=handle.docker_user)
|
|
3430
3465
|
cluster_utils.SSHConfigHelper.add_cluster(
|
|
3431
|
-
handle.cluster_name, handle.
|
|
3466
|
+
handle.cluster_name, handle.cluster_name_on_cloud,
|
|
3467
|
+
handle.cached_external_ips, auth_config,
|
|
3432
3468
|
handle.cached_external_ssh_ports, handle.docker_user,
|
|
3433
3469
|
handle.ssh_user)
|
|
3434
3470
|
|
|
@@ -3769,20 +3805,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3769
3805
|
up=True,
|
|
3770
3806
|
stream_logs=False)
|
|
3771
3807
|
|
|
3772
|
-
|
|
3773
|
-
mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
|
|
3774
|
-
f'touch {remote_log_path}')
|
|
3808
|
+
mkdir_code = f'mkdir -p {remote_log_dir} && touch {remote_log_path}'
|
|
3775
3809
|
encoded_script = shlex.quote(codegen)
|
|
3776
3810
|
create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
|
|
3777
3811
|
job_submit_cmd = (
|
|
3778
3812
|
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3779
3813
|
# retrieved with pid is the same driver process.
|
|
3780
3814
|
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
3781
|
-
f'{
|
|
3815
|
+
f'{constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
3782
3816
|
# Do not use &>, which is not POSIX and may not work.
|
|
3783
3817
|
# Note that the order of ">filename 2>&1" matters.
|
|
3784
3818
|
f'> {remote_log_path} 2>&1')
|
|
3785
3819
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
|
3820
|
+
|
|
3821
|
+
# For Slurm, we need to wait for the job to complete before exiting,
|
|
3822
|
+
# because Slurm's proctrack/cgroup kills all processes when the srun
|
|
3823
|
+
# job step ends, including child processes launched as a separate
|
|
3824
|
+
# process group.
|
|
3825
|
+
# So this keeps srun alive so the job driver process that was spawned
|
|
3826
|
+
# (and runs in the background) by job_lib.JobScheduler.schedule_step()
|
|
3827
|
+
# does not get killed.
|
|
3828
|
+
# Note: proctrack/cgroup is enabled by default on Nebius' Managed
|
|
3829
|
+
# Soperator.
|
|
3830
|
+
is_slurm = isinstance(handle.launched_resources.cloud, clouds.Slurm)
|
|
3831
|
+
if is_slurm:
|
|
3832
|
+
wait_code = job_lib.JobLibCodeGen.wait_for_job(job_id)
|
|
3833
|
+
code = code + ' && ' + wait_code
|
|
3834
|
+
|
|
3786
3835
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
|
3787
3836
|
|
|
3788
3837
|
# Should also be ealier than is_command_length_over_limit
|
|
@@ -3867,10 +3916,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3867
3916
|
|
|
3868
3917
|
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3869
3918
|
|
|
3870
|
-
|
|
3871
|
-
|
|
3872
|
-
|
|
3873
|
-
|
|
3919
|
+
# For Slurm, run in background so that SSH returns immediately.
|
|
3920
|
+
# This is needed because we add the wait_for_job code above which
|
|
3921
|
+
# makes the command block until the job completes.
|
|
3922
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3923
|
+
handle,
|
|
3924
|
+
job_submit_cmd,
|
|
3925
|
+
stream_logs=False,
|
|
3926
|
+
require_outputs=True,
|
|
3927
|
+
run_in_background=is_slurm)
|
|
3874
3928
|
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3875
3929
|
# running a job. Necessitating calling `sky launch`.
|
|
3876
3930
|
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
@@ -3887,11 +3941,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3887
3941
|
_dump_code_to_file(codegen)
|
|
3888
3942
|
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3889
3943
|
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3944
|
+
# See comment above for why run_in_background=is_slurm.
|
|
3890
3945
|
returncode, stdout, stderr = self.run_on_head(
|
|
3891
3946
|
handle,
|
|
3892
3947
|
job_submit_cmd,
|
|
3893
3948
|
stream_logs=False,
|
|
3894
|
-
require_outputs=True
|
|
3949
|
+
require_outputs=True,
|
|
3950
|
+
run_in_background=is_slurm)
|
|
3895
3951
|
|
|
3896
3952
|
subprocess_utils.handle_returncode(
|
|
3897
3953
|
returncode,
|
|
@@ -4950,6 +5006,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4950
5006
|
ports_cleaned_up = True
|
|
4951
5007
|
except exceptions.PortDoesNotExistError:
|
|
4952
5008
|
logger.debug('Ports do not exist. Skipping cleanup.')
|
|
5009
|
+
ports_cleaned_up = True
|
|
4953
5010
|
except Exception as e: # pylint: disable=broad-except
|
|
4954
5011
|
if purge:
|
|
4955
5012
|
msg = common_utils.format_exception(e, use_bracket=True)
|
|
@@ -5022,11 +5079,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5022
5079
|
config['provider'],
|
|
5023
5080
|
non_terminated_only=False)
|
|
5024
5081
|
|
|
5025
|
-
|
|
5082
|
+
unexpected_nodes = []
|
|
5026
5083
|
for node_id, node_status_tuple in node_status_dict.items():
|
|
5027
5084
|
node_status, reason = node_status_tuple
|
|
5028
|
-
|
|
5029
|
-
logger.debug(f'{node_id} status: {node_status}{
|
|
5085
|
+
reason_str = '' if reason is None else f' ({reason})'
|
|
5086
|
+
logger.debug(f'{node_id} status: {node_status}{reason_str}')
|
|
5030
5087
|
# FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
|
|
5031
5088
|
# between "stopping/stopped" and "terminating/terminated",
|
|
5032
5089
|
# so we allow for either status instead of casing on
|
|
@@ -5034,19 +5091,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5034
5091
|
if node_status not in [
|
|
5035
5092
|
None, status_lib.ClusterStatus.STOPPED
|
|
5036
5093
|
]:
|
|
5037
|
-
|
|
5038
|
-
break
|
|
5094
|
+
unexpected_nodes.append((node_id, node_status, reason))
|
|
5039
5095
|
|
|
5040
|
-
if
|
|
5096
|
+
if not unexpected_nodes:
|
|
5041
5097
|
break
|
|
5042
5098
|
|
|
5043
5099
|
attempts += 1
|
|
5044
5100
|
if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
|
|
5045
5101
|
time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
|
|
5046
5102
|
else:
|
|
5047
|
-
|
|
5048
|
-
|
|
5049
|
-
|
|
5103
|
+
unexpected_nodes_str = '\n'.join([
|
|
5104
|
+
f' - {node_id}: {node_status}' +
|
|
5105
|
+
(f' ({reason})' if reason else '')
|
|
5106
|
+
for node_id, node_status, reason in unexpected_nodes
|
|
5107
|
+
])
|
|
5108
|
+
raise RuntimeError(f'Instances in unexpected state:\n'
|
|
5109
|
+
f'{unexpected_nodes_str}')
|
|
5050
5110
|
|
|
5051
5111
|
# If cluster_yaml is None, the cluster should ensured to be terminated,
|
|
5052
5112
|
# so we don't need to do the double check.
|
|
@@ -5333,6 +5393,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5333
5393
|
assert handle is not None
|
|
5334
5394
|
# Cluster already exists.
|
|
5335
5395
|
self.check_resources_fit_cluster(handle, task)
|
|
5396
|
+
|
|
5336
5397
|
# Use the existing cluster.
|
|
5337
5398
|
assert handle.launched_resources is not None, (cluster_name, handle)
|
|
5338
5399
|
# Take a random resource in order to get resource info that applies
|
|
@@ -5384,27 +5445,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5384
5445
|
for resource in task.resources:
|
|
5385
5446
|
assert (resource.cluster_config_overrides ==
|
|
5386
5447
|
one_task_resource.cluster_config_overrides)
|
|
5387
|
-
|
|
5448
|
+
|
|
5449
|
+
cluster_yaml_str = global_user_state.get_cluster_yaml_str(
|
|
5450
|
+
cluster_name)
|
|
5451
|
+
cluster_yaml_obj = (yaml_utils.safe_load(cluster_yaml_str)
|
|
5452
|
+
if cluster_yaml_str is not None else None)
|
|
5453
|
+
|
|
5454
|
+
def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
5455
|
+
return (yaml_obj.get('available_node_types',
|
|
5456
|
+
{}).get('ray_head_default',
|
|
5457
|
+
{}).get('node_config', {}))
|
|
5458
|
+
|
|
5459
|
+
if isinstance(to_provision.cloud,
|
|
5460
|
+
clouds.Kubernetes) and cluster_yaml_obj is not None:
|
|
5388
5461
|
# Warn users if the Kubernetes pod config is different
|
|
5389
5462
|
# from the existing cluster.
|
|
5390
|
-
cluster_yaml_str = global_user_state.get_cluster_yaml_str(
|
|
5391
|
-
cluster_name)
|
|
5392
|
-
actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
|
|
5393
5463
|
desired_cluster_yaml_obj = (
|
|
5394
5464
|
kubernetes_utils.combine_pod_config_fields_and_metadata(
|
|
5395
|
-
|
|
5465
|
+
cluster_yaml_obj,
|
|
5396
5466
|
cluster_config_overrides=one_task_resource.
|
|
5397
5467
|
cluster_config_overrides,
|
|
5398
5468
|
cloud=to_provision.cloud,
|
|
5399
5469
|
context=to_provision.region))
|
|
5400
5470
|
|
|
5401
|
-
def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
5402
|
-
return (yaml_obj.get('available_node_types',
|
|
5403
|
-
{}).get('ray_head_default',
|
|
5404
|
-
{}).get('node_config', {}))
|
|
5405
|
-
|
|
5406
5471
|
if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
|
|
5407
|
-
|
|
5472
|
+
cluster_yaml_obj):
|
|
5408
5473
|
# pylint: disable=line-too-long
|
|
5409
5474
|
logger.warning(
|
|
5410
5475
|
f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
|
|
@@ -5415,6 +5480,101 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5415
5480
|
f' • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
|
|
5416
5481
|
f'{colorama.Style.RESET_ALL}')
|
|
5417
5482
|
|
|
5483
|
+
# Check for volume mount warnings
|
|
5484
|
+
if task.volume_mounts:
|
|
5485
|
+
# Get existing cluster's volume mounts from cluster yaml
|
|
5486
|
+
existing_volume_names = set()
|
|
5487
|
+
try:
|
|
5488
|
+
if cluster_yaml_obj is not None:
|
|
5489
|
+
# Extract volume names from existing cluster
|
|
5490
|
+
node_config = _get_pod_config(cluster_yaml_obj)
|
|
5491
|
+
|
|
5492
|
+
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
|
5493
|
+
# Check for K8s-style persistent volumes
|
|
5494
|
+
# (spec.volumes)
|
|
5495
|
+
# See sky/templates/kubernetes-ray.yml.j2.
|
|
5496
|
+
volumes = node_config.get('spec',
|
|
5497
|
+
{}).get('volumes', [])
|
|
5498
|
+
for vol in volumes:
|
|
5499
|
+
# Volume from PVC has structure:
|
|
5500
|
+
# - name: <volume_name>
|
|
5501
|
+
# persistentVolumeClaim:
|
|
5502
|
+
# claimName: <volume_name_on_cloud>
|
|
5503
|
+
if 'persistentVolumeClaim' in vol:
|
|
5504
|
+
pvc = vol.get('persistentVolumeClaim', {})
|
|
5505
|
+
# Use claimName (volume_name_on_cloud) to
|
|
5506
|
+
# be consistent with RunPod.
|
|
5507
|
+
vol_name_on_cloud = pvc.get('claimName')
|
|
5508
|
+
if vol_name_on_cloud:
|
|
5509
|
+
existing_volume_names.add(
|
|
5510
|
+
vol_name_on_cloud)
|
|
5511
|
+
|
|
5512
|
+
# Check for K8s ephemeral volumes
|
|
5513
|
+
# See sky/templates/kubernetes-ray.yml.j2.
|
|
5514
|
+
provider_config = cluster_yaml_obj.get(
|
|
5515
|
+
'provider', {})
|
|
5516
|
+
ephemeral_specs = provider_config.get(
|
|
5517
|
+
'ephemeral_volume_specs', [])
|
|
5518
|
+
for spec in ephemeral_specs:
|
|
5519
|
+
# For ephemeral volumes, we check the mount
|
|
5520
|
+
# path.
|
|
5521
|
+
mount_path = spec.get('path')
|
|
5522
|
+
if mount_path:
|
|
5523
|
+
existing_volume_names.add(mount_path)
|
|
5524
|
+
|
|
5525
|
+
elif isinstance(to_provision.cloud, clouds.RunPod):
|
|
5526
|
+
# Check for custom VolumeMounts config
|
|
5527
|
+
# (e.g. RunPod)
|
|
5528
|
+
# See sky/templates/runpod-ray.yml.j2.
|
|
5529
|
+
volume_mounts_config = node_config.get(
|
|
5530
|
+
'VolumeMounts', [])
|
|
5531
|
+
for vol_mount in volume_mounts_config:
|
|
5532
|
+
vol_name = vol_mount.get('VolumeNameOnCloud')
|
|
5533
|
+
if vol_name:
|
|
5534
|
+
existing_volume_names.add(vol_name)
|
|
5535
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5536
|
+
# If we can't get the existing volume mounts, log debug
|
|
5537
|
+
# and skip the warning check
|
|
5538
|
+
logger.debug(f'Failed to check existing volume mounts: {e}',
|
|
5539
|
+
exc_info=True)
|
|
5540
|
+
|
|
5541
|
+
# Check if task has new volumes not in existing cluster
|
|
5542
|
+
new_ephemeral_volumes = []
|
|
5543
|
+
new_persistent_volumes = []
|
|
5544
|
+
for volume_mount in task.volume_mounts:
|
|
5545
|
+
# Compare using volume_name for user-facing name
|
|
5546
|
+
if volume_mount.is_ephemeral:
|
|
5547
|
+
if volume_mount.path not in existing_volume_names:
|
|
5548
|
+
new_ephemeral_volumes.append(volume_mount.path)
|
|
5549
|
+
elif (volume_mount.volume_name not in existing_volume_names
|
|
5550
|
+
and volume_mount.volume_config.name_on_cloud
|
|
5551
|
+
not in existing_volume_names):
|
|
5552
|
+
new_persistent_volumes.append(volume_mount.volume_name)
|
|
5553
|
+
|
|
5554
|
+
if new_ephemeral_volumes or new_persistent_volumes:
|
|
5555
|
+
msg_parts = []
|
|
5556
|
+
if new_ephemeral_volumes:
|
|
5557
|
+
msg_parts.append(f'new ephemeral volume(s) with path '
|
|
5558
|
+
f'{", ".join(new_ephemeral_volumes)}')
|
|
5559
|
+
if new_persistent_volumes:
|
|
5560
|
+
msg_parts.append(
|
|
5561
|
+
f'new volume(s) {", ".join(new_persistent_volumes)}'
|
|
5562
|
+
)
|
|
5563
|
+
|
|
5564
|
+
volume_msg = ' and '.join(msg_parts)
|
|
5565
|
+
# Capitalize the first letter of the message
|
|
5566
|
+
volume_msg = volume_msg[0].upper() + volume_msg[1:]
|
|
5567
|
+
|
|
5568
|
+
logger.warning(
|
|
5569
|
+
f'{colorama.Fore.YELLOW}WARNING: {volume_msg} '
|
|
5570
|
+
f'specified in task but not '
|
|
5571
|
+
f'mounted to existing cluster "{cluster_name}". '
|
|
5572
|
+
f'These volumes will not be mounted to the cluster. '
|
|
5573
|
+
f'To mount new volumes, either:\n'
|
|
5574
|
+
f' • Use a new cluster, or\n'
|
|
5575
|
+
f' • Terminate and recreate this cluster'
|
|
5576
|
+
f'{colorama.Style.RESET_ALL}')
|
|
5577
|
+
|
|
5418
5578
|
return RetryingVmProvisioner.ToProvisionConfig(
|
|
5419
5579
|
cluster_name,
|
|
5420
5580
|
to_provision,
|
sky/backends/task_codegen.py
CHANGED
|
@@ -147,6 +147,7 @@ class TaskCodeGen:
|
|
|
147
147
|
if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
|
|
148
148
|
[ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
|
|
149
149
|
[ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
|
|
150
|
+
FLUSH_START_TIME=$(date +%s)
|
|
150
151
|
flushed=0
|
|
151
152
|
# extra second on top of --vfs-cache-poll-interval to
|
|
152
153
|
# avoid race condition between rclone log line creation and this check.
|
|
@@ -159,13 +160,32 @@ class TaskCodeGen:
|
|
|
159
160
|
exitcode=0
|
|
160
161
|
tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
|
|
161
162
|
if [ $exitcode -ne 0 ]; then
|
|
162
|
-
|
|
163
|
+
ELAPSED=$(($(date +%s) - FLUSH_START_TIME))
|
|
164
|
+
# Extract the last vfs cache status line to show what we're waiting for
|
|
165
|
+
CACHE_STATUS=$(tac $file | grep "vfs cache: cleaned:" -m 1 | sed 's/.*vfs cache: cleaned: //' 2>/dev/null)
|
|
166
|
+
# Extract currently uploading files from recent log lines (show up to 2 files)
|
|
167
|
+
UPLOADING_FILES=$(tac $file | head -30 | grep -E "queuing for upload" | head -2 | sed 's/.*INFO : //' | sed 's/: vfs cache:.*//' | tr '\\n' ',' | sed 's/,$//' | sed 's/,/, /g' 2>/dev/null)
|
|
168
|
+
# Build status message with available info
|
|
169
|
+
if [ -n "$CACHE_STATUS" ] && [ -n "$UPLOADING_FILES" ]; then
|
|
170
|
+
echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) [${{CACHE_STATUS}}] uploading: ${{UPLOADING_FILES}}"
|
|
171
|
+
elif [ -n "$CACHE_STATUS" ]; then
|
|
172
|
+
echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) [${{CACHE_STATUS}}]"
|
|
173
|
+
else
|
|
174
|
+
# Fallback: show last non-empty line from log
|
|
175
|
+
LAST_LINE=$(tac $file | grep -v "^$" | head -1 | sed 's/.*INFO : //' | sed 's/.*ERROR : //' | sed 's/.*NOTICE: //' 2>/dev/null)
|
|
176
|
+
if [ -n "$LAST_LINE" ]; then
|
|
177
|
+
echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) ${{LAST_LINE}}"
|
|
178
|
+
else
|
|
179
|
+
echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s)"
|
|
180
|
+
fi
|
|
181
|
+
fi
|
|
163
182
|
flushed=0
|
|
164
183
|
break
|
|
165
184
|
fi
|
|
166
185
|
done
|
|
167
186
|
done
|
|
168
|
-
|
|
187
|
+
TOTAL_FLUSH_TIME=$(($(date +%s) - FLUSH_START_TIME))
|
|
188
|
+
echo "skypilot: cached mount upload complete (took ${{TOTAL_FLUSH_TIME}}s)"
|
|
169
189
|
fi""")
|
|
170
190
|
|
|
171
191
|
def add_prologue(self, job_id: int) -> None:
|
|
@@ -214,6 +234,9 @@ class TaskCodeGen:
|
|
|
214
234
|
self._code += [
|
|
215
235
|
textwrap.dedent(f"""\
|
|
216
236
|
if sum(returncodes) != 0:
|
|
237
|
+
# Save exit codes to job metadata for potential recovery logic
|
|
238
|
+
if int(constants.SKYLET_VERSION) >= 28:
|
|
239
|
+
job_lib.set_exit_codes({self.job_id!r}, returncodes)
|
|
217
240
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
|
|
218
241
|
# Schedule the next pending job immediately to make the job
|
|
219
242
|
# scheduling more efficient.
|
|
@@ -483,6 +506,8 @@ class RayCodeGen(TaskCodeGen):
|
|
|
483
506
|
msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
|
|
484
507
|
msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
485
508
|
print(msg, flush=True)
|
|
509
|
+
if int(constants.SKYLET_VERSION) >= 28:
|
|
510
|
+
job_lib.set_exit_codes({self.job_id!r}, setup_returncodes)
|
|
486
511
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
487
512
|
# This waits for all streaming logs to finish.
|
|
488
513
|
time.sleep(1)
|
|
@@ -851,7 +876,18 @@ class SlurmCodeGen(TaskCodeGen):
|
|
|
851
876
|
# $HOME/.local/bin/env (non-executable, from uv installation)
|
|
852
877
|
# shadows /usr/bin/env.
|
|
853
878
|
job_suffix = '-setup' if is_setup else ''
|
|
879
|
+
# Unset SLURM_* environment variables before running srun.
|
|
880
|
+
# When this srun runs inside another srun (from
|
|
881
|
+
# SlurmCommandRunner.run), inherited variables like
|
|
882
|
+
# SLURM_CPU_BIND, SLURM_NNODES, and SLURM_NODELIST constrain
|
|
883
|
+
# the inner srun to the parent step's allocation. This causes
|
|
884
|
+
# "CPU binding outside of job step allocation" errors.
|
|
885
|
+
# Unsetting all SLURM_* variables allows this srun to access the full job
|
|
886
|
+
# allocation. See:
|
|
887
|
+
# https://support.schedmd.com/show_bug.cgi?id=14298
|
|
888
|
+
# https://github.com/huggingface/datatrove/issues/248
|
|
854
889
|
srun_cmd = (
|
|
890
|
+
"unset $(env | awk -F= '/^SLURM_/ {{print $1}}') && "
|
|
855
891
|
f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid={self._slurm_job_id} '
|
|
856
892
|
f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1 {{extra_flags}} '
|
|
857
893
|
f'{{constants.SKY_SLURM_PYTHON_CMD}} -m sky.skylet.executor.slurm {{runner_args}}'
|
|
@@ -900,6 +936,8 @@ class SlurmCodeGen(TaskCodeGen):
|
|
|
900
936
|
msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
901
937
|
print(msg, flush=True)
|
|
902
938
|
returncodes = [returncode]
|
|
939
|
+
if int(constants.SKYLET_VERSION) >= 28:
|
|
940
|
+
job_lib.set_exit_codes({self.job_id!r}, returncodes)
|
|
903
941
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
904
942
|
sys.exit(1)
|
|
905
943
|
time.sleep(0.1)
|
|
@@ -189,6 +189,9 @@ SERIES_TO_DESCRIPTION = {
|
|
|
189
189
|
'c2': 'Compute optimized',
|
|
190
190
|
'c2d': 'C2D AMD Instance',
|
|
191
191
|
'c3': 'C3 Instance',
|
|
192
|
+
'c3d': 'C3D Instance',
|
|
193
|
+
'c4': 'C4 Instance',
|
|
194
|
+
'c4d': 'C4D Instance',
|
|
192
195
|
'e2': 'E2 Instance',
|
|
193
196
|
'f1': 'Micro Instance with burstable CPU',
|
|
194
197
|
'g1': 'Small Instance with 1 VCPU',
|
|
@@ -376,8 +379,13 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
|
|
|
376
379
|
is_cpu = True
|
|
377
380
|
elif resource_group == 'RAM':
|
|
378
381
|
is_memory = True
|
|
382
|
+
elif resource_group == 'LocalSSD':
|
|
383
|
+
# Ignore local SSD pricing for now, as we do not include disk
|
|
384
|
+
# pricing for instances for now.
|
|
385
|
+
# TODO(zhwu): Handle local SSD pricing.
|
|
386
|
+
pass
|
|
379
387
|
else:
|
|
380
|
-
assert resource_group == 'N1Standard'
|
|
388
|
+
assert resource_group == 'N1Standard', (resource_group, sku)
|
|
381
389
|
if 'Core' in description:
|
|
382
390
|
is_cpu = True
|
|
383
391
|
elif 'Ram' in description:
|
|
@@ -180,7 +180,7 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
|
|
|
180
180
|
presets (List[PresetInfo]): A list of PresetInfo objects to write.
|
|
181
181
|
output_file (str): The path to the output CSV file.
|
|
182
182
|
"""
|
|
183
|
-
os.makedirs(os.path.dirname(output_file))
|
|
183
|
+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
|
184
184
|
# Set up the CSV writer to output to stdout
|
|
185
185
|
with open(output_file, 'w', encoding='utf-8') as out:
|
|
186
186
|
header = [
|
|
@@ -50,7 +50,7 @@ if __name__ == '__main__':
|
|
|
50
50
|
('num_gpus', 'AcceleratorCount'), ('cpu_cores', 'vCPUs'),
|
|
51
51
|
('cpu_ram', 'MemoryGiB'), ('gpu_name', 'GpuInfo'),
|
|
52
52
|
('search.totalHour', 'Price'), ('min_bid', 'SpotPrice'),
|
|
53
|
-
('geolocation', 'Region'))
|
|
53
|
+
('geolocation', 'Region'), ('hosting_type', 'HostingType'))
|
|
54
54
|
|
|
55
55
|
# Vast has a wide variety of machines, some of
|
|
56
56
|
# which will have less diskspace and network
|
|
@@ -138,7 +138,9 @@ if __name__ == '__main__':
|
|
|
138
138
|
|
|
139
139
|
maxBid = max([x.get('SpotPrice') for x in toList])
|
|
140
140
|
for instance in toList:
|
|
141
|
-
|
|
141
|
+
hosting_type = instance.get('HostingType', 0)
|
|
142
|
+
stub = (f'{instance["InstanceType"]} '
|
|
143
|
+
f'{instance["Region"][-2:]} {hosting_type}')
|
|
142
144
|
if stub in seen:
|
|
143
145
|
printstub = f'{stub}#print'
|
|
144
146
|
if printstub not in seen:
|
sky/catalog/seeweb_catalog.py
CHANGED
|
@@ -7,22 +7,33 @@ query instance types and pricing information for Seeweb.
|
|
|
7
7
|
import typing
|
|
8
8
|
from typing import Dict, List, Optional, Tuple
|
|
9
9
|
|
|
10
|
-
import
|
|
11
|
-
|
|
10
|
+
from sky.adaptors import common as adaptors_common
|
|
12
11
|
from sky.catalog import common
|
|
13
12
|
from sky.utils import resources_utils
|
|
14
13
|
from sky.utils import ux_utils
|
|
15
14
|
|
|
16
15
|
if typing.TYPE_CHECKING:
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
17
18
|
from sky.clouds import cloud
|
|
19
|
+
else:
|
|
20
|
+
pd = adaptors_common.LazyImport('pandas')
|
|
18
21
|
|
|
19
22
|
_PULL_FREQUENCY_HOURS = 8
|
|
20
|
-
_df =
|
|
21
|
-
|
|
23
|
+
_df = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_df():
|
|
27
|
+
"""Get the dataframe, loading it lazily if needed."""
|
|
28
|
+
global _df
|
|
29
|
+
if _df is None:
|
|
30
|
+
_df = common.read_catalog('seeweb/vms.csv',
|
|
31
|
+
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
|
32
|
+
return _df
|
|
22
33
|
|
|
23
34
|
|
|
24
35
|
def instance_type_exists(instance_type: str) -> bool:
|
|
25
|
-
result = common.instance_type_exists_impl(
|
|
36
|
+
result = common.instance_type_exists_impl(_get_df(), instance_type)
|
|
26
37
|
return result
|
|
27
38
|
|
|
28
39
|
|
|
@@ -33,7 +44,7 @@ def validate_region_zone(
|
|
|
33
44
|
with ux_utils.print_exception_no_traceback():
|
|
34
45
|
raise ValueError('Seeweb does not support zones.')
|
|
35
46
|
|
|
36
|
-
result = common.validate_region_zone_impl('Seeweb',
|
|
47
|
+
result = common.validate_region_zone_impl('Seeweb', _get_df(), region, zone)
|
|
37
48
|
return result
|
|
38
49
|
|
|
39
50
|
|
|
@@ -46,14 +57,15 @@ def get_hourly_cost(instance_type: str,
|
|
|
46
57
|
with ux_utils.print_exception_no_traceback():
|
|
47
58
|
raise ValueError('Seeweb does not support zones.')
|
|
48
59
|
|
|
49
|
-
result = common.get_hourly_cost_impl(
|
|
50
|
-
zone)
|
|
60
|
+
result = common.get_hourly_cost_impl(_get_df(), instance_type, use_spot,
|
|
61
|
+
region, zone)
|
|
51
62
|
return result
|
|
52
63
|
|
|
53
64
|
|
|
54
65
|
def get_vcpus_mem_from_instance_type(
|
|
55
66
|
instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
|
56
|
-
result = common.get_vcpus_mem_from_instance_type_impl(
|
|
67
|
+
result = common.get_vcpus_mem_from_instance_type_impl(
|
|
68
|
+
_get_df(), instance_type)
|
|
57
69
|
return result
|
|
58
70
|
|
|
59
71
|
|
|
@@ -64,7 +76,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
|
64
76
|
region: Optional[str] = None,
|
|
65
77
|
zone: Optional[str] = None) -> Optional[str]:
|
|
66
78
|
del disk_tier # unused
|
|
67
|
-
result = common.get_instance_type_for_cpus_mem_impl(
|
|
79
|
+
result = common.get_instance_type_for_cpus_mem_impl(_get_df(), cpus, memory,
|
|
68
80
|
region, zone)
|
|
69
81
|
return result
|
|
70
82
|
|
|
@@ -72,7 +84,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
|
72
84
|
def get_accelerators_from_instance_type(
|
|
73
85
|
instance_type: str) -> Optional[Dict[str, int]]:
|
|
74
86
|
# Filter the dataframe for the specific instance type
|
|
75
|
-
|
|
87
|
+
df = _get_df()
|
|
88
|
+
df_filtered = df[df['InstanceType'] == instance_type]
|
|
76
89
|
if df_filtered.empty:
|
|
77
90
|
return None
|
|
78
91
|
|
|
@@ -114,7 +127,7 @@ def get_instance_type_for_accelerator(
|
|
|
114
127
|
with ux_utils.print_exception_no_traceback():
|
|
115
128
|
raise ValueError('Seeweb does not support zones.')
|
|
116
129
|
|
|
117
|
-
result = common.get_instance_type_for_accelerator_impl(df=
|
|
130
|
+
result = common.get_instance_type_for_accelerator_impl(df=_get_df(),
|
|
118
131
|
acc_name=acc_name,
|
|
119
132
|
acc_count=acc_count,
|
|
120
133
|
cpus=cpus,
|
|
@@ -126,7 +139,7 @@ def get_instance_type_for_accelerator(
|
|
|
126
139
|
|
|
127
140
|
|
|
128
141
|
def regions() -> List['cloud.Region']:
|
|
129
|
-
result = common.get_region_zones(
|
|
142
|
+
result = common.get_region_zones(_get_df(), use_spot=False)
|
|
130
143
|
return result
|
|
131
144
|
|
|
132
145
|
|
|
@@ -135,7 +148,8 @@ def get_region_zones_for_instance_type(instance_type: str,
|
|
|
135
148
|
) -> List['cloud.Region']:
|
|
136
149
|
"""Returns a list of regions for a given instance type."""
|
|
137
150
|
# Filter the dataframe for the specific instance type
|
|
138
|
-
|
|
151
|
+
df = _get_df()
|
|
152
|
+
df_filtered = df[df['InstanceType'] == instance_type]
|
|
139
153
|
if df_filtered.empty:
|
|
140
154
|
return []
|
|
141
155
|
|
|
@@ -174,7 +188,8 @@ def list_accelerators(
|
|
|
174
188
|
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
175
189
|
"""Lists accelerators offered in Seeweb."""
|
|
176
190
|
# Filter out rows with empty or null regions (indicating unavailability)
|
|
177
|
-
|
|
191
|
+
df = _get_df()
|
|
192
|
+
df_filtered = df.dropna(subset=['Region'])
|
|
178
193
|
df_filtered = df_filtered[df_filtered['Region'].str.strip() != '']
|
|
179
194
|
|
|
180
195
|
result = common.list_accelerators_impl('Seeweb', df_filtered, gpus_only,
|
sky/catalog/shadeform_catalog.py
CHANGED
|
@@ -7,12 +7,15 @@ and can be used to query instance types and pricing information for Shadeform.
|
|
|
7
7
|
import typing
|
|
8
8
|
from typing import Dict, List, Optional, Tuple, Union
|
|
9
9
|
|
|
10
|
-
import
|
|
11
|
-
|
|
10
|
+
from sky.adaptors import common as adaptors_common
|
|
12
11
|
from sky.catalog import common
|
|
13
12
|
|
|
14
13
|
if typing.TYPE_CHECKING:
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
15
16
|
from sky.clouds import cloud
|
|
17
|
+
else:
|
|
18
|
+
pd = adaptors_common.LazyImport('pandas')
|
|
16
19
|
|
|
17
20
|
# We'll use dynamic fetching, so no static CSV file to load
|
|
18
21
|
_df = None
|