skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/backends/task_codegen.py
CHANGED
|
@@ -4,6 +4,7 @@ import copy
|
|
|
4
4
|
import inspect
|
|
5
5
|
import json
|
|
6
6
|
import math
|
|
7
|
+
import os
|
|
7
8
|
import textwrap
|
|
8
9
|
from typing import Dict, List, Optional, Tuple
|
|
9
10
|
|
|
@@ -146,6 +147,7 @@ class TaskCodeGen:
|
|
|
146
147
|
if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
|
|
147
148
|
[ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
|
|
148
149
|
[ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
|
|
150
|
+
FLUSH_START_TIME=$(date +%s)
|
|
149
151
|
flushed=0
|
|
150
152
|
# extra second on top of --vfs-cache-poll-interval to
|
|
151
153
|
# avoid race condition between rclone log line creation and this check.
|
|
@@ -158,13 +160,32 @@ class TaskCodeGen:
|
|
|
158
160
|
exitcode=0
|
|
159
161
|
tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
|
|
160
162
|
if [ $exitcode -ne 0 ]; then
|
|
161
|
-
|
|
163
|
+
ELAPSED=$(($(date +%s) - FLUSH_START_TIME))
|
|
164
|
+
# Extract the last vfs cache status line to show what we're waiting for
|
|
165
|
+
CACHE_STATUS=$(tac $file | grep "vfs cache: cleaned:" -m 1 | sed 's/.*vfs cache: cleaned: //' 2>/dev/null)
|
|
166
|
+
# Extract currently uploading files from recent log lines (show up to 2 files)
|
|
167
|
+
UPLOADING_FILES=$(tac $file | head -30 | grep -E "queuing for upload" | head -2 | sed 's/.*INFO : //' | sed 's/: vfs cache:.*//' | tr '\\n' ',' | sed 's/,$//' | sed 's/,/, /g' 2>/dev/null)
|
|
168
|
+
# Build status message with available info
|
|
169
|
+
if [ -n "$CACHE_STATUS" ] && [ -n "$UPLOADING_FILES" ]; then
|
|
170
|
+
echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) [${{CACHE_STATUS}}] uploading: ${{UPLOADING_FILES}}"
|
|
171
|
+
elif [ -n "$CACHE_STATUS" ]; then
|
|
172
|
+
echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) [${{CACHE_STATUS}}]"
|
|
173
|
+
else
|
|
174
|
+
# Fallback: show last non-empty line from log
|
|
175
|
+
LAST_LINE=$(tac $file | grep -v "^$" | head -1 | sed 's/.*INFO : //' | sed 's/.*ERROR : //' | sed 's/.*NOTICE: //' 2>/dev/null)
|
|
176
|
+
if [ -n "$LAST_LINE" ]; then
|
|
177
|
+
echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) ${{LAST_LINE}}"
|
|
178
|
+
else
|
|
179
|
+
echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s)"
|
|
180
|
+
fi
|
|
181
|
+
fi
|
|
162
182
|
flushed=0
|
|
163
183
|
break
|
|
164
184
|
fi
|
|
165
185
|
done
|
|
166
186
|
done
|
|
167
|
-
|
|
187
|
+
TOTAL_FLUSH_TIME=$(($(date +%s) - FLUSH_START_TIME))
|
|
188
|
+
echo "skypilot: cached mount upload complete (took ${{TOTAL_FLUSH_TIME}}s)"
|
|
168
189
|
fi""")
|
|
169
190
|
|
|
170
191
|
def add_prologue(self, job_id: int) -> None:
|
|
@@ -181,8 +202,8 @@ class TaskCodeGen:
|
|
|
181
202
|
resources_dict: Dict[str, float],
|
|
182
203
|
stable_cluster_internal_ips: List[str],
|
|
183
204
|
env_vars: Dict[str, str],
|
|
205
|
+
log_dir: str,
|
|
184
206
|
setup_cmd: Optional[str] = None,
|
|
185
|
-
setup_log_path: Optional[str] = None,
|
|
186
207
|
) -> None:
|
|
187
208
|
"""Generates code to set up the task on each node.
|
|
188
209
|
|
|
@@ -213,6 +234,9 @@ class TaskCodeGen:
|
|
|
213
234
|
self._code += [
|
|
214
235
|
textwrap.dedent(f"""\
|
|
215
236
|
if sum(returncodes) != 0:
|
|
237
|
+
# Save exit codes to job metadata for potential recovery logic
|
|
238
|
+
if int(constants.SKYLET_VERSION) >= 28:
|
|
239
|
+
job_lib.set_exit_codes({self.job_id!r}, returncodes)
|
|
216
240
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
|
|
217
241
|
# Schedule the next pending job immediately to make the job
|
|
218
242
|
# scheduling more efficient.
|
|
@@ -379,13 +403,15 @@ class RayCodeGen(TaskCodeGen):
|
|
|
379
403
|
resources_dict: Dict[str, float],
|
|
380
404
|
stable_cluster_internal_ips: List[str],
|
|
381
405
|
env_vars: Dict[str, str],
|
|
406
|
+
log_dir: str,
|
|
382
407
|
setup_cmd: Optional[str] = None,
|
|
383
|
-
setup_log_path: Optional[str] = None,
|
|
384
408
|
) -> None:
|
|
385
409
|
assert self._has_prologue, ('Call add_prologue() before '
|
|
386
410
|
'add_setup().')
|
|
387
411
|
self._has_setup = True
|
|
388
412
|
|
|
413
|
+
setup_log_path = os.path.join(log_dir, 'setup.log')
|
|
414
|
+
|
|
389
415
|
bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
|
|
390
416
|
# Set CPU to avoid ray hanging the resources allocation
|
|
391
417
|
# for remote functions, since the task will request 1 CPU
|
|
@@ -480,6 +506,8 @@ class RayCodeGen(TaskCodeGen):
|
|
|
480
506
|
msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
|
|
481
507
|
msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
482
508
|
print(msg, flush=True)
|
|
509
|
+
if int(constants.SKYLET_VERSION) >= 28:
|
|
510
|
+
job_lib.set_exit_codes({self.job_id!r}, setup_returncodes)
|
|
483
511
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
484
512
|
# This waits for all streaming logs to finish.
|
|
485
513
|
time.sleep(1)
|
|
@@ -631,3 +659,351 @@ class RayCodeGen(TaskCodeGen):
|
|
|
631
659
|
"""Generates code that waits for all tasks, then exits."""
|
|
632
660
|
self._code.append('returncodes, _ = get_or_fail(futures, pg)')
|
|
633
661
|
super().add_epilogue()
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
class SlurmCodeGen(TaskCodeGen):
|
|
665
|
+
"""Code generator for task execution on Slurm using native srun."""
|
|
666
|
+
|
|
667
|
+
def __init__(self, slurm_job_id: str):
|
|
668
|
+
"""Initialize SlurmCodeGen
|
|
669
|
+
|
|
670
|
+
Args:
|
|
671
|
+
slurm_job_id: The Slurm job ID, i.e. SLURM_JOB_ID
|
|
672
|
+
"""
|
|
673
|
+
super().__init__()
|
|
674
|
+
self._slurm_job_id = slurm_job_id
|
|
675
|
+
|
|
676
|
+
def add_prologue(self, job_id: int) -> None:
|
|
677
|
+
assert not self._has_prologue, 'add_prologue() called twice?'
|
|
678
|
+
self._has_prologue = True
|
|
679
|
+
self.job_id = job_id
|
|
680
|
+
|
|
681
|
+
self._add_common_imports()
|
|
682
|
+
|
|
683
|
+
self._code.append(
|
|
684
|
+
textwrap.dedent("""\
|
|
685
|
+
import colorama
|
|
686
|
+
import copy
|
|
687
|
+
import json
|
|
688
|
+
import multiprocessing
|
|
689
|
+
import signal
|
|
690
|
+
import threading
|
|
691
|
+
from sky.backends import backend_utils
|
|
692
|
+
"""))
|
|
693
|
+
self._add_skylet_imports()
|
|
694
|
+
|
|
695
|
+
self._add_constants()
|
|
696
|
+
|
|
697
|
+
self._add_logging_functions()
|
|
698
|
+
|
|
699
|
+
self._code.append(
|
|
700
|
+
textwrap.dedent(f"""\
|
|
701
|
+
def _cancel_slurm_job_steps():
|
|
702
|
+
slurm_job_id = {self._slurm_job_id!r}
|
|
703
|
+
assert slurm_job_id is not None, 'SLURM_JOB_ID is not set'
|
|
704
|
+
try:
|
|
705
|
+
# Query steps for this job: squeue -s -j JOBID -h -o "%i %j"
|
|
706
|
+
# Output format: "JOBID.STEPID STEPNAME"
|
|
707
|
+
# TODO(kevin): This assumes that compute node is able
|
|
708
|
+
# to run client commands against the controller.
|
|
709
|
+
# Validate this assumption.
|
|
710
|
+
result = subprocess.run(
|
|
711
|
+
['squeue', '-s', '-j', slurm_job_id, '-h', '-o', '%i %j'],
|
|
712
|
+
capture_output=True, text=True, check=False)
|
|
713
|
+
for line in result.stdout.strip().split('\\n'):
|
|
714
|
+
if not line:
|
|
715
|
+
continue
|
|
716
|
+
parts = line.split()
|
|
717
|
+
assert len(parts) >= 2, 'Expected at least 2 parts'
|
|
718
|
+
step_id, step_name = parts[0], parts[1]
|
|
719
|
+
if step_name == f'sky-{self.job_id}':
|
|
720
|
+
subprocess.run(['scancel', step_id],
|
|
721
|
+
check=False, capture_output=True)
|
|
722
|
+
except Exception as e:
|
|
723
|
+
print(f'Error in _cancel_slurm_job_steps: {{e}}', flush=True)
|
|
724
|
+
pass
|
|
725
|
+
|
|
726
|
+
def _slurm_cleanup_handler(signum, _frame):
|
|
727
|
+
_cancel_slurm_job_steps()
|
|
728
|
+
# Re-raise to let default handler terminate.
|
|
729
|
+
signal.signal(signum, signal.SIG_DFL)
|
|
730
|
+
os.kill(os.getpid(), signum)
|
|
731
|
+
|
|
732
|
+
signal.signal(signal.SIGTERM, _slurm_cleanup_handler)
|
|
733
|
+
"""))
|
|
734
|
+
|
|
735
|
+
self._code += [
|
|
736
|
+
'autostop_lib.set_last_active_time_to_now()',
|
|
737
|
+
f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
|
|
738
|
+
]
|
|
739
|
+
|
|
740
|
+
self._setup_cmd: Optional[str] = None
|
|
741
|
+
self._setup_envs: Optional[Dict[str, str]] = None
|
|
742
|
+
self._setup_log_dir: Optional[str] = None
|
|
743
|
+
self._setup_num_nodes: Optional[int] = None
|
|
744
|
+
|
|
745
|
+
def add_setup(
|
|
746
|
+
self,
|
|
747
|
+
num_nodes: int,
|
|
748
|
+
resources_dict: Dict[str, float],
|
|
749
|
+
stable_cluster_internal_ips: List[str],
|
|
750
|
+
env_vars: Dict[str, str],
|
|
751
|
+
log_dir: str,
|
|
752
|
+
setup_cmd: Optional[str] = None,
|
|
753
|
+
) -> None:
|
|
754
|
+
assert self._has_prologue, ('Call add_prologue() before add_setup().')
|
|
755
|
+
self._has_setup = True
|
|
756
|
+
self._cluster_num_nodes = len(stable_cluster_internal_ips)
|
|
757
|
+
self._stable_cluster_ips = stable_cluster_internal_ips
|
|
758
|
+
|
|
759
|
+
self._add_waiting_for_resources_msg(num_nodes)
|
|
760
|
+
|
|
761
|
+
# Store setup information for use in add_task().
|
|
762
|
+
if setup_cmd is not None:
|
|
763
|
+
setup_envs = env_vars.copy()
|
|
764
|
+
setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
|
|
765
|
+
self._setup_cmd = setup_cmd
|
|
766
|
+
self._setup_envs = setup_envs
|
|
767
|
+
self._setup_log_dir = log_dir
|
|
768
|
+
self._setup_num_nodes = num_nodes
|
|
769
|
+
|
|
770
|
+
def add_task(
|
|
771
|
+
self,
|
|
772
|
+
num_nodes: int,
|
|
773
|
+
bash_script: Optional[str],
|
|
774
|
+
task_name: Optional[str],
|
|
775
|
+
resources_dict: Dict[str, float],
|
|
776
|
+
log_dir: str,
|
|
777
|
+
env_vars: Optional[Dict[str, str]] = None,
|
|
778
|
+
) -> None:
|
|
779
|
+
"""Generates code for invoking a bash command
|
|
780
|
+
using srun within sbatch allocation.
|
|
781
|
+
"""
|
|
782
|
+
assert self._has_setup, 'Call add_setup() before add_task().'
|
|
783
|
+
env_vars = env_vars or {}
|
|
784
|
+
task_name = task_name if task_name is not None else 'task'
|
|
785
|
+
|
|
786
|
+
acc_name, acc_count = self._get_accelerator_details(resources_dict)
|
|
787
|
+
num_gpus = 0
|
|
788
|
+
if (acc_name is not None and
|
|
789
|
+
not accelerator_registry.is_schedulable_non_gpu_accelerator(
|
|
790
|
+
acc_name)):
|
|
791
|
+
num_gpus = int(math.ceil(acc_count))
|
|
792
|
+
|
|
793
|
+
# Slurm does not support fractional CPUs.
|
|
794
|
+
task_cpu_demand = int(math.ceil(resources_dict.pop('CPU')))
|
|
795
|
+
|
|
796
|
+
sky_env_vars_dict_str = [
|
|
797
|
+
textwrap.dedent(f"""\
|
|
798
|
+
sky_env_vars_dict = {{}}
|
|
799
|
+
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
|
|
800
|
+
""")
|
|
801
|
+
]
|
|
802
|
+
|
|
803
|
+
if env_vars:
|
|
804
|
+
sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
|
|
805
|
+
for k, v in env_vars.items())
|
|
806
|
+
sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
|
|
807
|
+
|
|
808
|
+
rclone_flush_script = self._get_rclone_flush_script()
|
|
809
|
+
streaming_msg = self._get_job_started_msg()
|
|
810
|
+
has_setup_cmd = self._setup_cmd is not None
|
|
811
|
+
|
|
812
|
+
self._code += [
|
|
813
|
+
sky_env_vars_dict_str,
|
|
814
|
+
textwrap.dedent(f"""\
|
|
815
|
+
script = {bash_script!r}
|
|
816
|
+
if script is None:
|
|
817
|
+
script = ''
|
|
818
|
+
rclone_flush_script = {rclone_flush_script!r}
|
|
819
|
+
|
|
820
|
+
if script or {has_setup_cmd!r}:
|
|
821
|
+
script += rclone_flush_script
|
|
822
|
+
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {num_gpus}
|
|
823
|
+
|
|
824
|
+
# Signal files for setup/run synchronization:
|
|
825
|
+
# 1. alloc_signal_file: srun has acquired allocation
|
|
826
|
+
# 2. setup_done_signal_file: Driver has finished setup, run can proceed
|
|
827
|
+
#
|
|
828
|
+
# Signal files are stored in home directory, which is
|
|
829
|
+
# assumed to be on a shared NFS mount accessible by all nodes.
|
|
830
|
+
# To support clusters with non-NFS home directories, we would
|
|
831
|
+
# need to let users specify an NFS-backed "working directory"
|
|
832
|
+
# or use a different coordination mechanism.
|
|
833
|
+
alloc_signal_file = f'~/.sky_alloc_{self._slurm_job_id}_{self.job_id}'
|
|
834
|
+
alloc_signal_file = os.path.expanduser(alloc_signal_file)
|
|
835
|
+
setup_done_signal_file = f'~/.sky_setup_done_{self._slurm_job_id}_{self.job_id}'
|
|
836
|
+
setup_done_signal_file = os.path.expanduser(setup_done_signal_file)
|
|
837
|
+
|
|
838
|
+
# Start exclusive srun in a thread to reserve allocation (similar to ray.get(pg.ready()))
|
|
839
|
+
gpu_arg = f'--gpus-per-node={num_gpus}' if {num_gpus} > 0 else ''
|
|
840
|
+
|
|
841
|
+
def build_task_runner_cmd(user_script, extra_flags, log_dir, env_vars_dict,
|
|
842
|
+
task_name=None, is_setup=False,
|
|
843
|
+
alloc_signal=None, setup_done_signal=None):
|
|
844
|
+
env_vars_json = json.dumps(env_vars_dict)
|
|
845
|
+
|
|
846
|
+
log_dir = shlex.quote(log_dir)
|
|
847
|
+
env_vars = shlex.quote(env_vars_json)
|
|
848
|
+
cluster_ips = shlex.quote(",".join({self._stable_cluster_ips!r}))
|
|
849
|
+
|
|
850
|
+
runner_args = f'--log-dir={{log_dir}} --env-vars={{env_vars}} --cluster-num-nodes={self._cluster_num_nodes} --cluster-ips={{cluster_ips}}'
|
|
851
|
+
|
|
852
|
+
if task_name is not None:
|
|
853
|
+
runner_args += f' --task-name={{shlex.quote(task_name)}}'
|
|
854
|
+
|
|
855
|
+
if is_setup:
|
|
856
|
+
runner_args += ' --is-setup'
|
|
857
|
+
|
|
858
|
+
if alloc_signal is not None:
|
|
859
|
+
runner_args += f' --alloc-signal-file={{shlex.quote(alloc_signal)}}'
|
|
860
|
+
|
|
861
|
+
if setup_done_signal is not None:
|
|
862
|
+
runner_args += f' --setup-done-signal-file={{shlex.quote(setup_done_signal)}}'
|
|
863
|
+
|
|
864
|
+
script_path = None
|
|
865
|
+
prefix = 'sky_setup_' if is_setup else 'sky_task_'
|
|
866
|
+
if backend_utils.is_command_length_over_limit(user_script):
|
|
867
|
+
with tempfile.NamedTemporaryFile('w', prefix=prefix, suffix='.sh', delete=False) as f:
|
|
868
|
+
f.write(user_script)
|
|
869
|
+
script_path = f.name
|
|
870
|
+
runner_args += f' --script-path={{shlex.quote(script_path)}}'
|
|
871
|
+
else:
|
|
872
|
+
runner_args += f' --script={{shlex.quote(user_script)}}'
|
|
873
|
+
|
|
874
|
+
# Use /usr/bin/env explicitly to work around a Slurm quirk where
|
|
875
|
+
# srun's execvp() doesn't check execute permissions, failing when
|
|
876
|
+
# $HOME/.local/bin/env (non-executable, from uv installation)
|
|
877
|
+
# shadows /usr/bin/env.
|
|
878
|
+
job_suffix = '-setup' if is_setup else ''
|
|
879
|
+
# Unset SLURM_* environment variables before running srun.
|
|
880
|
+
# When this srun runs inside another srun (from
|
|
881
|
+
# SlurmCommandRunner.run), inherited variables like
|
|
882
|
+
# SLURM_CPU_BIND, SLURM_NNODES, and SLURM_NODELIST constrain
|
|
883
|
+
# the inner srun to the parent step's allocation. This causes
|
|
884
|
+
# "CPU binding outside of job step allocation" errors.
|
|
885
|
+
# Unsetting all SLURM_* variables allows this srun to access the full job
|
|
886
|
+
# allocation. See:
|
|
887
|
+
# https://support.schedmd.com/show_bug.cgi?id=14298
|
|
888
|
+
# https://github.com/huggingface/datatrove/issues/248
|
|
889
|
+
srun_cmd = (
|
|
890
|
+
"unset $(env | awk -F= '/^SLURM_/ {{print $1}}') && "
|
|
891
|
+
f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid={self._slurm_job_id} '
|
|
892
|
+
f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1 {{extra_flags}} '
|
|
893
|
+
f'{{constants.SKY_SLURM_PYTHON_CMD}} -m sky.skylet.executor.slurm {{runner_args}}'
|
|
894
|
+
)
|
|
895
|
+
return srun_cmd, script_path
|
|
896
|
+
|
|
897
|
+
def run_thread_func():
|
|
898
|
+
# This blocks until Slurm allocates resources (--exclusive)
|
|
899
|
+
# --mem=0 to match RayCodeGen's behavior where we don't explicitly request memory.
|
|
900
|
+
run_flags = f'--nodes={num_nodes} --cpus-per-task={task_cpu_demand} --mem=0 {{gpu_arg}} --exclusive'
|
|
901
|
+
srun_cmd, task_script_path = build_task_runner_cmd(
|
|
902
|
+
script, run_flags, {log_dir!r}, sky_env_vars_dict,
|
|
903
|
+
task_name={task_name!r},
|
|
904
|
+
alloc_signal=alloc_signal_file,
|
|
905
|
+
setup_done_signal=setup_done_signal_file
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
proc = subprocess.Popen(srun_cmd, shell=True,
|
|
909
|
+
stdout=subprocess.PIPE,
|
|
910
|
+
stderr=subprocess.STDOUT,
|
|
911
|
+
text=True)
|
|
912
|
+
for line in proc.stdout:
|
|
913
|
+
print(line, end='', flush=True)
|
|
914
|
+
proc.wait()
|
|
915
|
+
|
|
916
|
+
if task_script_path is not None:
|
|
917
|
+
os.remove(task_script_path)
|
|
918
|
+
return {{'return_code': proc.returncode, 'pid': proc.pid}}
|
|
919
|
+
|
|
920
|
+
run_thread_result = {{'result': None}}
|
|
921
|
+
def run_thread_wrapper():
|
|
922
|
+
run_thread_result['result'] = run_thread_func()
|
|
923
|
+
|
|
924
|
+
run_thread = threading.Thread(target=run_thread_wrapper)
|
|
925
|
+
run_thread.start()
|
|
926
|
+
|
|
927
|
+
# Wait for allocation signal from inside srun
|
|
928
|
+
while not os.path.exists(alloc_signal_file):
|
|
929
|
+
if not run_thread.is_alive():
|
|
930
|
+
# srun failed before creating the signal file.
|
|
931
|
+
run_thread.join()
|
|
932
|
+
result = run_thread_result['result']
|
|
933
|
+
returncode = int(result.get('return_code', 1))
|
|
934
|
+
pid = result.get('pid', os.getpid())
|
|
935
|
+
msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{returncode}} (pid={{pid}}).'
|
|
936
|
+
msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
937
|
+
print(msg, flush=True)
|
|
938
|
+
returncodes = [returncode]
|
|
939
|
+
if int(constants.SKYLET_VERSION) >= 28:
|
|
940
|
+
job_lib.set_exit_codes({self.job_id!r}, returncodes)
|
|
941
|
+
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
942
|
+
sys.exit(1)
|
|
943
|
+
time.sleep(0.1)
|
|
944
|
+
|
|
945
|
+
print({streaming_msg!r}, flush=True)
|
|
946
|
+
|
|
947
|
+
if {has_setup_cmd!r}:
|
|
948
|
+
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SETTING_UP)
|
|
949
|
+
|
|
950
|
+
# The schedule_step should be called after the job status is set to
|
|
951
|
+
# non-PENDING, otherwise, the scheduler will think the current job
|
|
952
|
+
# is not submitted yet, and skip the scheduling step.
|
|
953
|
+
job_lib.scheduler.schedule_step()
|
|
954
|
+
|
|
955
|
+
# --overlap as we have already secured allocation with the srun for the run section,
|
|
956
|
+
# and otherwise this srun would get blocked and deadlock.
|
|
957
|
+
setup_flags = f'--overlap --nodes={self._setup_num_nodes}'
|
|
958
|
+
setup_srun, setup_script_path = build_task_runner_cmd(
|
|
959
|
+
{self._setup_cmd!r}, setup_flags, {self._setup_log_dir!r}, {self._setup_envs!r},
|
|
960
|
+
is_setup=True
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
# Run setup srun directly, streaming output to driver stdout
|
|
964
|
+
setup_proc = subprocess.Popen(setup_srun, shell=True,
|
|
965
|
+
stdout=subprocess.PIPE,
|
|
966
|
+
stderr=subprocess.STDOUT,
|
|
967
|
+
text=True)
|
|
968
|
+
for line in setup_proc.stdout:
|
|
969
|
+
print(line, end='', flush=True)
|
|
970
|
+
setup_proc.wait()
|
|
971
|
+
|
|
972
|
+
if setup_script_path is not None:
|
|
973
|
+
os.remove(setup_script_path)
|
|
974
|
+
|
|
975
|
+
setup_returncode = setup_proc.returncode
|
|
976
|
+
if setup_returncode != 0:
|
|
977
|
+
setup_pid = setup_proc.pid
|
|
978
|
+
msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{setup_returncode}} (pid={{setup_pid}}).'
|
|
979
|
+
msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
980
|
+
print(msg, flush=True)
|
|
981
|
+
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
982
|
+
# Cancel the srun spawned by run_thread_func.
|
|
983
|
+
_cancel_slurm_job_steps()
|
|
984
|
+
sys.exit(1)
|
|
985
|
+
|
|
986
|
+
job_lib.set_job_started({self.job_id!r})
|
|
987
|
+
if not {has_setup_cmd!r}:
|
|
988
|
+
# Need to call schedule_step() to make sure the scheduler
|
|
989
|
+
# schedule the next pending job.
|
|
990
|
+
job_lib.scheduler.schedule_step()
|
|
991
|
+
|
|
992
|
+
# Signal run thread to proceed.
|
|
993
|
+
pathlib.Path(setup_done_signal_file).touch()
|
|
994
|
+
|
|
995
|
+
# Wait for run thread to complete.
|
|
996
|
+
run_thread.join()
|
|
997
|
+
result = run_thread_result['result']
|
|
998
|
+
|
|
999
|
+
# Cleanup signal files
|
|
1000
|
+
if os.path.exists(alloc_signal_file):
|
|
1001
|
+
os.remove(alloc_signal_file)
|
|
1002
|
+
if os.path.exists(setup_done_signal_file):
|
|
1003
|
+
os.remove(setup_done_signal_file)
|
|
1004
|
+
|
|
1005
|
+
returncodes = [int(result.get('return_code', 1))]
|
|
1006
|
+
else:
|
|
1007
|
+
returncodes = [0]
|
|
1008
|
+
"""),
|
|
1009
|
+
]
|
sky/catalog/__init__.py
CHANGED
|
@@ -127,12 +127,9 @@ def list_accelerator_realtime(
|
|
|
127
127
|
case_sensitive: bool = True,
|
|
128
128
|
) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]:
|
|
129
129
|
"""Lists all accelerators offered by Sky with their realtime availability.
|
|
130
|
-
|
|
131
130
|
Realtime availability is the total number of accelerators in the cluster
|
|
132
131
|
and number of accelerators available at the time of the call.
|
|
133
|
-
|
|
134
132
|
Used for fixed size cluster settings, such as Kubernetes.
|
|
135
|
-
|
|
136
133
|
Returns:
|
|
137
134
|
A tuple of three dictionaries mapping canonical accelerator names to:
|
|
138
135
|
- A list of available counts. (e.g., [1, 2, 4])
|
|
@@ -189,6 +189,9 @@ SERIES_TO_DESCRIPTION = {
|
|
|
189
189
|
'c2': 'Compute optimized',
|
|
190
190
|
'c2d': 'C2D AMD Instance',
|
|
191
191
|
'c3': 'C3 Instance',
|
|
192
|
+
'c3d': 'C3D Instance',
|
|
193
|
+
'c4': 'C4 Instance',
|
|
194
|
+
'c4d': 'C4D Instance',
|
|
192
195
|
'e2': 'E2 Instance',
|
|
193
196
|
'f1': 'Micro Instance with burstable CPU',
|
|
194
197
|
'g1': 'Small Instance with 1 VCPU',
|
|
@@ -376,8 +379,13 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
|
|
|
376
379
|
is_cpu = True
|
|
377
380
|
elif resource_group == 'RAM':
|
|
378
381
|
is_memory = True
|
|
382
|
+
elif resource_group == 'LocalSSD':
|
|
383
|
+
# Ignore local SSD pricing for now, as we do not include disk
|
|
384
|
+
# pricing for instances for now.
|
|
385
|
+
# TODO(zhwu): Handle local SSD pricing.
|
|
386
|
+
pass
|
|
379
387
|
else:
|
|
380
|
-
assert resource_group == 'N1Standard'
|
|
388
|
+
assert resource_group == 'N1Standard', (resource_group, sku)
|
|
381
389
|
if 'Core' in description:
|
|
382
390
|
is_cpu = True
|
|
383
391
|
elif 'Ram' in description:
|
|
@@ -180,7 +180,7 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
|
|
|
180
180
|
presets (List[PresetInfo]): A list of PresetInfo objects to write.
|
|
181
181
|
output_file (str): The path to the output CSV file.
|
|
182
182
|
"""
|
|
183
|
-
os.makedirs(os.path.dirname(output_file))
|
|
183
|
+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
|
184
184
|
# Set up the CSV writer to output to stdout
|
|
185
185
|
with open(output_file, 'w', encoding='utf-8') as out:
|
|
186
186
|
header = [
|
|
@@ -50,7 +50,7 @@ if __name__ == '__main__':
|
|
|
50
50
|
('num_gpus', 'AcceleratorCount'), ('cpu_cores', 'vCPUs'),
|
|
51
51
|
('cpu_ram', 'MemoryGiB'), ('gpu_name', 'GpuInfo'),
|
|
52
52
|
('search.totalHour', 'Price'), ('min_bid', 'SpotPrice'),
|
|
53
|
-
('geolocation', 'Region'))
|
|
53
|
+
('geolocation', 'Region'), ('hosting_type', 'HostingType'))
|
|
54
54
|
|
|
55
55
|
# Vast has a wide variety of machines, some of
|
|
56
56
|
# which will have less diskspace and network
|
|
@@ -138,7 +138,9 @@ if __name__ == '__main__':
|
|
|
138
138
|
|
|
139
139
|
maxBid = max([x.get('SpotPrice') for x in toList])
|
|
140
140
|
for instance in toList:
|
|
141
|
-
|
|
141
|
+
hosting_type = instance.get('HostingType', 0)
|
|
142
|
+
stub = (f'{instance["InstanceType"]} '
|
|
143
|
+
f'{instance["Region"][-2:]} {hosting_type}')
|
|
142
144
|
if stub in seen:
|
|
143
145
|
printstub = f'{stub}#print'
|
|
144
146
|
if printstub not in seen:
|
|
@@ -204,6 +204,9 @@ def _list_accelerators(
|
|
|
204
204
|
min_quantity_filter = quantity_filter if quantity_filter else 1
|
|
205
205
|
|
|
206
206
|
for node in nodes:
|
|
207
|
+
# Check if node is ready
|
|
208
|
+
node_is_ready = node.is_ready()
|
|
209
|
+
|
|
207
210
|
for key in keys:
|
|
208
211
|
if key in node.metadata.labels:
|
|
209
212
|
accelerator_name = lf.get_accelerator_from_label_value(
|
|
@@ -260,6 +263,15 @@ def _list_accelerators(
|
|
|
260
263
|
total_accelerators_capacity[
|
|
261
264
|
accelerator_name] += quantized_count
|
|
262
265
|
|
|
266
|
+
# Initialize the total_accelerators_available to make sure the
|
|
267
|
+
# key exists in the dictionary.
|
|
268
|
+
total_accelerators_available[accelerator_name] = (
|
|
269
|
+
total_accelerators_available.get(accelerator_name, 0))
|
|
270
|
+
|
|
271
|
+
# Skip availability counting for not-ready nodes
|
|
272
|
+
if not node_is_ready:
|
|
273
|
+
continue
|
|
274
|
+
|
|
263
275
|
if error_on_get_allocated_gpu_qty_by_node:
|
|
264
276
|
# If we can't get the allocated GPU quantity by each node,
|
|
265
277
|
# we can't get the GPU usage.
|
|
@@ -268,10 +280,6 @@ def _list_accelerators(
|
|
|
268
280
|
|
|
269
281
|
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
270
282
|
accelerators_available = accelerator_count - allocated_qty
|
|
271
|
-
# Initialize the total_accelerators_available to make sure the
|
|
272
|
-
# key exists in the dictionary.
|
|
273
|
-
total_accelerators_available[accelerator_name] = (
|
|
274
|
-
total_accelerators_available.get(accelerator_name, 0))
|
|
275
283
|
|
|
276
284
|
if accelerators_available >= min_quantity_filter:
|
|
277
285
|
quantized_availability = min_quantity_filter * (
|
sky/catalog/seeweb_catalog.py
CHANGED
|
@@ -7,22 +7,33 @@ query instance types and pricing information for Seeweb.
|
|
|
7
7
|
import typing
|
|
8
8
|
from typing import Dict, List, Optional, Tuple
|
|
9
9
|
|
|
10
|
-
import
|
|
11
|
-
|
|
10
|
+
from sky.adaptors import common as adaptors_common
|
|
12
11
|
from sky.catalog import common
|
|
13
12
|
from sky.utils import resources_utils
|
|
14
13
|
from sky.utils import ux_utils
|
|
15
14
|
|
|
16
15
|
if typing.TYPE_CHECKING:
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
17
18
|
from sky.clouds import cloud
|
|
19
|
+
else:
|
|
20
|
+
pd = adaptors_common.LazyImport('pandas')
|
|
18
21
|
|
|
19
22
|
_PULL_FREQUENCY_HOURS = 8
|
|
20
|
-
_df =
|
|
21
|
-
|
|
23
|
+
_df = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_df():
|
|
27
|
+
"""Get the dataframe, loading it lazily if needed."""
|
|
28
|
+
global _df
|
|
29
|
+
if _df is None:
|
|
30
|
+
_df = common.read_catalog('seeweb/vms.csv',
|
|
31
|
+
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
|
32
|
+
return _df
|
|
22
33
|
|
|
23
34
|
|
|
24
35
|
def instance_type_exists(instance_type: str) -> bool:
|
|
25
|
-
result = common.instance_type_exists_impl(
|
|
36
|
+
result = common.instance_type_exists_impl(_get_df(), instance_type)
|
|
26
37
|
return result
|
|
27
38
|
|
|
28
39
|
|
|
@@ -33,7 +44,7 @@ def validate_region_zone(
|
|
|
33
44
|
with ux_utils.print_exception_no_traceback():
|
|
34
45
|
raise ValueError('Seeweb does not support zones.')
|
|
35
46
|
|
|
36
|
-
result = common.validate_region_zone_impl('Seeweb',
|
|
47
|
+
result = common.validate_region_zone_impl('Seeweb', _get_df(), region, zone)
|
|
37
48
|
return result
|
|
38
49
|
|
|
39
50
|
|
|
@@ -46,14 +57,15 @@ def get_hourly_cost(instance_type: str,
|
|
|
46
57
|
with ux_utils.print_exception_no_traceback():
|
|
47
58
|
raise ValueError('Seeweb does not support zones.')
|
|
48
59
|
|
|
49
|
-
result = common.get_hourly_cost_impl(
|
|
50
|
-
zone)
|
|
60
|
+
result = common.get_hourly_cost_impl(_get_df(), instance_type, use_spot,
|
|
61
|
+
region, zone)
|
|
51
62
|
return result
|
|
52
63
|
|
|
53
64
|
|
|
54
65
|
def get_vcpus_mem_from_instance_type(
|
|
55
66
|
instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
|
56
|
-
result = common.get_vcpus_mem_from_instance_type_impl(
|
|
67
|
+
result = common.get_vcpus_mem_from_instance_type_impl(
|
|
68
|
+
_get_df(), instance_type)
|
|
57
69
|
return result
|
|
58
70
|
|
|
59
71
|
|
|
@@ -64,7 +76,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
|
64
76
|
region: Optional[str] = None,
|
|
65
77
|
zone: Optional[str] = None) -> Optional[str]:
|
|
66
78
|
del disk_tier # unused
|
|
67
|
-
result = common.get_instance_type_for_cpus_mem_impl(
|
|
79
|
+
result = common.get_instance_type_for_cpus_mem_impl(_get_df(), cpus, memory,
|
|
68
80
|
region, zone)
|
|
69
81
|
return result
|
|
70
82
|
|
|
@@ -72,7 +84,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
|
72
84
|
def get_accelerators_from_instance_type(
|
|
73
85
|
instance_type: str) -> Optional[Dict[str, int]]:
|
|
74
86
|
# Filter the dataframe for the specific instance type
|
|
75
|
-
|
|
87
|
+
df = _get_df()
|
|
88
|
+
df_filtered = df[df['InstanceType'] == instance_type]
|
|
76
89
|
if df_filtered.empty:
|
|
77
90
|
return None
|
|
78
91
|
|
|
@@ -114,7 +127,7 @@ def get_instance_type_for_accelerator(
|
|
|
114
127
|
with ux_utils.print_exception_no_traceback():
|
|
115
128
|
raise ValueError('Seeweb does not support zones.')
|
|
116
129
|
|
|
117
|
-
result = common.get_instance_type_for_accelerator_impl(df=
|
|
130
|
+
result = common.get_instance_type_for_accelerator_impl(df=_get_df(),
|
|
118
131
|
acc_name=acc_name,
|
|
119
132
|
acc_count=acc_count,
|
|
120
133
|
cpus=cpus,
|
|
@@ -126,7 +139,7 @@ def get_instance_type_for_accelerator(
|
|
|
126
139
|
|
|
127
140
|
|
|
128
141
|
def regions() -> List['cloud.Region']:
|
|
129
|
-
result = common.get_region_zones(
|
|
142
|
+
result = common.get_region_zones(_get_df(), use_spot=False)
|
|
130
143
|
return result
|
|
131
144
|
|
|
132
145
|
|
|
@@ -135,7 +148,8 @@ def get_region_zones_for_instance_type(instance_type: str,
|
|
|
135
148
|
) -> List['cloud.Region']:
|
|
136
149
|
"""Returns a list of regions for a given instance type."""
|
|
137
150
|
# Filter the dataframe for the specific instance type
|
|
138
|
-
|
|
151
|
+
df = _get_df()
|
|
152
|
+
df_filtered = df[df['InstanceType'] == instance_type]
|
|
139
153
|
if df_filtered.empty:
|
|
140
154
|
return []
|
|
141
155
|
|
|
@@ -174,7 +188,8 @@ def list_accelerators(
|
|
|
174
188
|
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
175
189
|
"""Lists accelerators offered in Seeweb."""
|
|
176
190
|
# Filter out rows with empty or null regions (indicating unavailability)
|
|
177
|
-
|
|
191
|
+
df = _get_df()
|
|
192
|
+
df_filtered = df.dropna(subset=['Region'])
|
|
178
193
|
df_filtered = df_filtered[df_filtered['Region'].str.strip() != '']
|
|
179
194
|
|
|
180
195
|
result = common.list_accelerators_impl('Seeweb', df_filtered, gpus_only,
|