skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +38 -11
- sky/backends/cloud_vm_ray_backend.py +52 -18
- sky/client/cli/command.py +264 -25
- sky/client/sdk.py +119 -85
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +27 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +89 -15
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +26 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +263 -20
- sky/jobs/client/sdk.py +13 -12
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +121 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +22 -3
- sky/server/requests/requests.py +59 -2
- sky/server/rest.py +152 -0
- sky/server/server.py +70 -19
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -14
- sky/task.py +141 -43
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +31 -2
- sky/users/permission.py +2 -0
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/context.py +3 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +180 -38
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
- sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/clouds/do.py
CHANGED
@@ -14,6 +14,7 @@ from sky.utils import resources_utils
|
|
14
14
|
|
15
15
|
if typing.TYPE_CHECKING:
|
16
16
|
from sky import resources as resources_lib
|
17
|
+
from sky.volumes import volume as volume_lib
|
17
18
|
|
18
19
|
_CREDENTIAL_FILE = 'config.yaml'
|
19
20
|
|
@@ -175,13 +176,15 @@ class DO(clouds.Cloud):
|
|
175
176
|
return None
|
176
177
|
|
177
178
|
def make_deploy_resources_variables(
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
179
|
+
self,
|
180
|
+
resources: 'resources_lib.Resources',
|
181
|
+
cluster_name: resources_utils.ClusterName,
|
182
|
+
region: 'clouds.Region',
|
183
|
+
zones: Optional[List['clouds.Zone']],
|
184
|
+
num_nodes: int,
|
185
|
+
dryrun: bool = False,
|
186
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
187
|
+
) -> Dict[str, Optional[str]]:
|
185
188
|
del zones, dryrun, cluster_name
|
186
189
|
|
187
190
|
resources = resources.assert_launchable()
|
sky/clouds/fluidstack.py
CHANGED
@@ -21,6 +21,7 @@ if typing.TYPE_CHECKING:
|
|
21
21
|
|
22
22
|
# Renaming to avoid shadowing variables.
|
23
23
|
from sky import resources as resources_lib
|
24
|
+
from sky.volumes import volume as volume_lib
|
24
25
|
else:
|
25
26
|
requests = adaptors_common.LazyImport('requests')
|
26
27
|
|
@@ -188,6 +189,7 @@ class Fluidstack(clouds.Cloud):
|
|
188
189
|
zones: Optional[List[clouds.Zone]],
|
189
190
|
num_nodes: int,
|
190
191
|
dryrun: bool = False,
|
192
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
191
193
|
) -> Dict[str, Optional[str]]:
|
192
194
|
|
193
195
|
assert zones is None, 'FluidStack does not support zones.'
|
sky/clouds/gcp.py
CHANGED
@@ -29,6 +29,7 @@ from sky.utils import ux_utils
|
|
29
29
|
if typing.TYPE_CHECKING:
|
30
30
|
from sky import resources
|
31
31
|
from sky.utils import status_lib
|
32
|
+
from sky.volumes import volume as volume_lib
|
32
33
|
|
33
34
|
logger = sky_logging.init_logger(__name__)
|
34
35
|
|
@@ -465,13 +466,15 @@ class GCP(clouds.Cloud):
|
|
465
466
|
assert False, 'Low disk tier should always be supported on GCP.'
|
466
467
|
|
467
468
|
def make_deploy_resources_variables(
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
469
|
+
self,
|
470
|
+
resources: 'resources.Resources',
|
471
|
+
cluster_name: resources_utils.ClusterName,
|
472
|
+
region: 'clouds.Region',
|
473
|
+
zones: Optional[List['clouds.Zone']],
|
474
|
+
num_nodes: int,
|
475
|
+
dryrun: bool = False,
|
476
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
477
|
+
) -> Dict[str, Optional[str]]:
|
475
478
|
assert zones is not None, (region, zones)
|
476
479
|
|
477
480
|
region_name = region.name
|
sky/clouds/hyperbolic.py
CHANGED
@@ -13,6 +13,7 @@ from sky.utils.resources_utils import DiskTier
|
|
13
13
|
|
14
14
|
if typing.TYPE_CHECKING:
|
15
15
|
from sky import resources as resources_lib
|
16
|
+
from sky.volumes import volume as volume_lib
|
16
17
|
|
17
18
|
|
18
19
|
@registry.CLOUD_REGISTRY.register
|
@@ -244,13 +245,15 @@ class Hyperbolic(clouds.Cloud):
|
|
244
245
|
return 0.0
|
245
246
|
|
246
247
|
def make_deploy_resources_variables(
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
248
|
+
self,
|
249
|
+
resources: 'resources_lib.Resources',
|
250
|
+
cluster_name: resources_utils.ClusterName,
|
251
|
+
region: 'clouds.Region',
|
252
|
+
zones: Optional[List['clouds.Zone']],
|
253
|
+
num_nodes: int,
|
254
|
+
dryrun: bool = False,
|
255
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
256
|
+
) -> Dict[str, Any]:
|
254
257
|
"""Returns a dict of variables for the deployment template."""
|
255
258
|
del dryrun, region, cluster_name # unused
|
256
259
|
assert zones is None, ('Hyperbolic does not support zones', zones)
|
sky/clouds/ibm.py
CHANGED
@@ -18,6 +18,7 @@ from sky.utils import ux_utils
|
|
18
18
|
if typing.TYPE_CHECKING:
|
19
19
|
# renaming to avoid shadowing variables
|
20
20
|
from sky import resources as resources_lib
|
21
|
+
from sky.volumes import volume as volume_lib
|
21
22
|
|
22
23
|
logger = sky_logging.init_logger(__name__)
|
23
24
|
|
@@ -175,6 +176,7 @@ class IBM(clouds.Cloud):
|
|
175
176
|
zones: Optional[List['clouds.Zone']],
|
176
177
|
num_nodes: int,
|
177
178
|
dryrun: bool = False,
|
179
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
178
180
|
) -> Dict[str, Any]:
|
179
181
|
"""Converts planned sky.Resources to cloud-specific resource variables.
|
180
182
|
|
sky/clouds/kubernetes.py
CHANGED
@@ -25,6 +25,7 @@ from sky.utils import common_utils
|
|
25
25
|
from sky.utils import registry
|
26
26
|
from sky.utils import resources_utils
|
27
27
|
from sky.utils import schemas
|
28
|
+
from sky.volumes import volume as volume_lib
|
28
29
|
|
29
30
|
if typing.TYPE_CHECKING:
|
30
31
|
# Renaming to avoid shadowing variables.
|
@@ -394,7 +395,9 @@ class Kubernetes(clouds.Cloud):
|
|
394
395
|
return 0
|
395
396
|
|
396
397
|
@staticmethod
|
397
|
-
def _calculate_provision_timeout(
|
398
|
+
def _calculate_provision_timeout(
|
399
|
+
num_nodes: int,
|
400
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']]) -> int:
|
398
401
|
"""Calculate provision timeout based on number of nodes.
|
399
402
|
|
400
403
|
The timeout scales linearly with the number of nodes to account for
|
@@ -409,19 +412,33 @@ class Kubernetes(clouds.Cloud):
|
|
409
412
|
base_timeout = 10 # Base timeout for single node
|
410
413
|
per_node_timeout = 0.2 # Additional seconds per node
|
411
414
|
max_timeout = 60 # Cap at 1 minute
|
415
|
+
if volume_mounts is not None:
|
416
|
+
for volume_mount in volume_mounts:
|
417
|
+
if (volume_mount.volume_config.type ==
|
418
|
+
volume_lib.VolumeType.PVC.value):
|
419
|
+
if (volume_mount.volume_config.config.get(
|
420
|
+
'access_mode', '') ==
|
421
|
+
volume_lib.VolumeAccessMode.READ_WRITE_MANY.value):
|
422
|
+
# GKE may take several minutes to provision a PV
|
423
|
+
# supporting READ_WRITE_MANY with filestore.
|
424
|
+
base_timeout = 180
|
425
|
+
max_timeout = 240
|
426
|
+
break
|
412
427
|
|
413
428
|
return int(
|
414
429
|
min(base_timeout + (per_node_timeout * (num_nodes - 1)),
|
415
430
|
max_timeout))
|
416
431
|
|
417
432
|
def make_deploy_resources_variables(
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
433
|
+
self,
|
434
|
+
resources: 'resources_lib.Resources',
|
435
|
+
cluster_name: 'resources_utils.ClusterName',
|
436
|
+
region: Optional['clouds.Region'],
|
437
|
+
zones: Optional[List['clouds.Zone']],
|
438
|
+
num_nodes: int,
|
439
|
+
dryrun: bool = False,
|
440
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
441
|
+
) -> Dict[str, Optional[str]]:
|
425
442
|
del cluster_name, zones, dryrun # Unused.
|
426
443
|
if region is None:
|
427
444
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
@@ -562,7 +579,7 @@ class Kubernetes(clouds.Cloud):
|
|
562
579
|
# We use a linear scaling formula to determine the timeout based on the
|
563
580
|
# number of nodes.
|
564
581
|
|
565
|
-
timeout = self._calculate_provision_timeout(num_nodes)
|
582
|
+
timeout = self._calculate_provision_timeout(num_nodes, volume_mounts)
|
566
583
|
timeout = skypilot_config.get_nested(
|
567
584
|
('kubernetes', 'provision_timeout'),
|
568
585
|
timeout,
|
@@ -653,6 +670,7 @@ class Kubernetes(clouds.Cloud):
|
|
653
670
|
(constants.PERSISTENT_RUN_SCRIPT_DIR),
|
654
671
|
'k8s_high_availability_restarting_signal_file':
|
655
672
|
(constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE),
|
673
|
+
'ha_recovery_log_path': constants.HA_PERSISTENT_RECOVERY_LOG_PATH,
|
656
674
|
'sky_python_cmd': constants.SKY_PYTHON_CMD,
|
657
675
|
'k8s_high_availability_storage_class_name':
|
658
676
|
(k8s_ha_storage_class_name),
|
sky/clouds/lambda_cloud.py
CHANGED
@@ -15,6 +15,7 @@ if typing.TYPE_CHECKING:
|
|
15
15
|
|
16
16
|
# Renaming to avoid shadowing variables.
|
17
17
|
from sky import resources as resources_lib
|
18
|
+
from sky.volumes import volume as volume_lib
|
18
19
|
else:
|
19
20
|
requests = adaptors_common.LazyImport('requests')
|
20
21
|
|
@@ -159,13 +160,15 @@ class Lambda(clouds.Cloud):
|
|
159
160
|
return None
|
160
161
|
|
161
162
|
def make_deploy_resources_variables(
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
163
|
+
self,
|
164
|
+
resources: 'resources_lib.Resources',
|
165
|
+
cluster_name: 'resources_utils.ClusterName',
|
166
|
+
region: 'clouds.Region',
|
167
|
+
zones: Optional[List['clouds.Zone']],
|
168
|
+
num_nodes: int,
|
169
|
+
dryrun: bool = False,
|
170
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
171
|
+
) -> Dict[str, Any]:
|
169
172
|
del cluster_name, dryrun # Unused.
|
170
173
|
assert zones is None, 'Lambda does not support zones.'
|
171
174
|
resources = resources.assert_launchable()
|
sky/clouds/nebius.py
CHANGED
@@ -16,6 +16,7 @@ from sky.utils import resources_utils
|
|
16
16
|
|
17
17
|
if typing.TYPE_CHECKING:
|
18
18
|
from sky import resources as resources_lib
|
19
|
+
from sky.volumes import volume as volume_lib
|
19
20
|
|
20
21
|
_INDENT_PREFIX = ' '
|
21
22
|
|
@@ -196,13 +197,15 @@ class Nebius(clouds.Cloud):
|
|
196
197
|
return None
|
197
198
|
|
198
199
|
def make_deploy_resources_variables(
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
200
|
+
self,
|
201
|
+
resources: 'resources_lib.Resources',
|
202
|
+
cluster_name: resources_utils.ClusterName,
|
203
|
+
region: 'clouds.Region',
|
204
|
+
zones: Optional[List['clouds.Zone']],
|
205
|
+
num_nodes: int,
|
206
|
+
dryrun: bool = False,
|
207
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
208
|
+
) -> Dict[str, Any]:
|
206
209
|
del dryrun, cluster_name
|
207
210
|
assert zones is None, ('Nebius does not support zones', zones)
|
208
211
|
|
sky/clouds/oci.py
CHANGED
@@ -40,6 +40,7 @@ from sky.utils import ux_utils
|
|
40
40
|
if typing.TYPE_CHECKING:
|
41
41
|
# Renaming to avoid shadowing variables.
|
42
42
|
from sky import resources as resources_lib
|
43
|
+
from sky.volumes import volume as volume_lib
|
43
44
|
|
44
45
|
logger = logging.getLogger(__name__)
|
45
46
|
|
@@ -207,13 +208,15 @@ class OCI(clouds.Cloud):
|
|
207
208
|
return None
|
208
209
|
|
209
210
|
def make_deploy_resources_variables(
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
211
|
+
self,
|
212
|
+
resources: 'resources_lib.Resources',
|
213
|
+
cluster_name: resources_utils.ClusterName,
|
214
|
+
region: Optional['clouds.Region'],
|
215
|
+
zones: Optional[List['clouds.Zone']],
|
216
|
+
num_nodes: int,
|
217
|
+
dryrun: bool = False,
|
218
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
219
|
+
) -> Dict[str, Any]:
|
217
220
|
del cluster_name, dryrun # Unused.
|
218
221
|
assert region is not None, resources
|
219
222
|
|
sky/clouds/paperspace.py
CHANGED
@@ -14,6 +14,7 @@ if typing.TYPE_CHECKING:
|
|
14
14
|
import requests
|
15
15
|
|
16
16
|
from sky import resources as resources_lib
|
17
|
+
from sky.volumes import volume as volume_lib
|
17
18
|
else:
|
18
19
|
requests = adaptors_common.LazyImport('requests')
|
19
20
|
|
@@ -179,13 +180,15 @@ class Paperspace(clouds.Cloud):
|
|
179
180
|
return None
|
180
181
|
|
181
182
|
def make_deploy_resources_variables(
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
183
|
+
self,
|
184
|
+
resources: 'resources_lib.Resources',
|
185
|
+
cluster_name: resources_utils.ClusterName,
|
186
|
+
region: 'clouds.Region',
|
187
|
+
zones: Optional[List['clouds.Zone']],
|
188
|
+
num_nodes: int,
|
189
|
+
dryrun: bool = False,
|
190
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
191
|
+
) -> Dict[str, Optional[str]]:
|
189
192
|
del zones, dryrun, cluster_name
|
190
193
|
|
191
194
|
resources = resources.assert_launchable()
|
sky/clouds/runpod.py
CHANGED
@@ -10,6 +10,7 @@ from sky.utils import resources_utils
|
|
10
10
|
|
11
11
|
if typing.TYPE_CHECKING:
|
12
12
|
from sky import resources as resources_lib
|
13
|
+
from sky.volumes import volume as volume_lib
|
13
14
|
|
14
15
|
_CREDENTIAL_FILES = [
|
15
16
|
'config.toml',
|
@@ -160,13 +161,15 @@ class RunPod(clouds.Cloud):
|
|
160
161
|
return None
|
161
162
|
|
162
163
|
def make_deploy_resources_variables(
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
164
|
+
self,
|
165
|
+
resources: 'resources_lib.Resources',
|
166
|
+
cluster_name: resources_utils.ClusterName,
|
167
|
+
region: 'clouds.Region',
|
168
|
+
zones: Optional[List['clouds.Zone']],
|
169
|
+
num_nodes: int,
|
170
|
+
dryrun: bool = False,
|
171
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
172
|
+
) -> Dict[str, Optional[Union[str, bool]]]:
|
170
173
|
del dryrun, cluster_name # unused
|
171
174
|
assert zones is not None, (region, zones)
|
172
175
|
|
sky/clouds/scp.py
CHANGED
@@ -19,6 +19,7 @@ from sky.utils import status_lib
|
|
19
19
|
if typing.TYPE_CHECKING:
|
20
20
|
# Renaming to avoid shadowing variables.
|
21
21
|
from sky import resources as resources_lib
|
22
|
+
from sky.volumes import volume as volume_lib
|
22
23
|
|
23
24
|
_CREDENTIAL_FILES = [
|
24
25
|
'scp_credential',
|
@@ -183,13 +184,15 @@ class SCP(clouds.Cloud):
|
|
183
184
|
return None
|
184
185
|
|
185
186
|
def make_deploy_resources_variables(
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
187
|
+
self,
|
188
|
+
resources: 'resources_lib.Resources',
|
189
|
+
cluster_name: 'resources_utils.ClusterName',
|
190
|
+
region: 'clouds.Region',
|
191
|
+
zones: Optional[List['clouds.Zone']],
|
192
|
+
num_nodes: int,
|
193
|
+
dryrun: bool = False,
|
194
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
195
|
+
) -> Dict[str, Optional[str]]:
|
193
196
|
del cluster_name, dryrun # Unused.
|
194
197
|
assert zones is None, 'SCP does not support zones.'
|
195
198
|
|
sky/clouds/vast.py
CHANGED
@@ -10,6 +10,7 @@ from sky.utils import resources_utils
|
|
10
10
|
|
11
11
|
if typing.TYPE_CHECKING:
|
12
12
|
from sky import resources as resources_lib
|
13
|
+
from sky.volumes import volume as volume_lib
|
13
14
|
|
14
15
|
|
15
16
|
@registry.CLOUD_REGISTRY.register
|
@@ -155,13 +156,15 @@ class Vast(clouds.Cloud):
|
|
155
156
|
return None
|
156
157
|
|
157
158
|
def make_deploy_resources_variables(
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
159
|
+
self,
|
160
|
+
resources: 'resources_lib.Resources',
|
161
|
+
cluster_name: resources_utils.ClusterName,
|
162
|
+
region: 'clouds.Region',
|
163
|
+
zones: Optional[List['clouds.Zone']],
|
164
|
+
num_nodes: int,
|
165
|
+
dryrun: bool = False,
|
166
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
167
|
+
) -> Dict[str, Optional[str]]:
|
165
168
|
del zones, dryrun, cluster_name, num_nodes # unused
|
166
169
|
|
167
170
|
resources = resources.assert_launchable()
|
sky/clouds/vsphere.py
CHANGED
@@ -18,6 +18,7 @@ if typing.TYPE_CHECKING:
|
|
18
18
|
|
19
19
|
# Renaming to avoid shadowing variables.
|
20
20
|
from sky import resources as resources_lib
|
21
|
+
from sky.volumes import volume as volume_lib
|
21
22
|
else:
|
22
23
|
requests = adaptors_common.LazyImport('requests')
|
23
24
|
|
@@ -184,6 +185,7 @@ class Vsphere(clouds.Cloud):
|
|
184
185
|
zones: Optional[List['clouds.Zone']],
|
185
186
|
num_nodes: int,
|
186
187
|
dryrun: bool = False,
|
188
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
187
189
|
) -> Dict[str, Optional[str]]:
|
188
190
|
# TODO get image id here.
|
189
191
|
del cluster_name, dryrun # unused
|
sky/core.py
CHANGED
@@ -33,6 +33,7 @@ from sky.utils import admin_policy_utils
|
|
33
33
|
from sky.utils import common
|
34
34
|
from sky.utils import common_utils
|
35
35
|
from sky.utils import controller_utils
|
36
|
+
from sky.utils import resources_utils
|
36
37
|
from sky.utils import rich_utils
|
37
38
|
from sky.utils import status_lib
|
38
39
|
from sky.utils import subprocess_utils
|
@@ -75,6 +76,7 @@ def optimize(
|
|
75
76
|
for a task.
|
76
77
|
exceptions.NoCloudAccessError: if no public clouds are enabled.
|
77
78
|
"""
|
79
|
+
dag.resolve_and_validate_volumes()
|
78
80
|
# TODO: We apply the admin policy only on the first DAG optimization which
|
79
81
|
# is shown on `sky launch`. The optimizer is also invoked during failover,
|
80
82
|
# but we do not apply the admin policy there. We should apply the admin
|
@@ -265,7 +267,7 @@ def endpoints(cluster: str,
|
|
265
267
|
the dictionary will contain all ports:endpoints exposed on the cluster.
|
266
268
|
|
267
269
|
Raises:
|
268
|
-
|
270
|
+
ValueError: if the cluster is not UP or the endpoint is not exposed.
|
269
271
|
RuntimeError: if the cluster has no ports to be exposed or no endpoints
|
270
272
|
are exposed yet.
|
271
273
|
"""
|
@@ -276,7 +278,7 @@ def endpoints(cluster: str,
|
|
276
278
|
|
277
279
|
|
278
280
|
@usage_lib.entrypoint
|
279
|
-
def cost_report() -> List[Dict[str, Any]]:
|
281
|
+
def cost_report(days: Optional[int] = None) -> List[Dict[str, Any]]:
|
280
282
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
281
283
|
"""Get all cluster cost reports, including those that have been downed.
|
282
284
|
|
@@ -294,6 +296,13 @@ def cost_report() -> List[Dict[str, Any]]:
|
|
294
296
|
'cluster_hash': (str) unique hash identifying cluster,
|
295
297
|
'usage_intervals': (List[Tuple[int, int]]) cluster usage times,
|
296
298
|
'total_cost': (float) cost given resources and usage intervals,
|
299
|
+
'cloud': (str) cloud of the cluster,
|
300
|
+
'region': (str) region of the cluster,
|
301
|
+
'cpus': (str) number of vCPUs of the cluster,
|
302
|
+
'memory': (str) memory of the cluster,
|
303
|
+
'accelerators': (str) accelerators of the cluster,
|
304
|
+
'resources_str': (str) resources string of the cluster,
|
305
|
+
'resources_str_full': (str) full resources string of the cluster,
|
297
306
|
}
|
298
307
|
|
299
308
|
The estimated cost column indicates price for the cluster based on the type
|
@@ -303,27 +312,92 @@ def cost_report() -> List[Dict[str, Any]]:
|
|
303
312
|
cache of the cluster status, and may not be accurate for the cluster with
|
304
313
|
autostop/use_spot set or terminated/stopped on the cloud console.
|
305
314
|
|
315
|
+
Args:
|
316
|
+
days: Number of days to look back from now. Active clusters are always
|
317
|
+
included. Historical clusters are only included if they were last
|
318
|
+
used within the past 'days' days. Defaults to 30 days.
|
319
|
+
|
306
320
|
Returns:
|
307
321
|
A list of dicts, with each dict containing the cost information of a
|
308
322
|
cluster.
|
309
323
|
"""
|
310
|
-
|
324
|
+
if days is None:
|
325
|
+
days = constants.COST_REPORT_DEFAULT_DAYS
|
326
|
+
|
327
|
+
cluster_reports = global_user_state.get_clusters_from_history(days=days)
|
328
|
+
logger.debug(
|
329
|
+
f'{len(cluster_reports)} clusters found from history with {days} days.')
|
330
|
+
|
331
|
+
def _process_cluster_report(
|
332
|
+
cluster_report: Dict[str, Any]) -> Dict[str, Any]:
|
333
|
+
"""Process cluster report by calculating cost and adding fields."""
|
334
|
+
# Make a copy to avoid modifying the original
|
335
|
+
report = cluster_report.copy()
|
336
|
+
|
337
|
+
def get_total_cost(cluster_report: dict) -> float:
|
338
|
+
duration = cluster_report['duration']
|
339
|
+
launched_nodes = cluster_report['num_nodes']
|
340
|
+
launched_resources = cluster_report['resources']
|
341
|
+
|
342
|
+
cost = (launched_resources.get_cost(duration) * launched_nodes)
|
343
|
+
return cost
|
344
|
+
|
345
|
+
def _update_record_with_resources(record: Dict[str, Any]) -> None:
|
346
|
+
"""Add resource fields for dashboard compatibility."""
|
347
|
+
if record is None:
|
348
|
+
return
|
349
|
+
resources = record.get('resources')
|
350
|
+
if resources is None:
|
351
|
+
return
|
352
|
+
fields = ['cloud', 'region', 'cpus', 'memory', 'accelerators']
|
353
|
+
for field in fields:
|
354
|
+
try:
|
355
|
+
record[field] = str(getattr(resources, field))
|
356
|
+
except Exception as e: # pylint: disable=broad-except
|
357
|
+
# Ok to skip the fields as this is just for display
|
358
|
+
# purposes.
|
359
|
+
logger.debug(f'Failed to get resources.{field} for cluster '
|
360
|
+
f'{record["name"]}: {str(e)}')
|
361
|
+
record[field] = None
|
362
|
+
|
363
|
+
# Add resources_str and resources_str_full for dashboard
|
364
|
+
# compatibility
|
365
|
+
num_nodes = record.get('num_nodes', 1)
|
366
|
+
try:
|
367
|
+
resource_str_simple = resources_utils.format_resource(
|
368
|
+
resources, simplify=True)
|
369
|
+
resource_str_full = resources_utils.format_resource(
|
370
|
+
resources, simplify=False)
|
371
|
+
record['resources_str'] = f'{num_nodes}x{resource_str_simple}'
|
372
|
+
record[
|
373
|
+
'resources_str_full'] = f'{num_nodes}x{resource_str_full}'
|
374
|
+
except Exception as e: # pylint: disable=broad-except
|
375
|
+
logger.debug(f'Failed to get resources_str for cluster '
|
376
|
+
f'{record["name"]}: {str(e)}')
|
377
|
+
for field in fields:
|
378
|
+
record[field] = None
|
379
|
+
record['resources_str'] = '-'
|
380
|
+
record['resources_str_full'] = '-'
|
381
|
+
|
382
|
+
try:
|
383
|
+
report['total_cost'] = get_total_cost(report)
|
384
|
+
except Exception as e: # pylint: disable=broad-except
|
385
|
+
# Ok to skip the total cost as this is just for display purposes.
|
386
|
+
logger.warning(f'Failed to get total cost for cluster '
|
387
|
+
f'{report["name"]}: {str(e)}')
|
388
|
+
report['total_cost'] = 0.0
|
311
389
|
|
312
|
-
|
313
|
-
|
314
|
-
launched_nodes = cluster_report['num_nodes']
|
315
|
-
launched_resources = cluster_report['resources']
|
390
|
+
_update_record_with_resources(report)
|
391
|
+
return report
|
316
392
|
|
317
|
-
|
318
|
-
|
393
|
+
# Process clusters in parallel
|
394
|
+
if not cluster_reports:
|
395
|
+
return []
|
319
396
|
|
320
|
-
|
321
|
-
|
322
|
-
cluster_report['cloud'] = str(cluster_report['resources'].cloud)
|
323
|
-
cluster_report['accelerators'] = cluster_report[
|
324
|
-
'resources'].accelerators
|
397
|
+
processed_reports = subprocess_utils.run_in_parallel(
|
398
|
+
_process_cluster_report, cluster_reports)
|
325
399
|
|
326
|
-
return
|
400
|
+
return processed_reports
|
327
401
|
|
328
402
|
|
329
403
|
def _start(
|
sky/dag.py
CHANGED
@@ -83,6 +83,20 @@ class Dag:
|
|
83
83
|
task.validate(skip_file_mounts=skip_file_mounts,
|
84
84
|
skip_workdir=skip_workdir)
|
85
85
|
|
86
|
+
def resolve_and_validate_volumes(self) -> None:
|
87
|
+
for task in self.tasks:
|
88
|
+
task.resolve_and_validate_volumes()
|
89
|
+
|
90
|
+
def pre_mount_volumes(self) -> None:
|
91
|
+
vol_map = {}
|
92
|
+
# Deduplicate volume mounts.
|
93
|
+
for task in self.tasks:
|
94
|
+
if task.volume_mounts is not None:
|
95
|
+
for volume_mount in task.volume_mounts:
|
96
|
+
vol_map[volume_mount.volume_name] = volume_mount
|
97
|
+
for volume_mount in vol_map.values():
|
98
|
+
volume_mount.pre_mount()
|
99
|
+
|
86
100
|
|
87
101
|
class _DagContext(threading.local):
|
88
102
|
"""A thread-local stack of Dags."""
|
sky/dashboard/out/404.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b23cb0257bf96c51.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b23cb0257bf96c51.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6133dc1e928bd0b5.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-0ef7418d1a3822f3.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"ZWdSYkqVe3WjnFR8ocqoG","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
|
@@ -0,0 +1 @@
|
|
1
|
+
self.__BUILD_MANIFEST=function(s,c,e,a,t,u,n,r,i,j,f,k,b,d){return{__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/":["static/chunks/pages/index-6b0d9e5031b70c58.js"],"/_error":["static/chunks/pages/_error-1be831200e60c5c0.js"],"/clusters":["static/chunks/pages/clusters-4aa031d1f42723d8.js"],"/clusters/[cluster]":[s,c,e,a,t,r,j,u,n,f,i,k,b,d,"static/chunks/37-1f1e94f5a561202a.js","static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js"],"/clusters/[cluster]/[job]":[s,c,e,a,t,u,n,"static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js"],"/config":["static/chunks/pages/config-3102d02a188f04b3.js"],"/infra":["static/chunks/pages/infra-fd5dc8a91bd9169a.js"],"/infra/[context]":["static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js"],"/jobs":["static/chunks/pages/jobs-26da173e20af16e4.js"],"/jobs/[job]":[s,c,e,a,t,r,u,n,i,"static/chunks/pages/jobs/[job]-e4b23128db0774cd.js"],"/users":["static/chunks/pages/users-ce29e7420385563d.js"],"/volumes":["static/chunks/pages/volumes-476b670ef33d1ecd.js"],"/workspace/new":["static/chunks/pages/workspace/new-09ae0f6f972aa871.js"],"/workspaces":["static/chunks/pages/workspaces-862b120406461b10.js"],"/workspaces/[name]":[s,c,e,a,t,r,j,u,n,f,i,k,b,d,"static/chunks/843-07d25a7e64462fd8.js","static/chunks/pages/workspaces/[name]-0b4c662a25e4747a.js"],sortedPages:["/","/_app","/_error","/clusters","/clusters/[cluster]","/clusters/[cluster]/[job]","/config","/infra","/infra/[context]","/jobs","/jobs/[job]","/users","/volumes","/workspace/new","/workspaces","/workspaces/[name]"]}}("static/chunks/616-d6128fa9e7cae6e6.js","static/chunks/230-d6e363362017ff3a.js","static/chunks/799-3625946b2ec2eb30.js","static/chunks/664-047bc03493fda379.js","static/chunks/804-4c9fc53aa74bc191.js","static/chunks/989-db34c16ad7ea6155.js","static/chunks/470-92dd1614396389be.js","static/chunks/798-c0525dc3f21e488d.js","static/chunks/969-d3a0b53f728d280a.js","static/chunks/947-6620842ef80ae879.js","static/chunks/66-66ae330df2d3c1c7.js","static/chunks/856-cdf66268ec878d0c.js","static/chunks/973-5b5019ba333e8d62.js","static/chunks/938-068520cc11738deb.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|