skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +38 -11
- sky/backends/cloud_vm_ray_backend.py +52 -18
- sky/client/cli/command.py +264 -25
- sky/client/sdk.py +119 -85
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +27 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +89 -15
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +26 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +263 -20
- sky/jobs/client/sdk.py +13 -12
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +121 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +22 -3
- sky/server/requests/requests.py +59 -2
- sky/server/rest.py +152 -0
- sky/server/server.py +70 -19
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -14
- sky/task.py +141 -43
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +31 -2
- sky/users/permission.py +2 -0
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/context.py +3 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +180 -38
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
- sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'cc8dbb73a2d26e8c017a788b0bbfc63041c78bae'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250625'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/admin_policy.py
CHANGED
@@ -4,11 +4,13 @@ import dataclasses
|
|
4
4
|
import typing
|
5
5
|
from typing import Any, Dict, Optional
|
6
6
|
|
7
|
+
import colorama
|
7
8
|
import pydantic
|
8
9
|
|
9
10
|
import sky
|
10
11
|
from sky import exceptions
|
11
12
|
from sky.adaptors import common as adaptors_common
|
13
|
+
from sky.utils import common_utils
|
12
14
|
from sky.utils import config_utils
|
13
15
|
from sky.utils import ux_utils
|
14
16
|
|
@@ -218,18 +220,27 @@ class RestfulAdminPolicy(PolicyTemplate):
|
|
218
220
|
headers={'Content-Type': 'application/json'},
|
219
221
|
# TODO(aylei): make this configurable
|
220
222
|
timeout=30)
|
223
|
+
if response.status_code == 400:
|
224
|
+
raise exceptions.UserRequestRejectedByPolicy(
|
225
|
+
f'{colorama.Fore.RED}User request is rejected by admin '
|
226
|
+
f'policy {self.policy_url}{colorama.Fore.RESET}: '
|
227
|
+
f'{response.text}')
|
221
228
|
response.raise_for_status()
|
222
229
|
except requests.exceptions.RequestException as e:
|
223
230
|
with ux_utils.print_exception_no_traceback():
|
224
|
-
raise exceptions.
|
225
|
-
f'Failed to
|
226
|
-
f'{self.policy_url}: {e}') from
|
231
|
+
raise exceptions.RestfulPolicyError(
|
232
|
+
f'Failed to call admin policy URL '
|
233
|
+
f'{self.policy_url}: {e}') from None
|
227
234
|
|
228
235
|
try:
|
229
236
|
mutated_user_request = MutatedUserRequest.decode(response.json())
|
230
237
|
except Exception as e: # pylint: disable=broad-except
|
231
238
|
with ux_utils.print_exception_no_traceback():
|
232
|
-
raise exceptions.
|
239
|
+
raise exceptions.RestfulPolicyError(
|
233
240
|
f'Failed to decode response from admin policy URL '
|
234
|
-
f'{self.policy_url}: {e}'
|
241
|
+
f'{self.policy_url}: {common_utils.format_exception(e, use_bracket=True)}'
|
242
|
+
) from None
|
235
243
|
return mutated_user_request
|
244
|
+
|
245
|
+
def __repr__(self):
|
246
|
+
return f'RestfulAdminPolicy(policy_url={self.policy_url})'
|
sky/backends/__init__.py
CHANGED
@@ -3,11 +3,12 @@ from sky.backends.backend import Backend
|
|
3
3
|
from sky.backends.backend import ResourceHandle
|
4
4
|
from sky.backends.cloud_vm_ray_backend import CloudVmRayBackend
|
5
5
|
from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle
|
6
|
+
from sky.backends.cloud_vm_ray_backend import LocalResourcesHandle
|
6
7
|
from sky.backends.local_docker_backend import LocalDockerBackend
|
7
8
|
from sky.backends.local_docker_backend import LocalDockerResourceHandle
|
8
9
|
|
9
10
|
__all__ = [
|
10
11
|
'Backend', 'ResourceHandle', 'CloudVmRayBackend',
|
11
|
-
'CloudVmRayResourceHandle', 'LocalDockerBackend',
|
12
|
+
'CloudVmRayResourceHandle', 'LocalResourcesHandle', 'LocalDockerBackend',
|
12
13
|
'LocalDockerResourceHandle'
|
13
14
|
]
|
sky/backends/backend_utils.py
CHANGED
@@ -33,6 +33,7 @@ from sky import provision as provision_lib
|
|
33
33
|
from sky import sky_logging
|
34
34
|
from sky import skypilot_config
|
35
35
|
from sky.adaptors import common as adaptors_common
|
36
|
+
from sky.jobs import utils as managed_job_utils
|
36
37
|
from sky.provision import instance_setup
|
37
38
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
38
39
|
from sky.skylet import constants
|
@@ -65,6 +66,7 @@ if typing.TYPE_CHECKING:
|
|
65
66
|
from sky import task as task_lib
|
66
67
|
from sky.backends import cloud_vm_ray_backend
|
67
68
|
from sky.backends import local_docker_backend
|
69
|
+
from sky.volumes import volume as volume_lib
|
68
70
|
else:
|
69
71
|
yaml = adaptors_common.LazyImport('yaml')
|
70
72
|
requests = adaptors_common.LazyImport('requests')
|
@@ -540,16 +542,18 @@ def get_expirable_clouds(
|
|
540
542
|
# TODO: too many things happening here - leaky abstraction. Refactor.
|
541
543
|
@timeline.event
|
542
544
|
def write_cluster_config(
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
545
|
+
to_provision: 'resources_lib.Resources',
|
546
|
+
num_nodes: int,
|
547
|
+
cluster_config_template: str,
|
548
|
+
cluster_name: str,
|
549
|
+
local_wheel_path: pathlib.Path,
|
550
|
+
wheel_hash: str,
|
551
|
+
region: clouds.Region,
|
552
|
+
zones: Optional[List[clouds.Zone]] = None,
|
553
|
+
dryrun: bool = False,
|
554
|
+
keep_launch_fields_in_existing_config: bool = True,
|
555
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
556
|
+
) -> Dict[str, str]:
|
553
557
|
"""Fills in cluster configuration templates and writes them out.
|
554
558
|
|
555
559
|
Returns:
|
@@ -597,7 +601,7 @@ def write_cluster_config(
|
|
597
601
|
resources_utils.ClusterName(
|
598
602
|
cluster_name,
|
599
603
|
cluster_name_on_cloud,
|
600
|
-
), region, zones, num_nodes, dryrun)
|
604
|
+
), region, zones, num_nodes, dryrun, volume_mounts)
|
601
605
|
config_dict = {}
|
602
606
|
|
603
607
|
specific_reservations = set(
|
@@ -730,6 +734,15 @@ def write_cluster_config(
|
|
730
734
|
high_availability_specified = controller_utils.high_availability_specified(
|
731
735
|
cluster_name)
|
732
736
|
|
737
|
+
volume_mount_vars = []
|
738
|
+
if volume_mounts is not None:
|
739
|
+
for vol in volume_mounts:
|
740
|
+
volume_mount_vars.append({
|
741
|
+
'name': vol.volume_name,
|
742
|
+
'path': vol.path,
|
743
|
+
'volume_name_on_cloud': vol.volume_config.name_on_cloud,
|
744
|
+
})
|
745
|
+
|
733
746
|
# Use a tmp file path to avoid incomplete YAML file being re-used in the
|
734
747
|
# future.
|
735
748
|
tmp_yaml_path = yaml_path + '.tmp'
|
@@ -820,6 +833,9 @@ def write_cluster_config(
|
|
820
833
|
|
821
834
|
# High availability
|
822
835
|
'high_availability': high_availability_specified,
|
836
|
+
|
837
|
+
# Volume mounts
|
838
|
+
'volume_mounts': volume_mount_vars,
|
823
839
|
}),
|
824
840
|
output_path=tmp_yaml_path)
|
825
841
|
config_dict['cluster_name'] = cluster_name
|
@@ -2454,6 +2470,17 @@ def is_controller_accessible(
|
|
2454
2470
|
exceptions.ClusterNotUpError: if the controller is not accessible, or
|
2455
2471
|
failed to be connected.
|
2456
2472
|
"""
|
2473
|
+
if (managed_job_utils.is_consolidation_mode() and
|
2474
|
+
controller == controller_utils.Controllers.JOBS_CONTROLLER):
|
2475
|
+
cn = 'local-controller-consolidation'
|
2476
|
+
return backends.LocalResourcesHandle(
|
2477
|
+
cluster_name=cn,
|
2478
|
+
cluster_name_on_cloud=cn,
|
2479
|
+
cluster_yaml=None,
|
2480
|
+
launched_nodes=1,
|
2481
|
+
launched_resources=sky.Resources(cloud=clouds.Cloud(),
|
2482
|
+
instance_type=cn),
|
2483
|
+
)
|
2457
2484
|
if non_existent_message is None:
|
2458
2485
|
non_existent_message = controller.value.default_hint_if_non_existent
|
2459
2486
|
cluster_name = controller.value.cluster_name
|
@@ -73,6 +73,7 @@ from sky.utils import status_lib
|
|
73
73
|
from sky.utils import subprocess_utils
|
74
74
|
from sky.utils import timeline
|
75
75
|
from sky.utils import ux_utils
|
76
|
+
from sky.volumes import volume as volume_lib
|
76
77
|
|
77
78
|
if typing.TYPE_CHECKING:
|
78
79
|
from sky import dag
|
@@ -1327,6 +1328,7 @@ class RetryingVmProvisioner(object):
|
|
1327
1328
|
prev_handle: Optional['CloudVmRayResourceHandle'],
|
1328
1329
|
prev_cluster_ever_up: bool,
|
1329
1330
|
skip_if_config_hash_matches: Optional[str],
|
1331
|
+
volume_mounts: Optional[List[volume_lib.VolumeMount]],
|
1330
1332
|
) -> Dict[str, Any]:
|
1331
1333
|
"""The provision retry loop.
|
1332
1334
|
|
@@ -1432,7 +1434,9 @@ class RetryingVmProvisioner(object):
|
|
1432
1434
|
region=region,
|
1433
1435
|
zones=zones,
|
1434
1436
|
dryrun=dryrun,
|
1435
|
-
keep_launch_fields_in_existing_config=cluster_exists
|
1437
|
+
keep_launch_fields_in_existing_config=cluster_exists,
|
1438
|
+
volume_mounts=volume_mounts,
|
1439
|
+
)
|
1436
1440
|
except exceptions.ResourcesUnavailableError as e:
|
1437
1441
|
# Failed due to catalog issue, e.g. image not found, or
|
1438
1442
|
# GPUs are requested in a Kubernetes cluster but the cluster
|
@@ -2081,7 +2085,9 @@ class RetryingVmProvisioner(object):
|
|
2081
2085
|
prev_cluster_status=prev_cluster_status,
|
2082
2086
|
prev_handle=prev_handle,
|
2083
2087
|
prev_cluster_ever_up=prev_cluster_ever_up,
|
2084
|
-
skip_if_config_hash_matches=skip_if_config_hash_matches
|
2088
|
+
skip_if_config_hash_matches=skip_if_config_hash_matches,
|
2089
|
+
volume_mounts=task.volume_mounts,
|
2090
|
+
)
|
2085
2091
|
if dryrun:
|
2086
2092
|
return config_dict
|
2087
2093
|
except (exceptions.InvalidClusterNameError,
|
@@ -2435,9 +2441,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2435
2441
|
zip(cluster_internal_ips, cluster_feasible_ips))
|
2436
2442
|
|
2437
2443
|
# Ensure head node is the first element, then sort based on the
|
2438
|
-
# external IPs for stableness
|
2439
|
-
|
2440
|
-
|
2444
|
+
# external IPs for stableness. Skip for k8s nodes since pods
|
2445
|
+
# worker ids are already mapped.
|
2446
|
+
if (cluster_info is not None and
|
2447
|
+
cluster_info.provider_name == 'kubernetes'):
|
2448
|
+
stable_internal_external_ips = internal_external_ips
|
2449
|
+
else:
|
2450
|
+
stable_internal_external_ips = [internal_external_ips[0]] + sorted(
|
2451
|
+
internal_external_ips[1:], key=lambda x: x[1])
|
2441
2452
|
self.stable_internal_external_ips = stable_internal_external_ips
|
2442
2453
|
|
2443
2454
|
@context_utils.cancellation_guard
|
@@ -2696,6 +2707,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2696
2707
|
pass
|
2697
2708
|
|
2698
2709
|
|
2710
|
+
class LocalResourcesHandle(CloudVmRayResourceHandle):
|
2711
|
+
"""A handle for local resources."""
|
2712
|
+
|
2713
|
+
@context_utils.cancellation_guard
|
2714
|
+
@annotations.lru_cache(scope='global')
|
2715
|
+
@timeline.event
|
2716
|
+
def get_command_runners(self,
|
2717
|
+
force_cached: bool = False,
|
2718
|
+
avoid_ssh_control: bool = False
|
2719
|
+
) -> List[command_runner.CommandRunner]:
|
2720
|
+
"""Returns a list of local command runners."""
|
2721
|
+
del force_cached, avoid_ssh_control # Unused.
|
2722
|
+
return [command_runner.LocalProcessCommandRunner()]
|
2723
|
+
|
2724
|
+
|
2699
2725
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
2700
2726
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
2701
2727
|
"""Backend: runs on cloud virtual machines, managed by Ray.
|
@@ -4043,19 +4069,27 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4043
4069
|
# list should aready be in descending order
|
4044
4070
|
job_id = job_ids[0]
|
4045
4071
|
|
4046
|
-
|
4047
|
-
|
4048
|
-
|
4049
|
-
|
4050
|
-
|
4051
|
-
|
4052
|
-
|
4053
|
-
|
4054
|
-
|
4055
|
-
|
4056
|
-
|
4057
|
-
|
4058
|
-
|
4072
|
+
if isinstance(handle, LocalResourcesHandle):
|
4073
|
+
# In consolidation mode, we don't submit a ray job, therefore no
|
4074
|
+
# run_timestamp is available. We use a dummy run_timestamp here.
|
4075
|
+
run_timestamps = {
|
4076
|
+
job_id: f'managed-jobs-consolidation-mode-{job_id}'
|
4077
|
+
}
|
4078
|
+
else:
|
4079
|
+
# get the run_timestamp
|
4080
|
+
# the function takes in [job_id]
|
4081
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs([str(job_id)])
|
4082
|
+
returncode, run_timestamps_payload, stderr = self.run_on_head(
|
4083
|
+
handle,
|
4084
|
+
code,
|
4085
|
+
stream_logs=False,
|
4086
|
+
require_outputs=True,
|
4087
|
+
separate_stderr=True)
|
4088
|
+
subprocess_utils.handle_returncode(returncode, code,
|
4089
|
+
'Failed to sync logs.', stderr)
|
4090
|
+
# returns with a dict of {job_id: run_timestamp}
|
4091
|
+
run_timestamps = message_utils.decode_payload(
|
4092
|
+
run_timestamps_payload)
|
4059
4093
|
if not run_timestamps:
|
4060
4094
|
logger.info(f'{colorama.Fore.YELLOW}'
|
4061
4095
|
'No matching log directories found'
|