skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/backend_utils.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +91 -96
- sky/cli.py +5 -6311
- sky/client/cli.py +66 -639
- sky/client/sdk.py +22 -2
- sky/clouds/kubernetes.py +8 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +50 -11
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/logs/__init__.py +17 -0
- sky/logs/agent.py +73 -0
- sky/logs/gcp.py +91 -0
- sky/models.py +1 -0
- sky/provision/__init__.py +1 -0
- sky/provision/instance_setup.py +35 -0
- sky/provision/provisioner.py +11 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/server/common.py +21 -9
- sky/server/requests/payloads.py +19 -1
- sky/server/server.py +121 -29
- sky/setup_files/dependencies.py +11 -1
- sky/skylet/constants.py +48 -1
- sky/skylet/job_lib.py +83 -19
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +60 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +47 -34
- sky/users/rbac.py +10 -1
- sky/users/server.py +274 -9
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +83 -5
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
- sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '903f8a7f3955084316b26af619b6b043f43de01c'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250618'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -81,7 +81,6 @@ _set_http_proxy_env_vars()
|
|
81
81
|
# Keep this order to avoid cyclic imports
|
82
82
|
# pylint: disable=wrong-import-position
|
83
83
|
from sky import backends
|
84
|
-
from sky import benchmark
|
85
84
|
from sky import clouds
|
86
85
|
from sky.admin_policy import AdminPolicy
|
87
86
|
from sky.admin_policy import MutatedUserRequest
|
@@ -168,7 +167,6 @@ __all__ = [
|
|
168
167
|
'Optimizer',
|
169
168
|
'OptimizeTarget',
|
170
169
|
'backends',
|
171
|
-
'benchmark',
|
172
170
|
'list_accelerators',
|
173
171
|
'__root_dir__',
|
174
172
|
'Storage',
|
sky/backends/backend_utils.py
CHANGED
@@ -28,6 +28,7 @@ from sky import check as sky_check
|
|
28
28
|
from sky import clouds
|
29
29
|
from sky import exceptions
|
30
30
|
from sky import global_user_state
|
31
|
+
from sky import logs
|
31
32
|
from sky import provision as provision_lib
|
32
33
|
from sky import sky_logging
|
33
34
|
from sky import skypilot_config
|
@@ -660,6 +661,12 @@ def write_cluster_config(
|
|
660
661
|
|
661
662
|
credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
|
662
663
|
|
664
|
+
logging_agent = logs.get_logging_agent()
|
665
|
+
if logging_agent:
|
666
|
+
for k, v in logging_agent.get_credential_file_mounts().items():
|
667
|
+
assert k not in credentials, f'{k} already in credentials'
|
668
|
+
credentials[k] = v
|
669
|
+
|
663
670
|
private_key_path, _ = auth.get_or_generate_keys()
|
664
671
|
auth_config = {'ssh_private_key': private_key_path}
|
665
672
|
region_name = resources_vars.get('region')
|
@@ -21,6 +21,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
|
21
21
|
|
22
22
|
import colorama
|
23
23
|
import filelock
|
24
|
+
import yaml
|
24
25
|
|
25
26
|
import sky
|
26
27
|
from sky import backends
|
@@ -141,6 +142,7 @@ _MAX_RAY_UP_RETRY = 5
|
|
141
142
|
_MAX_GET_ZONE_RETRY = 3
|
142
143
|
|
143
144
|
_JOB_ID_PATTERN = re.compile(r'Job ID: ([0-9]+)')
|
145
|
+
_LOG_DIR_PATTERN = re.compile(r'Log Dir: ([^ ]+)')
|
144
146
|
|
145
147
|
# Path to the monkey-patched ray up script.
|
146
148
|
# We don't do import then __file__ because that script needs to be filled in
|
@@ -786,34 +788,6 @@ class FailoverCloudErrorHandlerV1:
|
|
786
788
|
setattr(e, 'detailed_reason', detailed_reason)
|
787
789
|
raise e
|
788
790
|
|
789
|
-
@staticmethod
|
790
|
-
def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
|
791
|
-
launchable_resources: 'resources_lib.Resources',
|
792
|
-
region: 'clouds.Region',
|
793
|
-
zones: Optional[List['clouds.Zone']], stdout: str,
|
794
|
-
stderr: str):
|
795
|
-
del zones # Unused.
|
796
|
-
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
797
|
-
stdout,
|
798
|
-
stderr,
|
799
|
-
is_error_str_known=lambda x: 'SCPError:' in x.strip())
|
800
|
-
|
801
|
-
logger.warning(f'Got error(s) in {region.name}:')
|
802
|
-
messages = '\n\t'.join(errors)
|
803
|
-
style = colorama.Style
|
804
|
-
logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
|
805
|
-
_add_to_blocked_resources(blocked_resources,
|
806
|
-
launchable_resources.copy(zone=None))
|
807
|
-
|
808
|
-
# Sometimes, SCPError will list available regions.
|
809
|
-
for e in errors:
|
810
|
-
if e.find('Regions with capacity available:') != -1:
|
811
|
-
for r in catalog.regions('scp'):
|
812
|
-
if e.find(r.name) == -1:
|
813
|
-
_add_to_blocked_resources(
|
814
|
-
blocked_resources,
|
815
|
-
launchable_resources.copy(region=r.name, zone=None))
|
816
|
-
|
817
791
|
@staticmethod
|
818
792
|
def _ibm_handler(blocked_resources: Set['resources_lib.Resources'],
|
819
793
|
launchable_resources: 'resources_lib.Resources',
|
@@ -1117,6 +1091,21 @@ class FailoverCloudErrorHandlerV2:
|
|
1117
1091
|
FailoverCloudErrorHandlerV2._default_handler(
|
1118
1092
|
blocked_resources, launchable_resources, region, zones, error)
|
1119
1093
|
|
1094
|
+
@staticmethod
|
1095
|
+
def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
|
1096
|
+
launchable_resources: 'resources_lib.Resources',
|
1097
|
+
region: 'clouds.Region',
|
1098
|
+
zones: Optional[List['clouds.Zone']],
|
1099
|
+
error: Exception) -> None:
|
1100
|
+
logger.info(f'SCP handler error: {error}')
|
1101
|
+
# Block SCP if the credential has expired.
|
1102
|
+
if isinstance(error, exceptions.InvalidCloudCredentials):
|
1103
|
+
_add_to_blocked_resources(
|
1104
|
+
blocked_resources, resources_lib.Resources(cloud=clouds.SCP()))
|
1105
|
+
else:
|
1106
|
+
FailoverCloudErrorHandlerV2._default_handler(
|
1107
|
+
blocked_resources, launchable_resources, region, zones, error)
|
1108
|
+
|
1120
1109
|
@staticmethod
|
1121
1110
|
def _default_handler(blocked_resources: Set['resources_lib.Resources'],
|
1122
1111
|
launchable_resources: 'resources_lib.Resources',
|
@@ -2302,12 +2291,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2302
2291
|
clouds.ProvisionerVersion.SKYPILOT):
|
2303
2292
|
provider_name = str(self.launched_resources.cloud).lower()
|
2304
2293
|
config = {}
|
2305
|
-
|
2306
|
-
|
2307
|
-
|
2308
|
-
|
2309
|
-
|
2310
|
-
|
2294
|
+
# It is possible that the cluster yaml is not available when
|
2295
|
+
# the handle is unpickled for service replicas from the
|
2296
|
+
# controller with older version.
|
2297
|
+
yaml_str = global_user_state.get_cluster_yaml_str(self.cluster_yaml)
|
2298
|
+
if yaml_str is None:
|
2299
|
+
# If the cluster yaml is not available,
|
2300
|
+
# we skip updating the cluster info.
|
2301
|
+
return
|
2302
|
+
config = yaml.safe_load(yaml_str)
|
2311
2303
|
try:
|
2312
2304
|
cluster_info = provision_lib.get_cluster_info(
|
2313
2305
|
provider_name,
|
@@ -2500,6 +2492,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2500
2492
|
'Tried to use cached cluster info, but it\'s missing for '
|
2501
2493
|
f'cluster "{self.cluster_name}"')
|
2502
2494
|
self._update_cluster_info()
|
2495
|
+
# For Kubernetes, `KubernetesCommandRunner` want to get the pod names
|
2496
|
+
# to run the command. But for high availability serve controller,
|
2497
|
+
# the controller pod is part of a deployment, and once the pod is
|
2498
|
+
# killed and a new one is created, the pod name changes, so we need
|
2499
|
+
# to manually update the cluster info here.
|
2500
|
+
# TODO(andyl): See if we can prevent this refresh. Like pass in
|
2501
|
+
# deployment name as identifier for KubernetesCommandRunner. Now this
|
2502
|
+
# is required for rsync as using deployment in rsync seems to cause
|
2503
|
+
# some unknown issues.
|
2504
|
+
# TODO(andyl): Should check through the real cluster info. Same as
|
2505
|
+
# the TODO in kubernetes/instance.py:terminate_instances
|
2506
|
+
if (isinstance(self.launched_resources.cloud, clouds.Kubernetes) and
|
2507
|
+
controller_utils.high_availability_specified(
|
2508
|
+
self.cluster_name)):
|
2509
|
+
self._update_cluster_info()
|
2503
2510
|
|
2504
2511
|
assert self.cached_cluster_info is not None, self
|
2505
2512
|
runners = provision_lib.get_command_runners(
|
@@ -3178,7 +3185,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3178
3185
|
# Capture task YAML and command
|
3179
3186
|
task_config = None
|
3180
3187
|
if task is not None:
|
3181
|
-
task_config = task.to_yaml_config()
|
3188
|
+
task_config = task.to_yaml_config(redact_secrets=True)
|
3182
3189
|
|
3183
3190
|
with timeline.Event('backend.provision.post_process'):
|
3184
3191
|
global_user_state.add_or_update_cluster(
|
@@ -3302,7 +3309,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3302
3309
|
runners = handle.get_command_runners(avoid_ssh_control=True)
|
3303
3310
|
|
3304
3311
|
def _setup_node(node_id: int) -> None:
|
3305
|
-
setup_envs = task.
|
3312
|
+
setup_envs = task.envs_and_secrets
|
3306
3313
|
setup_envs.update(self._skypilot_predefined_env_vars(handle))
|
3307
3314
|
setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
|
3308
3315
|
setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
|
@@ -3455,10 +3462,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3455
3462
|
job_id: int,
|
3456
3463
|
detach_run: bool = False,
|
3457
3464
|
managed_job_dag: Optional['dag.Dag'] = None,
|
3465
|
+
remote_log_dir: Optional[str] = None,
|
3458
3466
|
) -> None:
|
3459
3467
|
"""Executes generated code on the head node."""
|
3460
3468
|
script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
|
3461
|
-
remote_log_dir
|
3469
|
+
if remote_log_dir is None:
|
3470
|
+
remote_log_dir = self.log_dir
|
3462
3471
|
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
3463
3472
|
|
3464
3473
|
cd = f'cd {SKY_REMOTE_WORKDIR}'
|
@@ -3577,13 +3586,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3577
3586
|
self.tail_logs(handle, job_id)
|
3578
3587
|
|
3579
3588
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
3580
|
-
job_name: Optional[str],
|
3589
|
+
job_name: Optional[str],
|
3590
|
+
resources_str: str) -> Tuple[int, str]:
|
3581
3591
|
code = job_lib.JobLibCodeGen.add_job(
|
3582
3592
|
job_name=job_name,
|
3583
3593
|
username=common_utils.get_user_hash(),
|
3584
3594
|
run_timestamp=self.run_timestamp,
|
3585
3595
|
resources_str=resources_str)
|
3586
|
-
returncode,
|
3596
|
+
returncode, result_str, stderr = self.run_on_head(handle,
|
3587
3597
|
code,
|
3588
3598
|
stream_logs=False,
|
3589
3599
|
require_outputs=True,
|
@@ -3597,17 +3607,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3597
3607
|
subprocess_utils.handle_returncode(returncode, code,
|
3598
3608
|
'Failed to fetch job id.', stderr)
|
3599
3609
|
try:
|
3600
|
-
job_id_match = _JOB_ID_PATTERN.search(
|
3610
|
+
job_id_match = _JOB_ID_PATTERN.search(result_str)
|
3601
3611
|
if job_id_match is not None:
|
3602
3612
|
job_id = int(job_id_match.group(1))
|
3603
3613
|
else:
|
3604
3614
|
# For backward compatibility.
|
3605
|
-
job_id = int(
|
3615
|
+
job_id = int(result_str)
|
3616
|
+
log_dir_match = _LOG_DIR_PATTERN.search(result_str)
|
3617
|
+
if log_dir_match is not None:
|
3618
|
+
log_dir = log_dir_match.group(1).strip()
|
3619
|
+
else:
|
3620
|
+
# For backward compatibility, use the same log dir as local.
|
3621
|
+
log_dir = self.log_dir
|
3606
3622
|
except ValueError as e:
|
3607
3623
|
logger.error(stderr)
|
3608
|
-
raise ValueError(f'Failed to parse job id: {
|
3624
|
+
raise ValueError(f'Failed to parse job id: {result_str}; '
|
3609
3625
|
f'Returncode: {returncode}') from e
|
3610
|
-
return job_id
|
3626
|
+
return job_id, log_dir
|
3611
3627
|
|
3612
3628
|
def _execute(
|
3613
3629
|
self,
|
@@ -3659,15 +3675,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3659
3675
|
logger.info(f'Dryrun complete. Would have run:\n{task}')
|
3660
3676
|
return None
|
3661
3677
|
|
3662
|
-
job_id = self._add_job(handle, task_copy.name, resources_str)
|
3678
|
+
job_id, log_dir = self._add_job(handle, task_copy.name, resources_str)
|
3663
3679
|
|
3664
3680
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
3665
3681
|
# Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
|
3666
3682
|
if num_actual_nodes > 1:
|
3667
|
-
self._execute_task_n_nodes(handle, task_copy, job_id, detach_run
|
3683
|
+
self._execute_task_n_nodes(handle, task_copy, job_id, detach_run,
|
3684
|
+
log_dir)
|
3668
3685
|
else:
|
3669
3686
|
# Case: task_lib.Task(run, num_nodes=1)
|
3670
|
-
self._execute_task_one_node(handle, task_copy, job_id, detach_run
|
3687
|
+
self._execute_task_one_node(handle, task_copy, job_id, detach_run,
|
3688
|
+
log_dir)
|
3671
3689
|
|
3672
3690
|
return job_id
|
3673
3691
|
|
@@ -3830,32 +3848,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3830
3848
|
Returns:
|
3831
3849
|
A dictionary mapping job_id to log path.
|
3832
3850
|
"""
|
3833
|
-
code = job_lib.JobLibCodeGen.
|
3834
|
-
returncode,
|
3835
|
-
|
3836
|
-
|
3837
|
-
|
3838
|
-
|
3839
|
-
separate_stderr=True)
|
3851
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
|
3852
|
+
returncode, job_to_dir, stderr = self.run_on_head(handle,
|
3853
|
+
code,
|
3854
|
+
stream_logs=False,
|
3855
|
+
require_outputs=True,
|
3856
|
+
separate_stderr=True)
|
3840
3857
|
subprocess_utils.handle_returncode(returncode, code,
|
3841
3858
|
'Failed to sync logs.', stderr)
|
3842
|
-
|
3843
|
-
if not
|
3859
|
+
job_to_dir: Dict[str, str] = message_utils.decode_payload(job_to_dir)
|
3860
|
+
if not job_to_dir:
|
3844
3861
|
logger.info(f'{colorama.Fore.YELLOW}'
|
3845
3862
|
'No matching log directories found'
|
3846
3863
|
f'{colorama.Style.RESET_ALL}')
|
3847
3864
|
return {}
|
3848
3865
|
|
3849
|
-
job_ids = list(
|
3850
|
-
|
3866
|
+
job_ids = list(job_to_dir.keys())
|
3867
|
+
dirs = list(job_to_dir.values())
|
3851
3868
|
remote_log_dirs = [
|
3852
|
-
|
3853
|
-
|
3854
|
-
|
3855
|
-
|
3856
|
-
os.path.join(local_dir, run_timestamp)
|
3857
|
-
for run_timestamp in run_timestamps
|
3869
|
+
# TODO(aylei): backward compatibility for legacy runtime that
|
3870
|
+
# returns run_timestamp only, remove after 0.12.0
|
3871
|
+
(dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
|
3872
|
+
constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
|
3858
3873
|
]
|
3874
|
+
local_log_dirs = [(dir.replace(constants.SKY_LOGS_DIRECTORY, local_dir)
|
3875
|
+
if constants.SKY_LOGS_DIRECTORY in dir else
|
3876
|
+
os.path.join(local_dir, dir)) for dir in dirs]
|
3859
3877
|
|
3860
3878
|
runners = handle.get_command_runners()
|
3861
3879
|
|
@@ -4027,8 +4045,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4027
4045
|
|
4028
4046
|
# get the run_timestamp
|
4029
4047
|
# the function takes in [job_id]
|
4030
|
-
code = job_lib.JobLibCodeGen.
|
4031
|
-
[str(job_id)])
|
4048
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs([str(job_id)])
|
4032
4049
|
returncode, run_timestamps, stderr = self.run_on_head(
|
4033
4050
|
handle,
|
4034
4051
|
code,
|
@@ -4290,29 +4307,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4290
4307
|
# successfully removed cluster as no exception was raised
|
4291
4308
|
returncode = 0
|
4292
4309
|
|
4293
|
-
elif terminate and isinstance(cloud, clouds.SCP):
|
4294
|
-
# pylint: disable=import-outside-toplevel
|
4295
|
-
from sky.skylet.providers.scp import node_provider
|
4296
|
-
config['provider']['cache_stopped_nodes'] = not terminate
|
4297
|
-
provider = node_provider.SCPNodeProvider(config['provider'],
|
4298
|
-
cluster_name_on_cloud)
|
4299
|
-
try:
|
4300
|
-
if not os.path.exists(provider.metadata.path):
|
4301
|
-
raise node_provider.SCPError(
|
4302
|
-
'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
|
4303
|
-
'Metadata file does not exist.')
|
4304
|
-
|
4305
|
-
with open(provider.metadata.path, 'r', encoding='utf-8') as f:
|
4306
|
-
metadata = json.load(f)
|
4307
|
-
node_id = next(iter(metadata.values())).get(
|
4308
|
-
'creation', {}).get('virtualServerId', None)
|
4309
|
-
provider.terminate_node(node_id)
|
4310
|
-
returncode = 0
|
4311
|
-
except node_provider.SCPError as e:
|
4312
|
-
returncode = 1
|
4313
|
-
stdout = ''
|
4314
|
-
stderr = str(e)
|
4315
|
-
|
4316
4310
|
else:
|
4317
4311
|
config['provider']['cache_stopped_nodes'] = not terminate
|
4318
4312
|
with tempfile.NamedTemporaryFile('w',
|
@@ -5185,7 +5179,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
5185
5179
|
def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
|
5186
5180
|
handle: CloudVmRayResourceHandle) -> Dict[str, str]:
|
5187
5181
|
"""Returns the environment variables for the task."""
|
5188
|
-
env_vars = task.
|
5182
|
+
env_vars = task.envs_and_secrets
|
5189
5183
|
# If it is a managed job, the TASK_ID_ENV_VAR will have been already set
|
5190
5184
|
# by the controller.
|
5191
5185
|
if constants.TASK_ID_ENV_VAR not in env_vars:
|
@@ -5199,9 +5193,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
5199
5193
|
|
5200
5194
|
def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
|
5201
5195
|
task: task_lib.Task, job_id: int,
|
5202
|
-
detach_run: bool) -> None:
|
5196
|
+
detach_run: bool, remote_log_dir: str) -> None:
|
5203
5197
|
# Launch the command as a Ray task.
|
5204
|
-
log_dir = os.path.join(
|
5198
|
+
log_dir = os.path.join(remote_log_dir, 'tasks')
|
5205
5199
|
|
5206
5200
|
resources_dict = backend_utils.get_task_demands_dict(task)
|
5207
5201
|
internal_ips = handle.internal_ips()
|
@@ -5239,17 +5233,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
5239
5233
|
codegen.build(),
|
5240
5234
|
job_id,
|
5241
5235
|
detach_run=detach_run,
|
5242
|
-
managed_job_dag=task.managed_job_dag
|
5236
|
+
managed_job_dag=task.managed_job_dag,
|
5237
|
+
remote_log_dir=remote_log_dir)
|
5243
5238
|
|
5244
5239
|
def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
|
5245
5240
|
task: task_lib.Task, job_id: int,
|
5246
|
-
detach_run: bool) -> None:
|
5241
|
+
detach_run: bool, remote_log_dir: str) -> None:
|
5247
5242
|
# Strategy:
|
5248
5243
|
# ray.init(...)
|
5249
5244
|
# for node:
|
5250
5245
|
# submit _run_cmd(cmd) with resource {node_i: 1}
|
5251
|
-
|
5252
|
-
log_dir = os.path.join(log_dir_base, 'tasks')
|
5246
|
+
log_dir = os.path.join(remote_log_dir, 'tasks')
|
5253
5247
|
resources_dict = backend_utils.get_task_demands_dict(task)
|
5254
5248
|
internal_ips = handle.internal_ips()
|
5255
5249
|
assert internal_ips is not None, 'internal_ips is not cached in handle'
|
@@ -5295,4 +5289,5 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
5295
5289
|
codegen.build(),
|
5296
5290
|
job_id,
|
5297
5291
|
detach_run=detach_run,
|
5298
|
-
managed_job_dag=task.managed_job_dag
|
5292
|
+
managed_job_dag=task.managed_job_dag,
|
5293
|
+
remote_log_dir=remote_log_dir)
|