skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/backend_utils.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +91 -96
  4. sky/cli.py +5 -6311
  5. sky/client/cli.py +66 -639
  6. sky/client/sdk.py +22 -2
  7. sky/clouds/kubernetes.py +8 -0
  8. sky/clouds/scp.py +7 -26
  9. sky/clouds/utils/scp_utils.py +177 -124
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  16. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
  18. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/global_user_state.py +50 -11
  33. sky/jobs/controller.py +98 -31
  34. sky/jobs/scheduler.py +37 -29
  35. sky/jobs/server/core.py +36 -3
  36. sky/jobs/state.py +69 -9
  37. sky/jobs/utils.py +11 -0
  38. sky/logs/__init__.py +17 -0
  39. sky/logs/agent.py +73 -0
  40. sky/logs/gcp.py +91 -0
  41. sky/models.py +1 -0
  42. sky/provision/__init__.py +1 -0
  43. sky/provision/instance_setup.py +35 -0
  44. sky/provision/provisioner.py +11 -0
  45. sky/provision/scp/__init__.py +15 -0
  46. sky/provision/scp/config.py +93 -0
  47. sky/provision/scp/instance.py +528 -0
  48. sky/resources.py +164 -29
  49. sky/server/common.py +21 -9
  50. sky/server/requests/payloads.py +19 -1
  51. sky/server/server.py +121 -29
  52. sky/setup_files/dependencies.py +11 -1
  53. sky/skylet/constants.py +48 -1
  54. sky/skylet/job_lib.py +83 -19
  55. sky/task.py +171 -21
  56. sky/templates/kubernetes-ray.yml.j2 +60 -4
  57. sky/templates/scp-ray.yml.j2 +3 -50
  58. sky/users/permission.py +47 -34
  59. sky/users/rbac.py +10 -1
  60. sky/users/server.py +274 -9
  61. sky/utils/command_runner.py +1 -1
  62. sky/utils/common_utils.py +16 -14
  63. sky/utils/context.py +1 -1
  64. sky/utils/controller_utils.py +12 -3
  65. sky/utils/dag_utils.py +17 -4
  66. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  67. sky/utils/schemas.py +83 -5
  68. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
  69. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
  70. sky/benchmark/__init__.py +0 -0
  71. sky/benchmark/benchmark_state.py +0 -295
  72. sky/benchmark/benchmark_utils.py +0 -641
  73. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
  74. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  75. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  76. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
  78. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
  79. sky/skylet/providers/scp/__init__.py +0 -2
  80. sky/skylet/providers/scp/config.py +0 -149
  81. sky/skylet/providers/scp/node_provider.py +0 -578
  82. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
  83. /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
  84. /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
  85. /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
  86. /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
  87. /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
  89. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
  90. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
  91. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
  92. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'c093624863dbd49ba5acad3590ba8f5c294d37a3'
8
+ _SKYPILOT_COMMIT_SHA = '903f8a7f3955084316b26af619b6b043f43de01c'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250616'
38
+ __version__ = '1.0.0.dev20250618'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -81,7 +81,6 @@ _set_http_proxy_env_vars()
81
81
  # Keep this order to avoid cyclic imports
82
82
  # pylint: disable=wrong-import-position
83
83
  from sky import backends
84
- from sky import benchmark
85
84
  from sky import clouds
86
85
  from sky.admin_policy import AdminPolicy
87
86
  from sky.admin_policy import MutatedUserRequest
@@ -168,7 +167,6 @@ __all__ = [
168
167
  'Optimizer',
169
168
  'OptimizeTarget',
170
169
  'backends',
171
- 'benchmark',
172
170
  'list_accelerators',
173
171
  '__root_dir__',
174
172
  'Storage',
@@ -28,6 +28,7 @@ from sky import check as sky_check
28
28
  from sky import clouds
29
29
  from sky import exceptions
30
30
  from sky import global_user_state
31
+ from sky import logs
31
32
  from sky import provision as provision_lib
32
33
  from sky import sky_logging
33
34
  from sky import skypilot_config
@@ -660,6 +661,12 @@ def write_cluster_config(
660
661
 
661
662
  credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
662
663
 
664
+ logging_agent = logs.get_logging_agent()
665
+ if logging_agent:
666
+ for k, v in logging_agent.get_credential_file_mounts().items():
667
+ assert k not in credentials, f'{k} already in credentials'
668
+ credentials[k] = v
669
+
663
670
  private_key_path, _ = auth.get_or_generate_keys()
664
671
  auth_config = {'ssh_private_key': private_key_path}
665
672
  region_name = resources_vars.get('region')
@@ -21,6 +21,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
21
21
 
22
22
  import colorama
23
23
  import filelock
24
+ import yaml
24
25
 
25
26
  import sky
26
27
  from sky import backends
@@ -141,6 +142,7 @@ _MAX_RAY_UP_RETRY = 5
141
142
  _MAX_GET_ZONE_RETRY = 3
142
143
 
143
144
  _JOB_ID_PATTERN = re.compile(r'Job ID: ([0-9]+)')
145
+ _LOG_DIR_PATTERN = re.compile(r'Log Dir: ([^ ]+)')
144
146
 
145
147
  # Path to the monkey-patched ray up script.
146
148
  # We don't do import then __file__ because that script needs to be filled in
@@ -786,34 +788,6 @@ class FailoverCloudErrorHandlerV1:
786
788
  setattr(e, 'detailed_reason', detailed_reason)
787
789
  raise e
788
790
 
789
- @staticmethod
790
- def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
791
- launchable_resources: 'resources_lib.Resources',
792
- region: 'clouds.Region',
793
- zones: Optional[List['clouds.Zone']], stdout: str,
794
- stderr: str):
795
- del zones # Unused.
796
- errors = FailoverCloudErrorHandlerV1._handle_errors(
797
- stdout,
798
- stderr,
799
- is_error_str_known=lambda x: 'SCPError:' in x.strip())
800
-
801
- logger.warning(f'Got error(s) in {region.name}:')
802
- messages = '\n\t'.join(errors)
803
- style = colorama.Style
804
- logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
805
- _add_to_blocked_resources(blocked_resources,
806
- launchable_resources.copy(zone=None))
807
-
808
- # Sometimes, SCPError will list available regions.
809
- for e in errors:
810
- if e.find('Regions with capacity available:') != -1:
811
- for r in catalog.regions('scp'):
812
- if e.find(r.name) == -1:
813
- _add_to_blocked_resources(
814
- blocked_resources,
815
- launchable_resources.copy(region=r.name, zone=None))
816
-
817
791
  @staticmethod
818
792
  def _ibm_handler(blocked_resources: Set['resources_lib.Resources'],
819
793
  launchable_resources: 'resources_lib.Resources',
@@ -1117,6 +1091,21 @@ class FailoverCloudErrorHandlerV2:
1117
1091
  FailoverCloudErrorHandlerV2._default_handler(
1118
1092
  blocked_resources, launchable_resources, region, zones, error)
1119
1093
 
1094
+ @staticmethod
1095
+ def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
1096
+ launchable_resources: 'resources_lib.Resources',
1097
+ region: 'clouds.Region',
1098
+ zones: Optional[List['clouds.Zone']],
1099
+ error: Exception) -> None:
1100
+ logger.info(f'SCP handler error: {error}')
1101
+ # Block SCP if the credential has expired.
1102
+ if isinstance(error, exceptions.InvalidCloudCredentials):
1103
+ _add_to_blocked_resources(
1104
+ blocked_resources, resources_lib.Resources(cloud=clouds.SCP()))
1105
+ else:
1106
+ FailoverCloudErrorHandlerV2._default_handler(
1107
+ blocked_resources, launchable_resources, region, zones, error)
1108
+
1120
1109
  @staticmethod
1121
1110
  def _default_handler(blocked_resources: Set['resources_lib.Resources'],
1122
1111
  launchable_resources: 'resources_lib.Resources',
@@ -2302,12 +2291,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2302
2291
  clouds.ProvisionerVersion.SKYPILOT):
2303
2292
  provider_name = str(self.launched_resources.cloud).lower()
2304
2293
  config = {}
2305
- if os.path.exists(self.cluster_yaml):
2306
- # It is possible that the cluster yaml is not available when
2307
- # the handle is unpickled for service replicas from the
2308
- # controller with older version.
2309
- config = global_user_state.get_cluster_yaml_dict(
2310
- self.cluster_yaml)
2294
+ # It is possible that the cluster yaml is not available when
2295
+ # the handle is unpickled for service replicas from the
2296
+ # controller with older version.
2297
+ yaml_str = global_user_state.get_cluster_yaml_str(self.cluster_yaml)
2298
+ if yaml_str is None:
2299
+ # If the cluster yaml is not available,
2300
+ # we skip updating the cluster info.
2301
+ return
2302
+ config = yaml.safe_load(yaml_str)
2311
2303
  try:
2312
2304
  cluster_info = provision_lib.get_cluster_info(
2313
2305
  provider_name,
@@ -2500,6 +2492,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2500
2492
  'Tried to use cached cluster info, but it\'s missing for '
2501
2493
  f'cluster "{self.cluster_name}"')
2502
2494
  self._update_cluster_info()
2495
+ # For Kubernetes, `KubernetesCommandRunner` want to get the pod names
2496
+ # to run the command. But for high availability serve controller,
2497
+ # the controller pod is part of a deployment, and once the pod is
2498
+ # killed and a new one is created, the pod name changes, so we need
2499
+ # to manually update the cluster info here.
2500
+ # TODO(andyl): See if we can prevent this refresh. Like pass in
2501
+ # deployment name as identifier for KubernetesCommandRunner. Now this
2502
+ # is required for rsync as using deployment in rsync seems to cause
2503
+ # some unknown issues.
2504
+ # TODO(andyl): Should check through the real cluster info. Same as
2505
+ # the TODO in kubernetes/instance.py:terminate_instances
2506
+ if (isinstance(self.launched_resources.cloud, clouds.Kubernetes) and
2507
+ controller_utils.high_availability_specified(
2508
+ self.cluster_name)):
2509
+ self._update_cluster_info()
2503
2510
 
2504
2511
  assert self.cached_cluster_info is not None, self
2505
2512
  runners = provision_lib.get_command_runners(
@@ -3178,7 +3185,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3178
3185
  # Capture task YAML and command
3179
3186
  task_config = None
3180
3187
  if task is not None:
3181
- task_config = task.to_yaml_config()
3188
+ task_config = task.to_yaml_config(redact_secrets=True)
3182
3189
 
3183
3190
  with timeline.Event('backend.provision.post_process'):
3184
3191
  global_user_state.add_or_update_cluster(
@@ -3302,7 +3309,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3302
3309
  runners = handle.get_command_runners(avoid_ssh_control=True)
3303
3310
 
3304
3311
  def _setup_node(node_id: int) -> None:
3305
- setup_envs = task.envs.copy()
3312
+ setup_envs = task.envs_and_secrets
3306
3313
  setup_envs.update(self._skypilot_predefined_env_vars(handle))
3307
3314
  setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
3308
3315
  setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
@@ -3455,10 +3462,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3455
3462
  job_id: int,
3456
3463
  detach_run: bool = False,
3457
3464
  managed_job_dag: Optional['dag.Dag'] = None,
3465
+ remote_log_dir: Optional[str] = None,
3458
3466
  ) -> None:
3459
3467
  """Executes generated code on the head node."""
3460
3468
  script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3461
- remote_log_dir = self.log_dir
3469
+ if remote_log_dir is None:
3470
+ remote_log_dir = self.log_dir
3462
3471
  remote_log_path = os.path.join(remote_log_dir, 'run.log')
3463
3472
 
3464
3473
  cd = f'cd {SKY_REMOTE_WORKDIR}'
@@ -3577,13 +3586,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3577
3586
  self.tail_logs(handle, job_id)
3578
3587
 
3579
3588
  def _add_job(self, handle: CloudVmRayResourceHandle,
3580
- job_name: Optional[str], resources_str: str) -> int:
3589
+ job_name: Optional[str],
3590
+ resources_str: str) -> Tuple[int, str]:
3581
3591
  code = job_lib.JobLibCodeGen.add_job(
3582
3592
  job_name=job_name,
3583
3593
  username=common_utils.get_user_hash(),
3584
3594
  run_timestamp=self.run_timestamp,
3585
3595
  resources_str=resources_str)
3586
- returncode, job_id_str, stderr = self.run_on_head(handle,
3596
+ returncode, result_str, stderr = self.run_on_head(handle,
3587
3597
  code,
3588
3598
  stream_logs=False,
3589
3599
  require_outputs=True,
@@ -3597,17 +3607,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3597
3607
  subprocess_utils.handle_returncode(returncode, code,
3598
3608
  'Failed to fetch job id.', stderr)
3599
3609
  try:
3600
- job_id_match = _JOB_ID_PATTERN.search(job_id_str)
3610
+ job_id_match = _JOB_ID_PATTERN.search(result_str)
3601
3611
  if job_id_match is not None:
3602
3612
  job_id = int(job_id_match.group(1))
3603
3613
  else:
3604
3614
  # For backward compatibility.
3605
- job_id = int(job_id_str)
3615
+ job_id = int(result_str)
3616
+ log_dir_match = _LOG_DIR_PATTERN.search(result_str)
3617
+ if log_dir_match is not None:
3618
+ log_dir = log_dir_match.group(1).strip()
3619
+ else:
3620
+ # For backward compatibility, use the same log dir as local.
3621
+ log_dir = self.log_dir
3606
3622
  except ValueError as e:
3607
3623
  logger.error(stderr)
3608
- raise ValueError(f'Failed to parse job id: {job_id_str}; '
3624
+ raise ValueError(f'Failed to parse job id: {result_str}; '
3609
3625
  f'Returncode: {returncode}') from e
3610
- return job_id
3626
+ return job_id, log_dir
3611
3627
 
3612
3628
  def _execute(
3613
3629
  self,
@@ -3659,15 +3675,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3659
3675
  logger.info(f'Dryrun complete. Would have run:\n{task}')
3660
3676
  return None
3661
3677
 
3662
- job_id = self._add_job(handle, task_copy.name, resources_str)
3678
+ job_id, log_dir = self._add_job(handle, task_copy.name, resources_str)
3663
3679
 
3664
3680
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
3665
3681
  # Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
3666
3682
  if num_actual_nodes > 1:
3667
- self._execute_task_n_nodes(handle, task_copy, job_id, detach_run)
3683
+ self._execute_task_n_nodes(handle, task_copy, job_id, detach_run,
3684
+ log_dir)
3668
3685
  else:
3669
3686
  # Case: task_lib.Task(run, num_nodes=1)
3670
- self._execute_task_one_node(handle, task_copy, job_id, detach_run)
3687
+ self._execute_task_one_node(handle, task_copy, job_id, detach_run,
3688
+ log_dir)
3671
3689
 
3672
3690
  return job_id
3673
3691
 
@@ -3830,32 +3848,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3830
3848
  Returns:
3831
3849
  A dictionary mapping job_id to log path.
3832
3850
  """
3833
- code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(job_ids)
3834
- returncode, run_timestamps, stderr = self.run_on_head(
3835
- handle,
3836
- code,
3837
- stream_logs=False,
3838
- require_outputs=True,
3839
- separate_stderr=True)
3851
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
3852
+ returncode, job_to_dir, stderr = self.run_on_head(handle,
3853
+ code,
3854
+ stream_logs=False,
3855
+ require_outputs=True,
3856
+ separate_stderr=True)
3840
3857
  subprocess_utils.handle_returncode(returncode, code,
3841
3858
  'Failed to sync logs.', stderr)
3842
- run_timestamps = message_utils.decode_payload(run_timestamps)
3843
- if not run_timestamps:
3859
+ job_to_dir: Dict[str, str] = message_utils.decode_payload(job_to_dir)
3860
+ if not job_to_dir:
3844
3861
  logger.info(f'{colorama.Fore.YELLOW}'
3845
3862
  'No matching log directories found'
3846
3863
  f'{colorama.Style.RESET_ALL}')
3847
3864
  return {}
3848
3865
 
3849
- job_ids = list(run_timestamps.keys())
3850
- run_timestamps = list(run_timestamps.values())
3866
+ job_ids = list(job_to_dir.keys())
3867
+ dirs = list(job_to_dir.values())
3851
3868
  remote_log_dirs = [
3852
- os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp)
3853
- for run_timestamp in run_timestamps
3854
- ]
3855
- local_log_dirs = [
3856
- os.path.join(local_dir, run_timestamp)
3857
- for run_timestamp in run_timestamps
3869
+ # TODO(aylei): backward compatibility for legacy runtime that
3870
+ # returns run_timestamp only, remove after 0.12.0
3871
+ (dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
3872
+ constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
3858
3873
  ]
3874
+ local_log_dirs = [(dir.replace(constants.SKY_LOGS_DIRECTORY, local_dir)
3875
+ if constants.SKY_LOGS_DIRECTORY in dir else
3876
+ os.path.join(local_dir, dir)) for dir in dirs]
3859
3877
 
3860
3878
  runners = handle.get_command_runners()
3861
3879
 
@@ -4027,8 +4045,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4027
4045
 
4028
4046
  # get the run_timestamp
4029
4047
  # the function takes in [job_id]
4030
- code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
4031
- [str(job_id)])
4048
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs([str(job_id)])
4032
4049
  returncode, run_timestamps, stderr = self.run_on_head(
4033
4050
  handle,
4034
4051
  code,
@@ -4290,29 +4307,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4290
4307
  # successfully removed cluster as no exception was raised
4291
4308
  returncode = 0
4292
4309
 
4293
- elif terminate and isinstance(cloud, clouds.SCP):
4294
- # pylint: disable=import-outside-toplevel
4295
- from sky.skylet.providers.scp import node_provider
4296
- config['provider']['cache_stopped_nodes'] = not terminate
4297
- provider = node_provider.SCPNodeProvider(config['provider'],
4298
- cluster_name_on_cloud)
4299
- try:
4300
- if not os.path.exists(provider.metadata.path):
4301
- raise node_provider.SCPError(
4302
- 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
4303
- 'Metadata file does not exist.')
4304
-
4305
- with open(provider.metadata.path, 'r', encoding='utf-8') as f:
4306
- metadata = json.load(f)
4307
- node_id = next(iter(metadata.values())).get(
4308
- 'creation', {}).get('virtualServerId', None)
4309
- provider.terminate_node(node_id)
4310
- returncode = 0
4311
- except node_provider.SCPError as e:
4312
- returncode = 1
4313
- stdout = ''
4314
- stderr = str(e)
4315
-
4316
4310
  else:
4317
4311
  config['provider']['cache_stopped_nodes'] = not terminate
4318
4312
  with tempfile.NamedTemporaryFile('w',
@@ -5185,7 +5179,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5185
5179
  def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
5186
5180
  handle: CloudVmRayResourceHandle) -> Dict[str, str]:
5187
5181
  """Returns the environment variables for the task."""
5188
- env_vars = task.envs.copy()
5182
+ env_vars = task.envs_and_secrets
5189
5183
  # If it is a managed job, the TASK_ID_ENV_VAR will have been already set
5190
5184
  # by the controller.
5191
5185
  if constants.TASK_ID_ENV_VAR not in env_vars:
@@ -5199,9 +5193,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5199
5193
 
5200
5194
  def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
5201
5195
  task: task_lib.Task, job_id: int,
5202
- detach_run: bool) -> None:
5196
+ detach_run: bool, remote_log_dir: str) -> None:
5203
5197
  # Launch the command as a Ray task.
5204
- log_dir = os.path.join(self.log_dir, 'tasks')
5198
+ log_dir = os.path.join(remote_log_dir, 'tasks')
5205
5199
 
5206
5200
  resources_dict = backend_utils.get_task_demands_dict(task)
5207
5201
  internal_ips = handle.internal_ips()
@@ -5239,17 +5233,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5239
5233
  codegen.build(),
5240
5234
  job_id,
5241
5235
  detach_run=detach_run,
5242
- managed_job_dag=task.managed_job_dag)
5236
+ managed_job_dag=task.managed_job_dag,
5237
+ remote_log_dir=remote_log_dir)
5243
5238
 
5244
5239
  def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
5245
5240
  task: task_lib.Task, job_id: int,
5246
- detach_run: bool) -> None:
5241
+ detach_run: bool, remote_log_dir: str) -> None:
5247
5242
  # Strategy:
5248
5243
  # ray.init(...)
5249
5244
  # for node:
5250
5245
  # submit _run_cmd(cmd) with resource {node_i: 1}
5251
- log_dir_base = self.log_dir
5252
- log_dir = os.path.join(log_dir_base, 'tasks')
5246
+ log_dir = os.path.join(remote_log_dir, 'tasks')
5253
5247
  resources_dict = backend_utils.get_task_demands_dict(task)
5254
5248
  internal_ips = handle.internal_ips()
5255
5249
  assert internal_ips is not None, 'internal_ips is not cached in handle'
@@ -5295,4 +5289,5 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5295
5289
  codegen.build(),
5296
5290
  job_id,
5297
5291
  detach_run=detach_run,
5298
- managed_job_dag=task.managed_job_dag)
5292
+ managed_job_dag=task.managed_job_dag,
5293
+ remote_log_dir=remote_log_dir)