skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/cloud_vm_ray_backend.py +43 -60
  3. sky/cli.py +55 -637
  4. sky/client/cli.py +55 -637
  5. sky/clouds/kubernetes.py +3 -0
  6. sky/clouds/scp.py +7 -26
  7. sky/clouds/utils/scp_utils.py +177 -124
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  10. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
  12. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  13. sky/dashboard/out/clusters/[cluster].html +1 -1
  14. sky/dashboard/out/clusters.html +1 -1
  15. sky/dashboard/out/config.html +1 -1
  16. sky/dashboard/out/index.html +1 -1
  17. sky/dashboard/out/infra/[context].html +1 -1
  18. sky/dashboard/out/infra.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs.html +1 -1
  21. sky/dashboard/out/users.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/jobs/controller.py +98 -31
  26. sky/jobs/scheduler.py +37 -29
  27. sky/jobs/server/core.py +36 -3
  28. sky/jobs/state.py +69 -9
  29. sky/jobs/utils.py +11 -0
  30. sky/provision/__init__.py +1 -0
  31. sky/provision/scp/__init__.py +15 -0
  32. sky/provision/scp/config.py +93 -0
  33. sky/provision/scp/instance.py +528 -0
  34. sky/resources.py +164 -29
  35. sky/skylet/constants.py +39 -0
  36. sky/skylet/job_lib.py +8 -0
  37. sky/task.py +171 -21
  38. sky/templates/kubernetes-ray.yml.j2 +51 -4
  39. sky/templates/scp-ray.yml.j2 +3 -50
  40. sky/users/permission.py +19 -36
  41. sky/utils/command_runner.py +1 -1
  42. sky/utils/common_utils.py +16 -14
  43. sky/utils/context.py +1 -1
  44. sky/utils/controller_utils.py +12 -3
  45. sky/utils/dag_utils.py +17 -4
  46. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  47. sky/utils/schemas.py +43 -5
  48. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
  49. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
  50. sky/benchmark/__init__.py +0 -0
  51. sky/benchmark/benchmark_state.py +0 -295
  52. sky/benchmark/benchmark_utils.py +0 -641
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  55. sky/skylet/providers/scp/__init__.py +0 -2
  56. sky/skylet/providers/scp/config.py +0 -149
  57. sky/skylet/providers/scp/node_provider.py +0 -578
  58. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
  59. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
  60. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
  61. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
  62. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'c093624863dbd49ba5acad3590ba8f5c294d37a3'
8
+ _SKYPILOT_COMMIT_SHA = 'fa78e63ee618b8695df1bca87911a231cce3d7da'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250616'
38
+ __version__ = '1.0.0.dev20250617'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -81,7 +81,6 @@ _set_http_proxy_env_vars()
81
81
  # Keep this order to avoid cyclic imports
82
82
  # pylint: disable=wrong-import-position
83
83
  from sky import backends
84
- from sky import benchmark
85
84
  from sky import clouds
86
85
  from sky.admin_policy import AdminPolicy
87
86
  from sky.admin_policy import MutatedUserRequest
@@ -168,7 +167,6 @@ __all__ = [
168
167
  'Optimizer',
169
168
  'OptimizeTarget',
170
169
  'backends',
171
- 'benchmark',
172
170
  'list_accelerators',
173
171
  '__root_dir__',
174
172
  'Storage',
@@ -21,6 +21,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
21
21
 
22
22
  import colorama
23
23
  import filelock
24
+ import yaml
24
25
 
25
26
  import sky
26
27
  from sky import backends
@@ -786,34 +787,6 @@ class FailoverCloudErrorHandlerV1:
786
787
  setattr(e, 'detailed_reason', detailed_reason)
787
788
  raise e
788
789
 
789
- @staticmethod
790
- def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
791
- launchable_resources: 'resources_lib.Resources',
792
- region: 'clouds.Region',
793
- zones: Optional[List['clouds.Zone']], stdout: str,
794
- stderr: str):
795
- del zones # Unused.
796
- errors = FailoverCloudErrorHandlerV1._handle_errors(
797
- stdout,
798
- stderr,
799
- is_error_str_known=lambda x: 'SCPError:' in x.strip())
800
-
801
- logger.warning(f'Got error(s) in {region.name}:')
802
- messages = '\n\t'.join(errors)
803
- style = colorama.Style
804
- logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
805
- _add_to_blocked_resources(blocked_resources,
806
- launchable_resources.copy(zone=None))
807
-
808
- # Sometimes, SCPError will list available regions.
809
- for e in errors:
810
- if e.find('Regions with capacity available:') != -1:
811
- for r in catalog.regions('scp'):
812
- if e.find(r.name) == -1:
813
- _add_to_blocked_resources(
814
- blocked_resources,
815
- launchable_resources.copy(region=r.name, zone=None))
816
-
817
790
  @staticmethod
818
791
  def _ibm_handler(blocked_resources: Set['resources_lib.Resources'],
819
792
  launchable_resources: 'resources_lib.Resources',
@@ -1117,6 +1090,21 @@ class FailoverCloudErrorHandlerV2:
1117
1090
  FailoverCloudErrorHandlerV2._default_handler(
1118
1091
  blocked_resources, launchable_resources, region, zones, error)
1119
1092
 
1093
+ @staticmethod
1094
+ def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
1095
+ launchable_resources: 'resources_lib.Resources',
1096
+ region: 'clouds.Region',
1097
+ zones: Optional[List['clouds.Zone']],
1098
+ error: Exception) -> None:
1099
+ logger.info(f'SCP handler error: {error}')
1100
+ # Block SCP if the credential has expired.
1101
+ if isinstance(error, exceptions.InvalidCloudCredentials):
1102
+ _add_to_blocked_resources(
1103
+ blocked_resources, resources_lib.Resources(cloud=clouds.SCP()))
1104
+ else:
1105
+ FailoverCloudErrorHandlerV2._default_handler(
1106
+ blocked_resources, launchable_resources, region, zones, error)
1107
+
1120
1108
  @staticmethod
1121
1109
  def _default_handler(blocked_resources: Set['resources_lib.Resources'],
1122
1110
  launchable_resources: 'resources_lib.Resources',
@@ -2302,12 +2290,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2302
2290
  clouds.ProvisionerVersion.SKYPILOT):
2303
2291
  provider_name = str(self.launched_resources.cloud).lower()
2304
2292
  config = {}
2305
- if os.path.exists(self.cluster_yaml):
2306
- # It is possible that the cluster yaml is not available when
2307
- # the handle is unpickled for service replicas from the
2308
- # controller with older version.
2309
- config = global_user_state.get_cluster_yaml_dict(
2310
- self.cluster_yaml)
2293
+ # It is possible that the cluster yaml is not available when
2294
+ # the handle is unpickled for service replicas from the
2295
+ # controller with older version.
2296
+ yaml_str = global_user_state.get_cluster_yaml_str(self.cluster_yaml)
2297
+ if yaml_str is None:
2298
+ # If the cluster yaml is not available,
2299
+ # we skip updating the cluster info.
2300
+ return
2301
+ config = yaml.safe_load(yaml_str)
2311
2302
  try:
2312
2303
  cluster_info = provision_lib.get_cluster_info(
2313
2304
  provider_name,
@@ -2500,6 +2491,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2500
2491
  'Tried to use cached cluster info, but it\'s missing for '
2501
2492
  f'cluster "{self.cluster_name}"')
2502
2493
  self._update_cluster_info()
2494
+ # For Kubernetes, `KubernetesCommandRunner` want to get the pod names
2495
+ # to run the command. But for high availability serve controller,
2496
+ # the controller pod is part of a deployment, and once the pod is
2497
+ # killed and a new one is created, the pod name changes, so we need
2498
+ # to manually update the cluster info here.
2499
+ # TODO(andyl): See if we can prevent this refresh. Like pass in
2500
+ # deployment name as identifier for KubernetesCommandRunner. Now this
2501
+ # is required for rsync as using deployment in rsync seems to cause
2502
+ # some unknown issues.
2503
+ # TODO(andyl): Should check through the real cluster info. Same as
2504
+ # the TODO in kubernetes/instance.py:terminate_instances
2505
+ if (isinstance(self.launched_resources.cloud, clouds.Kubernetes) and
2506
+ controller_utils.high_availability_specified(
2507
+ self.cluster_name)):
2508
+ self._update_cluster_info()
2503
2509
 
2504
2510
  assert self.cached_cluster_info is not None, self
2505
2511
  runners = provision_lib.get_command_runners(
@@ -3178,7 +3184,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3178
3184
  # Capture task YAML and command
3179
3185
  task_config = None
3180
3186
  if task is not None:
3181
- task_config = task.to_yaml_config()
3187
+ task_config = task.to_yaml_config(redact_secrets=True)
3182
3188
 
3183
3189
  with timeline.Event('backend.provision.post_process'):
3184
3190
  global_user_state.add_or_update_cluster(
@@ -3302,7 +3308,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3302
3308
  runners = handle.get_command_runners(avoid_ssh_control=True)
3303
3309
 
3304
3310
  def _setup_node(node_id: int) -> None:
3305
- setup_envs = task.envs.copy()
3311
+ setup_envs = task.envs_and_secrets
3306
3312
  setup_envs.update(self._skypilot_predefined_env_vars(handle))
3307
3313
  setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
3308
3314
  setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
@@ -4290,29 +4296,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4290
4296
  # successfully removed cluster as no exception was raised
4291
4297
  returncode = 0
4292
4298
 
4293
- elif terminate and isinstance(cloud, clouds.SCP):
4294
- # pylint: disable=import-outside-toplevel
4295
- from sky.skylet.providers.scp import node_provider
4296
- config['provider']['cache_stopped_nodes'] = not terminate
4297
- provider = node_provider.SCPNodeProvider(config['provider'],
4298
- cluster_name_on_cloud)
4299
- try:
4300
- if not os.path.exists(provider.metadata.path):
4301
- raise node_provider.SCPError(
4302
- 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
4303
- 'Metadata file does not exist.')
4304
-
4305
- with open(provider.metadata.path, 'r', encoding='utf-8') as f:
4306
- metadata = json.load(f)
4307
- node_id = next(iter(metadata.values())).get(
4308
- 'creation', {}).get('virtualServerId', None)
4309
- provider.terminate_node(node_id)
4310
- returncode = 0
4311
- except node_provider.SCPError as e:
4312
- returncode = 1
4313
- stdout = ''
4314
- stderr = str(e)
4315
-
4316
4299
  else:
4317
4300
  config['provider']['cache_stopped_nodes'] = not terminate
4318
4301
  with tempfile.NamedTemporaryFile('w',
@@ -5185,7 +5168,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5185
5168
  def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
5186
5169
  handle: CloudVmRayResourceHandle) -> Dict[str, str]:
5187
5170
  """Returns the environment variables for the task."""
5188
- env_vars = task.envs.copy()
5171
+ env_vars = task.envs_and_secrets
5189
5172
  # If it is a managed job, the TASK_ID_ENV_VAR will have been already set
5190
5173
  # by the controller.
5191
5174
  if constants.TASK_ID_ENV_VAR not in env_vars: