skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/cloud_vm_ray_backend.py +43 -60
- sky/cli.py +55 -637
- sky/client/cli.py +55 -637
- sky/clouds/kubernetes.py +3 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/provision/__init__.py +1 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/skylet/constants.py +39 -0
- sky/skylet/job_lib.py +8 -0
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +51 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +19 -36
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +43 -5
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'fa78e63ee618b8695df1bca87911a231cce3d7da'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250617'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -81,7 +81,6 @@ _set_http_proxy_env_vars()
|
|
81
81
|
# Keep this order to avoid cyclic imports
|
82
82
|
# pylint: disable=wrong-import-position
|
83
83
|
from sky import backends
|
84
|
-
from sky import benchmark
|
85
84
|
from sky import clouds
|
86
85
|
from sky.admin_policy import AdminPolicy
|
87
86
|
from sky.admin_policy import MutatedUserRequest
|
@@ -168,7 +167,6 @@ __all__ = [
|
|
168
167
|
'Optimizer',
|
169
168
|
'OptimizeTarget',
|
170
169
|
'backends',
|
171
|
-
'benchmark',
|
172
170
|
'list_accelerators',
|
173
171
|
'__root_dir__',
|
174
172
|
'Storage',
|
@@ -21,6 +21,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
|
21
21
|
|
22
22
|
import colorama
|
23
23
|
import filelock
|
24
|
+
import yaml
|
24
25
|
|
25
26
|
import sky
|
26
27
|
from sky import backends
|
@@ -786,34 +787,6 @@ class FailoverCloudErrorHandlerV1:
|
|
786
787
|
setattr(e, 'detailed_reason', detailed_reason)
|
787
788
|
raise e
|
788
789
|
|
789
|
-
@staticmethod
|
790
|
-
def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
|
791
|
-
launchable_resources: 'resources_lib.Resources',
|
792
|
-
region: 'clouds.Region',
|
793
|
-
zones: Optional[List['clouds.Zone']], stdout: str,
|
794
|
-
stderr: str):
|
795
|
-
del zones # Unused.
|
796
|
-
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
797
|
-
stdout,
|
798
|
-
stderr,
|
799
|
-
is_error_str_known=lambda x: 'SCPError:' in x.strip())
|
800
|
-
|
801
|
-
logger.warning(f'Got error(s) in {region.name}:')
|
802
|
-
messages = '\n\t'.join(errors)
|
803
|
-
style = colorama.Style
|
804
|
-
logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
|
805
|
-
_add_to_blocked_resources(blocked_resources,
|
806
|
-
launchable_resources.copy(zone=None))
|
807
|
-
|
808
|
-
# Sometimes, SCPError will list available regions.
|
809
|
-
for e in errors:
|
810
|
-
if e.find('Regions with capacity available:') != -1:
|
811
|
-
for r in catalog.regions('scp'):
|
812
|
-
if e.find(r.name) == -1:
|
813
|
-
_add_to_blocked_resources(
|
814
|
-
blocked_resources,
|
815
|
-
launchable_resources.copy(region=r.name, zone=None))
|
816
|
-
|
817
790
|
@staticmethod
|
818
791
|
def _ibm_handler(blocked_resources: Set['resources_lib.Resources'],
|
819
792
|
launchable_resources: 'resources_lib.Resources',
|
@@ -1117,6 +1090,21 @@ class FailoverCloudErrorHandlerV2:
|
|
1117
1090
|
FailoverCloudErrorHandlerV2._default_handler(
|
1118
1091
|
blocked_resources, launchable_resources, region, zones, error)
|
1119
1092
|
|
1093
|
+
@staticmethod
|
1094
|
+
def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
|
1095
|
+
launchable_resources: 'resources_lib.Resources',
|
1096
|
+
region: 'clouds.Region',
|
1097
|
+
zones: Optional[List['clouds.Zone']],
|
1098
|
+
error: Exception) -> None:
|
1099
|
+
logger.info(f'SCP handler error: {error}')
|
1100
|
+
# Block SCP if the credential has expired.
|
1101
|
+
if isinstance(error, exceptions.InvalidCloudCredentials):
|
1102
|
+
_add_to_blocked_resources(
|
1103
|
+
blocked_resources, resources_lib.Resources(cloud=clouds.SCP()))
|
1104
|
+
else:
|
1105
|
+
FailoverCloudErrorHandlerV2._default_handler(
|
1106
|
+
blocked_resources, launchable_resources, region, zones, error)
|
1107
|
+
|
1120
1108
|
@staticmethod
|
1121
1109
|
def _default_handler(blocked_resources: Set['resources_lib.Resources'],
|
1122
1110
|
launchable_resources: 'resources_lib.Resources',
|
@@ -2302,12 +2290,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2302
2290
|
clouds.ProvisionerVersion.SKYPILOT):
|
2303
2291
|
provider_name = str(self.launched_resources.cloud).lower()
|
2304
2292
|
config = {}
|
2305
|
-
|
2306
|
-
|
2307
|
-
|
2308
|
-
|
2309
|
-
|
2310
|
-
|
2293
|
+
# It is possible that the cluster yaml is not available when
|
2294
|
+
# the handle is unpickled for service replicas from the
|
2295
|
+
# controller with older version.
|
2296
|
+
yaml_str = global_user_state.get_cluster_yaml_str(self.cluster_yaml)
|
2297
|
+
if yaml_str is None:
|
2298
|
+
# If the cluster yaml is not available,
|
2299
|
+
# we skip updating the cluster info.
|
2300
|
+
return
|
2301
|
+
config = yaml.safe_load(yaml_str)
|
2311
2302
|
try:
|
2312
2303
|
cluster_info = provision_lib.get_cluster_info(
|
2313
2304
|
provider_name,
|
@@ -2500,6 +2491,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2500
2491
|
'Tried to use cached cluster info, but it\'s missing for '
|
2501
2492
|
f'cluster "{self.cluster_name}"')
|
2502
2493
|
self._update_cluster_info()
|
2494
|
+
# For Kubernetes, `KubernetesCommandRunner` want to get the pod names
|
2495
|
+
# to run the command. But for high availability serve controller,
|
2496
|
+
# the controller pod is part of a deployment, and once the pod is
|
2497
|
+
# killed and a new one is created, the pod name changes, so we need
|
2498
|
+
# to manually update the cluster info here.
|
2499
|
+
# TODO(andyl): See if we can prevent this refresh. Like pass in
|
2500
|
+
# deployment name as identifier for KubernetesCommandRunner. Now this
|
2501
|
+
# is required for rsync as using deployment in rsync seems to cause
|
2502
|
+
# some unknown issues.
|
2503
|
+
# TODO(andyl): Should check through the real cluster info. Same as
|
2504
|
+
# the TODO in kubernetes/instance.py:terminate_instances
|
2505
|
+
if (isinstance(self.launched_resources.cloud, clouds.Kubernetes) and
|
2506
|
+
controller_utils.high_availability_specified(
|
2507
|
+
self.cluster_name)):
|
2508
|
+
self._update_cluster_info()
|
2503
2509
|
|
2504
2510
|
assert self.cached_cluster_info is not None, self
|
2505
2511
|
runners = provision_lib.get_command_runners(
|
@@ -3178,7 +3184,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3178
3184
|
# Capture task YAML and command
|
3179
3185
|
task_config = None
|
3180
3186
|
if task is not None:
|
3181
|
-
task_config = task.to_yaml_config()
|
3187
|
+
task_config = task.to_yaml_config(redact_secrets=True)
|
3182
3188
|
|
3183
3189
|
with timeline.Event('backend.provision.post_process'):
|
3184
3190
|
global_user_state.add_or_update_cluster(
|
@@ -3302,7 +3308,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3302
3308
|
runners = handle.get_command_runners(avoid_ssh_control=True)
|
3303
3309
|
|
3304
3310
|
def _setup_node(node_id: int) -> None:
|
3305
|
-
setup_envs = task.
|
3311
|
+
setup_envs = task.envs_and_secrets
|
3306
3312
|
setup_envs.update(self._skypilot_predefined_env_vars(handle))
|
3307
3313
|
setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
|
3308
3314
|
setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
|
@@ -4290,29 +4296,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4290
4296
|
# successfully removed cluster as no exception was raised
|
4291
4297
|
returncode = 0
|
4292
4298
|
|
4293
|
-
elif terminate and isinstance(cloud, clouds.SCP):
|
4294
|
-
# pylint: disable=import-outside-toplevel
|
4295
|
-
from sky.skylet.providers.scp import node_provider
|
4296
|
-
config['provider']['cache_stopped_nodes'] = not terminate
|
4297
|
-
provider = node_provider.SCPNodeProvider(config['provider'],
|
4298
|
-
cluster_name_on_cloud)
|
4299
|
-
try:
|
4300
|
-
if not os.path.exists(provider.metadata.path):
|
4301
|
-
raise node_provider.SCPError(
|
4302
|
-
'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
|
4303
|
-
'Metadata file does not exist.')
|
4304
|
-
|
4305
|
-
with open(provider.metadata.path, 'r', encoding='utf-8') as f:
|
4306
|
-
metadata = json.load(f)
|
4307
|
-
node_id = next(iter(metadata.values())).get(
|
4308
|
-
'creation', {}).get('virtualServerId', None)
|
4309
|
-
provider.terminate_node(node_id)
|
4310
|
-
returncode = 0
|
4311
|
-
except node_provider.SCPError as e:
|
4312
|
-
returncode = 1
|
4313
|
-
stdout = ''
|
4314
|
-
stderr = str(e)
|
4315
|
-
|
4316
4299
|
else:
|
4317
4300
|
config['provider']['cache_stopped_nodes'] = not terminate
|
4318
4301
|
with tempfile.NamedTemporaryFile('w',
|
@@ -5185,7 +5168,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
5185
5168
|
def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
|
5186
5169
|
handle: CloudVmRayResourceHandle) -> Dict[str, str]:
|
5187
5170
|
"""Returns the environment variables for the task."""
|
5188
|
-
env_vars = task.
|
5171
|
+
env_vars = task.envs_and_secrets
|
5189
5172
|
# If it is a managed job, the TASK_ID_ENV_VAR will have been already set
|
5190
5173
|
# by the controller.
|
5191
5174
|
if constants.TASK_ID_ENV_VAR not in env_vars:
|