skypilot-nightly 1.0.0.dev20250818__py3-none-any.whl → 1.0.0.dev20250820__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +5 -3
- sky/backends/cloud_vm_ray_backend.py +6 -13
- sky/backends/wheel_utils.py +2 -1
- sky/catalog/data_fetchers/fetch_aws.py +2 -0
- sky/client/cli/command.py +20 -16
- sky/core.py +1 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{D7_ocVBIBwyxtvXYWggqV → 8ZscIHnvBWz3AXkxsJL6H}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/3015-bf218e4973bf5c8f.js +1 -0
- sky/dashboard/out/_next/static/chunks/{8969-6cb1af4ec7fb1e19.js → 8969-23c8fbdb8b397d59.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-a46c8b62df807ec1.js → webpack-008593a02784a2df.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +6 -1
- sky/global_user_state.py +18 -11
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +43 -34
- sky/jobs/server/utils.py +2 -1
- sky/jobs/utils.py +56 -9
- sky/models.py +1 -0
- sky/provision/aws/config.py +11 -11
- sky/provision/aws/instance.py +30 -27
- sky/provision/do/utils.py +2 -2
- sky/provision/kubernetes/network_utils.py +3 -3
- sky/provision/kubernetes/utils.py +2 -2
- sky/provision/kubernetes/volume.py +2 -0
- sky/provision/provisioner.py +10 -6
- sky/serve/replica_managers.py +7 -0
- sky/serve/server/impl.py +1 -1
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/serializers/encoders.py +29 -5
- sky/server/server.py +37 -1
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +17 -11
- sky/skylet/ray_patches/__init__.py +18 -4
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/utils/common.py +27 -7
- sky/utils/common_utils.py +13 -9
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +3 -0
- sky/utils/kubernetes/gpu_labeler.py +3 -3
- sky/utils/schemas.py +1 -0
- sky/utils/serialize_utils.py +16 -0
- sky/volumes/client/sdk.py +10 -7
- sky/volumes/server/core.py +12 -3
- sky/volumes/volume.py +17 -3
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/METADATA +21 -13
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/RECORD +72 -63
- sky/dashboard/out/_next/static/chunks/3015-471d67c9302d4027.js +0 -1
- /sky/dashboard/out/_next/static/{D7_ocVBIBwyxtvXYWggqV → 8ZscIHnvBWz3AXkxsJL6H}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/top_level.txt +0 -0
sky/provision/aws/config.py
CHANGED
|
@@ -498,8 +498,8 @@ def _vpc_id_from_security_group_ids(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
498
498
|
return vpc_ids[0]
|
|
499
499
|
|
|
500
500
|
|
|
501
|
-
def
|
|
502
|
-
|
|
501
|
+
def get_vpc_id_by_name(ec2: 'mypy_boto3_ec2.ServiceResource', vpc_name: str,
|
|
502
|
+
region: str) -> str:
|
|
503
503
|
"""Returns the VPC ID of the unique VPC with a given name.
|
|
504
504
|
|
|
505
505
|
Exits with code 1 if:
|
|
@@ -532,7 +532,7 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
532
532
|
use_internal_ips: bool,
|
|
533
533
|
vpc_name: Optional[str]) -> Tuple[Any, str]:
|
|
534
534
|
if vpc_name is not None:
|
|
535
|
-
vpc_id_of_sg =
|
|
535
|
+
vpc_id_of_sg = get_vpc_id_by_name(ec2, vpc_name, region)
|
|
536
536
|
elif security_group_ids:
|
|
537
537
|
vpc_id_of_sg = _vpc_id_from_security_group_ids(ec2, security_group_ids)
|
|
538
538
|
else:
|
|
@@ -614,8 +614,8 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
614
614
|
due to AWS service issues.
|
|
615
615
|
"""
|
|
616
616
|
# Figure out which security groups with this name exist for each VPC...
|
|
617
|
-
security_group =
|
|
618
|
-
|
|
617
|
+
security_group = get_security_group_from_vpc_id(ec2, vpc_id,
|
|
618
|
+
expected_sg_name)
|
|
619
619
|
if security_group is not None:
|
|
620
620
|
return security_group
|
|
621
621
|
|
|
@@ -631,7 +631,7 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
631
631
|
# The security group already exists, but we didn't see it
|
|
632
632
|
# because of eventual consistency.
|
|
633
633
|
logger.warning(f'{expected_sg_name} already exists when creating.')
|
|
634
|
-
security_group =
|
|
634
|
+
security_group = get_security_group_from_vpc_id(
|
|
635
635
|
ec2, vpc_id, expected_sg_name)
|
|
636
636
|
assert (security_group is not None and
|
|
637
637
|
security_group.group_name == expected_sg_name), (
|
|
@@ -646,8 +646,8 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
646
646
|
logger.warning(message)
|
|
647
647
|
raise exceptions.NoClusterLaunchedError(message) from e
|
|
648
648
|
|
|
649
|
-
security_group =
|
|
650
|
-
|
|
649
|
+
security_group = get_security_group_from_vpc_id(ec2, vpc_id,
|
|
650
|
+
expected_sg_name)
|
|
651
651
|
assert security_group is not None, 'Failed to create security group'
|
|
652
652
|
logger.info(f'Created new security group {colorama.Style.BRIGHT}'
|
|
653
653
|
f'{security_group.group_name}{colorama.Style.RESET_ALL} '
|
|
@@ -655,9 +655,9 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
655
655
|
return security_group
|
|
656
656
|
|
|
657
657
|
|
|
658
|
-
def
|
|
659
|
-
|
|
660
|
-
|
|
658
|
+
def get_security_group_from_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
659
|
+
vpc_id: str,
|
|
660
|
+
group_name: str) -> Optional[Any]:
|
|
661
661
|
"""Get security group by VPC ID and group name."""
|
|
662
662
|
existing_groups = list(
|
|
663
663
|
ec2.security_groups.filter(Filters=[{
|
sky/provision/aws/instance.py
CHANGED
|
@@ -18,6 +18,7 @@ from sky.clouds import aws as aws_cloud
|
|
|
18
18
|
from sky.clouds.utils import aws_utils
|
|
19
19
|
from sky.provision import common
|
|
20
20
|
from sky.provision import constants
|
|
21
|
+
from sky.provision.aws import config as aws_config
|
|
21
22
|
from sky.provision.aws import utils
|
|
22
23
|
from sky.utils import common_utils
|
|
23
24
|
from sky.utils import resources_utils
|
|
@@ -685,7 +686,9 @@ def terminate_instances(
|
|
|
685
686
|
filters,
|
|
686
687
|
included_instances=None,
|
|
687
688
|
excluded_instances=None)
|
|
688
|
-
default_sg =
|
|
689
|
+
default_sg = aws_config.get_security_group_from_vpc_id(
|
|
690
|
+
ec2, _get_vpc_id(provider_config),
|
|
691
|
+
aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
|
|
689
692
|
if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
690
693
|
# Case 1: The default SG is used, we don't need to ensure instance are
|
|
691
694
|
# terminated.
|
|
@@ -727,30 +730,6 @@ def terminate_instances(
|
|
|
727
730
|
# of most cloud implementations (including AWS).
|
|
728
731
|
|
|
729
732
|
|
|
730
|
-
def _get_sg_from_name(
|
|
731
|
-
ec2: Any,
|
|
732
|
-
sg_name: str,
|
|
733
|
-
) -> Any:
|
|
734
|
-
# GroupNames will only filter SGs in the default VPC, so we need to use
|
|
735
|
-
# Filters here. Ref:
|
|
736
|
-
# https://boto3.amazonaws.com/v1/documentation/api/1.26.112/reference/services/ec2/service-resource/security_groups.html # pylint: disable=line-too-long
|
|
737
|
-
sgs = ec2.security_groups.filter(Filters=[{
|
|
738
|
-
'Name': 'group-name',
|
|
739
|
-
'Values': [sg_name]
|
|
740
|
-
}])
|
|
741
|
-
num_sg = len(list(sgs))
|
|
742
|
-
if num_sg == 0:
|
|
743
|
-
logger.warning(f'Expected security group {sg_name} not found. ')
|
|
744
|
-
return None
|
|
745
|
-
if num_sg > 1:
|
|
746
|
-
# TODO(tian): Better handle this case. Maybe we can check when creating
|
|
747
|
-
# the SG and throw an error if there is already an existing SG with the
|
|
748
|
-
# same name.
|
|
749
|
-
logger.warning(f'Found {num_sg} security groups with name {sg_name}. ')
|
|
750
|
-
return None
|
|
751
|
-
return list(sgs)[0]
|
|
752
|
-
|
|
753
|
-
|
|
754
733
|
def _maybe_move_to_new_sg(
|
|
755
734
|
instance: Any,
|
|
756
735
|
expected_sg: Any,
|
|
@@ -803,7 +782,9 @@ def open_ports(
|
|
|
803
782
|
with ux_utils.print_exception_no_traceback():
|
|
804
783
|
raise ValueError('Instance with cluster name '
|
|
805
784
|
f'{cluster_name_on_cloud} not found.')
|
|
806
|
-
sg =
|
|
785
|
+
sg = aws_config.get_security_group_from_vpc_id(ec2,
|
|
786
|
+
_get_vpc_id(provider_config),
|
|
787
|
+
sg_name)
|
|
807
788
|
if sg is None:
|
|
808
789
|
with ux_utils.print_exception_no_traceback():
|
|
809
790
|
raise ValueError('Cannot find new security group '
|
|
@@ -899,7 +880,9 @@ def cleanup_ports(
|
|
|
899
880
|
# We only want to delete the SG that is dedicated to this cluster (i.e.,
|
|
900
881
|
# this cluster have opened some ports).
|
|
901
882
|
return
|
|
902
|
-
sg =
|
|
883
|
+
sg = aws_config.get_security_group_from_vpc_id(ec2,
|
|
884
|
+
_get_vpc_id(provider_config),
|
|
885
|
+
sg_name)
|
|
903
886
|
if sg is None:
|
|
904
887
|
logger.warning(
|
|
905
888
|
'Find security group failed. Skip cleanup security group.')
|
|
@@ -1010,3 +993,23 @@ def get_cluster_info(
|
|
|
1010
993
|
provider_name='aws',
|
|
1011
994
|
provider_config=provider_config,
|
|
1012
995
|
)
|
|
996
|
+
|
|
997
|
+
|
|
998
|
+
def _get_vpc_id(provider_config: Dict[str, Any]) -> str:
|
|
999
|
+
region = provider_config['region']
|
|
1000
|
+
ec2 = _default_ec2_resource(provider_config['region'])
|
|
1001
|
+
if 'vpc_name' in provider_config:
|
|
1002
|
+
return aws_config.get_vpc_id_by_name(ec2, provider_config['vpc_name'],
|
|
1003
|
+
region)
|
|
1004
|
+
else:
|
|
1005
|
+
# Retrieve the default VPC name from the region.
|
|
1006
|
+
response = ec2.meta.client.describe_vpcs(Filters=[{
|
|
1007
|
+
'Name': 'isDefault',
|
|
1008
|
+
'Values': ['true']
|
|
1009
|
+
}])
|
|
1010
|
+
if len(response['Vpcs']) == 0:
|
|
1011
|
+
raise ValueError(f'No default VPC found in region {region}')
|
|
1012
|
+
elif len(response['Vpcs']) > 1:
|
|
1013
|
+
raise ValueError(f'Multiple default VPCs found in region {region}')
|
|
1014
|
+
else:
|
|
1015
|
+
return response['Vpcs'][0]['VpcId']
|
sky/provision/do/utils.py
CHANGED
|
@@ -30,7 +30,7 @@ POSSIBLE_CREDENTIALS_PATHS = [
|
|
|
30
30
|
INITIAL_BACKOFF_SECONDS = 10
|
|
31
31
|
MAX_BACKOFF_FACTOR = 10
|
|
32
32
|
MAX_ATTEMPTS = 6
|
|
33
|
-
|
|
33
|
+
SSH_KEY_NAME_ON_DO_PREFIX = 'sky-key-'
|
|
34
34
|
|
|
35
35
|
_client = None
|
|
36
36
|
_ssh_key_id = None
|
|
@@ -125,7 +125,7 @@ def ssh_key_id(public_key: str):
|
|
|
125
125
|
|
|
126
126
|
request = {
|
|
127
127
|
'public_key': public_key,
|
|
128
|
-
'name':
|
|
128
|
+
'name': SSH_KEY_NAME_ON_DO_PREFIX + common_utils.get_user_hash(),
|
|
129
129
|
}
|
|
130
130
|
_ssh_key_id = client().ssh_keys.create(body=request)['ssh_key']
|
|
131
131
|
return _ssh_key_id
|
|
@@ -4,13 +4,13 @@ import time
|
|
|
4
4
|
import typing
|
|
5
5
|
from typing import Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
|
7
|
-
import sky
|
|
8
7
|
from sky import exceptions
|
|
9
8
|
from sky import sky_logging
|
|
10
9
|
from sky import skypilot_config
|
|
11
10
|
from sky.adaptors import common as adaptors_common
|
|
12
11
|
from sky.adaptors import kubernetes
|
|
13
12
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
13
|
+
from sky.utils import directory_utils
|
|
14
14
|
from sky.utils import kubernetes_enums
|
|
15
15
|
from sky.utils import ux_utils
|
|
16
16
|
|
|
@@ -80,7 +80,7 @@ def get_networking_mode(
|
|
|
80
80
|
def fill_loadbalancer_template(namespace: str, context: Optional[str],
|
|
81
81
|
service_name: str, ports: List[int],
|
|
82
82
|
selector_key: str, selector_value: str) -> Dict:
|
|
83
|
-
template_path = os.path.join(
|
|
83
|
+
template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
|
|
84
84
|
_LOADBALANCER_TEMPLATE_NAME)
|
|
85
85
|
if not os.path.exists(template_path):
|
|
86
86
|
raise FileNotFoundError(
|
|
@@ -116,7 +116,7 @@ def fill_ingress_template(namespace: str, context: Optional[str],
|
|
|
116
116
|
service_details: List[Tuple[str, int,
|
|
117
117
|
str]], ingress_name: str,
|
|
118
118
|
selector_key: str, selector_value: str) -> Dict:
|
|
119
|
-
template_path = os.path.join(
|
|
119
|
+
template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
|
|
120
120
|
_INGRESS_TEMPLATE_NAME)
|
|
121
121
|
if not os.path.exists(template_path):
|
|
122
122
|
raise FileNotFoundError(
|
|
@@ -14,7 +14,6 @@ import typing
|
|
|
14
14
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
15
15
|
from urllib.parse import urlparse
|
|
16
16
|
|
|
17
|
-
import sky
|
|
18
17
|
from sky import clouds
|
|
19
18
|
from sky import exceptions
|
|
20
19
|
from sky import global_user_state
|
|
@@ -31,6 +30,7 @@ from sky.skylet import constants
|
|
|
31
30
|
from sky.utils import annotations
|
|
32
31
|
from sky.utils import common_utils
|
|
33
32
|
from sky.utils import config_utils
|
|
33
|
+
from sky.utils import directory_utils
|
|
34
34
|
from sky.utils import env_options
|
|
35
35
|
from sky.utils import kubernetes_enums
|
|
36
36
|
from sky.utils import schemas
|
|
@@ -2444,7 +2444,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
|
|
|
2444
2444
|
|
|
2445
2445
|
def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
|
|
2446
2446
|
ssh_jump_name: str, service_type: str) -> Dict:
|
|
2447
|
-
template_path = os.path.join(
|
|
2447
|
+
template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
|
|
2448
2448
|
'kubernetes-ssh-jump.yml.j2')
|
|
2449
2449
|
if not os.path.exists(template_path):
|
|
2450
2450
|
raise FileNotFoundError(
|
|
@@ -203,6 +203,8 @@ def _get_pvc_spec(namespace: str,
|
|
|
203
203
|
},
|
|
204
204
|
}
|
|
205
205
|
}
|
|
206
|
+
if config.labels:
|
|
207
|
+
pvc_spec['metadata']['labels'].update(config.labels)
|
|
206
208
|
storage_class = config.config.get('storage_class_name')
|
|
207
209
|
if storage_class is not None:
|
|
208
210
|
pvc_spec['spec']['storageClassName'] = storage_class
|
sky/provision/provisioner.py
CHANGED
|
@@ -167,7 +167,7 @@ def bulk_provision(
|
|
|
167
167
|
# This error is a user error instead of a provisioning failure.
|
|
168
168
|
# And there is no possibility to fix it by teardown.
|
|
169
169
|
raise
|
|
170
|
-
except Exception: # pylint: disable=broad-except
|
|
170
|
+
except Exception as exc: # pylint: disable=broad-except
|
|
171
171
|
zone_str = 'all zones'
|
|
172
172
|
if zones:
|
|
173
173
|
zone_str = ','.join(zone.name for zone in zones)
|
|
@@ -189,14 +189,18 @@ def bulk_provision(
|
|
|
189
189
|
provider_config=original_config['provider'])
|
|
190
190
|
break
|
|
191
191
|
except NotImplementedError as e:
|
|
192
|
-
|
|
192
|
+
assert not terminate, (
|
|
193
|
+
'Terminating must be supported by all clouds')
|
|
194
|
+
exc_msg = common_utils.format_exception(exc).replace(
|
|
195
|
+
'\n', ' ')
|
|
193
196
|
# If the underlying cloud does not support stopping
|
|
194
197
|
# instances, we should stop failover as well.
|
|
195
198
|
raise provision_common.StopFailoverError(
|
|
196
|
-
'
|
|
197
|
-
f'
|
|
198
|
-
|
|
199
|
-
f'
|
|
199
|
+
f'Provisioning cluster {cluster_name.display_name} '
|
|
200
|
+
f'failed: {exc_msg}. Failover is stopped for safety '
|
|
201
|
+
'because the cluster was previously in UP state but '
|
|
202
|
+
f'{cloud} does not support stopping instances to '
|
|
203
|
+
'preserve the cluster state. Please try launching the '
|
|
200
204
|
'cluster again, or terminate it with: '
|
|
201
205
|
f'sky down {cluster_name.display_name}') from e
|
|
202
206
|
except Exception as e: # pylint: disable=broad-except
|
sky/serve/replica_managers.py
CHANGED
|
@@ -48,6 +48,13 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
|
|
|
48
48
|
_RETRY_INIT_GAP_SECONDS = 60
|
|
49
49
|
_DEFAULT_DRAIN_SECONDS = 120
|
|
50
50
|
|
|
51
|
+
# TODO(tian): Backward compatibility. Remove this after 3 minor release, i.e.
|
|
52
|
+
# 0.13.0. We move the ProcessStatus to common_utils.ProcessStatus in #6666, but
|
|
53
|
+
# old ReplicaInfo in database will still tries to unpickle using ProcessStatus
|
|
54
|
+
# in replica_managers. We set this alias to avoid breaking changes. See #6729
|
|
55
|
+
# for more details.
|
|
56
|
+
ProcessStatus = common_utils.ProcessStatus
|
|
57
|
+
|
|
51
58
|
|
|
52
59
|
# TODO(tian): Combine this with
|
|
53
60
|
# sky/spot/recovery_strategy.py::StrategyExecutor::launch
|
sky/serve/server/impl.py
CHANGED
|
@@ -129,11 +129,11 @@ def up(
|
|
|
129
129
|
f'{constants.CLUSTER_NAME_VALID_REGEX}')
|
|
130
130
|
|
|
131
131
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
|
132
|
-
dag.resolve_and_validate_volumes()
|
|
133
132
|
# Always apply the policy again here, even though it might have been applied
|
|
134
133
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
135
134
|
# and get the mutated config.
|
|
136
135
|
dag, mutated_user_config = admin_policy_utils.apply(dag)
|
|
136
|
+
dag.resolve_and_validate_volumes()
|
|
137
137
|
dag.pre_mount_volumes()
|
|
138
138
|
task = dag.tasks[0]
|
|
139
139
|
assert task.service is not None
|
sky/server/requests/payloads.py
CHANGED
|
@@ -453,6 +453,7 @@ class VolumeApplyBody(RequestBody):
|
|
|
453
453
|
zone: Optional[str] = None
|
|
454
454
|
size: Optional[str] = None
|
|
455
455
|
config: Optional[Dict[str, Any]] = None
|
|
456
|
+
labels: Optional[Dict[str, str]] = None
|
|
456
457
|
|
|
457
458
|
|
|
458
459
|
class VolumeDeleteBody(RequestBody):
|
|
@@ -503,6 +504,7 @@ class JobsQueueBody(RequestBody):
|
|
|
503
504
|
pool_match: Optional[str] = None
|
|
504
505
|
page: Optional[int] = None
|
|
505
506
|
limit: Optional[int] = None
|
|
507
|
+
statuses: Optional[List[str]] = None
|
|
506
508
|
|
|
507
509
|
|
|
508
510
|
class JobsCancelBody(RequestBody):
|
|
@@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
10
10
|
|
|
11
11
|
from sky.schemas.api import responses
|
|
12
12
|
from sky.server import constants as server_constants
|
|
13
|
+
from sky.utils import serialize_utils
|
|
13
14
|
|
|
14
15
|
if typing.TYPE_CHECKING:
|
|
15
16
|
from sky import backends
|
|
@@ -22,6 +23,9 @@ handlers: Dict[str, Any] = {}
|
|
|
22
23
|
|
|
23
24
|
def pickle_and_encode(obj: Any) -> str:
|
|
24
25
|
try:
|
|
26
|
+
# Apply backwards compatibility processing at the lowest level
|
|
27
|
+
# to catch any handles that might have bypassed the encoders
|
|
28
|
+
obj = serialize_utils.prepare_handle_for_backwards_compatibility(obj)
|
|
25
29
|
return base64.b64encode(pickle.dumps(obj)).decode('utf-8')
|
|
26
30
|
except TypeError as e:
|
|
27
31
|
raise ValueError(f'Failed to pickle object: {obj}') from e
|
|
@@ -58,7 +62,9 @@ def encode_status(
|
|
|
58
62
|
for cluster in clusters:
|
|
59
63
|
response_cluster = cluster.model_dump()
|
|
60
64
|
response_cluster['status'] = cluster['status'].value
|
|
61
|
-
|
|
65
|
+
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
66
|
+
cluster['handle'])
|
|
67
|
+
response_cluster['handle'] = pickle_and_encode(handle)
|
|
62
68
|
response_cluster['storage_mounts_metadata'] = pickle_and_encode(
|
|
63
69
|
response_cluster['storage_mounts_metadata'])
|
|
64
70
|
response.append(response_cluster)
|
|
@@ -70,6 +76,7 @@ def encode_launch(
|
|
|
70
76
|
job_id_handle: Tuple[Optional[int], Optional['backends.ResourceHandle']]
|
|
71
77
|
) -> Dict[str, Any]:
|
|
72
78
|
job_id, handle = job_id_handle
|
|
79
|
+
handle = serialize_utils.prepare_handle_for_backwards_compatibility(handle)
|
|
73
80
|
return {
|
|
74
81
|
'job_id': job_id,
|
|
75
82
|
'handle': pickle_and_encode(handle),
|
|
@@ -78,6 +85,9 @@ def encode_launch(
|
|
|
78
85
|
|
|
79
86
|
@register_encoder('start')
|
|
80
87
|
def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
|
|
88
|
+
resource_handle = (
|
|
89
|
+
serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
90
|
+
resource_handle))
|
|
81
91
|
return pickle_and_encode(resource_handle)
|
|
82
92
|
|
|
83
93
|
|
|
@@ -113,8 +123,15 @@ def encode_status_kubernetes(
|
|
|
113
123
|
@register_encoder('jobs.queue')
|
|
114
124
|
def encode_jobs_queue(jobs_or_tuple):
|
|
115
125
|
# Support returning either a plain jobs list or a (jobs, total) tuple
|
|
116
|
-
|
|
117
|
-
|
|
126
|
+
status_counts = {}
|
|
127
|
+
if isinstance(jobs_or_tuple, tuple):
|
|
128
|
+
if len(jobs_or_tuple) == 2:
|
|
129
|
+
jobs, total = jobs_or_tuple
|
|
130
|
+
total_no_filter = total
|
|
131
|
+
elif len(jobs_or_tuple) == 4:
|
|
132
|
+
jobs, total, status_counts, total_no_filter = jobs_or_tuple
|
|
133
|
+
else:
|
|
134
|
+
raise ValueError(f'Invalid jobs tuple: {jobs_or_tuple}')
|
|
118
135
|
else:
|
|
119
136
|
jobs = jobs_or_tuple
|
|
120
137
|
total = None
|
|
@@ -122,7 +139,12 @@ def encode_jobs_queue(jobs_or_tuple):
|
|
|
122
139
|
job['status'] = job['status'].value
|
|
123
140
|
if total is None:
|
|
124
141
|
return jobs
|
|
125
|
-
return {
|
|
142
|
+
return {
|
|
143
|
+
'jobs': jobs,
|
|
144
|
+
'total': total,
|
|
145
|
+
'total_no_filter': total_no_filter,
|
|
146
|
+
'status_counts': status_counts
|
|
147
|
+
}
|
|
126
148
|
|
|
127
149
|
|
|
128
150
|
def _encode_serve_status(
|
|
@@ -131,7 +153,9 @@ def _encode_serve_status(
|
|
|
131
153
|
service_status['status'] = service_status['status'].value
|
|
132
154
|
for replica_info in service_status.get('replica_info', []):
|
|
133
155
|
replica_info['status'] = replica_info['status'].value
|
|
134
|
-
|
|
156
|
+
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
157
|
+
replica_info['handle'])
|
|
158
|
+
replica_info['handle'] = pickle_and_encode(handle)
|
|
135
159
|
return service_statuses
|
|
136
160
|
|
|
137
161
|
|
sky/server/server.py
CHANGED
|
@@ -83,6 +83,8 @@ else:
|
|
|
83
83
|
|
|
84
84
|
P = ParamSpec('P')
|
|
85
85
|
|
|
86
|
+
_SERVER_USER_HASH_KEY = 'server_user_hash'
|
|
87
|
+
|
|
86
88
|
|
|
87
89
|
def _add_timestamp_prefix_for_server_logs() -> None:
|
|
88
90
|
server_logger = sky_logging.init_logger('sky.server')
|
|
@@ -1650,7 +1652,10 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1650
1652
|
await websocket.accept()
|
|
1651
1653
|
logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
|
|
1652
1654
|
|
|
1653
|
-
|
|
1655
|
+
# Run core.status in another thread to avoid blocking the event loop.
|
|
1656
|
+
cluster_records = await context_utils.to_thread(core.status,
|
|
1657
|
+
cluster_name,
|
|
1658
|
+
all_users=True)
|
|
1654
1659
|
cluster_record = cluster_records[0]
|
|
1655
1660
|
if cluster_record['status'] != status_lib.ClusterStatus.UP:
|
|
1656
1661
|
raise fastapi.HTTPException(
|
|
@@ -1818,6 +1823,35 @@ async def root():
|
|
|
1818
1823
|
return fastapi.responses.RedirectResponse(url='/dashboard/')
|
|
1819
1824
|
|
|
1820
1825
|
|
|
1826
|
+
def _init_or_restore_server_user_hash():
|
|
1827
|
+
"""Restores the server user hash from the global user state db.
|
|
1828
|
+
|
|
1829
|
+
The API server must have a stable user hash across restarts and potential
|
|
1830
|
+
multiple replicas. Thus we persist the user hash in db and restore it on
|
|
1831
|
+
startup. When upgrading from old version, the user hash will be read from
|
|
1832
|
+
the local file (if any) to keep the user hash consistent.
|
|
1833
|
+
"""
|
|
1834
|
+
|
|
1835
|
+
def apply_user_hash(user_hash: str) -> None:
|
|
1836
|
+
# For local API server, the user hash in db and local file should be
|
|
1837
|
+
# same so there is no harm to override here.
|
|
1838
|
+
common_utils.set_user_hash_locally(user_hash)
|
|
1839
|
+
# Refresh the server user hash for current process after restore or
|
|
1840
|
+
# initialize the user hash in db, child processes will get the correct
|
|
1841
|
+
# server id from the local cache file.
|
|
1842
|
+
common_lib.refresh_server_id()
|
|
1843
|
+
|
|
1844
|
+
user_hash = global_user_state.get_system_config(_SERVER_USER_HASH_KEY)
|
|
1845
|
+
if user_hash is not None:
|
|
1846
|
+
apply_user_hash(user_hash)
|
|
1847
|
+
return
|
|
1848
|
+
|
|
1849
|
+
# Initial deployment, generate a user hash and save it to the db.
|
|
1850
|
+
user_hash = common_utils.get_user_hash()
|
|
1851
|
+
global_user_state.set_system_config(_SERVER_USER_HASH_KEY, user_hash)
|
|
1852
|
+
apply_user_hash(user_hash)
|
|
1853
|
+
|
|
1854
|
+
|
|
1821
1855
|
if __name__ == '__main__':
|
|
1822
1856
|
import uvicorn
|
|
1823
1857
|
|
|
@@ -1827,6 +1861,8 @@ if __name__ == '__main__':
|
|
|
1827
1861
|
global_user_state.initialize_and_get_db()
|
|
1828
1862
|
# Initialize request db
|
|
1829
1863
|
requests_lib.reset_db_and_logs()
|
|
1864
|
+
# Restore the server user hash
|
|
1865
|
+
_init_or_restore_server_user_hash()
|
|
1830
1866
|
|
|
1831
1867
|
parser = argparse.ArgumentParser()
|
|
1832
1868
|
parser.add_argument('--host', default='127.0.0.1')
|
sky/setup_files/MANIFEST.in
CHANGED
|
@@ -9,6 +9,7 @@ include sky/skylet/providers/ibm/*
|
|
|
9
9
|
include sky/skylet/providers/scp/*
|
|
10
10
|
include sky/skylet/providers/*.py
|
|
11
11
|
include sky/skylet/ray_patches/*.patch
|
|
12
|
+
include sky/skylet/ray_patches/*.diff
|
|
12
13
|
include sky/jobs/dashboard/*
|
|
13
14
|
include sky/jobs/dashboard/templates/*
|
|
14
15
|
include sky/jobs/dashboard/static/*
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -72,12 +72,27 @@ install_requires = [
|
|
|
72
72
|
'aiohttp',
|
|
73
73
|
]
|
|
74
74
|
|
|
75
|
+
# See requirements-dev.txt for the version of grpc and protobuf
|
|
76
|
+
# used to generate the code during development.
|
|
77
|
+
|
|
78
|
+
# The grpc version at runtime has to be newer than the version
|
|
79
|
+
# used to generate the code.
|
|
80
|
+
GRPC = 'grpcio>=1.63.0'
|
|
81
|
+
# >= 5.26.1 because the runtime version can't be older than the version
|
|
82
|
+
# used to generate the code.
|
|
83
|
+
# < 7.0.0 because code generated for a major version V will be supported by
|
|
84
|
+
# protobuf runtimes of version V and V+1.
|
|
85
|
+
# https://protobuf.dev/support/cross-version-runtime-guarantee
|
|
86
|
+
PROTOBUF = 'protobuf>=5.26.1, < 7.0.0'
|
|
87
|
+
|
|
75
88
|
server_dependencies = [
|
|
76
89
|
'casbin',
|
|
77
90
|
'sqlalchemy_adapter',
|
|
78
91
|
'passlib',
|
|
79
92
|
'pyjwt',
|
|
80
93
|
'aiohttp',
|
|
94
|
+
GRPC,
|
|
95
|
+
PROTOBUF,
|
|
81
96
|
]
|
|
82
97
|
|
|
83
98
|
local_ray = [
|
|
@@ -88,18 +103,9 @@ local_ray = [
|
|
|
88
103
|
'ray[default] >= 2.2.0, != 2.6.0',
|
|
89
104
|
]
|
|
90
105
|
|
|
91
|
-
# See requirements-dev.txt for the version of grpc and protobuf
|
|
92
|
-
# used to generate the code during development.
|
|
93
106
|
remote = [
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
'grpcio>=1.63.0',
|
|
97
|
-
# >= 5.26.1 because the runtime version can't be older than the version
|
|
98
|
-
# used to generate the code.
|
|
99
|
-
# < 7.0.0 because code generated for a major version V will be supported by
|
|
100
|
-
# protobuf runtimes of version V and V+1.
|
|
101
|
-
# https://protobuf.dev/support/cross-version-runtime-guarantee
|
|
102
|
-
'protobuf >= 5.26.1, < 7.0.0',
|
|
107
|
+
GRPC,
|
|
108
|
+
PROTOBUF,
|
|
103
109
|
]
|
|
104
110
|
|
|
105
111
|
# NOTE: Change the templates/jobs-controller.yaml.j2 file if any of the
|
|
@@ -40,15 +40,29 @@ def _run_patch(target_file,
|
|
|
40
40
|
"""Applies a patch if it has not been applied already."""
|
|
41
41
|
# .orig is the original file that is not patched.
|
|
42
42
|
orig_file = os.path.abspath(f'{target_file}-v{version}.orig')
|
|
43
|
+
# Get diff filename by replacing .patch with .diff
|
|
44
|
+
diff_file = patch_file.replace('.patch', '.diff')
|
|
45
|
+
|
|
43
46
|
script = f"""\
|
|
44
|
-
which patch >/dev/null 2>&1 || sudo yum install -y patch ||
|
|
45
|
-
which patch >/dev/null 2>&1 || (echo "`patch` is not found. Failed to setup ray." && exit 1)
|
|
47
|
+
which patch >/dev/null 2>&1 || sudo yum install -y patch || true
|
|
46
48
|
if [ ! -f {orig_file} ]; then
|
|
47
49
|
echo Create backup file {orig_file}
|
|
48
50
|
cp {target_file} {orig_file}
|
|
49
51
|
fi
|
|
50
|
-
|
|
51
|
-
|
|
52
|
+
if which patch >/dev/null 2>&1; then
|
|
53
|
+
# System patch command is available, use it
|
|
54
|
+
# It is ok to patch again from the original file.
|
|
55
|
+
patch {orig_file} -i {patch_file} -o {target_file}
|
|
56
|
+
else
|
|
57
|
+
# System patch command not available, use Python patch library
|
|
58
|
+
echo "System patch command not available, using Python patch library..."
|
|
59
|
+
python -m pip install patch
|
|
60
|
+
# Get target directory
|
|
61
|
+
target_dir="$(dirname {target_file})"
|
|
62
|
+
# Execute python patch command
|
|
63
|
+
echo "Executing python -m patch -d $target_dir {diff_file}"
|
|
64
|
+
python -m patch -d "$target_dir" "{diff_file}"
|
|
65
|
+
fi
|
|
52
66
|
"""
|
|
53
67
|
subprocess.run(script, shell=True, check=True)
|
|
54
68
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
--- a/autoscaler.py
|
|
2
|
+
+++ b/autoscaler.py
|
|
3
|
+
@@ -1,3 +1,6 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/autoscaler.py
|
|
5
|
+
+# Sky patch changes:
|
|
6
|
+
+# - enable upscaling_speed to be 0.0
|
|
7
|
+
import copy
|
|
8
|
+
import logging
|
|
9
|
+
import math
|
|
10
|
+
@@ -1071,7 +1074,7 @@
|
|
11
|
+
upscaling_speed = self.config.get("upscaling_speed")
|
|
12
|
+
aggressive = self.config.get("autoscaling_mode") == "aggressive"
|
|
13
|
+
target_utilization_fraction = self.config.get("target_utilization_fraction")
|
|
14
|
+
- if upscaling_speed:
|
|
15
|
+
+ if upscaling_speed is not None: # NOTE(sky): enable 0.0
|
|
16
|
+
upscaling_speed = float(upscaling_speed)
|
|
17
|
+
# TODO(ameer): consider adding (if users ask) an option of
|
|
18
|
+
# initial_upscaling_num_workers.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
--- a/cli.py
|
|
2
|
+
+++ b/cli.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/dashboard/modules/job/cli.py
|
|
5
|
+
+# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514
|
|
6
|
+
+# Otherwise, the output redirection ">" will not work.
|
|
7
|
+
+
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
@@ -270,7 +274,7 @@
|
|
12
|
+
working_dir=working_dir,
|
|
13
|
+
)
|
|
14
|
+
job_id = client.submit_job(
|
|
15
|
+
- entrypoint=list2cmdline(entrypoint),
|
|
16
|
+
+ entrypoint=" ".join(entrypoint),
|
|
17
|
+
submission_id=submission_id,
|
|
18
|
+
runtime_env=final_runtime_env,
|
|
19
|
+
metadata=metadata_json,
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
--- a/command_runner.py
|
|
2
|
+
+++ b/command_runner.py
|
|
3
|
+
@@ -1,3 +1,5 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/command_runner.py
|
|
5
|
+
+
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
@@ -137,7 +139,7 @@
|
|
10
|
+
{
|
|
11
|
+
"ControlMaster": "auto",
|
|
12
|
+
"ControlPath": "{}/%C".format(control_path),
|
|
13
|
+
- "ControlPersist": "10s",
|
|
14
|
+
+ "ControlPersist": "300s",
|
|
15
|
+
}
|
|
16
|
+
)
|
|
17
|
+
self.arg_dict.update(kwargs)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
--- a/log_monitor.py
|
|
2
|
+
+++ b/log_monitor.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# Original file https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/log_monitor.py
|
|
5
|
+
+# Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar.
|
|
6
|
+
+# We change the newline handling back to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300
|
|
7
|
+
+
|
|
8
|
+
import argparse
|
|
9
|
+
import errno
|
|
10
|
+
import glob
|
|
11
|
+
@@ -374,7 +378,8 @@
|
|
12
|
+
next_line = next_line.decode("utf-8", "replace")
|
|
13
|
+
if next_line == "":
|
|
14
|
+
break
|
|
15
|
+
- next_line = next_line.rstrip("\r\n")
|
|
16
|
+
+ if next_line.endswith("\n"):
|
|
17
|
+
+ next_line = next_line[:-1]
|
|
18
|
+
|
|
19
|
+
if next_line.startswith(ray_constants.LOG_PREFIX_ACTOR_NAME):
|
|
20
|
+
flush() # Possible change of task/actor name.
|