skypilot-nightly 1.0.0.dev20250828__py3-none-any.whl → 1.0.0.dev20250831__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +24 -2
- sky/backends/backend_utils.py +152 -59
- sky/backends/cloud_vm_ray_backend.py +56 -3
- sky/backends/wheel_utils.py +35 -8
- sky/client/cli/command.py +17 -6
- sky/client/common.py +5 -4
- sky/client/sdk.py +5 -0
- sky/client/sdk_async.py +8 -2
- sky/clouds/aws.py +118 -1
- sky/core.py +8 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-6dae1cd599a34def.js → webpack-6e76f636a048e145.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +58 -10
- sky/provision/aws/config.py +78 -3
- sky/provision/aws/instance.py +45 -6
- sky/provision/docker_utils.py +1 -1
- sky/provision/kubernetes/utils.py +48 -26
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/server/common.py +1 -2
- sky/server/daemons.py +6 -0
- sky/server/requests/executor.py +2 -1
- sky/server/requests/payloads.py +4 -1
- sky/server/server.py +67 -58
- sky/setup_files/dependencies.py +25 -8
- sky/setup_files/setup.py +2 -0
- sky/sky_logging.py +28 -0
- sky/skylet/constants.py +6 -0
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/utils/annotations.py +8 -2
- sky/utils/cluster_utils.py +3 -3
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes_enums.py +1 -0
- sky/utils/lock_events.py +94 -0
- sky/utils/schemas.py +6 -0
- sky/utils/timeline.py +24 -93
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/METADATA +36 -48
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/RECORD +59 -57
- /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
|
@@ -206,6 +206,7 @@ cluster_event_table = sqlalchemy.Table(
|
|
|
206
206
|
sqlalchemy.Column('reason', sqlalchemy.Text, primary_key=True),
|
|
207
207
|
sqlalchemy.Column('transitioned_at', sqlalchemy.Integer, primary_key=True),
|
|
208
208
|
sqlalchemy.Column('type', sqlalchemy.Text),
|
|
209
|
+
sqlalchemy.Column('request_id', sqlalchemy.Text, server_default=None),
|
|
209
210
|
)
|
|
210
211
|
|
|
211
212
|
ssh_key_table = sqlalchemy.Table(
|
|
@@ -745,6 +746,7 @@ def add_cluster_event(cluster_name: str,
|
|
|
745
746
|
elif last_event == reason:
|
|
746
747
|
return
|
|
747
748
|
try:
|
|
749
|
+
request_id = common_utils.get_current_request_id()
|
|
748
750
|
session.execute(
|
|
749
751
|
insert_func(cluster_event_table).values(
|
|
750
752
|
cluster_hash=cluster_hash,
|
|
@@ -754,6 +756,7 @@ def add_cluster_event(cluster_name: str,
|
|
|
754
756
|
reason=reason,
|
|
755
757
|
transitioned_at=transitioned_at,
|
|
756
758
|
type=event_type.value,
|
|
759
|
+
request_id=request_id,
|
|
757
760
|
))
|
|
758
761
|
session.commit()
|
|
759
762
|
except sqlalchemy.exc.IntegrityError as e:
|
|
@@ -2082,19 +2085,51 @@ def get_cluster_yaml_str(cluster_yaml_path: Optional[str]) -> Optional[str]:
|
|
|
2082
2085
|
row = session.query(cluster_yaml_table).filter_by(
|
|
2083
2086
|
cluster_name=cluster_name).first()
|
|
2084
2087
|
if row is None:
|
|
2085
|
-
|
|
2086
|
-
# on the local file system and migrate it to the database.
|
|
2087
|
-
# TODO(syang): remove this check once we have a way to migrate the
|
|
2088
|
-
# cluster from file to database. Remove on v0.12.0.
|
|
2089
|
-
if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
|
|
2090
|
-
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
|
2091
|
-
yaml_str = f.read()
|
|
2092
|
-
set_cluster_yaml(cluster_name, yaml_str)
|
|
2093
|
-
return yaml_str
|
|
2094
|
-
return None
|
|
2088
|
+
return _set_cluster_yaml_from_file(cluster_yaml_path, cluster_name)
|
|
2095
2089
|
return row.yaml
|
|
2096
2090
|
|
|
2097
2091
|
|
|
2092
|
+
def get_cluster_yaml_str_multiple(cluster_yaml_paths: List[str]) -> List[str]:
|
|
2093
|
+
"""Get the cluster yaml from the database or the local file system.
|
|
2094
|
+
"""
|
|
2095
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2096
|
+
cluster_names_to_yaml_paths = {}
|
|
2097
|
+
for cluster_yaml_path in cluster_yaml_paths:
|
|
2098
|
+
cluster_name, _ = os.path.splitext(os.path.basename(cluster_yaml_path))
|
|
2099
|
+
cluster_names_to_yaml_paths[cluster_name] = cluster_yaml_path
|
|
2100
|
+
|
|
2101
|
+
cluster_names = list(cluster_names_to_yaml_paths.keys())
|
|
2102
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2103
|
+
rows = session.query(cluster_yaml_table).filter(
|
|
2104
|
+
cluster_yaml_table.c.cluster_name.in_(cluster_names)).all()
|
|
2105
|
+
row_cluster_names_to_yaml = {row.cluster_name: row.yaml for row in rows}
|
|
2106
|
+
|
|
2107
|
+
yaml_strs = []
|
|
2108
|
+
for cluster_name in cluster_names:
|
|
2109
|
+
if cluster_name in row_cluster_names_to_yaml:
|
|
2110
|
+
yaml_strs.append(row_cluster_names_to_yaml[cluster_name])
|
|
2111
|
+
else:
|
|
2112
|
+
yaml_str = _set_cluster_yaml_from_file(
|
|
2113
|
+
cluster_names_to_yaml_paths[cluster_name], cluster_name)
|
|
2114
|
+
yaml_strs.append(yaml_str)
|
|
2115
|
+
return yaml_strs
|
|
2116
|
+
|
|
2117
|
+
|
|
2118
|
+
def _set_cluster_yaml_from_file(cluster_yaml_path: str,
|
|
2119
|
+
cluster_name: str) -> Optional[str]:
|
|
2120
|
+
"""Set the cluster yaml in the database from a file."""
|
|
2121
|
+
# If the cluster yaml is not in the database, check if it exists
|
|
2122
|
+
# on the local file system and migrate it to the database.
|
|
2123
|
+
# TODO(syang): remove this check once we have a way to migrate the
|
|
2124
|
+
# cluster from file to database. Remove on v0.12.0.
|
|
2125
|
+
if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
|
|
2126
|
+
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
|
2127
|
+
yaml_str = f.read()
|
|
2128
|
+
set_cluster_yaml(cluster_name, yaml_str)
|
|
2129
|
+
return yaml_str
|
|
2130
|
+
return None
|
|
2131
|
+
|
|
2132
|
+
|
|
2098
2133
|
def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
|
|
2099
2134
|
"""Get the cluster yaml as a dictionary from the database.
|
|
2100
2135
|
|
|
@@ -2106,6 +2141,19 @@ def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
|
|
|
2106
2141
|
return yaml_utils.safe_load(yaml_str)
|
|
2107
2142
|
|
|
2108
2143
|
|
|
2144
|
+
def get_cluster_yaml_dict_multiple(
|
|
2145
|
+
cluster_yaml_paths: List[str]) -> List[Dict[str, Any]]:
|
|
2146
|
+
"""Get the cluster yaml as a dictionary from the database."""
|
|
2147
|
+
yaml_strs = get_cluster_yaml_str_multiple(cluster_yaml_paths)
|
|
2148
|
+
yaml_dicts = []
|
|
2149
|
+
for idx, yaml_str in enumerate(yaml_strs):
|
|
2150
|
+
if yaml_str is None:
|
|
2151
|
+
raise ValueError(
|
|
2152
|
+
f'Cluster yaml {cluster_yaml_paths[idx]} not found.')
|
|
2153
|
+
yaml_dicts.append(yaml_utils.safe_load(yaml_str))
|
|
2154
|
+
return yaml_dicts
|
|
2155
|
+
|
|
2156
|
+
|
|
2109
2157
|
@_init_db
|
|
2110
2158
|
def set_cluster_yaml(cluster_name: str, yaml_str: str) -> None:
|
|
2111
2159
|
"""Set the cluster yaml in the database."""
|
sky/provision/aws/config.py
CHANGED
|
@@ -87,6 +87,9 @@ def bootstrap_instances(
|
|
|
87
87
|
use_internal_ips=config.provider_config.get('use_internal_ips', False),
|
|
88
88
|
vpc_name=config.provider_config.get('vpc_name'))
|
|
89
89
|
|
|
90
|
+
max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
|
|
91
|
+
enable_efa = max_efa_interfaces > 0
|
|
92
|
+
|
|
90
93
|
# Cluster workers should be in a security group that permits traffic within
|
|
91
94
|
# the group, and also SSH access from outside.
|
|
92
95
|
if security_group_ids is None:
|
|
@@ -103,7 +106,8 @@ def bootstrap_instances(
|
|
|
103
106
|
extended_ip_rules = []
|
|
104
107
|
security_group_ids = _configure_security_group(ec2, vpc_id,
|
|
105
108
|
expected_sg_name,
|
|
106
|
-
extended_ip_rules
|
|
109
|
+
extended_ip_rules,
|
|
110
|
+
enable_efa)
|
|
107
111
|
if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
108
112
|
logger.debug('Attempting to create the default security group.')
|
|
109
113
|
# Attempt to create the default security group. This is needed
|
|
@@ -114,7 +118,7 @@ def bootstrap_instances(
|
|
|
114
118
|
try:
|
|
115
119
|
_configure_security_group(ec2, vpc_id,
|
|
116
120
|
aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
|
|
117
|
-
[])
|
|
121
|
+
[], enable_efa)
|
|
118
122
|
logger.debug('Default security group created.')
|
|
119
123
|
except exceptions.NoClusterLaunchedError as e:
|
|
120
124
|
if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
|
|
@@ -148,6 +152,37 @@ def bootstrap_instances(
|
|
|
148
152
|
return config
|
|
149
153
|
|
|
150
154
|
|
|
155
|
+
def _configure_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
156
|
+
placement_group_name: str):
|
|
157
|
+
"""Configure placement group for the cluster."""
|
|
158
|
+
# Create the placement group
|
|
159
|
+
logger.info(f'Creating placement group {placement_group_name}.')
|
|
160
|
+
try:
|
|
161
|
+
ec2.meta.client.create_placement_group(GroupName=placement_group_name,
|
|
162
|
+
Strategy='cluster')
|
|
163
|
+
except aws.botocore_exceptions().ClientError as exc:
|
|
164
|
+
if exc.response.get(
|
|
165
|
+
'Error', {}).get('Code') == 'InvalidPlacementGroup.Duplicate':
|
|
166
|
+
logger.debug(
|
|
167
|
+
f'Placement group {placement_group_name} already exists.')
|
|
168
|
+
else:
|
|
169
|
+
raise exc
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def delete_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
173
|
+
placement_group_name: str):
|
|
174
|
+
"""Delete the placement group."""
|
|
175
|
+
try:
|
|
176
|
+
ec2.meta.client.delete_placement_group(GroupName=placement_group_name)
|
|
177
|
+
except aws.botocore_exceptions().ClientError as exc:
|
|
178
|
+
if exc.response.get('Error',
|
|
179
|
+
{}).get('Code') == 'InvalidPlacementGroup.Unknown':
|
|
180
|
+
logger.debug(
|
|
181
|
+
f'Placement group {placement_group_name} does not exist.')
|
|
182
|
+
else:
|
|
183
|
+
raise exc
|
|
184
|
+
|
|
185
|
+
|
|
151
186
|
def _configure_iam_role(iam) -> Dict[str, Any]:
|
|
152
187
|
|
|
153
188
|
def _get_instance_profile(profile_name: str):
|
|
@@ -557,7 +592,8 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
557
592
|
|
|
558
593
|
def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
559
594
|
vpc_id: str, expected_sg_name: str,
|
|
560
|
-
extended_ip_rules: List
|
|
595
|
+
extended_ip_rules: List,
|
|
596
|
+
enable_efa: bool) -> List[str]:
|
|
561
597
|
security_group = _get_or_create_vpc_security_group(ec2, vpc_id,
|
|
562
598
|
expected_sg_name)
|
|
563
599
|
sg_ids = [security_group.id]
|
|
@@ -583,16 +619,55 @@ def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
583
619
|
},
|
|
584
620
|
*extended_ip_rules,
|
|
585
621
|
]
|
|
622
|
+
outbound_rules = []
|
|
623
|
+
if enable_efa:
|
|
624
|
+
# EFA requires that outbound rules permit the same security group to
|
|
625
|
+
# communicate with each other
|
|
626
|
+
# Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html#nccl-start-base-setup # pylint: disable=line-too-long
|
|
627
|
+
outbound_rules.append({
|
|
628
|
+
'FromPort': -1,
|
|
629
|
+
'ToPort': -1,
|
|
630
|
+
'IpProtocol': '-1',
|
|
631
|
+
'UserIdGroupPairs': [{
|
|
632
|
+
'GroupId': i
|
|
633
|
+
} for i in sg_ids],
|
|
634
|
+
})
|
|
586
635
|
# upsert the default security group
|
|
587
636
|
if not security_group.ip_permissions:
|
|
588
637
|
# If users specify security groups, we should not change the rules
|
|
589
638
|
# of these security groups. Here we change it because it is the default
|
|
590
639
|
# security group for SkyPilot.
|
|
591
640
|
security_group.authorize_ingress(IpPermissions=inbound_rules)
|
|
641
|
+
if _need_to_update_outbound_rules(security_group, outbound_rules):
|
|
642
|
+
security_group.authorize_egress(IpPermissions=outbound_rules)
|
|
592
643
|
|
|
593
644
|
return sg_ids
|
|
594
645
|
|
|
595
646
|
|
|
647
|
+
def _need_to_update_outbound_rules(
|
|
648
|
+
security_group: Any,
|
|
649
|
+
outbound_rules: List[Dict[str, Any]],
|
|
650
|
+
) -> bool:
|
|
651
|
+
"""Check if we need to update the outbound rules of the security group."""
|
|
652
|
+
if not security_group.ip_permissions_egress:
|
|
653
|
+
return True # No outbound rules, we need to add them
|
|
654
|
+
existing_group_ids = []
|
|
655
|
+
for rule in security_group.ip_permissions_egress:
|
|
656
|
+
if 'UserIdGroupPairs' in rule:
|
|
657
|
+
group_pairs = rule['UserIdGroupPairs']
|
|
658
|
+
for pair in group_pairs:
|
|
659
|
+
existing_group_ids.append(pair['GroupId'])
|
|
660
|
+
logger.debug(f'Existing group ids: {existing_group_ids}')
|
|
661
|
+
for rule in outbound_rules:
|
|
662
|
+
if 'UserIdGroupPairs' in rule:
|
|
663
|
+
group_pairs = rule['UserIdGroupPairs']
|
|
664
|
+
for pair in group_pairs:
|
|
665
|
+
if pair['GroupId'] not in existing_group_ids:
|
|
666
|
+
logger.debug(f'New group id: {pair["GroupId"]}')
|
|
667
|
+
return True # New group id, we need to add it
|
|
668
|
+
return False # No need to update
|
|
669
|
+
|
|
670
|
+
|
|
596
671
|
def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
597
672
|
vpc_id: str,
|
|
598
673
|
expected_sg_name: str) -> Any:
|
sky/provision/aws/instance.py
CHANGED
|
@@ -184,9 +184,15 @@ def _merge_tag_specs(tag_specs: List[Dict[str, Any]],
|
|
|
184
184
|
tag_specs += [user_tag_spec]
|
|
185
185
|
|
|
186
186
|
|
|
187
|
-
def _create_instances(
|
|
188
|
-
|
|
189
|
-
|
|
187
|
+
def _create_instances(
|
|
188
|
+
ec2_fail_fast,
|
|
189
|
+
cluster_name: str,
|
|
190
|
+
node_config: Dict[str, Any],
|
|
191
|
+
tags: Dict[str, str],
|
|
192
|
+
count: int,
|
|
193
|
+
associate_public_ip_address: bool,
|
|
194
|
+
max_efa_interfaces: int,
|
|
195
|
+
) -> List:
|
|
190
196
|
tags = {
|
|
191
197
|
'Name': cluster_name,
|
|
192
198
|
constants.TAG_RAY_CLUSTER_NAME: cluster_name,
|
|
@@ -239,7 +245,36 @@ def _create_instances(ec2_fail_fast, cluster_name: str,
|
|
|
239
245
|
# Whether the VM(s) should have a public IP.
|
|
240
246
|
'AssociatePublicIpAddress': associate_public_ip_address,
|
|
241
247
|
'Groups': security_group_ids,
|
|
248
|
+
'InterfaceType': 'efa'
|
|
249
|
+
if max_efa_interfaces > 0 else 'interface',
|
|
242
250
|
}]
|
|
251
|
+
# Due to AWS limitation, if an instance type supports multiple
|
|
252
|
+
# network cards, we cannot assign public IP addresses to the
|
|
253
|
+
# instance during creation, which will raise the following error:
|
|
254
|
+
# (InvalidParameterCombination) when calling the RunInstances
|
|
255
|
+
# operation: The associatePublicIPAddress parameter cannot be
|
|
256
|
+
# specified when launching with multiple network interfaces.
|
|
257
|
+
# So we only attach multiple network interfaces if public IP is
|
|
258
|
+
# not required.
|
|
259
|
+
# TODO(hailong): support attaching/detaching elastic IP to expose
|
|
260
|
+
# public IP in this case.
|
|
261
|
+
if max_efa_interfaces > 1 and not associate_public_ip_address:
|
|
262
|
+
instance_type = conf['InstanceType']
|
|
263
|
+
for i in range(1, max_efa_interfaces):
|
|
264
|
+
interface_type = 'efa-only'
|
|
265
|
+
# Special handling for P5 instances
|
|
266
|
+
# Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 for more details. # pylint: disable=line-too-long
|
|
267
|
+
if (instance_type == 'p5.48xlarge' or
|
|
268
|
+
instance_type == 'p5e.48xlarge'):
|
|
269
|
+
interface_type = 'efa' if i % 4 == 0 else 'efa-only'
|
|
270
|
+
network_interfaces.append({
|
|
271
|
+
'SubnetId': subnet_id,
|
|
272
|
+
'DeviceIndex': 1,
|
|
273
|
+
'NetworkCardIndex': i,
|
|
274
|
+
'AssociatePublicIpAddress': False,
|
|
275
|
+
'Groups': security_group_ids,
|
|
276
|
+
'InterfaceType': interface_type,
|
|
277
|
+
})
|
|
243
278
|
conf['NetworkInterfaces'] = network_interfaces
|
|
244
279
|
|
|
245
280
|
instances = _ec2_call_with_retry_on_server_error(
|
|
@@ -289,6 +324,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
289
324
|
zone = None
|
|
290
325
|
resumed_instance_ids: List[str] = []
|
|
291
326
|
created_instance_ids: List[str] = []
|
|
327
|
+
max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
|
|
292
328
|
|
|
293
329
|
# sort tags by key to support deterministic unit test stubbing
|
|
294
330
|
tags = dict(sorted(copy.deepcopy(config.tags).items()))
|
|
@@ -504,7 +540,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
504
540
|
tags,
|
|
505
541
|
reservation_count,
|
|
506
542
|
associate_public_ip_address=(
|
|
507
|
-
not config.provider_config['use_internal_ips'])
|
|
543
|
+
not config.provider_config['use_internal_ips']),
|
|
544
|
+
max_efa_interfaces=max_efa_interfaces)
|
|
508
545
|
created_instances.extend(created_reserved_instances)
|
|
509
546
|
to_start_count -= reservation_count
|
|
510
547
|
if to_start_count <= 0:
|
|
@@ -527,7 +564,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
527
564
|
tags,
|
|
528
565
|
to_start_count,
|
|
529
566
|
associate_public_ip_address=(
|
|
530
|
-
not config.provider_config['use_internal_ips'])
|
|
567
|
+
not config.provider_config['use_internal_ips']),
|
|
568
|
+
max_efa_interfaces=max_efa_interfaces)
|
|
531
569
|
|
|
532
570
|
created_instances.extend(created_remaining_instances)
|
|
533
571
|
created_instances.sort(key=lambda x: x.id)
|
|
@@ -686,6 +724,7 @@ def terminate_instances(
|
|
|
686
724
|
filters,
|
|
687
725
|
included_instances=None,
|
|
688
726
|
excluded_instances=None)
|
|
727
|
+
instance_list = list(instances)
|
|
689
728
|
default_sg = aws_config.get_security_group_from_vpc_id(
|
|
690
729
|
ec2, _get_vpc_id(provider_config),
|
|
691
730
|
aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
|
|
@@ -719,7 +758,7 @@ def terminate_instances(
|
|
|
719
758
|
# exist. We must block on instance termination so that we can
|
|
720
759
|
# delete the security group.
|
|
721
760
|
instances.terminate()
|
|
722
|
-
for instance in
|
|
761
|
+
for instance in instance_list:
|
|
723
762
|
instance.wait_until_terminated()
|
|
724
763
|
|
|
725
764
|
# TODO(suquark): Currently, the implementation of GCP and Azure will
|
sky/provision/docker_utils.py
CHANGED
|
@@ -371,7 +371,7 @@ class DockerInitializer:
|
|
|
371
371
|
'mkdir -p ~/.ssh;'
|
|
372
372
|
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
|
|
373
373
|
'sudo service ssh start;'
|
|
374
|
-
'sudo sed -i "s/mesg n/tty -s
|
|
374
|
+
'sudo sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;'
|
|
375
375
|
f'{SETUP_ENV_VARS_CMD}',
|
|
376
376
|
run_env='docker')
|
|
377
377
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
|
2
|
+
import copy
|
|
2
3
|
import dataclasses
|
|
3
4
|
import datetime
|
|
4
5
|
import enum
|
|
@@ -1082,6 +1083,14 @@ class KarpenterAutoscaler(Autoscaler):
|
|
|
1082
1083
|
can_query_backend: bool = False
|
|
1083
1084
|
|
|
1084
1085
|
|
|
1086
|
+
class CoreweaveAutoscaler(Autoscaler):
|
|
1087
|
+
"""CoreWeave autoscaler
|
|
1088
|
+
"""
|
|
1089
|
+
|
|
1090
|
+
label_formatter: Any = CoreWeaveLabelFormatter
|
|
1091
|
+
can_query_backend: bool = False
|
|
1092
|
+
|
|
1093
|
+
|
|
1085
1094
|
class GenericAutoscaler(Autoscaler):
|
|
1086
1095
|
"""Generic autoscaler
|
|
1087
1096
|
"""
|
|
@@ -1094,6 +1103,7 @@ class GenericAutoscaler(Autoscaler):
|
|
|
1094
1103
|
AUTOSCALER_TYPE_TO_AUTOSCALER = {
|
|
1095
1104
|
kubernetes_enums.KubernetesAutoscalerType.GKE: GKEAutoscaler,
|
|
1096
1105
|
kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterAutoscaler,
|
|
1106
|
+
kubernetes_enums.KubernetesAutoscalerType.COREWEAVE: CoreweaveAutoscaler,
|
|
1097
1107
|
kubernetes_enums.KubernetesAutoscalerType.GENERIC: GenericAutoscaler,
|
|
1098
1108
|
}
|
|
1099
1109
|
|
|
@@ -2706,11 +2716,11 @@ def get_endpoint_debug_message(context: Optional[str] = None) -> str:
|
|
|
2706
2716
|
|
|
2707
2717
|
|
|
2708
2718
|
def combine_pod_config_fields(
|
|
2709
|
-
|
|
2719
|
+
cluster_yaml_obj: Dict[str, Any],
|
|
2710
2720
|
cluster_config_overrides: Dict[str, Any],
|
|
2711
2721
|
cloud: Optional[clouds.Cloud] = None,
|
|
2712
2722
|
context: Optional[str] = None,
|
|
2713
|
-
) ->
|
|
2723
|
+
) -> Dict[str, Any]:
|
|
2714
2724
|
"""Adds or updates fields in the YAML with fields from the
|
|
2715
2725
|
~/.sky/config.yaml's kubernetes.pod_spec dict.
|
|
2716
2726
|
This can be used to add fields to the YAML that are not supported by
|
|
@@ -2749,9 +2759,7 @@ def combine_pod_config_fields(
|
|
|
2749
2759
|
- name: my-secret
|
|
2750
2760
|
```
|
|
2751
2761
|
"""
|
|
2752
|
-
|
|
2753
|
-
yaml_content = f.read()
|
|
2754
|
-
yaml_obj = yaml_utils.safe_load(yaml_content)
|
|
2762
|
+
merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
|
|
2755
2763
|
# We don't use override_configs in `get_effective_region_config`, as merging
|
|
2756
2764
|
# the pod config requires special handling.
|
|
2757
2765
|
if isinstance(cloud, clouds.SSH):
|
|
@@ -2778,26 +2786,20 @@ def combine_pod_config_fields(
|
|
|
2778
2786
|
|
|
2779
2787
|
# Merge the kubernetes config into the YAML for both head and worker nodes.
|
|
2780
2788
|
config_utils.merge_k8s_configs(
|
|
2781
|
-
|
|
2782
|
-
kubernetes_config)
|
|
2783
|
-
|
|
2784
|
-
# Write the updated YAML back to the file
|
|
2785
|
-
yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
|
|
2789
|
+
merged_cluster_yaml_obj['available_node_types']['ray_head_default']
|
|
2790
|
+
['node_config'], kubernetes_config)
|
|
2791
|
+
return merged_cluster_yaml_obj
|
|
2786
2792
|
|
|
2787
2793
|
|
|
2788
|
-
def combine_metadata_fields(
|
|
2794
|
+
def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
|
|
2789
2795
|
cluster_config_overrides: Dict[str, Any],
|
|
2790
|
-
context: Optional[str] = None) ->
|
|
2796
|
+
context: Optional[str] = None) -> Dict[str, Any]:
|
|
2791
2797
|
"""Updates the metadata for all Kubernetes objects created by SkyPilot with
|
|
2792
2798
|
fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
|
|
2793
2799
|
|
|
2794
2800
|
Obeys the same add or update semantics as combine_pod_config_fields().
|
|
2795
2801
|
"""
|
|
2796
|
-
|
|
2797
|
-
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
|
2798
|
-
yaml_content = f.read()
|
|
2799
|
-
yaml_obj = yaml_utils.safe_load(yaml_content)
|
|
2800
|
-
|
|
2802
|
+
merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
|
|
2801
2803
|
# Get custom_metadata from global config
|
|
2802
2804
|
custom_metadata = skypilot_config.get_effective_region_config(
|
|
2803
2805
|
cloud='kubernetes',
|
|
@@ -2819,22 +2821,42 @@ def combine_metadata_fields(cluster_yaml_path: str,
|
|
|
2819
2821
|
# List of objects in the cluster YAML to be updated
|
|
2820
2822
|
combination_destinations = [
|
|
2821
2823
|
# Service accounts
|
|
2822
|
-
|
|
2823
|
-
|
|
2824
|
-
|
|
2825
|
-
|
|
2826
|
-
# Pod spec
|
|
2827
|
-
yaml_obj['available_node_types']['ray_head_default']['node_config']
|
|
2824
|
+
merged_cluster_yaml_obj['provider']['autoscaler_service_account']
|
|
2825
|
+
['metadata'],
|
|
2826
|
+
merged_cluster_yaml_obj['provider']['autoscaler_role']['metadata'],
|
|
2827
|
+
merged_cluster_yaml_obj['provider']['autoscaler_role_binding']
|
|
2828
2828
|
['metadata'],
|
|
2829
|
+
merged_cluster_yaml_obj['provider']['autoscaler_service_account']
|
|
2830
|
+
['metadata'],
|
|
2831
|
+
# Pod spec
|
|
2832
|
+
merged_cluster_yaml_obj['available_node_types']['ray_head_default']
|
|
2833
|
+
['node_config']['metadata'],
|
|
2829
2834
|
# Services for pods
|
|
2830
|
-
*[
|
|
2835
|
+
*[
|
|
2836
|
+
svc['metadata']
|
|
2837
|
+
for svc in merged_cluster_yaml_obj['provider']['services']
|
|
2838
|
+
]
|
|
2831
2839
|
]
|
|
2832
2840
|
|
|
2833
2841
|
for destination in combination_destinations:
|
|
2834
2842
|
config_utils.merge_k8s_configs(destination, custom_metadata)
|
|
2835
2843
|
|
|
2836
|
-
|
|
2837
|
-
|
|
2844
|
+
return merged_cluster_yaml_obj
|
|
2845
|
+
|
|
2846
|
+
|
|
2847
|
+
def combine_pod_config_fields_and_metadata(
|
|
2848
|
+
cluster_yaml_obj: Dict[str, Any],
|
|
2849
|
+
cluster_config_overrides: Dict[str, Any],
|
|
2850
|
+
cloud: Optional[clouds.Cloud] = None,
|
|
2851
|
+
context: Optional[str] = None) -> Dict[str, Any]:
|
|
2852
|
+
"""Combines pod config fields and metadata fields"""
|
|
2853
|
+
combined_yaml_obj = combine_pod_config_fields(cluster_yaml_obj,
|
|
2854
|
+
cluster_config_overrides,
|
|
2855
|
+
cloud, context)
|
|
2856
|
+
combined_yaml_obj = combine_metadata_fields(combined_yaml_obj,
|
|
2857
|
+
cluster_config_overrides,
|
|
2858
|
+
context)
|
|
2859
|
+
return combined_yaml_obj
|
|
2838
2860
|
|
|
2839
2861
|
|
|
2840
2862
|
def merge_custom_metadata(
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Add request_id to cluster_events.
|
|
2
|
+
|
|
3
|
+
Revision ID: 007
|
|
4
|
+
Revises: 006
|
|
5
|
+
Create Date: 2025-08-28
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '007'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '006'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add request_id column to cluster_events."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('cluster_events',
|
|
27
|
+
'request_id',
|
|
28
|
+
sa.Text(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def downgrade():
|
|
33
|
+
"""No-op for backward compatibility."""
|
|
34
|
+
pass
|
sky/server/common.py
CHANGED
|
@@ -911,8 +911,7 @@ def reload_for_new_request(client_entrypoint: Optional[str],
|
|
|
911
911
|
|
|
912
912
|
# Clear cache should be called before reload_logger and usage reset,
|
|
913
913
|
# otherwise, the latest env var will not be used.
|
|
914
|
-
|
|
915
|
-
func.cache_clear()
|
|
914
|
+
annotations.clear_request_level_cache()
|
|
916
915
|
|
|
917
916
|
# We need to reset usage message, so that the message is up-to-date with the
|
|
918
917
|
# latest information in the context, e.g. client entrypoint and run id.
|
sky/server/daemons.py
CHANGED
|
@@ -7,8 +7,10 @@ from typing import Callable
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky import skypilot_config
|
|
9
9
|
from sky.server import constants as server_constants
|
|
10
|
+
from sky.utils import annotations
|
|
10
11
|
from sky.utils import common
|
|
11
12
|
from sky.utils import env_options
|
|
13
|
+
from sky.utils import timeline
|
|
12
14
|
from sky.utils import ux_utils
|
|
13
15
|
|
|
14
16
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -67,6 +69,10 @@ class InternalRequestDaemon:
|
|
|
67
69
|
sky_logging.reload_logger()
|
|
68
70
|
level = self.refresh_log_level()
|
|
69
71
|
self.event_fn()
|
|
72
|
+
# Clear request level cache after each run to avoid
|
|
73
|
+
# using too much memory.
|
|
74
|
+
annotations.clear_request_level_cache()
|
|
75
|
+
timeline.save_timeline()
|
|
70
76
|
except Exception: # pylint: disable=broad-except
|
|
71
77
|
# It is OK to fail to run the event, as the event is not
|
|
72
78
|
# critical, but we should log the error.
|
sky/server/requests/executor.py
CHANGED
|
@@ -383,7 +383,8 @@ def _request_execution_wrapper(request_id: str,
|
|
|
383
383
|
# config, as there can be some logs during override that needs to be
|
|
384
384
|
# captured in the log file.
|
|
385
385
|
try:
|
|
386
|
-
with
|
|
386
|
+
with sky_logging.add_debug_log_handler(request_id), \
|
|
387
|
+
override_request_env_and_config(request_body, request_id), \
|
|
387
388
|
tempstore.tempdir():
|
|
388
389
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
389
390
|
config = skypilot_config.to_dict()
|
sky/server/requests/payloads.py
CHANGED
|
@@ -71,7 +71,9 @@ EXTERNAL_LOCAL_ENV_VARS = [
|
|
|
71
71
|
def request_body_env_vars() -> dict:
|
|
72
72
|
env_vars = {}
|
|
73
73
|
for env_var in os.environ:
|
|
74
|
-
if env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX)
|
|
74
|
+
if (env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX) and
|
|
75
|
+
not env_var.startswith(
|
|
76
|
+
constants.SKYPILOT_SERVER_ENV_VAR_PREFIX)):
|
|
75
77
|
env_vars[env_var] = os.environ[env_var]
|
|
76
78
|
if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
|
|
77
79
|
env_vars[env_var] = os.environ[env_var]
|
|
@@ -307,6 +309,7 @@ class StatusBody(RequestBody):
|
|
|
307
309
|
cluster_names: Optional[List[str]] = None
|
|
308
310
|
refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
|
|
309
311
|
all_users: bool = True
|
|
312
|
+
include_credentials: bool = False
|
|
310
313
|
|
|
311
314
|
|
|
312
315
|
class StartBody(RequestBody):
|