skypilot-nightly 1.0.0.dev20250828__py3-none-any.whl → 1.0.0.dev20250831__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +24 -2
  3. sky/backends/backend_utils.py +152 -59
  4. sky/backends/cloud_vm_ray_backend.py +56 -3
  5. sky/backends/wheel_utils.py +35 -8
  6. sky/client/cli/command.py +17 -6
  7. sky/client/common.py +5 -4
  8. sky/client/sdk.py +5 -0
  9. sky/client/sdk_async.py +8 -2
  10. sky/clouds/aws.py +118 -1
  11. sky/core.py +8 -3
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/{webpack-6dae1cd599a34def.js → webpack-6e76f636a048e145.js} +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/global_user_state.py +58 -10
  30. sky/provision/aws/config.py +78 -3
  31. sky/provision/aws/instance.py +45 -6
  32. sky/provision/docker_utils.py +1 -1
  33. sky/provision/kubernetes/utils.py +48 -26
  34. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  35. sky/server/common.py +1 -2
  36. sky/server/daemons.py +6 -0
  37. sky/server/requests/executor.py +2 -1
  38. sky/server/requests/payloads.py +4 -1
  39. sky/server/server.py +67 -58
  40. sky/setup_files/dependencies.py +25 -8
  41. sky/setup_files/setup.py +2 -0
  42. sky/sky_logging.py +28 -0
  43. sky/skylet/constants.py +6 -0
  44. sky/templates/aws-ray.yml.j2 +1 -0
  45. sky/utils/annotations.py +8 -2
  46. sky/utils/cluster_utils.py +3 -3
  47. sky/utils/db/migration_utils.py +1 -1
  48. sky/utils/kubernetes_enums.py +1 -0
  49. sky/utils/lock_events.py +94 -0
  50. sky/utils/schemas.py +6 -0
  51. sky/utils/timeline.py +24 -93
  52. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/METADATA +36 -48
  53. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/RECORD +59 -57
  54. /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_buildManifest.js +0 -0
  55. /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_ssgManifest.js +0 -0
  56. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/WHEEL +0 -0
  57. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -206,6 +206,7 @@ cluster_event_table = sqlalchemy.Table(
206
206
  sqlalchemy.Column('reason', sqlalchemy.Text, primary_key=True),
207
207
  sqlalchemy.Column('transitioned_at', sqlalchemy.Integer, primary_key=True),
208
208
  sqlalchemy.Column('type', sqlalchemy.Text),
209
+ sqlalchemy.Column('request_id', sqlalchemy.Text, server_default=None),
209
210
  )
210
211
 
211
212
  ssh_key_table = sqlalchemy.Table(
@@ -745,6 +746,7 @@ def add_cluster_event(cluster_name: str,
745
746
  elif last_event == reason:
746
747
  return
747
748
  try:
749
+ request_id = common_utils.get_current_request_id()
748
750
  session.execute(
749
751
  insert_func(cluster_event_table).values(
750
752
  cluster_hash=cluster_hash,
@@ -754,6 +756,7 @@ def add_cluster_event(cluster_name: str,
754
756
  reason=reason,
755
757
  transitioned_at=transitioned_at,
756
758
  type=event_type.value,
759
+ request_id=request_id,
757
760
  ))
758
761
  session.commit()
759
762
  except sqlalchemy.exc.IntegrityError as e:
@@ -2082,19 +2085,51 @@ def get_cluster_yaml_str(cluster_yaml_path: Optional[str]) -> Optional[str]:
2082
2085
  row = session.query(cluster_yaml_table).filter_by(
2083
2086
  cluster_name=cluster_name).first()
2084
2087
  if row is None:
2085
- # If the cluster yaml is not in the database, check if it exists
2086
- # on the local file system and migrate it to the database.
2087
- # TODO(syang): remove this check once we have a way to migrate the
2088
- # cluster from file to database. Remove on v0.12.0.
2089
- if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
2090
- with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2091
- yaml_str = f.read()
2092
- set_cluster_yaml(cluster_name, yaml_str)
2093
- return yaml_str
2094
- return None
2088
+ return _set_cluster_yaml_from_file(cluster_yaml_path, cluster_name)
2095
2089
  return row.yaml
2096
2090
 
2097
2091
 
2092
+ def get_cluster_yaml_str_multiple(cluster_yaml_paths: List[str]) -> List[str]:
2093
+ """Get the cluster yaml from the database or the local file system.
2094
+ """
2095
+ assert _SQLALCHEMY_ENGINE is not None
2096
+ cluster_names_to_yaml_paths = {}
2097
+ for cluster_yaml_path in cluster_yaml_paths:
2098
+ cluster_name, _ = os.path.splitext(os.path.basename(cluster_yaml_path))
2099
+ cluster_names_to_yaml_paths[cluster_name] = cluster_yaml_path
2100
+
2101
+ cluster_names = list(cluster_names_to_yaml_paths.keys())
2102
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2103
+ rows = session.query(cluster_yaml_table).filter(
2104
+ cluster_yaml_table.c.cluster_name.in_(cluster_names)).all()
2105
+ row_cluster_names_to_yaml = {row.cluster_name: row.yaml for row in rows}
2106
+
2107
+ yaml_strs = []
2108
+ for cluster_name in cluster_names:
2109
+ if cluster_name in row_cluster_names_to_yaml:
2110
+ yaml_strs.append(row_cluster_names_to_yaml[cluster_name])
2111
+ else:
2112
+ yaml_str = _set_cluster_yaml_from_file(
2113
+ cluster_names_to_yaml_paths[cluster_name], cluster_name)
2114
+ yaml_strs.append(yaml_str)
2115
+ return yaml_strs
2116
+
2117
+
2118
+ def _set_cluster_yaml_from_file(cluster_yaml_path: str,
2119
+ cluster_name: str) -> Optional[str]:
2120
+ """Set the cluster yaml in the database from a file."""
2121
+ # If the cluster yaml is not in the database, check if it exists
2122
+ # on the local file system and migrate it to the database.
2123
+ # TODO(syang): remove this check once we have a way to migrate the
2124
+ # cluster from file to database. Remove on v0.12.0.
2125
+ if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
2126
+ with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2127
+ yaml_str = f.read()
2128
+ set_cluster_yaml(cluster_name, yaml_str)
2129
+ return yaml_str
2130
+ return None
2131
+
2132
+
2098
2133
  def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
2099
2134
  """Get the cluster yaml as a dictionary from the database.
2100
2135
 
@@ -2106,6 +2141,19 @@ def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
2106
2141
  return yaml_utils.safe_load(yaml_str)
2107
2142
 
2108
2143
 
2144
+ def get_cluster_yaml_dict_multiple(
2145
+ cluster_yaml_paths: List[str]) -> List[Dict[str, Any]]:
2146
+ """Get the cluster yaml as a dictionary from the database."""
2147
+ yaml_strs = get_cluster_yaml_str_multiple(cluster_yaml_paths)
2148
+ yaml_dicts = []
2149
+ for idx, yaml_str in enumerate(yaml_strs):
2150
+ if yaml_str is None:
2151
+ raise ValueError(
2152
+ f'Cluster yaml {cluster_yaml_paths[idx]} not found.')
2153
+ yaml_dicts.append(yaml_utils.safe_load(yaml_str))
2154
+ return yaml_dicts
2155
+
2156
+
2109
2157
  @_init_db
2110
2158
  def set_cluster_yaml(cluster_name: str, yaml_str: str) -> None:
2111
2159
  """Set the cluster yaml in the database."""
@@ -87,6 +87,9 @@ def bootstrap_instances(
87
87
  use_internal_ips=config.provider_config.get('use_internal_ips', False),
88
88
  vpc_name=config.provider_config.get('vpc_name'))
89
89
 
90
+ max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
91
+ enable_efa = max_efa_interfaces > 0
92
+
90
93
  # Cluster workers should be in a security group that permits traffic within
91
94
  # the group, and also SSH access from outside.
92
95
  if security_group_ids is None:
@@ -103,7 +106,8 @@ def bootstrap_instances(
103
106
  extended_ip_rules = []
104
107
  security_group_ids = _configure_security_group(ec2, vpc_id,
105
108
  expected_sg_name,
106
- extended_ip_rules)
109
+ extended_ip_rules,
110
+ enable_efa)
107
111
  if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
108
112
  logger.debug('Attempting to create the default security group.')
109
113
  # Attempt to create the default security group. This is needed
@@ -114,7 +118,7 @@ def bootstrap_instances(
114
118
  try:
115
119
  _configure_security_group(ec2, vpc_id,
116
120
  aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
117
- [])
121
+ [], enable_efa)
118
122
  logger.debug('Default security group created.')
119
123
  except exceptions.NoClusterLaunchedError as e:
120
124
  if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
@@ -148,6 +152,37 @@ def bootstrap_instances(
148
152
  return config
149
153
 
150
154
 
155
+ def _configure_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
156
+ placement_group_name: str):
157
+ """Configure placement group for the cluster."""
158
+ # Create the placement group
159
+ logger.info(f'Creating placement group {placement_group_name}.')
160
+ try:
161
+ ec2.meta.client.create_placement_group(GroupName=placement_group_name,
162
+ Strategy='cluster')
163
+ except aws.botocore_exceptions().ClientError as exc:
164
+ if exc.response.get(
165
+ 'Error', {}).get('Code') == 'InvalidPlacementGroup.Duplicate':
166
+ logger.debug(
167
+ f'Placement group {placement_group_name} already exists.')
168
+ else:
169
+ raise exc
170
+
171
+
172
+ def delete_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
173
+ placement_group_name: str):
174
+ """Delete the placement group."""
175
+ try:
176
+ ec2.meta.client.delete_placement_group(GroupName=placement_group_name)
177
+ except aws.botocore_exceptions().ClientError as exc:
178
+ if exc.response.get('Error',
179
+ {}).get('Code') == 'InvalidPlacementGroup.Unknown':
180
+ logger.debug(
181
+ f'Placement group {placement_group_name} does not exist.')
182
+ else:
183
+ raise exc
184
+
185
+
151
186
  def _configure_iam_role(iam) -> Dict[str, Any]:
152
187
 
153
188
  def _get_instance_profile(profile_name: str):
@@ -557,7 +592,8 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
557
592
 
558
593
  def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
559
594
  vpc_id: str, expected_sg_name: str,
560
- extended_ip_rules: List) -> List[str]:
595
+ extended_ip_rules: List,
596
+ enable_efa: bool) -> List[str]:
561
597
  security_group = _get_or_create_vpc_security_group(ec2, vpc_id,
562
598
  expected_sg_name)
563
599
  sg_ids = [security_group.id]
@@ -583,16 +619,55 @@ def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
583
619
  },
584
620
  *extended_ip_rules,
585
621
  ]
622
+ outbound_rules = []
623
+ if enable_efa:
624
+ # EFA requires that outbound rules permit the same security group to
625
+ # communicate with each other
626
+ # Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html#nccl-start-base-setup # pylint: disable=line-too-long
627
+ outbound_rules.append({
628
+ 'FromPort': -1,
629
+ 'ToPort': -1,
630
+ 'IpProtocol': '-1',
631
+ 'UserIdGroupPairs': [{
632
+ 'GroupId': i
633
+ } for i in sg_ids],
634
+ })
586
635
  # upsert the default security group
587
636
  if not security_group.ip_permissions:
588
637
  # If users specify security groups, we should not change the rules
589
638
  # of these security groups. Here we change it because it is the default
590
639
  # security group for SkyPilot.
591
640
  security_group.authorize_ingress(IpPermissions=inbound_rules)
641
+ if _need_to_update_outbound_rules(security_group, outbound_rules):
642
+ security_group.authorize_egress(IpPermissions=outbound_rules)
592
643
 
593
644
  return sg_ids
594
645
 
595
646
 
647
+ def _need_to_update_outbound_rules(
648
+ security_group: Any,
649
+ outbound_rules: List[Dict[str, Any]],
650
+ ) -> bool:
651
+ """Check if we need to update the outbound rules of the security group."""
652
+ if not security_group.ip_permissions_egress:
653
+ return True # No outbound rules, we need to add them
654
+ existing_group_ids = []
655
+ for rule in security_group.ip_permissions_egress:
656
+ if 'UserIdGroupPairs' in rule:
657
+ group_pairs = rule['UserIdGroupPairs']
658
+ for pair in group_pairs:
659
+ existing_group_ids.append(pair['GroupId'])
660
+ logger.debug(f'Existing group ids: {existing_group_ids}')
661
+ for rule in outbound_rules:
662
+ if 'UserIdGroupPairs' in rule:
663
+ group_pairs = rule['UserIdGroupPairs']
664
+ for pair in group_pairs:
665
+ if pair['GroupId'] not in existing_group_ids:
666
+ logger.debug(f'New group id: {pair["GroupId"]}')
667
+ return True # New group id, we need to add it
668
+ return False # No need to update
669
+
670
+
596
671
  def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
597
672
  vpc_id: str,
598
673
  expected_sg_name: str) -> Any:
@@ -184,9 +184,15 @@ def _merge_tag_specs(tag_specs: List[Dict[str, Any]],
184
184
  tag_specs += [user_tag_spec]
185
185
 
186
186
 
187
- def _create_instances(ec2_fail_fast, cluster_name: str,
188
- node_config: Dict[str, Any], tags: Dict[str, str],
189
- count: int, associate_public_ip_address: bool) -> List:
187
+ def _create_instances(
188
+ ec2_fail_fast,
189
+ cluster_name: str,
190
+ node_config: Dict[str, Any],
191
+ tags: Dict[str, str],
192
+ count: int,
193
+ associate_public_ip_address: bool,
194
+ max_efa_interfaces: int,
195
+ ) -> List:
190
196
  tags = {
191
197
  'Name': cluster_name,
192
198
  constants.TAG_RAY_CLUSTER_NAME: cluster_name,
@@ -239,7 +245,36 @@ def _create_instances(ec2_fail_fast, cluster_name: str,
239
245
  # Whether the VM(s) should have a public IP.
240
246
  'AssociatePublicIpAddress': associate_public_ip_address,
241
247
  'Groups': security_group_ids,
248
+ 'InterfaceType': 'efa'
249
+ if max_efa_interfaces > 0 else 'interface',
242
250
  }]
251
+ # Due to AWS limitation, if an instance type supports multiple
252
+ # network cards, we cannot assign public IP addresses to the
253
+ # instance during creation, which will raise the following error:
254
+ # (InvalidParameterCombination) when calling the RunInstances
255
+ # operation: The associatePublicIPAddress parameter cannot be
256
+ # specified when launching with multiple network interfaces.
257
+ # So we only attach multiple network interfaces if public IP is
258
+ # not required.
259
+ # TODO(hailong): support attaching/detaching elastic IP to expose
260
+ # public IP in this case.
261
+ if max_efa_interfaces > 1 and not associate_public_ip_address:
262
+ instance_type = conf['InstanceType']
263
+ for i in range(1, max_efa_interfaces):
264
+ interface_type = 'efa-only'
265
+ # Special handling for P5 instances
266
+ # Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 for more details. # pylint: disable=line-too-long
267
+ if (instance_type == 'p5.48xlarge' or
268
+ instance_type == 'p5e.48xlarge'):
269
+ interface_type = 'efa' if i % 4 == 0 else 'efa-only'
270
+ network_interfaces.append({
271
+ 'SubnetId': subnet_id,
272
+ 'DeviceIndex': 1,
273
+ 'NetworkCardIndex': i,
274
+ 'AssociatePublicIpAddress': False,
275
+ 'Groups': security_group_ids,
276
+ 'InterfaceType': interface_type,
277
+ })
243
278
  conf['NetworkInterfaces'] = network_interfaces
244
279
 
245
280
  instances = _ec2_call_with_retry_on_server_error(
@@ -289,6 +324,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
289
324
  zone = None
290
325
  resumed_instance_ids: List[str] = []
291
326
  created_instance_ids: List[str] = []
327
+ max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
292
328
 
293
329
  # sort tags by key to support deterministic unit test stubbing
294
330
  tags = dict(sorted(copy.deepcopy(config.tags).items()))
@@ -504,7 +540,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
504
540
  tags,
505
541
  reservation_count,
506
542
  associate_public_ip_address=(
507
- not config.provider_config['use_internal_ips']))
543
+ not config.provider_config['use_internal_ips']),
544
+ max_efa_interfaces=max_efa_interfaces)
508
545
  created_instances.extend(created_reserved_instances)
509
546
  to_start_count -= reservation_count
510
547
  if to_start_count <= 0:
@@ -527,7 +564,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
527
564
  tags,
528
565
  to_start_count,
529
566
  associate_public_ip_address=(
530
- not config.provider_config['use_internal_ips']))
567
+ not config.provider_config['use_internal_ips']),
568
+ max_efa_interfaces=max_efa_interfaces)
531
569
 
532
570
  created_instances.extend(created_remaining_instances)
533
571
  created_instances.sort(key=lambda x: x.id)
@@ -686,6 +724,7 @@ def terminate_instances(
686
724
  filters,
687
725
  included_instances=None,
688
726
  excluded_instances=None)
727
+ instance_list = list(instances)
689
728
  default_sg = aws_config.get_security_group_from_vpc_id(
690
729
  ec2, _get_vpc_id(provider_config),
691
730
  aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
@@ -719,7 +758,7 @@ def terminate_instances(
719
758
  # exist. We must block on instance termination so that we can
720
759
  # delete the security group.
721
760
  instances.terminate()
722
- for instance in instances:
761
+ for instance in instance_list:
723
762
  instance.wait_until_terminated()
724
763
 
725
764
  # TODO(suquark): Currently, the implementation of GCP and Azure will
@@ -371,7 +371,7 @@ class DockerInitializer:
371
371
  'mkdir -p ~/.ssh;'
372
372
  'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
373
373
  'sudo service ssh start;'
374
- 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;'
374
+ 'sudo sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;'
375
375
  f'{SETUP_ENV_VARS_CMD}',
376
376
  run_env='docker')
377
377
 
@@ -1,4 +1,5 @@
1
1
  """Kubernetes utilities for SkyPilot."""
2
+ import copy
2
3
  import dataclasses
3
4
  import datetime
4
5
  import enum
@@ -1082,6 +1083,14 @@ class KarpenterAutoscaler(Autoscaler):
1082
1083
  can_query_backend: bool = False
1083
1084
 
1084
1085
 
1086
+ class CoreweaveAutoscaler(Autoscaler):
1087
+ """CoreWeave autoscaler
1088
+ """
1089
+
1090
+ label_formatter: Any = CoreWeaveLabelFormatter
1091
+ can_query_backend: bool = False
1092
+
1093
+
1085
1094
  class GenericAutoscaler(Autoscaler):
1086
1095
  """Generic autoscaler
1087
1096
  """
@@ -1094,6 +1103,7 @@ class GenericAutoscaler(Autoscaler):
1094
1103
  AUTOSCALER_TYPE_TO_AUTOSCALER = {
1095
1104
  kubernetes_enums.KubernetesAutoscalerType.GKE: GKEAutoscaler,
1096
1105
  kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterAutoscaler,
1106
+ kubernetes_enums.KubernetesAutoscalerType.COREWEAVE: CoreweaveAutoscaler,
1097
1107
  kubernetes_enums.KubernetesAutoscalerType.GENERIC: GenericAutoscaler,
1098
1108
  }
1099
1109
 
@@ -2706,11 +2716,11 @@ def get_endpoint_debug_message(context: Optional[str] = None) -> str:
2706
2716
 
2707
2717
 
2708
2718
  def combine_pod_config_fields(
2709
- cluster_yaml_path: str,
2719
+ cluster_yaml_obj: Dict[str, Any],
2710
2720
  cluster_config_overrides: Dict[str, Any],
2711
2721
  cloud: Optional[clouds.Cloud] = None,
2712
2722
  context: Optional[str] = None,
2713
- ) -> None:
2723
+ ) -> Dict[str, Any]:
2714
2724
  """Adds or updates fields in the YAML with fields from the
2715
2725
  ~/.sky/config.yaml's kubernetes.pod_spec dict.
2716
2726
  This can be used to add fields to the YAML that are not supported by
@@ -2749,9 +2759,7 @@ def combine_pod_config_fields(
2749
2759
  - name: my-secret
2750
2760
  ```
2751
2761
  """
2752
- with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2753
- yaml_content = f.read()
2754
- yaml_obj = yaml_utils.safe_load(yaml_content)
2762
+ merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
2755
2763
  # We don't use override_configs in `get_effective_region_config`, as merging
2756
2764
  # the pod config requires special handling.
2757
2765
  if isinstance(cloud, clouds.SSH):
@@ -2778,26 +2786,20 @@ def combine_pod_config_fields(
2778
2786
 
2779
2787
  # Merge the kubernetes config into the YAML for both head and worker nodes.
2780
2788
  config_utils.merge_k8s_configs(
2781
- yaml_obj['available_node_types']['ray_head_default']['node_config'],
2782
- kubernetes_config)
2783
-
2784
- # Write the updated YAML back to the file
2785
- yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2789
+ merged_cluster_yaml_obj['available_node_types']['ray_head_default']
2790
+ ['node_config'], kubernetes_config)
2791
+ return merged_cluster_yaml_obj
2786
2792
 
2787
2793
 
2788
- def combine_metadata_fields(cluster_yaml_path: str,
2794
+ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
2789
2795
  cluster_config_overrides: Dict[str, Any],
2790
- context: Optional[str] = None) -> None:
2796
+ context: Optional[str] = None) -> Dict[str, Any]:
2791
2797
  """Updates the metadata for all Kubernetes objects created by SkyPilot with
2792
2798
  fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
2793
2799
 
2794
2800
  Obeys the same add or update semantics as combine_pod_config_fields().
2795
2801
  """
2796
-
2797
- with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2798
- yaml_content = f.read()
2799
- yaml_obj = yaml_utils.safe_load(yaml_content)
2800
-
2802
+ merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
2801
2803
  # Get custom_metadata from global config
2802
2804
  custom_metadata = skypilot_config.get_effective_region_config(
2803
2805
  cloud='kubernetes',
@@ -2819,22 +2821,42 @@ def combine_metadata_fields(cluster_yaml_path: str,
2819
2821
  # List of objects in the cluster YAML to be updated
2820
2822
  combination_destinations = [
2821
2823
  # Service accounts
2822
- yaml_obj['provider']['autoscaler_service_account']['metadata'],
2823
- yaml_obj['provider']['autoscaler_role']['metadata'],
2824
- yaml_obj['provider']['autoscaler_role_binding']['metadata'],
2825
- yaml_obj['provider']['autoscaler_service_account']['metadata'],
2826
- # Pod spec
2827
- yaml_obj['available_node_types']['ray_head_default']['node_config']
2824
+ merged_cluster_yaml_obj['provider']['autoscaler_service_account']
2825
+ ['metadata'],
2826
+ merged_cluster_yaml_obj['provider']['autoscaler_role']['metadata'],
2827
+ merged_cluster_yaml_obj['provider']['autoscaler_role_binding']
2828
2828
  ['metadata'],
2829
+ merged_cluster_yaml_obj['provider']['autoscaler_service_account']
2830
+ ['metadata'],
2831
+ # Pod spec
2832
+ merged_cluster_yaml_obj['available_node_types']['ray_head_default']
2833
+ ['node_config']['metadata'],
2829
2834
  # Services for pods
2830
- *[svc['metadata'] for svc in yaml_obj['provider']['services']]
2835
+ *[
2836
+ svc['metadata']
2837
+ for svc in merged_cluster_yaml_obj['provider']['services']
2838
+ ]
2831
2839
  ]
2832
2840
 
2833
2841
  for destination in combination_destinations:
2834
2842
  config_utils.merge_k8s_configs(destination, custom_metadata)
2835
2843
 
2836
- # Write the updated YAML back to the file
2837
- yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2844
+ return merged_cluster_yaml_obj
2845
+
2846
+
2847
+ def combine_pod_config_fields_and_metadata(
2848
+ cluster_yaml_obj: Dict[str, Any],
2849
+ cluster_config_overrides: Dict[str, Any],
2850
+ cloud: Optional[clouds.Cloud] = None,
2851
+ context: Optional[str] = None) -> Dict[str, Any]:
2852
+ """Combines pod config fields and metadata fields"""
2853
+ combined_yaml_obj = combine_pod_config_fields(cluster_yaml_obj,
2854
+ cluster_config_overrides,
2855
+ cloud, context)
2856
+ combined_yaml_obj = combine_metadata_fields(combined_yaml_obj,
2857
+ cluster_config_overrides,
2858
+ context)
2859
+ return combined_yaml_obj
2838
2860
 
2839
2861
 
2840
2862
  def merge_custom_metadata(
@@ -0,0 +1,34 @@
1
+ """Add request_id to cluster_events.
2
+
3
+ Revision ID: 007
4
+ Revises: 006
5
+ Create Date: 2025-08-28
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '007'
18
+ down_revision: Union[str, Sequence[str], None] = '006'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add request_id column to cluster_events."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('cluster_events',
27
+ 'request_id',
28
+ sa.Text(),
29
+ server_default=None)
30
+
31
+
32
+ def downgrade():
33
+ """No-op for backward compatibility."""
34
+ pass
sky/server/common.py CHANGED
@@ -911,8 +911,7 @@ def reload_for_new_request(client_entrypoint: Optional[str],
911
911
 
912
912
  # Clear cache should be called before reload_logger and usage reset,
913
913
  # otherwise, the latest env var will not be used.
914
- for func in annotations.FUNCTIONS_NEED_RELOAD_CACHE:
915
- func.cache_clear()
914
+ annotations.clear_request_level_cache()
916
915
 
917
916
  # We need to reset usage message, so that the message is up-to-date with the
918
917
  # latest information in the context, e.g. client entrypoint and run id.
sky/server/daemons.py CHANGED
@@ -7,8 +7,10 @@ from typing import Callable
7
7
  from sky import sky_logging
8
8
  from sky import skypilot_config
9
9
  from sky.server import constants as server_constants
10
+ from sky.utils import annotations
10
11
  from sky.utils import common
11
12
  from sky.utils import env_options
13
+ from sky.utils import timeline
12
14
  from sky.utils import ux_utils
13
15
 
14
16
  logger = sky_logging.init_logger(__name__)
@@ -67,6 +69,10 @@ class InternalRequestDaemon:
67
69
  sky_logging.reload_logger()
68
70
  level = self.refresh_log_level()
69
71
  self.event_fn()
72
+ # Clear request level cache after each run to avoid
73
+ # using too much memory.
74
+ annotations.clear_request_level_cache()
75
+ timeline.save_timeline()
70
76
  except Exception: # pylint: disable=broad-except
71
77
  # It is OK to fail to run the event, as the event is not
72
78
  # critical, but we should log the error.
@@ -383,7 +383,8 @@ def _request_execution_wrapper(request_id: str,
383
383
  # config, as there can be some logs during override that needs to be
384
384
  # captured in the log file.
385
385
  try:
386
- with override_request_env_and_config(request_body, request_id), \
386
+ with sky_logging.add_debug_log_handler(request_id), \
387
+ override_request_env_and_config(request_body, request_id), \
387
388
  tempstore.tempdir():
388
389
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
389
390
  config = skypilot_config.to_dict()
@@ -71,7 +71,9 @@ EXTERNAL_LOCAL_ENV_VARS = [
71
71
  def request_body_env_vars() -> dict:
72
72
  env_vars = {}
73
73
  for env_var in os.environ:
74
- if env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX):
74
+ if (env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX) and
75
+ not env_var.startswith(
76
+ constants.SKYPILOT_SERVER_ENV_VAR_PREFIX)):
75
77
  env_vars[env_var] = os.environ[env_var]
76
78
  if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
77
79
  env_vars[env_var] = os.environ[env_var]
@@ -307,6 +309,7 @@ class StatusBody(RequestBody):
307
309
  cluster_names: Optional[List[str]] = None
308
310
  refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
309
311
  all_users: bool = True
312
+ include_credentials: bool = False
310
313
 
311
314
 
312
315
  class StartBody(RequestBody):