skypilot-nightly 1.0.0.dev20250818__py3-none-any.whl → 1.0.0.dev20250820__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (73) hide show
  1. sky/__init__.py +5 -3
  2. sky/backends/cloud_vm_ray_backend.py +6 -13
  3. sky/backends/wheel_utils.py +2 -1
  4. sky/catalog/data_fetchers/fetch_aws.py +2 -0
  5. sky/client/cli/command.py +20 -16
  6. sky/core.py +1 -1
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/{D7_ocVBIBwyxtvXYWggqV → 8ZscIHnvBWz3AXkxsJL6H}/_buildManifest.js +1 -1
  9. sky/dashboard/out/_next/static/chunks/3015-bf218e4973bf5c8f.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/{8969-6cb1af4ec7fb1e19.js → 8969-23c8fbdb8b397d59.js} +1 -1
  11. sky/dashboard/out/_next/static/chunks/{webpack-a46c8b62df807ec1.js → webpack-008593a02784a2df.js} +1 -1
  12. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  13. sky/dashboard/out/clusters/[cluster].html +1 -1
  14. sky/dashboard/out/clusters.html +1 -1
  15. sky/dashboard/out/config.html +1 -1
  16. sky/dashboard/out/index.html +1 -1
  17. sky/dashboard/out/infra/[context].html +1 -1
  18. sky/dashboard/out/infra.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  21. sky/dashboard/out/jobs.html +1 -1
  22. sky/dashboard/out/users.html +1 -1
  23. sky/dashboard/out/volumes.html +1 -1
  24. sky/dashboard/out/workspace/new.html +1 -1
  25. sky/dashboard/out/workspaces/[name].html +1 -1
  26. sky/dashboard/out/workspaces.html +1 -1
  27. sky/exceptions.py +6 -1
  28. sky/global_user_state.py +18 -11
  29. sky/jobs/constants.py +1 -1
  30. sky/jobs/server/core.py +43 -34
  31. sky/jobs/server/utils.py +2 -1
  32. sky/jobs/utils.py +56 -9
  33. sky/models.py +1 -0
  34. sky/provision/aws/config.py +11 -11
  35. sky/provision/aws/instance.py +30 -27
  36. sky/provision/do/utils.py +2 -2
  37. sky/provision/kubernetes/network_utils.py +3 -3
  38. sky/provision/kubernetes/utils.py +2 -2
  39. sky/provision/kubernetes/volume.py +2 -0
  40. sky/provision/provisioner.py +10 -6
  41. sky/serve/replica_managers.py +7 -0
  42. sky/serve/server/impl.py +1 -1
  43. sky/server/requests/payloads.py +2 -0
  44. sky/server/requests/serializers/encoders.py +29 -5
  45. sky/server/server.py +37 -1
  46. sky/setup_files/MANIFEST.in +1 -0
  47. sky/setup_files/dependencies.py +17 -11
  48. sky/skylet/ray_patches/__init__.py +18 -4
  49. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  50. sky/skylet/ray_patches/cli.py.diff +19 -0
  51. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  52. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  53. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  54. sky/skylet/ray_patches/updater.py.diff +18 -0
  55. sky/skylet/ray_patches/worker.py.diff +41 -0
  56. sky/utils/common.py +27 -7
  57. sky/utils/common_utils.py +13 -9
  58. sky/utils/directory_utils.py +12 -0
  59. sky/utils/env_options.py +3 -0
  60. sky/utils/kubernetes/gpu_labeler.py +3 -3
  61. sky/utils/schemas.py +1 -0
  62. sky/utils/serialize_utils.py +16 -0
  63. sky/volumes/client/sdk.py +10 -7
  64. sky/volumes/server/core.py +12 -3
  65. sky/volumes/volume.py +17 -3
  66. {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/METADATA +21 -13
  67. {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/RECORD +72 -63
  68. sky/dashboard/out/_next/static/chunks/3015-471d67c9302d4027.js +0 -1
  69. /sky/dashboard/out/_next/static/{D7_ocVBIBwyxtvXYWggqV → 8ZscIHnvBWz3AXkxsJL6H}/_ssgManifest.js +0 -0
  70. {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/WHEEL +0 -0
  71. {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/entry_points.txt +0 -0
  72. {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/licenses/LICENSE +0 -0
  73. {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/top_level.txt +0 -0
@@ -498,8 +498,8 @@ def _vpc_id_from_security_group_ids(ec2: 'mypy_boto3_ec2.ServiceResource',
498
498
  return vpc_ids[0]
499
499
 
500
500
 
501
- def _get_vpc_id_by_name(ec2: 'mypy_boto3_ec2.ServiceResource', vpc_name: str,
502
- region: str) -> str:
501
+ def get_vpc_id_by_name(ec2: 'mypy_boto3_ec2.ServiceResource', vpc_name: str,
502
+ region: str) -> str:
503
503
  """Returns the VPC ID of the unique VPC with a given name.
504
504
 
505
505
  Exits with code 1 if:
@@ -532,7 +532,7 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
532
532
  use_internal_ips: bool,
533
533
  vpc_name: Optional[str]) -> Tuple[Any, str]:
534
534
  if vpc_name is not None:
535
- vpc_id_of_sg = _get_vpc_id_by_name(ec2, vpc_name, region)
535
+ vpc_id_of_sg = get_vpc_id_by_name(ec2, vpc_name, region)
536
536
  elif security_group_ids:
537
537
  vpc_id_of_sg = _vpc_id_from_security_group_ids(ec2, security_group_ids)
538
538
  else:
@@ -614,8 +614,8 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
614
614
  due to AWS service issues.
615
615
  """
616
616
  # Figure out which security groups with this name exist for each VPC...
617
- security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
618
- expected_sg_name)
617
+ security_group = get_security_group_from_vpc_id(ec2, vpc_id,
618
+ expected_sg_name)
619
619
  if security_group is not None:
620
620
  return security_group
621
621
 
@@ -631,7 +631,7 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
631
631
  # The security group already exists, but we didn't see it
632
632
  # because of eventual consistency.
633
633
  logger.warning(f'{expected_sg_name} already exists when creating.')
634
- security_group = _get_security_group_from_vpc_id(
634
+ security_group = get_security_group_from_vpc_id(
635
635
  ec2, vpc_id, expected_sg_name)
636
636
  assert (security_group is not None and
637
637
  security_group.group_name == expected_sg_name), (
@@ -646,8 +646,8 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
646
646
  logger.warning(message)
647
647
  raise exceptions.NoClusterLaunchedError(message) from e
648
648
 
649
- security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
650
- expected_sg_name)
649
+ security_group = get_security_group_from_vpc_id(ec2, vpc_id,
650
+ expected_sg_name)
651
651
  assert security_group is not None, 'Failed to create security group'
652
652
  logger.info(f'Created new security group {colorama.Style.BRIGHT}'
653
653
  f'{security_group.group_name}{colorama.Style.RESET_ALL} '
@@ -655,9 +655,9 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
655
655
  return security_group
656
656
 
657
657
 
658
- def _get_security_group_from_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
659
- vpc_id: str,
660
- group_name: str) -> Optional[Any]:
658
+ def get_security_group_from_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
659
+ vpc_id: str,
660
+ group_name: str) -> Optional[Any]:
661
661
  """Get security group by VPC ID and group name."""
662
662
  existing_groups = list(
663
663
  ec2.security_groups.filter(Filters=[{
@@ -18,6 +18,7 @@ from sky.clouds import aws as aws_cloud
18
18
  from sky.clouds.utils import aws_utils
19
19
  from sky.provision import common
20
20
  from sky.provision import constants
21
+ from sky.provision.aws import config as aws_config
21
22
  from sky.provision.aws import utils
22
23
  from sky.utils import common_utils
23
24
  from sky.utils import resources_utils
@@ -685,7 +686,9 @@ def terminate_instances(
685
686
  filters,
686
687
  included_instances=None,
687
688
  excluded_instances=None)
688
- default_sg = _get_sg_from_name(ec2, aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
689
+ default_sg = aws_config.get_security_group_from_vpc_id(
690
+ ec2, _get_vpc_id(provider_config),
691
+ aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
689
692
  if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
690
693
  # Case 1: The default SG is used, we don't need to ensure instance are
691
694
  # terminated.
@@ -727,30 +730,6 @@ def terminate_instances(
727
730
  # of most cloud implementations (including AWS).
728
731
 
729
732
 
730
- def _get_sg_from_name(
731
- ec2: Any,
732
- sg_name: str,
733
- ) -> Any:
734
- # GroupNames will only filter SGs in the default VPC, so we need to use
735
- # Filters here. Ref:
736
- # https://boto3.amazonaws.com/v1/documentation/api/1.26.112/reference/services/ec2/service-resource/security_groups.html # pylint: disable=line-too-long
737
- sgs = ec2.security_groups.filter(Filters=[{
738
- 'Name': 'group-name',
739
- 'Values': [sg_name]
740
- }])
741
- num_sg = len(list(sgs))
742
- if num_sg == 0:
743
- logger.warning(f'Expected security group {sg_name} not found. ')
744
- return None
745
- if num_sg > 1:
746
- # TODO(tian): Better handle this case. Maybe we can check when creating
747
- # the SG and throw an error if there is already an existing SG with the
748
- # same name.
749
- logger.warning(f'Found {num_sg} security groups with name {sg_name}. ')
750
- return None
751
- return list(sgs)[0]
752
-
753
-
754
733
  def _maybe_move_to_new_sg(
755
734
  instance: Any,
756
735
  expected_sg: Any,
@@ -803,7 +782,9 @@ def open_ports(
803
782
  with ux_utils.print_exception_no_traceback():
804
783
  raise ValueError('Instance with cluster name '
805
784
  f'{cluster_name_on_cloud} not found.')
806
- sg = _get_sg_from_name(ec2, sg_name)
785
+ sg = aws_config.get_security_group_from_vpc_id(ec2,
786
+ _get_vpc_id(provider_config),
787
+ sg_name)
807
788
  if sg is None:
808
789
  with ux_utils.print_exception_no_traceback():
809
790
  raise ValueError('Cannot find new security group '
@@ -899,7 +880,9 @@ def cleanup_ports(
899
880
  # We only want to delete the SG that is dedicated to this cluster (i.e.,
900
881
  # this cluster have opened some ports).
901
882
  return
902
- sg = _get_sg_from_name(ec2, sg_name)
883
+ sg = aws_config.get_security_group_from_vpc_id(ec2,
884
+ _get_vpc_id(provider_config),
885
+ sg_name)
903
886
  if sg is None:
904
887
  logger.warning(
905
888
  'Find security group failed. Skip cleanup security group.')
@@ -1010,3 +993,23 @@ def get_cluster_info(
1010
993
  provider_name='aws',
1011
994
  provider_config=provider_config,
1012
995
  )
996
+
997
+
998
+ def _get_vpc_id(provider_config: Dict[str, Any]) -> str:
999
+ region = provider_config['region']
1000
+ ec2 = _default_ec2_resource(provider_config['region'])
1001
+ if 'vpc_name' in provider_config:
1002
+ return aws_config.get_vpc_id_by_name(ec2, provider_config['vpc_name'],
1003
+ region)
1004
+ else:
1005
+ # Retrieve the default VPC name from the region.
1006
+ response = ec2.meta.client.describe_vpcs(Filters=[{
1007
+ 'Name': 'isDefault',
1008
+ 'Values': ['true']
1009
+ }])
1010
+ if len(response['Vpcs']) == 0:
1011
+ raise ValueError(f'No default VPC found in region {region}')
1012
+ elif len(response['Vpcs']) > 1:
1013
+ raise ValueError(f'Multiple default VPCs found in region {region}')
1014
+ else:
1015
+ return response['Vpcs'][0]['VpcId']
sky/provision/do/utils.py CHANGED
@@ -30,7 +30,7 @@ POSSIBLE_CREDENTIALS_PATHS = [
30
30
  INITIAL_BACKOFF_SECONDS = 10
31
31
  MAX_BACKOFF_FACTOR = 10
32
32
  MAX_ATTEMPTS = 6
33
- SSH_KEY_NAME_ON_DO = f'sky-key-{common_utils.get_user_hash()}'
33
+ SSH_KEY_NAME_ON_DO_PREFIX = 'sky-key-'
34
34
 
35
35
  _client = None
36
36
  _ssh_key_id = None
@@ -125,7 +125,7 @@ def ssh_key_id(public_key: str):
125
125
 
126
126
  request = {
127
127
  'public_key': public_key,
128
- 'name': SSH_KEY_NAME_ON_DO,
128
+ 'name': SSH_KEY_NAME_ON_DO_PREFIX + common_utils.get_user_hash(),
129
129
  }
130
130
  _ssh_key_id = client().ssh_keys.create(body=request)['ssh_key']
131
131
  return _ssh_key_id
@@ -4,13 +4,13 @@ import time
4
4
  import typing
5
5
  from typing import Dict, List, Optional, Tuple, Union
6
6
 
7
- import sky
8
7
  from sky import exceptions
9
8
  from sky import sky_logging
10
9
  from sky import skypilot_config
11
10
  from sky.adaptors import common as adaptors_common
12
11
  from sky.adaptors import kubernetes
13
12
  from sky.provision.kubernetes import utils as kubernetes_utils
13
+ from sky.utils import directory_utils
14
14
  from sky.utils import kubernetes_enums
15
15
  from sky.utils import ux_utils
16
16
 
@@ -80,7 +80,7 @@ def get_networking_mode(
80
80
  def fill_loadbalancer_template(namespace: str, context: Optional[str],
81
81
  service_name: str, ports: List[int],
82
82
  selector_key: str, selector_value: str) -> Dict:
83
- template_path = os.path.join(sky.__root_dir__, 'templates',
83
+ template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
84
84
  _LOADBALANCER_TEMPLATE_NAME)
85
85
  if not os.path.exists(template_path):
86
86
  raise FileNotFoundError(
@@ -116,7 +116,7 @@ def fill_ingress_template(namespace: str, context: Optional[str],
116
116
  service_details: List[Tuple[str, int,
117
117
  str]], ingress_name: str,
118
118
  selector_key: str, selector_value: str) -> Dict:
119
- template_path = os.path.join(sky.__root_dir__, 'templates',
119
+ template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
120
120
  _INGRESS_TEMPLATE_NAME)
121
121
  if not os.path.exists(template_path):
122
122
  raise FileNotFoundError(
@@ -14,7 +14,6 @@ import typing
14
14
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
15
15
  from urllib.parse import urlparse
16
16
 
17
- import sky
18
17
  from sky import clouds
19
18
  from sky import exceptions
20
19
  from sky import global_user_state
@@ -31,6 +30,7 @@ from sky.skylet import constants
31
30
  from sky.utils import annotations
32
31
  from sky.utils import common_utils
33
32
  from sky.utils import config_utils
33
+ from sky.utils import directory_utils
34
34
  from sky.utils import env_options
35
35
  from sky.utils import kubernetes_enums
36
36
  from sky.utils import schemas
@@ -2444,7 +2444,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
2444
2444
 
2445
2445
  def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
2446
2446
  ssh_jump_name: str, service_type: str) -> Dict:
2447
- template_path = os.path.join(sky.__root_dir__, 'templates',
2447
+ template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
2448
2448
  'kubernetes-ssh-jump.yml.j2')
2449
2449
  if not os.path.exists(template_path):
2450
2450
  raise FileNotFoundError(
@@ -203,6 +203,8 @@ def _get_pvc_spec(namespace: str,
203
203
  },
204
204
  }
205
205
  }
206
+ if config.labels:
207
+ pvc_spec['metadata']['labels'].update(config.labels)
206
208
  storage_class = config.config.get('storage_class_name')
207
209
  if storage_class is not None:
208
210
  pvc_spec['spec']['storageClassName'] = storage_class
@@ -167,7 +167,7 @@ def bulk_provision(
167
167
  # This error is a user error instead of a provisioning failure.
168
168
  # And there is no possibility to fix it by teardown.
169
169
  raise
170
- except Exception: # pylint: disable=broad-except
170
+ except Exception as exc: # pylint: disable=broad-except
171
171
  zone_str = 'all zones'
172
172
  if zones:
173
173
  zone_str = ','.join(zone.name for zone in zones)
@@ -189,14 +189,18 @@ def bulk_provision(
189
189
  provider_config=original_config['provider'])
190
190
  break
191
191
  except NotImplementedError as e:
192
- verb = 'terminate' if terminate else 'stop'
192
+ assert not terminate, (
193
+ 'Terminating must be supported by all clouds')
194
+ exc_msg = common_utils.format_exception(exc).replace(
195
+ '\n', ' ')
193
196
  # If the underlying cloud does not support stopping
194
197
  # instances, we should stop failover as well.
195
198
  raise provision_common.StopFailoverError(
196
- 'During provisioner\'s failover, '
197
- f'{terminate_str.lower()} {cluster_name!r} failed. '
198
- f'We cannot {verb} the resources launched, as it is '
199
- f'not supported by {cloud}. Please try launching the '
199
+ f'Provisioning cluster {cluster_name.display_name} '
200
+ f'failed: {exc_msg}. Failover is stopped for safety '
201
+ 'because the cluster was previously in UP state but '
202
+ f'{cloud} does not support stopping instances to '
203
+ 'preserve the cluster state. Please try launching the '
200
204
  'cluster again, or terminate it with: '
201
205
  f'sky down {cluster_name.display_name}') from e
202
206
  except Exception as e: # pylint: disable=broad-except
@@ -48,6 +48,13 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
48
48
  _RETRY_INIT_GAP_SECONDS = 60
49
49
  _DEFAULT_DRAIN_SECONDS = 120
50
50
 
51
+ # TODO(tian): Backward compatibility. Remove this after 3 minor release, i.e.
52
+ # 0.13.0. We move the ProcessStatus to common_utils.ProcessStatus in #6666, but
53
+ # old ReplicaInfo in database will still tries to unpickle using ProcessStatus
54
+ # in replica_managers. We set this alias to avoid breaking changes. See #6729
55
+ # for more details.
56
+ ProcessStatus = common_utils.ProcessStatus
57
+
51
58
 
52
59
  # TODO(tian): Combine this with
53
60
  # sky/spot/recovery_strategy.py::StrategyExecutor::launch
sky/serve/server/impl.py CHANGED
@@ -129,11 +129,11 @@ def up(
129
129
  f'{constants.CLUSTER_NAME_VALID_REGEX}')
130
130
 
131
131
  dag = dag_utils.convert_entrypoint_to_dag(task)
132
- dag.resolve_and_validate_volumes()
133
132
  # Always apply the policy again here, even though it might have been applied
134
133
  # in the CLI. This is to ensure that we apply the policy to the final DAG
135
134
  # and get the mutated config.
136
135
  dag, mutated_user_config = admin_policy_utils.apply(dag)
136
+ dag.resolve_and_validate_volumes()
137
137
  dag.pre_mount_volumes()
138
138
  task = dag.tasks[0]
139
139
  assert task.service is not None
@@ -453,6 +453,7 @@ class VolumeApplyBody(RequestBody):
453
453
  zone: Optional[str] = None
454
454
  size: Optional[str] = None
455
455
  config: Optional[Dict[str, Any]] = None
456
+ labels: Optional[Dict[str, str]] = None
456
457
 
457
458
 
458
459
  class VolumeDeleteBody(RequestBody):
@@ -503,6 +504,7 @@ class JobsQueueBody(RequestBody):
503
504
  pool_match: Optional[str] = None
504
505
  page: Optional[int] = None
505
506
  limit: Optional[int] = None
507
+ statuses: Optional[List[str]] = None
506
508
 
507
509
 
508
510
  class JobsCancelBody(RequestBody):
@@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Tuple
10
10
 
11
11
  from sky.schemas.api import responses
12
12
  from sky.server import constants as server_constants
13
+ from sky.utils import serialize_utils
13
14
 
14
15
  if typing.TYPE_CHECKING:
15
16
  from sky import backends
@@ -22,6 +23,9 @@ handlers: Dict[str, Any] = {}
22
23
 
23
24
  def pickle_and_encode(obj: Any) -> str:
24
25
  try:
26
+ # Apply backwards compatibility processing at the lowest level
27
+ # to catch any handles that might have bypassed the encoders
28
+ obj = serialize_utils.prepare_handle_for_backwards_compatibility(obj)
25
29
  return base64.b64encode(pickle.dumps(obj)).decode('utf-8')
26
30
  except TypeError as e:
27
31
  raise ValueError(f'Failed to pickle object: {obj}') from e
@@ -58,7 +62,9 @@ def encode_status(
58
62
  for cluster in clusters:
59
63
  response_cluster = cluster.model_dump()
60
64
  response_cluster['status'] = cluster['status'].value
61
- response_cluster['handle'] = pickle_and_encode(cluster['handle'])
65
+ handle = serialize_utils.prepare_handle_for_backwards_compatibility(
66
+ cluster['handle'])
67
+ response_cluster['handle'] = pickle_and_encode(handle)
62
68
  response_cluster['storage_mounts_metadata'] = pickle_and_encode(
63
69
  response_cluster['storage_mounts_metadata'])
64
70
  response.append(response_cluster)
@@ -70,6 +76,7 @@ def encode_launch(
70
76
  job_id_handle: Tuple[Optional[int], Optional['backends.ResourceHandle']]
71
77
  ) -> Dict[str, Any]:
72
78
  job_id, handle = job_id_handle
79
+ handle = serialize_utils.prepare_handle_for_backwards_compatibility(handle)
73
80
  return {
74
81
  'job_id': job_id,
75
82
  'handle': pickle_and_encode(handle),
@@ -78,6 +85,9 @@ def encode_launch(
78
85
 
79
86
  @register_encoder('start')
80
87
  def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
88
+ resource_handle = (
89
+ serialize_utils.prepare_handle_for_backwards_compatibility(
90
+ resource_handle))
81
91
  return pickle_and_encode(resource_handle)
82
92
 
83
93
 
@@ -113,8 +123,15 @@ def encode_status_kubernetes(
113
123
  @register_encoder('jobs.queue')
114
124
  def encode_jobs_queue(jobs_or_tuple):
115
125
  # Support returning either a plain jobs list or a (jobs, total) tuple
116
- if isinstance(jobs_or_tuple, tuple) and len(jobs_or_tuple) == 2:
117
- jobs, total = jobs_or_tuple
126
+ status_counts = {}
127
+ if isinstance(jobs_or_tuple, tuple):
128
+ if len(jobs_or_tuple) == 2:
129
+ jobs, total = jobs_or_tuple
130
+ total_no_filter = total
131
+ elif len(jobs_or_tuple) == 4:
132
+ jobs, total, status_counts, total_no_filter = jobs_or_tuple
133
+ else:
134
+ raise ValueError(f'Invalid jobs tuple: {jobs_or_tuple}')
118
135
  else:
119
136
  jobs = jobs_or_tuple
120
137
  total = None
@@ -122,7 +139,12 @@ def encode_jobs_queue(jobs_or_tuple):
122
139
  job['status'] = job['status'].value
123
140
  if total is None:
124
141
  return jobs
125
- return {'jobs': jobs, 'total': total}
142
+ return {
143
+ 'jobs': jobs,
144
+ 'total': total,
145
+ 'total_no_filter': total_no_filter,
146
+ 'status_counts': status_counts
147
+ }
126
148
 
127
149
 
128
150
  def _encode_serve_status(
@@ -131,7 +153,9 @@ def _encode_serve_status(
131
153
  service_status['status'] = service_status['status'].value
132
154
  for replica_info in service_status.get('replica_info', []):
133
155
  replica_info['status'] = replica_info['status'].value
134
- replica_info['handle'] = pickle_and_encode(replica_info['handle'])
156
+ handle = serialize_utils.prepare_handle_for_backwards_compatibility(
157
+ replica_info['handle'])
158
+ replica_info['handle'] = pickle_and_encode(handle)
135
159
  return service_statuses
136
160
 
137
161
 
sky/server/server.py CHANGED
@@ -83,6 +83,8 @@ else:
83
83
 
84
84
  P = ParamSpec('P')
85
85
 
86
+ _SERVER_USER_HASH_KEY = 'server_user_hash'
87
+
86
88
 
87
89
  def _add_timestamp_prefix_for_server_logs() -> None:
88
90
  server_logger = sky_logging.init_logger('sky.server')
@@ -1650,7 +1652,10 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1650
1652
  await websocket.accept()
1651
1653
  logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
1652
1654
 
1653
- cluster_records = core.status(cluster_name, all_users=True)
1655
+ # Run core.status in another thread to avoid blocking the event loop.
1656
+ cluster_records = await context_utils.to_thread(core.status,
1657
+ cluster_name,
1658
+ all_users=True)
1654
1659
  cluster_record = cluster_records[0]
1655
1660
  if cluster_record['status'] != status_lib.ClusterStatus.UP:
1656
1661
  raise fastapi.HTTPException(
@@ -1818,6 +1823,35 @@ async def root():
1818
1823
  return fastapi.responses.RedirectResponse(url='/dashboard/')
1819
1824
 
1820
1825
 
1826
+ def _init_or_restore_server_user_hash():
1827
+ """Restores the server user hash from the global user state db.
1828
+
1829
+ The API server must have a stable user hash across restarts and potential
1830
+ multiple replicas. Thus we persist the user hash in db and restore it on
1831
+ startup. When upgrading from old version, the user hash will be read from
1832
+ the local file (if any) to keep the user hash consistent.
1833
+ """
1834
+
1835
+ def apply_user_hash(user_hash: str) -> None:
1836
+ # For local API server, the user hash in db and local file should be
1837
+ # same so there is no harm to override here.
1838
+ common_utils.set_user_hash_locally(user_hash)
1839
+ # Refresh the server user hash for current process after restore or
1840
+ # initialize the user hash in db, child processes will get the correct
1841
+ # server id from the local cache file.
1842
+ common_lib.refresh_server_id()
1843
+
1844
+ user_hash = global_user_state.get_system_config(_SERVER_USER_HASH_KEY)
1845
+ if user_hash is not None:
1846
+ apply_user_hash(user_hash)
1847
+ return
1848
+
1849
+ # Initial deployment, generate a user hash and save it to the db.
1850
+ user_hash = common_utils.get_user_hash()
1851
+ global_user_state.set_system_config(_SERVER_USER_HASH_KEY, user_hash)
1852
+ apply_user_hash(user_hash)
1853
+
1854
+
1821
1855
  if __name__ == '__main__':
1822
1856
  import uvicorn
1823
1857
 
@@ -1827,6 +1861,8 @@ if __name__ == '__main__':
1827
1861
  global_user_state.initialize_and_get_db()
1828
1862
  # Initialize request db
1829
1863
  requests_lib.reset_db_and_logs()
1864
+ # Restore the server user hash
1865
+ _init_or_restore_server_user_hash()
1830
1866
 
1831
1867
  parser = argparse.ArgumentParser()
1832
1868
  parser.add_argument('--host', default='127.0.0.1')
@@ -9,6 +9,7 @@ include sky/skylet/providers/ibm/*
9
9
  include sky/skylet/providers/scp/*
10
10
  include sky/skylet/providers/*.py
11
11
  include sky/skylet/ray_patches/*.patch
12
+ include sky/skylet/ray_patches/*.diff
12
13
  include sky/jobs/dashboard/*
13
14
  include sky/jobs/dashboard/templates/*
14
15
  include sky/jobs/dashboard/static/*
@@ -72,12 +72,27 @@ install_requires = [
72
72
  'aiohttp',
73
73
  ]
74
74
 
75
+ # See requirements-dev.txt for the version of grpc and protobuf
76
+ # used to generate the code during development.
77
+
78
+ # The grpc version at runtime has to be newer than the version
79
+ # used to generate the code.
80
+ GRPC = 'grpcio>=1.63.0'
81
+ # >= 5.26.1 because the runtime version can't be older than the version
82
+ # used to generate the code.
83
+ # < 7.0.0 because code generated for a major version V will be supported by
84
+ # protobuf runtimes of version V and V+1.
85
+ # https://protobuf.dev/support/cross-version-runtime-guarantee
86
+ PROTOBUF = 'protobuf>=5.26.1, < 7.0.0'
87
+
75
88
  server_dependencies = [
76
89
  'casbin',
77
90
  'sqlalchemy_adapter',
78
91
  'passlib',
79
92
  'pyjwt',
80
93
  'aiohttp',
94
+ GRPC,
95
+ PROTOBUF,
81
96
  ]
82
97
 
83
98
  local_ray = [
@@ -88,18 +103,9 @@ local_ray = [
88
103
  'ray[default] >= 2.2.0, != 2.6.0',
89
104
  ]
90
105
 
91
- # See requirements-dev.txt for the version of grpc and protobuf
92
- # used to generate the code during development.
93
106
  remote = [
94
- # The grpc version at runtime has to be newer than the version
95
- # used to generate the code.
96
- 'grpcio>=1.63.0',
97
- # >= 5.26.1 because the runtime version can't be older than the version
98
- # used to generate the code.
99
- # < 7.0.0 because code generated for a major version V will be supported by
100
- # protobuf runtimes of version V and V+1.
101
- # https://protobuf.dev/support/cross-version-runtime-guarantee
102
- 'protobuf >= 5.26.1, < 7.0.0',
107
+ GRPC,
108
+ PROTOBUF,
103
109
  ]
104
110
 
105
111
  # NOTE: Change the templates/jobs-controller.yaml.j2 file if any of the
@@ -40,15 +40,29 @@ def _run_patch(target_file,
40
40
  """Applies a patch if it has not been applied already."""
41
41
  # .orig is the original file that is not patched.
42
42
  orig_file = os.path.abspath(f'{target_file}-v{version}.orig')
43
+ # Get diff filename by replacing .patch with .diff
44
+ diff_file = patch_file.replace('.patch', '.diff')
45
+
43
46
  script = f"""\
44
- which patch >/dev/null 2>&1 || sudo yum install -y patch || sudo dnf install patch -y || true
45
- which patch >/dev/null 2>&1 || (echo "`patch` is not found. Failed to setup ray." && exit 1)
47
+ which patch >/dev/null 2>&1 || sudo yum install -y patch || true
46
48
  if [ ! -f {orig_file} ]; then
47
49
  echo Create backup file {orig_file}
48
50
  cp {target_file} {orig_file}
49
51
  fi
50
- # It is ok to patch again from the original file.
51
- patch {orig_file} -i {patch_file} -o {target_file}
52
+ if which patch >/dev/null 2>&1; then
53
+ # System patch command is available, use it
54
+ # It is ok to patch again from the original file.
55
+ patch {orig_file} -i {patch_file} -o {target_file}
56
+ else
57
+ # System patch command not available, use Python patch library
58
+ echo "System patch command not available, using Python patch library..."
59
+ python -m pip install patch
60
+ # Get target directory
61
+ target_dir="$(dirname {target_file})"
62
+ # Execute python patch command
63
+ echo "Executing python -m patch -d $target_dir {diff_file}"
64
+ python -m patch -d "$target_dir" "{diff_file}"
65
+ fi
52
66
  """
53
67
  subprocess.run(script, shell=True, check=True)
54
68
 
@@ -0,0 +1,18 @@
1
+ --- a/autoscaler.py
2
+ +++ b/autoscaler.py
3
+ @@ -1,3 +1,6 @@
4
+ +# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/autoscaler.py
5
+ +# Sky patch changes:
6
+ +# - enable upscaling_speed to be 0.0
7
+ import copy
8
+ import logging
9
+ import math
10
+ @@ -1071,7 +1074,7 @@
11
+ upscaling_speed = self.config.get("upscaling_speed")
12
+ aggressive = self.config.get("autoscaling_mode") == "aggressive"
13
+ target_utilization_fraction = self.config.get("target_utilization_fraction")
14
+ - if upscaling_speed:
15
+ + if upscaling_speed is not None: # NOTE(sky): enable 0.0
16
+ upscaling_speed = float(upscaling_speed)
17
+ # TODO(ameer): consider adding (if users ask) an option of
18
+ # initial_upscaling_num_workers.
@@ -0,0 +1,19 @@
1
+ --- a/cli.py
2
+ +++ b/cli.py
3
+ @@ -1,3 +1,7 @@
4
+ +# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/dashboard/modules/job/cli.py
5
+ +# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514
6
+ +# Otherwise, the output redirection ">" will not work.
7
+ +
8
+ import json
9
+ import os
10
+ import sys
11
+ @@ -270,7 +274,7 @@
12
+ working_dir=working_dir,
13
+ )
14
+ job_id = client.submit_job(
15
+ - entrypoint=list2cmdline(entrypoint),
16
+ + entrypoint=" ".join(entrypoint),
17
+ submission_id=submission_id,
18
+ runtime_env=final_runtime_env,
19
+ metadata=metadata_json,
@@ -0,0 +1,17 @@
1
+ --- a/command_runner.py
2
+ +++ b/command_runner.py
3
+ @@ -1,3 +1,5 @@
4
+ +# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/command_runner.py
5
+ +
6
+ import hashlib
7
+ import json
8
+ import logging
9
+ @@ -137,7 +139,7 @@
10
+ {
11
+ "ControlMaster": "auto",
12
+ "ControlPath": "{}/%C".format(control_path),
13
+ - "ControlPersist": "10s",
14
+ + "ControlPersist": "300s",
15
+ }
16
+ )
17
+ self.arg_dict.update(kwargs)
@@ -0,0 +1,20 @@
1
+ --- a/log_monitor.py
2
+ +++ b/log_monitor.py
3
+ @@ -1,3 +1,7 @@
4
+ +# Original file https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/log_monitor.py
5
+ +# Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar.
6
+ +# We change the newline handling back to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300
7
+ +
8
+ import argparse
9
+ import errno
10
+ import glob
11
+ @@ -374,7 +378,8 @@
12
+ next_line = next_line.decode("utf-8", "replace")
13
+ if next_line == "":
14
+ break
15
+ - next_line = next_line.rstrip("\r\n")
16
+ + if next_line.endswith("\n"):
17
+ + next_line = next_line[:-1]
18
+
19
+ if next_line.startswith(ray_constants.LOG_PREFIX_ACTOR_NAME):
20
+ flush() # Possible change of task/actor name.