skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (102) hide show
  1. sky/__init__.py +4 -2
  2. sky/backends/backend_utils.py +69 -6
  3. sky/backends/cloud_vm_ray_backend.py +156 -25
  4. sky/catalog/cudo_catalog.py +1 -1
  5. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  6. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  7. sky/client/cli/command.py +40 -77
  8. sky/client/common.py +1 -1
  9. sky/client/sdk.py +19 -19
  10. sky/client/sdk_async.py +5 -4
  11. sky/clouds/aws.py +52 -1
  12. sky/clouds/kubernetes.py +14 -0
  13. sky/dag.py +1 -0
  14. sky/dashboard/out/404.html +1 -1
  15. sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
  16. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
  18. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/{webpack-7fd0cf9dbecff10f.js → webpack-00c0a51d21157453.js} +1 -1
  20. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  21. sky/dashboard/out/clusters/[cluster].html +1 -1
  22. sky/dashboard/out/clusters.html +1 -1
  23. sky/dashboard/out/config.html +1 -1
  24. sky/dashboard/out/index.html +1 -1
  25. sky/dashboard/out/infra/[context].html +1 -1
  26. sky/dashboard/out/infra.html +1 -1
  27. sky/dashboard/out/jobs/[job].html +1 -1
  28. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  29. sky/dashboard/out/jobs.html +1 -1
  30. sky/dashboard/out/users.html +1 -1
  31. sky/dashboard/out/volumes.html +1 -1
  32. sky/dashboard/out/workspace/new.html +1 -1
  33. sky/dashboard/out/workspaces/[name].html +1 -1
  34. sky/dashboard/out/workspaces.html +1 -1
  35. sky/data/storage.py +11 -1
  36. sky/exceptions.py +5 -0
  37. sky/global_user_state.py +63 -7
  38. sky/jobs/constants.py +1 -1
  39. sky/jobs/controller.py +0 -1
  40. sky/jobs/recovery_strategy.py +3 -3
  41. sky/jobs/scheduler.py +23 -68
  42. sky/jobs/server/core.py +18 -12
  43. sky/jobs/state.py +6 -2
  44. sky/jobs/utils.py +8 -0
  45. sky/provision/__init__.py +1 -0
  46. sky/provision/aws/config.py +9 -0
  47. sky/provision/aws/instance.py +36 -13
  48. sky/provision/azure/instance.py +2 -0
  49. sky/provision/cudo/cudo_wrapper.py +1 -1
  50. sky/provision/cudo/instance.py +2 -0
  51. sky/provision/do/instance.py +2 -0
  52. sky/provision/fluidstack/instance.py +2 -0
  53. sky/provision/gcp/instance.py +2 -0
  54. sky/provision/hyperbolic/instance.py +2 -1
  55. sky/provision/kubernetes/instance.py +133 -0
  56. sky/provision/lambda_cloud/instance.py +2 -0
  57. sky/provision/nebius/instance.py +2 -0
  58. sky/provision/oci/instance.py +2 -0
  59. sky/provision/paperspace/instance.py +2 -1
  60. sky/provision/paperspace/utils.py +1 -1
  61. sky/provision/runpod/instance.py +2 -0
  62. sky/provision/runpod/utils.py +1 -1
  63. sky/provision/scp/instance.py +2 -0
  64. sky/provision/vast/instance.py +2 -0
  65. sky/provision/vsphere/instance.py +2 -0
  66. sky/resources.py +1 -2
  67. sky/schemas/__init__.py +0 -0
  68. sky/schemas/api/__init__.py +0 -0
  69. sky/schemas/api/responses.py +70 -0
  70. sky/schemas/generated/__init__.py +0 -0
  71. sky/schemas/generated/autostopv1_pb2.py +36 -0
  72. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  73. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  74. sky/serve/constants.py +3 -7
  75. sky/serve/replica_managers.py +15 -16
  76. sky/serve/serve_state.py +10 -0
  77. sky/serve/serve_utils.py +21 -20
  78. sky/serve/server/impl.py +15 -19
  79. sky/serve/service.py +31 -16
  80. sky/server/server.py +20 -14
  81. sky/setup_files/dependencies.py +11 -10
  82. sky/skylet/autostop_lib.py +38 -5
  83. sky/skylet/constants.py +3 -1
  84. sky/skylet/services.py +44 -0
  85. sky/skylet/skylet.py +49 -4
  86. sky/task.py +19 -16
  87. sky/templates/aws-ray.yml.j2 +2 -2
  88. sky/templates/jobs-controller.yaml.j2 +6 -0
  89. sky/utils/command_runner.py +1 -1
  90. sky/utils/config_utils.py +29 -5
  91. sky/utils/controller_utils.py +73 -0
  92. sky/utils/db/db_utils.py +17 -0
  93. sky/utils/schemas.py +3 -0
  94. sky/volumes/server/core.py +2 -2
  95. sky/volumes/server/server.py +2 -2
  96. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
  97. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +102 -94
  98. /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
  99. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
  100. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
  101. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
  102. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,146 @@
1
+ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
2
+ """Client and server classes corresponding to protobuf-defined services."""
3
+ import grpc
4
+ import warnings
5
+
6
+ from sky.schemas.generated import autostopv1_pb2 as sky_dot_schemas_dot_generated_dot_autostopv1__pb2
7
+
8
+ GRPC_GENERATED_VERSION = '1.63.0'
9
+ GRPC_VERSION = grpc.__version__
10
+ EXPECTED_ERROR_RELEASE = '1.65.0'
11
+ SCHEDULED_RELEASE_DATE = 'June 25, 2024'
12
+ _version_not_supported = False
13
+
14
+ try:
15
+ from grpc._utilities import first_version_is_lower
16
+ _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
17
+ except ImportError:
18
+ _version_not_supported = True
19
+
20
+ if _version_not_supported:
21
+ warnings.warn(
22
+ f'The grpc package installed is at version {GRPC_VERSION},'
23
+ + f' but the generated code in sky/schemas/generated/autostopv1_pb2_grpc.py depends on'
24
+ + f' grpcio>={GRPC_GENERATED_VERSION}.'
25
+ + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
26
+ + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
27
+ + f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
28
+ + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
29
+ RuntimeWarning
30
+ )
31
+
32
+
33
+ class AutostopServiceStub(object):
34
+ """Missing associated documentation comment in .proto file."""
35
+
36
+ def __init__(self, channel):
37
+ """Constructor.
38
+
39
+ Args:
40
+ channel: A grpc.Channel.
41
+ """
42
+ self.SetAutostop = channel.unary_unary(
43
+ '/autostop.v1.AutostopService/SetAutostop',
44
+ request_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.SerializeToString,
45
+ response_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.FromString,
46
+ _registered_method=True)
47
+ self.IsAutostopping = channel.unary_unary(
48
+ '/autostop.v1.AutostopService/IsAutostopping',
49
+ request_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.SerializeToString,
50
+ response_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.FromString,
51
+ _registered_method=True)
52
+
53
+
54
+ class AutostopServiceServicer(object):
55
+ """Missing associated documentation comment in .proto file."""
56
+
57
+ def SetAutostop(self, request, context):
58
+ """Set autostop configuration for the cluster.
59
+ """
60
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
61
+ context.set_details('Method not implemented!')
62
+ raise NotImplementedError('Method not implemented!')
63
+
64
+ def IsAutostopping(self, request, context):
65
+ """Check if the cluster is currently autostopping.
66
+ """
67
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
68
+ context.set_details('Method not implemented!')
69
+ raise NotImplementedError('Method not implemented!')
70
+
71
+
72
+ def add_AutostopServiceServicer_to_server(servicer, server):
73
+ rpc_method_handlers = {
74
+ 'SetAutostop': grpc.unary_unary_rpc_method_handler(
75
+ servicer.SetAutostop,
76
+ request_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.FromString,
77
+ response_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.SerializeToString,
78
+ ),
79
+ 'IsAutostopping': grpc.unary_unary_rpc_method_handler(
80
+ servicer.IsAutostopping,
81
+ request_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.FromString,
82
+ response_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.SerializeToString,
83
+ ),
84
+ }
85
+ generic_handler = grpc.method_handlers_generic_handler(
86
+ 'autostop.v1.AutostopService', rpc_method_handlers)
87
+ server.add_generic_rpc_handlers((generic_handler,))
88
+
89
+
90
+ # This class is part of an EXPERIMENTAL API.
91
+ class AutostopService(object):
92
+ """Missing associated documentation comment in .proto file."""
93
+
94
+ @staticmethod
95
+ def SetAutostop(request,
96
+ target,
97
+ options=(),
98
+ channel_credentials=None,
99
+ call_credentials=None,
100
+ insecure=False,
101
+ compression=None,
102
+ wait_for_ready=None,
103
+ timeout=None,
104
+ metadata=None):
105
+ return grpc.experimental.unary_unary(
106
+ request,
107
+ target,
108
+ '/autostop.v1.AutostopService/SetAutostop',
109
+ sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.SerializeToString,
110
+ sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.FromString,
111
+ options,
112
+ channel_credentials,
113
+ insecure,
114
+ call_credentials,
115
+ compression,
116
+ wait_for_ready,
117
+ timeout,
118
+ metadata,
119
+ _registered_method=True)
120
+
121
+ @staticmethod
122
+ def IsAutostopping(request,
123
+ target,
124
+ options=(),
125
+ channel_credentials=None,
126
+ call_credentials=None,
127
+ insecure=False,
128
+ compression=None,
129
+ wait_for_ready=None,
130
+ timeout=None,
131
+ metadata=None):
132
+ return grpc.experimental.unary_unary(
133
+ request,
134
+ target,
135
+ '/autostop.v1.AutostopService/IsAutostopping',
136
+ sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.SerializeToString,
137
+ sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.FromString,
138
+ options,
139
+ channel_credentials,
140
+ insecure,
141
+ call_credentials,
142
+ compression,
143
+ wait_for_ready,
144
+ timeout,
145
+ metadata,
146
+ _registered_method=True)
sky/serve/constants.py CHANGED
@@ -73,13 +73,6 @@ CONTROLLER_AUTOSTOP = {
73
73
  'down': False,
74
74
  }
75
75
 
76
- # Due to the CPU/memory usage of the controller process launched with a job on
77
- # controller VM (use ray job under the hood), we need to reserve some CPU/memory
78
- # for each serve controller process.
79
- # Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
80
- # services.
81
- CONTROLLER_MEMORY_USAGE_GB = 1.0
82
-
83
76
  # A period of time to initialize your service. Any readiness probe failures
84
77
  # during this period will be ignored.
85
78
  DEFAULT_INITIAL_DELAY_SECONDS = 1200
@@ -115,3 +108,6 @@ TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
115
108
 
116
109
  # Dummy run command for cluster pool.
117
110
  POOL_DUMMY_RUN_COMMAND = 'echo "setup done"'
111
+
112
+ # Error message for max number of services reached.
113
+ MAX_NUMBER_OF_SERVICES_REACHED_ERROR = 'Max number of services reached.'
@@ -13,16 +13,16 @@ import typing
13
13
  from typing import Any, Dict, List, Optional, Tuple
14
14
 
15
15
  import colorama
16
- import psutil
16
+ import filelock
17
17
  import requests
18
18
 
19
- import sky
20
19
  from sky import backends
21
20
  from sky import core
22
21
  from sky import exceptions
23
22
  from sky import execution
24
23
  from sky import global_user_state
25
24
  from sky import sky_logging
25
+ from sky import task as task_lib
26
26
  from sky.backends import backend_utils
27
27
  from sky.jobs import scheduler as jobs_scheduler
28
28
  from sky.serve import constants as serve_constants
@@ -41,7 +41,6 @@ from sky.utils import status_lib
41
41
  from sky.utils import ux_utils
42
42
 
43
43
  if typing.TYPE_CHECKING:
44
- from sky import resources
45
44
  from sky.serve import service_spec
46
45
 
47
46
  logger = sky_logging.init_logger(__name__)
@@ -51,10 +50,6 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
51
50
  _RETRY_INIT_GAP_SECONDS = 60
52
51
  _DEFAULT_DRAIN_SECONDS = 120
53
52
 
54
- # Since sky.launch is very resource demanding, we limit the number of
55
- # concurrent sky.launch process to avoid overloading the machine.
56
- _MAX_NUM_LAUNCH = psutil.cpu_count() * 2
57
-
58
53
 
59
54
  # TODO(tian): Combine this with
60
55
  # sky/spot/recovery_strategy.py::StrategyExecutor::launch
@@ -81,7 +76,7 @@ def launch_cluster(replica_id: int,
81
76
  try:
82
77
  config = common_utils.read_yaml(
83
78
  os.path.expanduser(service_task_yaml_path))
84
- task = sky.Task.from_yaml_config(config)
79
+ task = task_lib.Task.from_yaml_config(config)
85
80
  if resources_override is not None:
86
81
  resources = task.resources
87
82
  overrided_resources = [
@@ -177,7 +172,7 @@ def terminate_cluster(cluster_name: str,
177
172
 
178
173
  def _get_resources_ports(service_task_yaml_path: str) -> str:
179
174
  """Get the resources ports used by the task."""
180
- task = sky.Task.from_yaml(service_task_yaml_path)
175
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
181
176
  # Already checked all ports are valid in sky.serve.core.up
182
177
  assert task.resources, task
183
178
  assert task.service is not None, task
@@ -195,7 +190,7 @@ def _should_use_spot(service_task_yaml_path: str,
195
190
  if use_spot_override is not None:
196
191
  assert isinstance(use_spot_override, bool)
197
192
  return use_spot_override
198
- task = sky.Task.from_yaml(service_task_yaml_path)
193
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
199
194
  spot_use_resources = [
200
195
  resources for resources in task.resources if resources.use_spot
201
196
  ]
@@ -688,7 +683,7 @@ class SkyPilotReplicaManager(ReplicaManager):
688
683
  service_task_yaml_path: str) -> None:
689
684
  super().__init__(service_name, spec)
690
685
  self.service_task_yaml_path = service_task_yaml_path
691
- task = sky.Task.from_yaml(service_task_yaml_path)
686
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
692
687
  self._spot_placer: Optional[spot_placer.SpotPlacer] = (
693
688
  spot_placer.SpotPlacer.from_task(spec, task))
694
689
  # TODO(tian): Store launch/down pid in the replica table, to make the
@@ -872,8 +867,9 @@ class SkyPilotReplicaManager(ReplicaManager):
872
867
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
873
868
  replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
874
869
  'replica_jobs')
875
- job_log_file_name = (controller_utils.download_and_stream_job_log(
876
- backend, handle, replica_job_logs_dir))
870
+ job_ids = ['1'] if self._is_pool else None
871
+ job_log_file_name = controller_utils.download_and_stream_job_log(
872
+ backend, handle, replica_job_logs_dir, job_ids)
877
873
  if job_log_file_name is not None:
878
874
  logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
879
875
  with open(log_file_name, 'a',
@@ -981,7 +977,9 @@ class SkyPilotReplicaManager(ReplicaManager):
981
977
  # To avoid `dictionary changed size during iteration` error.
982
978
  launch_process_pool_snapshot = list(self._launch_process_pool.items())
983
979
  for replica_id, p in launch_process_pool_snapshot:
984
- if not p.is_alive():
980
+ if p.is_alive():
981
+ continue
982
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
985
983
  info = serve_state.get_replica_info_from_id(
986
984
  self._service_name, replica_id)
987
985
  assert info is not None, replica_id
@@ -989,8 +987,7 @@ class SkyPilotReplicaManager(ReplicaManager):
989
987
  schedule_next_jobs = False
990
988
  if info.status == serve_state.ReplicaStatus.PENDING:
991
989
  # sky.launch not started yet
992
- if (serve_state.total_number_provisioning_replicas() <
993
- _MAX_NUM_LAUNCH):
990
+ if controller_utils.can_provision():
994
991
  p.start()
995
992
  info.status_property.sky_launch_status = (
996
993
  ProcessStatus.RUNNING)
@@ -1044,6 +1041,8 @@ class SkyPilotReplicaManager(ReplicaManager):
1044
1041
  self._terminate_replica(replica_id,
1045
1042
  sync_down_logs=True,
1046
1043
  replica_drain_delay_seconds=0)
1044
+ # Try schedule next job after acquiring the lock.
1045
+ jobs_scheduler.maybe_schedule_next_jobs()
1047
1046
  down_process_pool_snapshot = list(self._down_process_pool.items())
1048
1047
  for replica_id, p in down_process_pool_snapshot:
1049
1048
  if not p.is_alive():
sky/serve/serve_state.py CHANGED
@@ -502,6 +502,16 @@ def get_services() -> List[Dict[str, Any]]:
502
502
  return records
503
503
 
504
504
 
505
+ @init_db
506
+ def get_num_services() -> int:
507
+ """Get the number of services."""
508
+ assert _SQLALCHEMY_ENGINE is not None
509
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
510
+ return session.execute(
511
+ sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
512
+ ).select_from(services_table)).fetchone()[0]
513
+
514
+
505
515
  @init_db
506
516
  def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
507
517
  """Get all existing service records."""
sky/serve/serve_utils.py CHANGED
@@ -57,14 +57,6 @@ else:
57
57
 
58
58
  logger = sky_logging.init_logger(__name__)
59
59
 
60
-
61
- @annotations.lru_cache(scope='request')
62
- def get_num_service_threshold():
63
- """Get number of services threshold, calculating it only when needed."""
64
- system_memory_gb = psutil.virtual_memory().total // (1024**3)
65
- return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
66
-
67
-
68
60
  _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
69
61
 
70
62
  # NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
@@ -524,6 +516,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
524
516
 
525
517
 
526
518
  def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
519
+ # NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
520
+ # checking replica cluster existence. Be careful when changing it.
527
521
  return f'{service_name}-{replica_id}'
528
522
 
529
523
 
@@ -796,9 +790,13 @@ def load_version_string(payload: str) -> str:
796
790
  return message_utils.decode_payload(payload)
797
791
 
798
792
 
799
- def num_replicas(service_name: str) -> int:
793
+ def get_ready_replicas(
794
+ service_name: str) -> List['replica_managers.ReplicaInfo']:
800
795
  logger.info(f'Get number of replicas for pool {service_name!r}')
801
- return len(serve_state.get_replica_infos(service_name))
796
+ return [
797
+ info for info in serve_state.get_replica_infos(service_name)
798
+ if info.status == serve_state.ReplicaStatus.READY
799
+ ]
802
800
 
803
801
 
804
802
  def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
@@ -823,12 +821,8 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
823
821
  logger.error(f'Service {service_name!r} is not a cluster pool.')
824
822
  return None
825
823
  with filelock.FileLock(get_service_filelock_path(service_name)):
826
-
827
824
  logger.debug(f'Get next cluster name for pool {service_name!r}')
828
- ready_replicas = [
829
- info for info in serve_state.get_replica_infos(service_name)
830
- if info.status == serve_state.ReplicaStatus.READY
831
- ]
825
+ ready_replicas = get_ready_replicas(service_name)
832
826
  idle_replicas: List['replica_managers.ReplicaInfo'] = []
833
827
  for replica_info in ready_replicas:
834
828
  jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
@@ -1044,11 +1038,18 @@ def wait_service_registration(service_name: str, job_id: int,
1044
1038
  lb_port = record['load_balancer_port']
1045
1039
  if lb_port is not None:
1046
1040
  return message_utils.encode_payload(lb_port)
1047
- elif len(serve_state.get_services()) >= get_num_service_threshold():
1048
- with ux_utils.print_exception_no_traceback():
1049
- raise RuntimeError('Max number of services reached. '
1050
- 'To spin up more services, please '
1051
- 'tear down some existing services.')
1041
+ else:
1042
+ controller_log_path = os.path.expanduser(
1043
+ generate_remote_controller_log_file_name(service_name))
1044
+ if os.path.exists(controller_log_path):
1045
+ with open(controller_log_path, 'r', encoding='utf-8') as f:
1046
+ log_content = f.read()
1047
+ if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
1048
+ in log_content):
1049
+ with ux_utils.print_exception_no_traceback():
1050
+ raise RuntimeError('Max number of services reached. '
1051
+ 'To spin up more services, please '
1052
+ 'tear down some existing services.')
1052
1053
  elapsed = time.time() - start_time
1053
1054
  if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
1054
1055
  # Print the controller log to help user debug.
sky/serve/server/impl.py CHANGED
@@ -11,7 +11,6 @@ import uuid
11
11
  import colorama
12
12
  import filelock
13
13
 
14
- import sky
15
14
  from sky import backends
16
15
  from sky import exceptions
17
16
  from sky import execution
@@ -25,6 +24,7 @@ from sky.serve import constants as serve_constants
25
24
  from sky.serve import serve_state
26
25
  from sky.serve import serve_utils
27
26
  from sky.skylet import constants
27
+ from sky.skylet import job_lib
28
28
  from sky.utils import admin_policy_utils
29
29
  from sky.utils import command_runner
30
30
  from sky.utils import common
@@ -39,7 +39,7 @@ logger = sky_logging.init_logger(__name__)
39
39
 
40
40
 
41
41
  def _rewrite_tls_credential_paths_and_get_tls_env_vars(
42
- service_name: str, task: 'sky.Task') -> Dict[str, Any]:
42
+ service_name: str, task: 'task_lib.Task') -> Dict[str, Any]:
43
43
  """Rewrite the paths of TLS credentials in the task.
44
44
 
45
45
  Args:
@@ -103,15 +103,11 @@ def _get_service_record(
103
103
 
104
104
 
105
105
  def up(
106
- task: 'sky.Task',
106
+ task: 'task_lib.Task',
107
107
  service_name: Optional[str] = None,
108
108
  pool: bool = False,
109
109
  ) -> Tuple[str, str]:
110
110
  """Spins up a service or a pool."""
111
- if pool and not serve_utils.is_consolidation_mode(pool):
112
- raise ValueError(
113
- 'Pool is only supported in consolidation mode. To fix, set '
114
- '`jobs.controller.consolidation_mode: true` in SkyPilot config.')
115
111
  task.validate()
116
112
  serve_utils.validate_service_task(task, pool=pool)
117
113
  assert task.service is not None
@@ -191,8 +187,7 @@ def up(
191
187
  controller_log_file = (
192
188
  serve_utils.generate_remote_controller_log_file_name(service_name))
193
189
  controller_resources = controller_utils.get_controller_resources(
194
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
195
- task_resources=task.resources)
190
+ controller=controller, task_resources=task.resources)
196
191
  controller_job_id = None
197
192
  if serve_utils.is_consolidation_mode(pool):
198
193
  # We need a unique integer per sky.serve.up call to avoid name
@@ -228,10 +223,11 @@ def up(
228
223
  # balancer port from the controller? So we don't need to open so many
229
224
  # ports here. Or, we should have a nginx traffic control to refuse
230
225
  # any connection to the unregistered ports.
231
- controller_resources = {
232
- r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
233
- for r in controller_resources
234
- }
226
+ if not pool:
227
+ controller_resources = {
228
+ r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
229
+ for r in controller_resources
230
+ }
235
231
  controller_task.set_resources(controller_resources)
236
232
 
237
233
  # # Set service_name so the backend will know to modify default ray
@@ -325,7 +321,7 @@ def up(
325
321
  [controller_job_id],
326
322
  stream_logs=False)
327
323
  controller_job_status = list(statuses.values())[0]
328
- if controller_job_status == sky.JobStatus.PENDING:
324
+ if controller_job_status == job_lib.JobStatus.PENDING:
329
325
  # Max number of services reached due to vCPU constraint.
330
326
  # The controller job is pending due to ray job scheduling.
331
327
  # We manually cancel the job here.
@@ -350,7 +346,7 @@ def up(
350
346
  else:
351
347
  lb_port = serve_utils.load_service_initialization_result(
352
348
  lb_port_payload)
353
- if not serve_utils.is_consolidation_mode(pool):
349
+ if not serve_utils.is_consolidation_mode(pool) and not pool:
354
350
  socket_endpoint = backend_utils.get_endpoints(
355
351
  controller_handle.cluster_name,
356
352
  lb_port,
@@ -374,10 +370,10 @@ def up(
374
370
  f'\n📋 Useful Commands'
375
371
  f'\n{ux_utils.INDENT_SYMBOL}To submit jobs to the pool:\t'
376
372
  f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
377
- f'<run-command>{ux_utils.RESET_BOLD}'
373
+ f'<yaml_file>{ux_utils.RESET_BOLD}'
378
374
  f'\n{ux_utils.INDENT_SYMBOL}To submit multiple jobs:\t'
379
375
  f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
380
- f'--num-jobs 10 <run-command>{ux_utils.RESET_BOLD}'
376
+ f'--num-jobs 10 <yaml_file>{ux_utils.RESET_BOLD}'
381
377
  f'\n{ux_utils.INDENT_SYMBOL}To check the pool status:\t'
382
378
  f'{ux_utils.BOLD}sky jobs pool status {service_name}'
383
379
  f'{ux_utils.RESET_BOLD}'
@@ -421,7 +417,7 @@ def up(
421
417
 
422
418
 
423
419
  def update(
424
- task: 'sky.Task',
420
+ task: 'task_lib.Task',
425
421
  service_name: str,
426
422
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
427
423
  pool: bool = False,
@@ -576,7 +572,7 @@ def update(
576
572
 
577
573
 
578
574
  def apply(
579
- task: 'sky.Task',
575
+ task: 'task_lib.Task',
580
576
  service_name: str,
581
577
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
582
578
  pool: bool = False,
sky/serve/service.py CHANGED
@@ -15,11 +15,13 @@ import filelock
15
15
 
16
16
  from sky import authentication
17
17
  from sky import exceptions
18
+ from sky import global_user_state
18
19
  from sky import sky_logging
19
20
  from sky import task as task_lib
20
21
  from sky.backends import backend_utils
21
22
  from sky.backends import cloud_vm_ray_backend
22
23
  from sky.data import data_utils
24
+ from sky.jobs import scheduler as jobs_scheduler
23
25
  from sky.serve import constants
24
26
  from sky.serve import controller
25
27
  from sky.serve import load_balancer
@@ -28,6 +30,7 @@ from sky.serve import serve_state
28
30
  from sky.serve import serve_utils
29
31
  from sky.skylet import constants as skylet_constants
30
32
  from sky.utils import common_utils
33
+ from sky.utils import controller_utils
31
34
  from sky.utils import subprocess_utils
32
35
  from sky.utils import ux_utils
33
36
 
@@ -120,7 +123,16 @@ def _cleanup(service_name: str) -> bool:
120
123
  replica_infos = serve_state.get_replica_infos(service_name)
121
124
  info2proc: Dict[replica_managers.ReplicaInfo,
122
125
  multiprocessing.Process] = dict()
126
+ # NOTE(dev): This relies on `sky/serve/serve_utils.py::
127
+ # generate_replica_cluster_name`. Change it if you change the function.
128
+ existing_cluster_names = global_user_state.get_cluster_names_start_with(
129
+ service_name)
123
130
  for info in replica_infos:
131
+ if info.cluster_name not in existing_cluster_names:
132
+ logger.info(f'Cluster {info.cluster_name} for replica '
133
+ f'{info.replica_id} not found. Might be a failed '
134
+ 'cluster. Skipping.')
135
+ continue
124
136
  p = multiprocessing.Process(target=replica_managers.terminate_cluster,
125
137
  args=(info.cluster_name,))
126
138
  p.start()
@@ -214,22 +226,25 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
214
226
  service_name, version)
215
227
 
216
228
  if not is_recovery:
217
- if (len(serve_state.get_services()) >=
218
- serve_utils.get_num_service_threshold()):
219
- cleanup_storage(tmp_task_yaml)
220
- with ux_utils.print_exception_no_traceback():
221
- raise RuntimeError('Max number of services reached.')
222
- success = serve_state.add_service(
223
- service_name,
224
- controller_job_id=job_id,
225
- policy=service_spec.autoscaling_policy_str(),
226
- requested_resources_str=backend_utils.get_task_resources_str(task),
227
- load_balancing_policy=service_spec.load_balancing_policy,
228
- status=serve_state.ServiceStatus.CONTROLLER_INIT,
229
- tls_encrypted=service_spec.tls_credential is not None,
230
- pool=service_spec.pool,
231
- controller_pid=os.getpid(),
232
- entrypoint=entrypoint)
229
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
230
+ if not controller_utils.can_start_new_process():
231
+ cleanup_storage(tmp_task_yaml)
232
+ with ux_utils.print_exception_no_traceback():
233
+ raise RuntimeError(
234
+ constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
235
+ success = serve_state.add_service(
236
+ service_name,
237
+ controller_job_id=job_id,
238
+ policy=service_spec.autoscaling_policy_str(),
239
+ requested_resources_str=backend_utils.get_task_resources_str(
240
+ task),
241
+ load_balancing_policy=service_spec.load_balancing_policy,
242
+ status=serve_state.ServiceStatus.CONTROLLER_INIT,
243
+ tls_encrypted=service_spec.tls_credential is not None,
244
+ pool=service_spec.pool,
245
+ controller_pid=os.getpid(),
246
+ entrypoint=entrypoint)
247
+ jobs_scheduler.maybe_schedule_next_jobs()
233
248
  # Directly throw an error here. See sky/serve/api.py::up
234
249
  # for more details.
235
250
  if not success:
sky/server/server.py CHANGED
@@ -17,7 +17,7 @@ import resource
17
17
  import shutil
18
18
  import sys
19
19
  import threading
20
- from typing import Any, Dict, List, Literal, Optional, Set, Tuple
20
+ from typing import Dict, List, Literal, Optional, Set, Tuple
21
21
  import uuid
22
22
  import zipfile
23
23
 
@@ -42,6 +42,7 @@ from sky.data import storage_utils
42
42
  from sky.jobs.server import server as jobs_rest
43
43
  from sky.metrics import utils as metrics_utils
44
44
  from sky.provision.kubernetes import utils as kubernetes_utils
45
+ from sky.schemas.api import responses
45
46
  from sky.serve.server import server as serve_rest
46
47
  from sky.server import common
47
48
  from sky.server import config as server_config
@@ -1531,8 +1532,12 @@ async def api_status(
1531
1532
  return encoded_request_tasks
1532
1533
 
1533
1534
 
1534
- @app.get('/api/health')
1535
- async def health(request: fastapi.Request) -> Dict[str, Any]:
1535
+ @app.get(
1536
+ '/api/health',
1537
+ # response_model_exclude_unset omits unset fields
1538
+ # in the response JSON.
1539
+ response_model_exclude_unset=True)
1540
+ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
1536
1541
  """Checks the health of the API server.
1537
1542
 
1538
1543
  Returns:
@@ -1570,7 +1575,8 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
1570
1575
  # - There is no harm when an malicious client calls /api/health
1571
1576
  # without authentication since no sensitive information is
1572
1577
  # returned.
1573
- return {'status': common.ApiServerStatus.HEALTHY}
1578
+ return responses.APIHealthResponse(
1579
+ status=common.ApiServerStatus.HEALTHY,)
1574
1580
  # TODO(aylei): remove this after min_compatible_api_version >= 14.
1575
1581
  if client_version < 14:
1576
1582
  # For Client with API version < 14, the NEEDS_AUTH status is not
@@ -1579,19 +1585,19 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
1579
1585
  detail='Authentication required')
1580
1586
 
1581
1587
  logger.debug(f'Health endpoint: request.state.auth_user = {user}')
1582
- return {
1583
- 'status': server_status,
1588
+ return responses.APIHealthResponse(
1589
+ status=server_status,
1584
1590
  # Kept for backward compatibility, clients before 0.11.0 will read this
1585
1591
  # field to check compatibility and hint the user to upgrade the CLI.
1586
1592
  # TODO(aylei): remove this field after 0.13.0
1587
- 'api_version': str(server_constants.API_VERSION),
1588
- 'version': sky.__version__,
1589
- 'version_on_disk': common.get_skypilot_version_on_disk(),
1590
- 'commit': sky.__commit__,
1591
- 'user': user.to_dict() if user is not None else None,
1592
- 'basic_auth_enabled': os.environ.get(
1593
- constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false').lower() == 'true',
1594
- }
1593
+ api_version=str(server_constants.API_VERSION),
1594
+ version=sky.__version__,
1595
+ version_on_disk=common.get_skypilot_version_on_disk(),
1596
+ commit=sky.__commit__,
1597
+ basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
1598
+ 'false').lower() == 'true',
1599
+ user=user if user is not None else None,
1600
+ )
1595
1601
 
1596
1602
 
1597
1603
  @app.websocket('/kubernetes-pod-ssh-proxy')