skypilot-nightly 1.0.0.dev20250908__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (70) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -4
  3. sky/backends/backend_utils.py +35 -1
  4. sky/backends/cloud_vm_ray_backend.py +2 -2
  5. sky/client/sdk.py +20 -0
  6. sky/client/sdk_async.py +18 -16
  7. sky/clouds/aws.py +3 -1
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/{webpack-47c64cc05717f8a3.js → webpack-1d7e11230da3ca89.js} +1 -1
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/config.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra/[context].html +1 -1
  16. sky/dashboard/out/infra.html +1 -1
  17. sky/dashboard/out/jobs/[job].html +1 -1
  18. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/storage.py +5 -1
  26. sky/execution.py +21 -14
  27. sky/jobs/constants.py +3 -0
  28. sky/jobs/controller.py +732 -310
  29. sky/jobs/recovery_strategy.py +251 -129
  30. sky/jobs/scheduler.py +247 -174
  31. sky/jobs/server/core.py +20 -4
  32. sky/jobs/server/utils.py +2 -2
  33. sky/jobs/state.py +702 -511
  34. sky/jobs/utils.py +94 -39
  35. sky/provision/aws/config.py +4 -1
  36. sky/provision/gcp/config.py +6 -1
  37. sky/provision/kubernetes/utils.py +17 -8
  38. sky/provision/provisioner.py +1 -0
  39. sky/serve/replica_managers.py +0 -7
  40. sky/serve/serve_utils.py +5 -0
  41. sky/serve/server/impl.py +1 -2
  42. sky/serve/service.py +0 -2
  43. sky/server/common.py +8 -3
  44. sky/server/config.py +43 -24
  45. sky/server/constants.py +1 -0
  46. sky/server/daemons.py +7 -11
  47. sky/server/metrics.py +60 -9
  48. sky/server/requests/executor.py +33 -32
  49. sky/server/requests/serializers/encoders.py +1 -1
  50. sky/server/server.py +57 -4
  51. sky/server/uvicorn.py +4 -0
  52. sky/setup_files/dependencies.py +4 -2
  53. sky/skylet/attempt_skylet.py +1 -0
  54. sky/skylet/constants.py +3 -1
  55. sky/skylet/events.py +2 -10
  56. sky/utils/command_runner.pyi +3 -3
  57. sky/utils/common_utils.py +19 -7
  58. sky/utils/controller_utils.py +5 -0
  59. sky/utils/db/db_utils.py +31 -2
  60. sky/utils/rich_utils.py +3 -1
  61. sky/utils/subprocess_utils.py +9 -0
  62. sky/volumes/volume.py +2 -0
  63. {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +37 -35
  64. {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +70 -70
  65. /sky/dashboard/out/_next/static/{qikl6zGuwe8BKavteei3r → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
  66. /sky/dashboard/out/_next/static/{qikl6zGuwe8BKavteei3r → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
  67. {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
  68. {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
  69. {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
  70. {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0
sky/server/daemons.py CHANGED
@@ -11,6 +11,7 @@ from sky.utils import annotations
11
11
  from sky.utils import common
12
12
  from sky.utils import common_utils
13
13
  from sky.utils import env_options
14
+ from sky.utils import subprocess_utils
14
15
  from sky.utils import timeline
15
16
  from sky.utils import ux_utils
16
17
 
@@ -74,6 +75,10 @@ class InternalRequestDaemon:
74
75
  # using too much memory.
75
76
  annotations.clear_request_level_cache()
76
77
  timeline.save_timeline()
78
+ # Kill all children processes related to this request.
79
+ # Each executor handles a single request, so we can safely
80
+ # kill all children processes related to this request.
81
+ subprocess_utils.kill_children_processes()
77
82
  common_utils.release_memory()
78
83
  except Exception: # pylint: disable=broad-except
79
84
  # It is OK to fail to run the event, as the event is not
@@ -123,21 +128,16 @@ def managed_job_status_refresh_event():
123
128
  """Refresh the managed job status for controller consolidation mode."""
124
129
  # pylint: disable=import-outside-toplevel
125
130
  from sky.jobs import utils as managed_job_utils
126
- from sky.utils import controller_utils
127
131
 
128
132
  # We run the recovery logic before starting the event loop as those two are
129
133
  # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
130
- if controller_utils.high_availability_specified(
131
- controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
132
- managed_job_utils.ha_recovery_for_consolidation_mode()
134
+ managed_job_utils.ha_recovery_for_consolidation_mode()
133
135
 
134
136
  # After recovery, we start the event loop.
135
137
  from sky.skylet import events
136
138
  refresh_event = events.ManagedJobEvent()
137
- scheduling_event = events.ManagedJobSchedulingEvent()
138
139
  logger.info('=== Running managed job event ===')
139
140
  refresh_event.run()
140
- scheduling_event.run()
141
141
  time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
142
142
 
143
143
 
@@ -152,14 +152,10 @@ def _serve_status_refresh_event(pool: bool):
152
152
  """Refresh the sky serve status for controller consolidation mode."""
153
153
  # pylint: disable=import-outside-toplevel
154
154
  from sky.serve import serve_utils
155
- from sky.utils import controller_utils
156
155
 
157
156
  # We run the recovery logic before starting the event loop as those two are
158
157
  # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
159
- controller = controller_utils.get_controller_for_pool(pool)
160
- if controller_utils.high_availability_specified(
161
- controller.value.cluster_name):
162
- serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
158
+ serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
163
159
 
164
160
  # After recovery, we start the event loop.
165
161
  from sky.skylet import events
sky/server/metrics.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  import contextlib
4
4
  import functools
5
+ import multiprocessing
5
6
  import os
6
7
  import time
7
8
 
@@ -9,6 +10,7 @@ import fastapi
9
10
  from prometheus_client import generate_latest
10
11
  from prometheus_client import multiprocess
11
12
  import prometheus_client as prom
13
+ import psutil
12
14
  import starlette.middleware.base
13
15
  import uvicorn
14
16
 
@@ -38,15 +40,6 @@ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
38
40
  60.0, 120.0, float('inf')),
39
41
  )
40
42
 
41
- # Time spent processing requests in executor.
42
- SKY_APISERVER_REQUEST_EXECUTION_DURATION_SECONDS = prom.Histogram(
43
- 'sky_apiserver_request_execution_duration_seconds',
44
- 'Time spent executing requests in executor',
45
- ['request', 'worker'],
46
- buckets=(0.5, 1, 2.5, 5.0, 10.0, 15.0, 25.0, 40.0, 60.0, 90.0, 120.0, 180.0,
47
- float('inf')),
48
- )
49
-
50
43
  # Time spent processing a piece of code, refer to time_it().
51
44
  SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
52
45
  'sky_apiserver_code_duration_seconds',
@@ -64,6 +57,41 @@ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
64
57
  60.0, float('inf')),
65
58
  )
66
59
 
60
+ SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
61
+ 'sky_apiserver_websocket_connections',
62
+ 'Number of websocket connections',
63
+ ['pid'],
64
+ multiprocess_mode='livesum',
65
+ )
66
+
67
+ SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
68
+ 'sky_apiserver_websocket_closed_total',
69
+ 'Number of websocket closed',
70
+ ['pid', 'reason'],
71
+ )
72
+
73
+ # The number of execution starts in each worker process, we do not record
74
+ # histogram here as the duration has been measured in
75
+ # SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
76
+ # Recording histogram WITH worker label will cause high cardinality.
77
+ SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
78
+ 'sky_apiserver_process_execution_start_total',
79
+ 'Total number of execution starts in each worker process',
80
+ ['request', 'pid'],
81
+ )
82
+
83
+ SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
84
+ 'sky_apiserver_process_peak_rss',
85
+ 'Peak RSS we saw in each process in last 30 seconds',
86
+ ['pid', 'type'],
87
+ )
88
+
89
+ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
90
+ 'sky_apiserver_process_cpu_total',
91
+ 'Total CPU times a worker process has been running',
92
+ ['pid', 'type', 'mode'],
93
+ )
94
+
67
95
  metrics_app = fastapi.FastAPI()
68
96
 
69
97
 
@@ -178,3 +206,26 @@ def time_me_async(func):
178
206
  return await func(*args, **kwargs)
179
207
 
180
208
  return async_wrapper
209
+
210
+
211
+ def process_monitor(process_type: str):
212
+ pid = multiprocessing.current_process().pid
213
+ proc = psutil.Process(pid)
214
+ peak_rss = 0
215
+ last_bucket_end = time.time()
216
+ while True:
217
+ if time.time() - last_bucket_end >= 30:
218
+ # Reset peak RSS every 30 seconds.
219
+ last_bucket_end = time.time()
220
+ peak_rss = 0
221
+ peak_rss = max(peak_rss, proc.memory_info().rss)
222
+ SKY_APISERVER_PROCESS_PEAK_RSS.labels(pid=pid,
223
+ type=process_type).set(peak_rss)
224
+ ctimes = proc.cpu_times()
225
+ SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
226
+ type=process_type,
227
+ mode='user').set(ctimes.user)
228
+ SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
229
+ type=process_type,
230
+ mode='system').set(ctimes.system)
231
+ time.sleep(1)
@@ -130,6 +130,9 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
130
130
  def executor_initializer(proc_group: str):
131
131
  setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
132
132
  f'{multiprocessing.current_process().pid}')
133
+ threading.Thread(target=metrics_lib.process_monitor,
134
+ args=(f'worker:{proc_group}',),
135
+ daemon=True).start()
133
136
 
134
137
 
135
138
  class RequestWorker:
@@ -281,34 +284,34 @@ def override_request_env_and_config(
281
284
  request_id: str) -> Generator[None, None, None]:
282
285
  """Override the environment and SkyPilot config for a request."""
283
286
  original_env = os.environ.copy()
284
- # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API server
285
- # affecting client requests. If set on the client side, it will be
286
- # overridden by the request body.
287
- os.environ.pop('SKYPILOT_DEBUG', None)
288
- # Remove the db connection uri from client supplied env vars, as the
289
- # client should not set the db string on server side.
290
- request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
291
- os.environ.update(request_body.env_vars)
292
- # Note: may be overridden by AuthProxyMiddleware.
293
- # TODO(zhwu): we need to make the entire request a context available to the
294
- # entire request execution, so that we can access info like user through
295
- # the execution.
296
- user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
297
- name=request_body.env_vars[constants.USER_ENV_VAR])
298
- global_user_state.add_or_update_user(user)
299
- # Refetch the user to get the latest user info, including the created_at
300
- # field.
301
- user = global_user_state.get_user(user.id)
302
-
303
- # Force color to be enabled.
304
- os.environ['CLICOLOR_FORCE'] = '1'
305
- server_common.reload_for_new_request(
306
- client_entrypoint=request_body.entrypoint,
307
- client_command=request_body.entrypoint_command,
308
- using_remote_api_server=request_body.using_remote_api_server,
309
- user=user,
310
- request_id=request_id)
311
287
  try:
288
+ # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API
289
+ # server affecting client requests. If set on the client side, it will
290
+ # be overridden by the request body.
291
+ os.environ.pop('SKYPILOT_DEBUG', None)
292
+ # Remove the db connection uri from client supplied env vars, as the
293
+ # client should not set the db string on server side.
294
+ request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
295
+ os.environ.update(request_body.env_vars)
296
+ # Note: may be overridden by AuthProxyMiddleware.
297
+ # TODO(zhwu): we need to make the entire request a context available to
298
+ # the entire request execution, so that we can access info like user
299
+ # through the execution.
300
+ user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
301
+ name=request_body.env_vars[constants.USER_ENV_VAR])
302
+ global_user_state.add_or_update_user(user)
303
+ # Refetch the user to get the latest user info, including the created_at
304
+ # field.
305
+ user = global_user_state.get_user(user.id)
306
+
307
+ # Force color to be enabled.
308
+ os.environ['CLICOLOR_FORCE'] = '1'
309
+ server_common.reload_for_new_request(
310
+ client_entrypoint=request_body.entrypoint,
311
+ client_command=request_body.entrypoint_command,
312
+ using_remote_api_server=request_body.using_remote_api_server,
313
+ user=user,
314
+ request_id=request_id)
312
315
  logger.debug(
313
316
  f'override path: {request_body.override_skypilot_config_path}')
314
317
  with skypilot_config.override_skypilot_config(
@@ -401,6 +404,8 @@ def _request_execution_wrapper(request_id: str,
401
404
  config = skypilot_config.to_dict()
402
405
  logger.debug(f'request config: \n'
403
406
  f'{yaml_utils.dump_yaml_str(dict(config))}')
407
+ metrics_lib.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.labels(
408
+ request=request_name, pid=pid).inc()
404
409
  with metrics_lib.time_it(name=request_name,
405
410
  group='request_execution'):
406
411
  return_value = func(**request_body.to_kwargs())
@@ -439,11 +444,7 @@ def _request_execution_wrapper(request_id: str,
439
444
  logger.info(f'Request {request_id} finished')
440
445
  finally:
441
446
  with metrics_lib.time_it(name='release_memory', group='internal'):
442
- try:
443
- common_utils.release_memory()
444
- except Exception as e: # pylint: disable=broad-except
445
- logger.error(f'Failed to release memory: '
446
- f'{common_utils.format_exception(e)}')
447
+ common_utils.release_memory()
447
448
 
448
449
 
449
450
  async def execute_request_coroutine(request: api_requests.Request):
@@ -131,7 +131,7 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
131
131
  def encode_jobs_queue_v2(
132
132
  jobs_or_tuple) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
133
133
  # Support returning either a plain jobs list or a (jobs, total) tuple
134
- status_counts = {}
134
+ status_counts: Dict[str, int] = {}
135
135
  if isinstance(jobs_or_tuple, tuple):
136
136
  if len(jobs_or_tuple) == 2:
137
137
  jobs, total = jobs_or_tuple
sky/server/server.py CHANGED
@@ -625,6 +625,9 @@ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
625
625
  app.include_router(ssh_node_pools_rest.router,
626
626
  prefix='/ssh_node_pools',
627
627
  tags=['ssh_node_pools'])
628
+ # increase the resource limit for the server
629
+ soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
630
+ resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
628
631
 
629
632
  # Increase the limit of files we can open to our hard limit. This fixes bugs
630
633
  # where we can not aquire file locks or open enough logs and the API server
@@ -1734,7 +1737,12 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1734
1737
  return
1735
1738
 
1736
1739
  logger.info(f'Starting port-forward to local port: {local_port}')
1740
+ conn_gauge = metrics.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
1741
+ pid=os.getpid())
1742
+ ssh_failed = False
1743
+ websocket_closed = False
1737
1744
  try:
1745
+ conn_gauge.inc()
1738
1746
  # Connect to the local port
1739
1747
  reader, writer = await asyncio.open_connection('127.0.0.1', local_port)
1740
1748
 
@@ -1742,9 +1750,21 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1742
1750
  try:
1743
1751
  async for message in websocket.iter_bytes():
1744
1752
  writer.write(message)
1745
- await writer.drain()
1753
+ try:
1754
+ await writer.drain()
1755
+ except Exception as e: # pylint: disable=broad-except
1756
+ # Typically we will not reach here, if the ssh to pod
1757
+ # is disconnected, ssh_to_websocket will exit first.
1758
+ # But just in case.
1759
+ logger.error('Failed to write to pod through '
1760
+ f'port-forward connection: {e}')
1761
+ nonlocal ssh_failed
1762
+ ssh_failed = True
1763
+ break
1746
1764
  except fastapi.WebSocketDisconnect:
1747
1765
  pass
1766
+ nonlocal websocket_closed
1767
+ websocket_closed = True
1748
1768
  writer.close()
1749
1769
 
1750
1770
  async def ssh_to_websocket():
@@ -1752,15 +1772,44 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1752
1772
  while True:
1753
1773
  data = await reader.read(1024)
1754
1774
  if not data:
1775
+ if not websocket_closed:
1776
+ logger.warning('SSH connection to pod is '
1777
+ 'disconnected before websocket '
1778
+ 'connection is closed')
1779
+ nonlocal ssh_failed
1780
+ ssh_failed = True
1755
1781
  break
1756
1782
  await websocket.send_bytes(data)
1757
1783
  except Exception: # pylint: disable=broad-except
1758
1784
  pass
1759
- await websocket.close()
1785
+ try:
1786
+ await websocket.close()
1787
+ except Exception: # pylint: disable=broad-except
1788
+ # The websocket might has been closed by the client.
1789
+ pass
1760
1790
 
1761
1791
  await asyncio.gather(websocket_to_ssh(), ssh_to_websocket())
1762
1792
  finally:
1763
- proc.terminate()
1793
+ conn_gauge.dec()
1794
+ reason = ''
1795
+ try:
1796
+ logger.info('Terminating kubectl port-forward process')
1797
+ proc.terminate()
1798
+ except ProcessLookupError:
1799
+ stdout = await proc.stdout.read()
1800
+ logger.error('kubectl port-forward was terminated before the '
1801
+ 'ssh websocket connection was closed. Remaining '
1802
+ f'output: {str(stdout)}')
1803
+ reason = 'KubectlPortForwardExit'
1804
+ metrics.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
1805
+ pid=os.getpid(), reason='KubectlPortForwardExit').inc()
1806
+ else:
1807
+ if ssh_failed:
1808
+ reason = 'SSHToPodDisconnected'
1809
+ else:
1810
+ reason = 'ClientClosed'
1811
+ metrics.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
1812
+ pid=os.getpid(), reason=reason).inc()
1764
1813
 
1765
1814
 
1766
1815
  @app.get('/all_contexts')
@@ -1780,7 +1829,7 @@ async def all_contexts(request: fastapi.Request) -> None:
1780
1829
  async def gpu_metrics() -> fastapi.Response:
1781
1830
  """Gets the GPU metrics from multiple external k8s clusters"""
1782
1831
  contexts = core.get_all_contexts()
1783
- all_metrics = []
1832
+ all_metrics: List[str] = []
1784
1833
  successful_contexts = 0
1785
1834
 
1786
1835
  tasks = [
@@ -1795,6 +1844,10 @@ async def gpu_metrics() -> fastapi.Response:
1795
1844
  if isinstance(result, Exception):
1796
1845
  logger.error(
1797
1846
  f'Failed to get metrics for context {contexts[i]}: {result}')
1847
+ elif isinstance(result, BaseException):
1848
+ # Avoid changing behavior for non-Exception BaseExceptions
1849
+ # like KeyboardInterrupt/SystemExit: re-raise them.
1850
+ raise result
1798
1851
  else:
1799
1852
  metrics_text = result
1800
1853
  all_metrics.append(metrics_text)
sky/server/uvicorn.py CHANGED
@@ -19,6 +19,7 @@ from uvicorn.supervisors import multiprocess
19
19
 
20
20
  from sky import sky_logging
21
21
  from sky.server import daemons
22
+ from sky.server import metrics as metrics_lib
22
23
  from sky.server import state
23
24
  from sky.server.requests import requests as requests_lib
24
25
  from sky.skylet import constants
@@ -212,6 +213,9 @@ class Server(uvicorn.Server):
212
213
  # Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
213
214
  event_loop.set_debug(True)
214
215
  event_loop.slow_callback_duration = lag_threshold
216
+ threading.Thread(target=metrics_lib.process_monitor,
217
+ args=('server',),
218
+ daemon=True).start()
215
219
  with self.capture_signals():
216
220
  asyncio.run(self.serve(*args, **kwargs))
217
221
 
@@ -63,6 +63,8 @@ install_requires = [
63
63
  'setproctitle',
64
64
  'sqlalchemy',
65
65
  'psycopg2-binary',
66
+ 'aiosqlite',
67
+ 'asyncpg',
66
68
  # TODO(hailong): These three dependencies should be removed after we make
67
69
  # the client-side actually not importing them.
68
70
  'casbin',
@@ -108,9 +110,9 @@ server_dependencies = [
108
110
  local_ray = [
109
111
  # Lower version of ray will cause dependency conflict for
110
112
  # click/grpcio/protobuf.
111
- # Excluded 2.6.0 as it has a bug in the cluster launcher:
113
+ # Ray 2.6.1+ resolved cluster launcher bugs and grpcio issues on Apple Silicon.
112
114
  # https://github.com/ray-project/ray/releases/tag/ray-2.6.1
113
- 'ray[default] >= 2.2.0, != 2.6.0',
115
+ 'ray[default] >= 2.6.1',
114
116
  ]
115
117
 
116
118
  remote = [
@@ -12,6 +12,7 @@ def restart_skylet():
12
12
  # Kills old skylet if it is running.
13
13
  # TODO(zhwu): make the killing graceful, e.g., use a signal to tell
14
14
  # skylet to exit, instead of directly killing it.
15
+
15
16
  subprocess.run(
16
17
  # We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
17
18
  # because need to handle the backward compatibility of the old skylet
sky/skylet/constants.py CHANGED
@@ -91,7 +91,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
91
91
  # cluster yaml is updated.
92
92
  #
93
93
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
94
- SKYLET_VERSION = '17'
94
+ SKYLET_VERSION = '18'
95
95
  # The version of the lib files that skylet/jobs use. Whenever there is an API
96
96
  # change for the job_lib or log_lib, we need to bump this version, so that the
97
97
  # user can be notified to update their SkyPilot version on the remote cluster.
@@ -374,6 +374,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
374
374
  ('ssh', 'pod_config'),
375
375
  ('kubernetes', 'custom_metadata'),
376
376
  ('kubernetes', 'pod_config'),
377
+ ('kubernetes', 'context_configs'),
377
378
  ('kubernetes', 'provision_timeout'),
378
379
  ('kubernetes', 'dws'),
379
380
  ('kubernetes', 'kueue'),
@@ -421,6 +422,7 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
421
422
  # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
422
423
  # Environment variable that is set to 'true' if this is a skypilot server.
423
424
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
425
+ OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
424
426
 
425
427
  # Environment variable that is set to 'true' if metrics are enabled.
426
428
  ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
sky/skylet/events.py CHANGED
@@ -11,7 +11,7 @@ import psutil
11
11
  from sky import clouds
12
12
  from sky import sky_logging
13
13
  from sky.backends import cloud_vm_ray_backend
14
- from sky.jobs import scheduler as managed_job_scheduler
14
+ from sky.jobs import scheduler
15
15
  from sky.jobs import state as managed_job_state
16
16
  from sky.jobs import utils as managed_job_utils
17
17
  from sky.serve import serve_utils
@@ -76,15 +76,7 @@ class ManagedJobEvent(SkyletEvent):
76
76
  def _run(self):
77
77
  logger.info('=== Updating managed job status ===')
78
78
  managed_job_utils.update_managed_jobs_statuses()
79
-
80
-
81
- class ManagedJobSchedulingEvent(SkyletEvent):
82
- """Skylet event for scheduling managed jobs."""
83
- EVENT_INTERVAL_SECONDS = 20
84
-
85
- def _run(self):
86
- logger.info('=== Scheduling next jobs ===')
87
- managed_job_scheduler.maybe_schedule_next_jobs()
79
+ scheduler.maybe_start_controllers()
88
80
 
89
81
 
90
82
  class ServiceUpdateEvent(SkyletEvent):
@@ -36,9 +36,9 @@ def ssh_options_list(
36
36
 
37
37
 
38
38
  class SshMode(enum.Enum):
39
- NON_INTERACTIVE: int
40
- INTERACTIVE: int
41
- LOGIN: int
39
+ NON_INTERACTIVE = ...
40
+ INTERACTIVE = ...
41
+ LOGIN = ...
42
42
 
43
43
 
44
44
  class CommandRunner:
sky/utils/common_utils.py CHANGED
@@ -996,7 +996,17 @@ def get_mem_size_gb() -> float:
996
996
  except ValueError as e:
997
997
  with ux_utils.print_exception_no_traceback():
998
998
  raise ValueError(
999
- f'Failed to parse the memory size from {mem_size}') from e
999
+ f'Failed to parse the memory size from {mem_size} (GB)'
1000
+ ) from e
1001
+ mem_size = os.getenv('SKYPILOT_POD_MEMORY_BYTES_LIMIT')
1002
+ if mem_size is not None:
1003
+ try:
1004
+ return float(mem_size) / (1024**3)
1005
+ except ValueError as e:
1006
+ with ux_utils.print_exception_no_traceback():
1007
+ raise ValueError(
1008
+ f'Failed to parse the memory size from {mem_size} (bytes)'
1009
+ ) from e
1000
1010
  return _mem_size_gb()
1001
1011
 
1002
1012
 
@@ -1098,13 +1108,15 @@ def release_memory():
1098
1108
  """Release the process memory"""
1099
1109
  # Do the best effort to release the python heap and let malloc_trim
1100
1110
  # be more efficient.
1101
- gc.collect()
1102
- if sys.platform.startswith('linux'):
1103
- try:
1111
+ try:
1112
+ gc.collect()
1113
+ if sys.platform.startswith('linux'):
1104
1114
  # Will fail on musl (alpine), but at least it works on our
1105
1115
  # offical docker images.
1106
1116
  libc = ctypes.CDLL('libc.so.6')
1107
1117
  return libc.malloc_trim(0)
1108
- except (AttributeError, OSError):
1109
- return 0
1110
- return 0
1118
+ return 0
1119
+ except Exception as e: # pylint: disable=broad-except
1120
+ logger.error(f'Failed to release memory: '
1121
+ f'{format_exception(e)}')
1122
+ return 0
@@ -228,6 +228,11 @@ def get_controller_for_pool(pool: bool) -> Controllers:
228
228
  def high_availability_specified(cluster_name: Optional[str]) -> bool:
229
229
  """Check if the controller high availability is specified in user config.
230
230
  """
231
+ # pylint: disable=import-outside-toplevel
232
+ from sky.jobs import utils as managed_job_utils
233
+ if managed_job_utils.is_consolidation_mode():
234
+ return True
235
+
231
236
  controller = Controllers.from_name(cluster_name)
232
237
  if controller is None:
233
238
  return False
sky/utils/db/db_utils.py CHANGED
@@ -7,12 +7,13 @@ import pathlib
7
7
  import sqlite3
8
8
  import threading
9
9
  import typing
10
- from typing import Any, Callable, Dict, Iterable, Optional
10
+ from typing import Any, Callable, Dict, Iterable, Literal, Optional, Union
11
11
 
12
12
  import aiosqlite
13
13
  import aiosqlite.context
14
14
  import sqlalchemy
15
15
  from sqlalchemy import exc as sqlalchemy_exc
16
+ from sqlalchemy.ext import asyncio as sqlalchemy_async
16
17
 
17
18
  from sky import sky_logging
18
19
  from sky.skylet import constants
@@ -375,11 +376,34 @@ def get_max_connections():
375
376
  return _max_connections
376
377
 
377
378
 
378
- def get_engine(db_name: str):
379
+ @typing.overload
380
+ def get_engine(
381
+ db_name: str,
382
+ async_engine: Literal[False] = False) -> sqlalchemy.engine.Engine:
383
+ ...
384
+
385
+
386
+ @typing.overload
387
+ def get_engine(db_name: str,
388
+ async_engine: Literal[True]) -> sqlalchemy_async.AsyncEngine:
389
+ ...
390
+
391
+
392
+ def get_engine(
393
+ db_name: str,
394
+ async_engine: bool = False
395
+ ) -> Union[sqlalchemy.engine.Engine, sqlalchemy_async.AsyncEngine]:
379
396
  conn_string = None
380
397
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
381
398
  conn_string = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
382
399
  if conn_string:
400
+ if async_engine:
401
+ conn_string = conn_string.replace('postgresql://',
402
+ 'postgresql+asyncpg://')
403
+ # This is an AsyncEngine, instead of a (normal, synchronous) Engine,
404
+ # so we should not put it in the cache. Instead, just return.
405
+ return sqlalchemy_async.create_async_engine(
406
+ conn_string, poolclass=sqlalchemy.NullPool)
383
407
  with _db_creation_lock:
384
408
  if conn_string not in _postgres_engine_cache:
385
409
  if _max_connections == 0:
@@ -401,6 +425,11 @@ def get_engine(db_name: str):
401
425
  else:
402
426
  db_path = os.path.expanduser(f'~/.sky/{db_name}.db')
403
427
  pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
428
+ if async_engine:
429
+ # This is an AsyncEngine, instead of a (normal, synchronous) Engine,
430
+ # so we should not put it in the cache. Instead, just return.
431
+ return sqlalchemy_async.create_async_engine(
432
+ 'sqlite+aiosqlite:///' + db_path, connect_args={'timeout': 30})
404
433
  if db_path not in _sqlite_engine_cache:
405
434
  _sqlite_engine_cache[db_path] = sqlalchemy.create_engine(
406
435
  'sqlite:///' + db_path)
sky/utils/rich_utils.py CHANGED
@@ -421,7 +421,7 @@ async def decode_rich_status_async(
421
421
  undecoded_buffer = b''
422
422
 
423
423
  # Iterate over the response content in chunks
424
- async for chunk in response.content.iter_chunked(8192):
424
+ async for chunk, _ in response.content.iter_chunks():
425
425
  if chunk is None:
426
426
  return
427
427
 
@@ -481,6 +481,8 @@ async def decode_rich_status_async(
481
481
  line = line[:-2] + '\n'
482
482
  is_payload, line = message_utils.decode_payload(
483
483
  line, raise_for_mismatch=False)
484
+ if line is None:
485
+ continue
484
486
  control = None
485
487
  if is_payload:
486
488
  control, encoded_status = Control.decode(line)
@@ -437,3 +437,12 @@ def slow_start_processes(processes: List[Startable],
437
437
  break
438
438
  batch_size = min(batch_size * 2, max_batch_size)
439
439
  time.sleep(delay)
440
+
441
+
442
+ def is_process_alive(pid: int) -> bool:
443
+ """Check if a process is alive."""
444
+ try:
445
+ process = psutil.Process(pid)
446
+ return process.is_running()
447
+ except psutil.NoSuchProcess:
448
+ return False
sky/volumes/volume.py CHANGED
@@ -150,6 +150,8 @@ class Volume:
150
150
  self.region, self.zone = cloud_obj.validate_region_zone(
151
151
  self.region, self.zone)
152
152
 
153
+ # Name must be set by factory before validation.
154
+ assert self.name is not None
153
155
  valid, err_msg = cloud_obj.is_volume_name_valid(self.name)
154
156
  if not valid:
155
157
  raise ValueError(f'Invalid volume name: {err_msg}')