skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (97) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -4
  3. sky/backends/backend_utils.py +160 -23
  4. sky/backends/cloud_vm_ray_backend.py +226 -74
  5. sky/catalog/__init__.py +7 -0
  6. sky/catalog/aws_catalog.py +4 -0
  7. sky/catalog/common.py +18 -0
  8. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  9. sky/client/cli/command.py +2 -71
  10. sky/client/sdk.py +20 -0
  11. sky/client/sdk_async.py +23 -18
  12. sky/clouds/aws.py +26 -6
  13. sky/clouds/cloud.py +8 -0
  14. sky/dashboard/out/404.html +1 -1
  15. sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
  17. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  18. sky/dashboard/out/clusters/[cluster].html +1 -1
  19. sky/dashboard/out/clusters.html +1 -1
  20. sky/dashboard/out/config.html +1 -1
  21. sky/dashboard/out/index.html +1 -1
  22. sky/dashboard/out/infra/[context].html +1 -1
  23. sky/dashboard/out/infra.html +1 -1
  24. sky/dashboard/out/jobs/[job].html +1 -1
  25. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  26. sky/dashboard/out/jobs.html +1 -1
  27. sky/dashboard/out/users.html +1 -1
  28. sky/dashboard/out/volumes.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/data/storage.py +5 -1
  33. sky/execution.py +21 -14
  34. sky/global_user_state.py +34 -0
  35. sky/jobs/client/sdk_async.py +4 -2
  36. sky/jobs/constants.py +3 -0
  37. sky/jobs/controller.py +734 -310
  38. sky/jobs/recovery_strategy.py +251 -129
  39. sky/jobs/scheduler.py +247 -174
  40. sky/jobs/server/core.py +20 -4
  41. sky/jobs/server/utils.py +2 -2
  42. sky/jobs/state.py +709 -508
  43. sky/jobs/utils.py +90 -40
  44. sky/logs/agent.py +10 -2
  45. sky/provision/aws/config.py +4 -1
  46. sky/provision/gcp/config.py +6 -1
  47. sky/provision/kubernetes/config.py +7 -2
  48. sky/provision/kubernetes/instance.py +84 -41
  49. sky/provision/kubernetes/utils.py +17 -8
  50. sky/provision/provisioner.py +1 -0
  51. sky/provision/vast/instance.py +1 -1
  52. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  53. sky/serve/replica_managers.py +0 -7
  54. sky/serve/serve_utils.py +5 -0
  55. sky/serve/server/impl.py +1 -2
  56. sky/serve/service.py +0 -2
  57. sky/server/common.py +8 -3
  58. sky/server/config.py +55 -27
  59. sky/server/constants.py +1 -0
  60. sky/server/daemons.py +7 -11
  61. sky/server/metrics.py +41 -8
  62. sky/server/requests/executor.py +41 -4
  63. sky/server/requests/serializers/encoders.py +1 -1
  64. sky/server/server.py +9 -1
  65. sky/server/uvicorn.py +11 -5
  66. sky/setup_files/dependencies.py +4 -2
  67. sky/skylet/attempt_skylet.py +1 -0
  68. sky/skylet/constants.py +14 -7
  69. sky/skylet/events.py +2 -10
  70. sky/skylet/log_lib.py +11 -0
  71. sky/skylet/log_lib.pyi +9 -0
  72. sky/task.py +62 -0
  73. sky/templates/kubernetes-ray.yml.j2 +120 -3
  74. sky/utils/accelerator_registry.py +3 -1
  75. sky/utils/command_runner.py +35 -11
  76. sky/utils/command_runner.pyi +25 -3
  77. sky/utils/common_utils.py +11 -1
  78. sky/utils/context_utils.py +15 -2
  79. sky/utils/controller_utils.py +5 -0
  80. sky/utils/db/db_utils.py +31 -2
  81. sky/utils/db/migration_utils.py +1 -1
  82. sky/utils/git.py +559 -1
  83. sky/utils/resource_checker.py +8 -7
  84. sky/utils/rich_utils.py +3 -1
  85. sky/utils/subprocess_utils.py +9 -0
  86. sky/volumes/volume.py +2 -0
  87. sky/workspaces/core.py +57 -21
  88. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +38 -36
  89. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +95 -95
  90. sky/client/cli/git.py +0 -549
  91. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  92. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
  93. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
  94. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
  95. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
  96. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
  97. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
@@ -22,7 +22,6 @@ from sky import global_user_state
22
22
  from sky import sky_logging
23
23
  from sky import task as task_lib
24
24
  from sky.backends import backend_utils
25
- from sky.jobs import scheduler as jobs_scheduler
26
25
  from sky.serve import constants as serve_constants
27
26
  from sky.serve import serve_state
28
27
  from sky.serve import serve_utils
@@ -1052,7 +1051,6 @@ class SkyPilotReplicaManager(ReplicaManager):
1052
1051
  self._service_name, replica_id)
1053
1052
  assert info is not None, replica_id
1054
1053
  error_in_sky_launch = False
1055
- schedule_next_jobs = False
1056
1054
  if info.status == serve_state.ReplicaStatus.PENDING:
1057
1055
  # sky.launch not started yet
1058
1056
  if controller_utils.can_provision():
@@ -1080,7 +1078,6 @@ class SkyPilotReplicaManager(ReplicaManager):
1080
1078
  else:
1081
1079
  info.status_property.sky_launch_status = (
1082
1080
  common_utils.ProcessStatus.SUCCEEDED)
1083
- schedule_next_jobs = True
1084
1081
  if self._spot_placer is not None and info.is_spot:
1085
1082
  # TODO(tian): Currently, we set the location to
1086
1083
  # preemptive if the launch process failed. This is
@@ -1100,16 +1097,12 @@ class SkyPilotReplicaManager(ReplicaManager):
1100
1097
  self._spot_placer.set_active(location)
1101
1098
  serve_state.add_or_update_replica(self._service_name,
1102
1099
  replica_id, info)
1103
- if schedule_next_jobs and self._is_pool:
1104
- jobs_scheduler.maybe_schedule_next_jobs()
1105
1100
  if error_in_sky_launch:
1106
1101
  # Teardown after update replica info since
1107
1102
  # _terminate_replica will update the replica info too.
1108
1103
  self._terminate_replica(replica_id,
1109
1104
  sync_down_logs=True,
1110
1105
  replica_drain_delay_seconds=0)
1111
- # Try schedule next job after acquiring the lock.
1112
- jobs_scheduler.maybe_schedule_next_jobs()
1113
1106
  down_process_pool_snapshot = list(self._down_process_pool.items())
1114
1107
  for replica_id, p in down_process_pool_snapshot:
1115
1108
  if p.is_alive():
sky/serve/serve_utils.py CHANGED
@@ -294,6 +294,11 @@ def is_consolidation_mode(pool: bool = False) -> bool:
294
294
  # We should only do this check on API server, as the controller will not
295
295
  # have related config and will always seemingly disabled for consolidation
296
296
  # mode. Check #6611 for more details.
297
+ if (os.environ.get(skylet_constants.OVERRIDE_CONSOLIDATION_MODE) is not None
298
+ and controller.controller_type == 'jobs'):
299
+ # if we are in the job controller, we must always be in consolidation
300
+ # mode.
301
+ return True
297
302
  if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
298
303
  _validate_consolidation_mode_config(consolidation_mode, pool)
299
304
  return consolidation_mode
sky/serve/server/impl.py CHANGED
@@ -280,8 +280,7 @@ def up(
280
280
  ]
281
281
  run_script = '\n'.join(env_cmds + [run_script])
282
282
  # Dump script for high availability recovery.
283
- if controller_utils.high_availability_specified(controller_name):
284
- serve_state.set_ha_recovery_script(service_name, run_script)
283
+ serve_state.set_ha_recovery_script(service_name, run_script)
285
284
  backend.run_on_head(controller_handle, run_script)
286
285
 
287
286
  style = colorama.Style
sky/serve/service.py CHANGED
@@ -21,7 +21,6 @@ from sky import task as task_lib
21
21
  from sky.backends import backend_utils
22
22
  from sky.backends import cloud_vm_ray_backend
23
23
  from sky.data import data_utils
24
- from sky.jobs import scheduler as jobs_scheduler
25
24
  from sky.serve import constants
26
25
  from sky.serve import controller
27
26
  from sky.serve import load_balancer
@@ -278,7 +277,6 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
278
277
  pool=service_spec.pool,
279
278
  controller_pid=os.getpid(),
280
279
  entrypoint=entrypoint)
281
- jobs_scheduler.maybe_schedule_next_jobs()
282
280
  # Directly throw an error here. See sky/serve/api.py::up
283
281
  # for more details.
284
282
  if not success:
sky/server/common.py CHANGED
@@ -538,12 +538,17 @@ def _start_api_server(deploy: bool = False,
538
538
 
539
539
  # Check available memory before starting the server.
540
540
  avail_mem_size_gb: float = common_utils.get_mem_size_gb()
541
- if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
541
+ # pylint: disable=import-outside-toplevel
542
+ import sky.jobs.utils as job_utils
543
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
544
+ if job_utils.is_consolidation_mode() else
545
+ server_constants.MIN_AVAIL_MEM_GB)
546
+ if avail_mem_size_gb <= max_memory:
542
547
  logger.warning(
543
548
  f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
544
549
  f'has {avail_mem_size_gb:.1f}GB memory available. '
545
- f'At least {server_constants.MIN_AVAIL_MEM_GB}GB is '
546
- 'recommended to support higher load with better performance.'
550
+ f'At least {max_memory}GB is recommended to support higher '
551
+ 'load with better performance.'
547
552
  f'{colorama.Style.RESET_ALL}')
548
553
 
549
554
  args = [sys.executable, *API_SERVER_CMD.split()]
sky/server/config.py CHANGED
@@ -6,6 +6,7 @@ from typing import Optional
6
6
 
7
7
  from sky import sky_logging
8
8
  from sky.server import constants as server_constants
9
+ from sky.server import daemons
9
10
  from sky.utils import common_utils
10
11
 
11
12
  # Constants based on profiling the peak memory usage while serving various
@@ -19,8 +20,9 @@ from sky.utils import common_utils
19
20
  # TODO(aylei): maintaining these constants is error-prone, we may need to
20
21
  # automatically tune parallelism at runtime according to system usage stats
21
22
  # in the future.
22
- _LONG_WORKER_MEM_GB = 0.4
23
- _SHORT_WORKER_MEM_GB = 0.25
23
+ # TODO(luca): The future is now! ^^^
24
+ LONG_WORKER_MEM_GB = 0.4
25
+ SHORT_WORKER_MEM_GB = 0.3
24
26
  # To control the number of long workers.
25
27
  _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
26
28
  # Limit the number of long workers of local API server, since local server is
@@ -35,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
35
37
  _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
36
38
  # Minimal number of long workers to ensure responsiveness.
37
39
  _MIN_LONG_WORKERS = 1
38
- # Minimal number of short workers, there is a daemon task running on short
39
- # workers so at least 2 workers are needed to ensure responsiveness.
40
- _MIN_SHORT_WORKERS = 2
40
+ # Minimal number of idle short workers to ensure responsiveness.
41
+ _MIN_IDLE_SHORT_WORKERS = 1
41
42
 
42
43
  # Default number of burstable workers for local API server. A heuristic number
43
44
  # that is large enough for most local cases.
@@ -75,8 +76,8 @@ class ServerConfig:
75
76
 
76
77
 
77
78
  def compute_server_config(deploy: bool,
78
- max_db_connections: Optional[int] = None
79
- ) -> ServerConfig:
79
+ max_db_connections: Optional[int] = None,
80
+ quiet: bool = False) -> ServerConfig:
80
81
  """Compute the server config based on environment.
81
82
 
82
83
  We have different assumptions for the resources in different deployment
@@ -140,7 +141,12 @@ def compute_server_config(deploy: bool,
140
141
  burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
141
142
  # Runs in low resource mode if the available memory is less than
142
143
  # server_constants.MIN_AVAIL_MEM_GB.
143
- if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
144
+ # pylint: disable=import-outside-toplevel
145
+ import sky.jobs.utils as job_utils
146
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
147
+ if job_utils.is_consolidation_mode() else
148
+ server_constants.MIN_AVAIL_MEM_GB)
149
+ if not deploy and mem_size_gb < max_memory:
144
150
  # Permanent worker process may have significant memory consumption
145
151
  # (~350MB per worker) after running commands like `sky check`, so we
146
152
  # don't start any permanent workers in low resource local mode. This
@@ -151,25 +157,29 @@ def compute_server_config(deploy: bool,
151
157
  # permanently because it never exits.
152
158
  max_parallel_for_long = 0
153
159
  max_parallel_for_short = 0
154
- logger.warning(
155
- 'SkyPilot API server will run in low resource mode because '
156
- 'the available memory is less than '
157
- f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
160
+ if not quiet:
161
+ logger.warning(
162
+ 'SkyPilot API server will run in low resource mode because '
163
+ 'the available memory is less than '
164
+ f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
158
165
  elif max_db_connections is not None:
159
166
  if max_parallel_all_workers > max_db_connections:
160
- logger.warning(
161
- f'Max parallel all workers ({max_parallel_all_workers}) '
162
- f'is greater than max db connections ({max_db_connections}). '
163
- 'Increase the number of max db connections to '
164
- f'at least {max_parallel_all_workers} for optimal performance.')
167
+ if not quiet:
168
+ logger.warning(
169
+ f'Max parallel all workers ({max_parallel_all_workers}) '
170
+ 'is greater than max db connections '
171
+ f'({max_db_connections}). Increase the number of max db '
172
+ f'connections to at least {max_parallel_all_workers} for '
173
+ 'optimal performance.')
165
174
  else:
166
175
  num_db_connections_per_worker = 1
167
176
 
168
- logger.info(
169
- f'SkyPilot API server will start {num_server_workers} server processes '
170
- f'with {max_parallel_for_long} background workers for long requests '
171
- f'and will allow at max {max_parallel_for_short} short requests in '
172
- f'parallel.')
177
+ if not quiet:
178
+ logger.info(
179
+ f'SkyPilot API server will start {num_server_workers} server '
180
+ f'processes with {max_parallel_for_long} background workers for '
181
+ f'long requests and will allow at max {max_parallel_for_short} '
182
+ 'short requests in parallel.')
173
183
  return ServerConfig(
174
184
  num_server_workers=num_server_workers,
175
185
  queue_backend=queue_backend,
@@ -190,10 +200,15 @@ def _max_long_worker_parallism(cpu_count: int,
190
200
  local=False) -> int:
191
201
  """Max parallelism for long workers."""
192
202
  # Reserve min available memory to avoid OOM.
193
- available_mem = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
203
+ # pylint: disable=import-outside-toplevel
204
+ import sky.jobs.utils as job_utils
205
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
206
+ if job_utils.is_consolidation_mode() else
207
+ server_constants.MIN_AVAIL_MEM_GB)
208
+ available_mem = max(0, mem_size_gb - max_memory)
194
209
  cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
195
210
  mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
196
- _LONG_WORKER_MEM_GB)
211
+ LONG_WORKER_MEM_GB)
197
212
  n = max(_MIN_LONG_WORKERS,
198
213
  min(cpu_based_max_parallel, mem_based_max_parallel))
199
214
  if local:
@@ -201,12 +216,25 @@ def _max_long_worker_parallism(cpu_count: int,
201
216
  return n
202
217
 
203
218
 
219
+ def _get_min_short_workers() -> int:
220
+ """Min number of short workers."""
221
+ daemon_count = 0
222
+ for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
223
+ if not daemon.should_skip():
224
+ daemon_count += 1
225
+ return _MIN_IDLE_SHORT_WORKERS + daemon_count
226
+
227
+
204
228
  def _max_short_worker_parallism(mem_size_gb: float,
205
229
  long_worker_parallism: int) -> int:
206
230
  """Max parallelism for short workers."""
207
231
  # Reserve memory for long workers and min available memory.
208
- reserved_mem = server_constants.MIN_AVAIL_MEM_GB + (long_worker_parallism *
209
- _LONG_WORKER_MEM_GB)
232
+ # pylint: disable=import-outside-toplevel
233
+ import sky.jobs.utils as job_utils
234
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
235
+ if job_utils.is_consolidation_mode() else
236
+ server_constants.MIN_AVAIL_MEM_GB)
237
+ reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
210
238
  available_mem = max(0, mem_size_gb - reserved_mem)
211
- n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
239
+ n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
212
240
  return n
sky/server/constants.py CHANGED
@@ -34,6 +34,7 @@ VERSION_HEADER = 'X-SkyPilot-Version'
34
34
  REQUEST_NAME_PREFIX = 'sky.'
35
35
  # The memory (GB) that SkyPilot tries to not use to prevent OOM.
36
36
  MIN_AVAIL_MEM_GB = 2
37
+ MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE = 4
37
38
  # Default encoder/decoder handler name.
38
39
  DEFAULT_HANDLER_NAME = 'default'
39
40
  # The path to the API request database.
sky/server/daemons.py CHANGED
@@ -11,6 +11,7 @@ from sky.utils import annotations
11
11
  from sky.utils import common
12
12
  from sky.utils import common_utils
13
13
  from sky.utils import env_options
14
+ from sky.utils import subprocess_utils
14
15
  from sky.utils import timeline
15
16
  from sky.utils import ux_utils
16
17
 
@@ -74,6 +75,10 @@ class InternalRequestDaemon:
74
75
  # using too much memory.
75
76
  annotations.clear_request_level_cache()
76
77
  timeline.save_timeline()
78
+ # Kill all children processes related to this request.
79
+ # Each executor handles a single request, so we can safely
80
+ # kill all children processes related to this request.
81
+ subprocess_utils.kill_children_processes()
77
82
  common_utils.release_memory()
78
83
  except Exception: # pylint: disable=broad-except
79
84
  # It is OK to fail to run the event, as the event is not
@@ -123,21 +128,16 @@ def managed_job_status_refresh_event():
123
128
  """Refresh the managed job status for controller consolidation mode."""
124
129
  # pylint: disable=import-outside-toplevel
125
130
  from sky.jobs import utils as managed_job_utils
126
- from sky.utils import controller_utils
127
131
 
128
132
  # We run the recovery logic before starting the event loop as those two are
129
133
  # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
130
- if controller_utils.high_availability_specified(
131
- controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
132
- managed_job_utils.ha_recovery_for_consolidation_mode()
134
+ managed_job_utils.ha_recovery_for_consolidation_mode()
133
135
 
134
136
  # After recovery, we start the event loop.
135
137
  from sky.skylet import events
136
138
  refresh_event = events.ManagedJobEvent()
137
- scheduling_event = events.ManagedJobSchedulingEvent()
138
139
  logger.info('=== Running managed job event ===')
139
140
  refresh_event.run()
140
- scheduling_event.run()
141
141
  time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
142
142
 
143
143
 
@@ -152,14 +152,10 @@ def _serve_status_refresh_event(pool: bool):
152
152
  """Refresh the sky serve status for controller consolidation mode."""
153
153
  # pylint: disable=import-outside-toplevel
154
154
  from sky.serve import serve_utils
155
- from sky.utils import controller_utils
156
155
 
157
156
  # We run the recovery logic before starting the event loop as those two are
158
157
  # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
159
- controller = controller_utils.get_controller_for_pool(pool)
160
- if controller_utils.high_availability_specified(
161
- controller.value.cluster_name):
162
- serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
158
+ serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
163
159
 
164
160
  # After recovery, we start the event loop.
165
161
  from sky.skylet import events
sky/server/metrics.py CHANGED
@@ -4,6 +4,7 @@ import contextlib
4
4
  import functools
5
5
  import multiprocessing
6
6
  import os
7
+ import threading
7
8
  import time
8
9
 
9
10
  import fastapi
@@ -21,6 +22,24 @@ from sky.skylet import constants
21
22
  METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
22
23
  'false').lower() == 'true'
23
24
 
25
+ _KB = 2**10
26
+ _MB = 2**20
27
+ _MEM_BUCKETS = [
28
+ _KB,
29
+ 256 * _KB,
30
+ 512 * _KB,
31
+ _MB,
32
+ 2 * _MB,
33
+ 4 * _MB,
34
+ 8 * _MB,
35
+ 16 * _MB,
36
+ 32 * _MB,
37
+ 64 * _MB,
38
+ 128 * _MB,
39
+ 256 * _MB,
40
+ float('inf'),
41
+ ]
42
+
24
43
  logger = sky_logging.init_logger(__name__)
25
44
 
26
45
  # Total number of API server requests, grouped by path, method, and status.
@@ -92,6 +111,16 @@ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
92
111
  ['pid', 'type', 'mode'],
93
112
  )
94
113
 
114
+ SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
115
+ 'sky_apiserver_request_memory_usage_bytes',
116
+ 'Peak memory usage of requests', ['name'],
117
+ buckets=_MEM_BUCKETS)
118
+
119
+ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
120
+ 'sky_apiserver_request_rss_incr_bytes',
121
+ 'RSS increment after requests', ['name'],
122
+ buckets=_MEM_BUCKETS)
123
+
95
124
  metrics_app = fastapi.FastAPI()
96
125
 
97
126
 
@@ -208,19 +237,23 @@ def time_me_async(func):
208
237
  return async_wrapper
209
238
 
210
239
 
211
- def process_monitor(process_type: str):
240
+ peak_rss_bytes = 0
241
+
242
+
243
+ def process_monitor(process_type: str, stop: threading.Event):
212
244
  pid = multiprocessing.current_process().pid
213
245
  proc = psutil.Process(pid)
214
- peak_rss = 0
215
246
  last_bucket_end = time.time()
216
- while True:
247
+ bucket_peak = 0
248
+ global peak_rss_bytes
249
+ while not stop.is_set():
217
250
  if time.time() - last_bucket_end >= 30:
218
- # Reset peak RSS every 30 seconds.
251
+ # Reset peak RSS for the next time bucket.
219
252
  last_bucket_end = time.time()
220
- peak_rss = 0
221
- peak_rss = max(peak_rss, proc.memory_info().rss)
222
- SKY_APISERVER_PROCESS_PEAK_RSS.labels(pid=pid,
223
- type=process_type).set(peak_rss)
253
+ bucket_peak = 0
254
+ peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
255
+ SKY_APISERVER_PROCESS_PEAK_RSS.labels(
256
+ pid=pid, type=process_type).set(peak_rss_bytes)
224
257
  ctimes = proc.cpu_times()
225
258
  SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
226
259
  type=process_type,
@@ -31,6 +31,7 @@ import time
31
31
  import typing
32
32
  from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
33
33
 
34
+ import psutil
34
35
  import setproctitle
35
36
 
36
37
  from sky import exceptions
@@ -130,8 +131,9 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
130
131
  def executor_initializer(proc_group: str):
131
132
  setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
132
133
  f'{multiprocessing.current_process().pid}')
134
+ # Executor never stops, unless the whole process is killed.
133
135
  threading.Thread(target=metrics_lib.process_monitor,
134
- args=(f'worker:{proc_group}',),
136
+ args=(f'worker:{proc_group}', threading.Event()),
135
137
  daemon=True).start()
136
138
 
137
139
 
@@ -373,11 +375,13 @@ def _request_execution_wrapper(request_id: str,
373
375
  4. Handle the SIGTERM signal to abort the request gracefully.
374
376
  5. Maintain the lifecycle of the temp dir used by the request.
375
377
  """
378
+ pid = multiprocessing.current_process().pid
379
+ proc = psutil.Process(pid)
380
+ rss_begin = proc.memory_info().rss
376
381
  db_utils.set_max_connections(num_db_connections_per_worker)
377
382
  # Handle the SIGTERM signal to abort the request processing gracefully.
378
383
  signal.signal(signal.SIGTERM, _sigterm_handler)
379
384
 
380
- pid = multiprocessing.current_process().pid
381
385
  logger.info(f'Running request {request_id} with pid {pid}')
382
386
  with api_requests.update_request(request_id) as request_task:
383
387
  assert request_task is not None, request_id
@@ -443,8 +447,41 @@ def _request_execution_wrapper(request_id: str,
443
447
  _restore_output(original_stdout, original_stderr)
444
448
  logger.info(f'Request {request_id} finished')
445
449
  finally:
446
- with metrics_lib.time_it(name='release_memory', group='internal'):
447
- common_utils.release_memory()
450
+ try:
451
+ # Capture the peak RSS before GC.
452
+ peak_rss = max(proc.memory_info().rss,
453
+ metrics_lib.peak_rss_bytes)
454
+ with metrics_lib.time_it(name='release_memory',
455
+ group='internal'):
456
+ common_utils.release_memory()
457
+ _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
458
+ except Exception as e: # pylint: disable=broad-except
459
+ logger.error(f'Failed to record memory metrics: '
460
+ f'{common_utils.format_exception(e)}')
461
+
462
+
463
+ _first_request = True
464
+
465
+
466
+ def _record_memory_metrics(request_name: str, proc: psutil.Process,
467
+ rss_begin: int, peak_rss: int) -> None:
468
+ """Record the memory metrics for a request."""
469
+ # Do not record full memory delta for the first request as it
470
+ # will loads the sky core modules and make the memory usage
471
+ # estimation inaccurate.
472
+ global _first_request
473
+ if _first_request:
474
+ _first_request = False
475
+ return
476
+ rss_end = proc.memory_info().rss
477
+
478
+ # Answer "how much RSS this request contributed?"
479
+ metrics_lib.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
480
+ name=request_name).observe(max(rss_end - rss_begin, 0))
481
+ # Estimate the memory usage by the request by capturing the
482
+ # peak memory delta during the request execution.
483
+ metrics_lib.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
484
+ name=request_name).observe(max(peak_rss - rss_begin, 0))
448
485
 
449
486
 
450
487
  async def execute_request_coroutine(request: api_requests.Request):
@@ -131,7 +131,7 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
131
131
  def encode_jobs_queue_v2(
132
132
  jobs_or_tuple) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
133
133
  # Support returning either a plain jobs list or a (jobs, total) tuple
134
- status_counts = {}
134
+ status_counts: Dict[str, int] = {}
135
135
  if isinstance(jobs_or_tuple, tuple):
136
136
  if len(jobs_or_tuple) == 2:
137
137
  jobs, total = jobs_or_tuple
sky/server/server.py CHANGED
@@ -625,6 +625,9 @@ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
625
625
  app.include_router(ssh_node_pools_rest.router,
626
626
  prefix='/ssh_node_pools',
627
627
  tags=['ssh_node_pools'])
628
+ # increase the resource limit for the server
629
+ soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
630
+ resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
628
631
 
629
632
  # Increase the limit of files we can open to our hard limit. This fixes bugs
630
633
  # where we can not aquire file locks or open enough logs and the API server
@@ -1211,6 +1214,7 @@ async def logs(
1211
1214
  request_body=cluster_job_body,
1212
1215
  func=core.tail_logs,
1213
1216
  schedule_type=requests_lib.ScheduleType.SHORT,
1217
+ request_cluster_name=cluster_job_body.cluster_name,
1214
1218
  )
1215
1219
  task = asyncio.create_task(executor.execute_request_coroutine(request_task))
1216
1220
 
@@ -1826,7 +1830,7 @@ async def all_contexts(request: fastapi.Request) -> None:
1826
1830
  async def gpu_metrics() -> fastapi.Response:
1827
1831
  """Gets the GPU metrics from multiple external k8s clusters"""
1828
1832
  contexts = core.get_all_contexts()
1829
- all_metrics = []
1833
+ all_metrics: List[str] = []
1830
1834
  successful_contexts = 0
1831
1835
 
1832
1836
  tasks = [
@@ -1841,6 +1845,10 @@ async def gpu_metrics() -> fastapi.Response:
1841
1845
  if isinstance(result, Exception):
1842
1846
  logger.error(
1843
1847
  f'Failed to get metrics for context {contexts[i]}: {result}')
1848
+ elif isinstance(result, BaseException):
1849
+ # Avoid changing behavior for non-Exception BaseExceptions
1850
+ # like KeyboardInterrupt/SystemExit: re-raise them.
1851
+ raise result
1844
1852
  else:
1845
1853
  metrics_text = result
1846
1854
  all_metrics.append(metrics_text)
sky/server/uvicorn.py CHANGED
@@ -213,11 +213,17 @@ class Server(uvicorn.Server):
213
213
  # Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
214
214
  event_loop.set_debug(True)
215
215
  event_loop.slow_callback_duration = lag_threshold
216
- threading.Thread(target=metrics_lib.process_monitor,
217
- args=('server',),
218
- daemon=True).start()
219
- with self.capture_signals():
220
- asyncio.run(self.serve(*args, **kwargs))
216
+ stop_monitor = threading.Event()
217
+ monitor = threading.Thread(target=metrics_lib.process_monitor,
218
+ args=('server', stop_monitor),
219
+ daemon=True)
220
+ monitor.start()
221
+ try:
222
+ with self.capture_signals():
223
+ asyncio.run(self.serve(*args, **kwargs))
224
+ finally:
225
+ stop_monitor.set()
226
+ monitor.join()
221
227
 
222
228
 
223
229
  def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):
@@ -63,6 +63,8 @@ install_requires = [
63
63
  'setproctitle',
64
64
  'sqlalchemy',
65
65
  'psycopg2-binary',
66
+ 'aiosqlite',
67
+ 'asyncpg',
66
68
  # TODO(hailong): These three dependencies should be removed after we make
67
69
  # the client-side actually not importing them.
68
70
  'casbin',
@@ -108,9 +110,9 @@ server_dependencies = [
108
110
  local_ray = [
109
111
  # Lower version of ray will cause dependency conflict for
110
112
  # click/grpcio/protobuf.
111
- # Excluded 2.6.0 as it has a bug in the cluster launcher:
113
+ # Ray 2.6.1+ resolved cluster launcher bugs and grpcio issues on Apple Silicon.
112
114
  # https://github.com/ray-project/ray/releases/tag/ray-2.6.1
113
- 'ray[default] >= 2.2.0, != 2.6.0',
115
+ 'ray[default] >= 2.6.1',
114
116
  ]
115
117
 
116
118
  remote = [
@@ -12,6 +12,7 @@ def restart_skylet():
12
12
  # Kills old skylet if it is running.
13
13
  # TODO(zhwu): make the killing graceful, e.g., use a signal to tell
14
14
  # skylet to exit, instead of directly killing it.
15
+
15
16
  subprocess.run(
16
17
  # We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
17
18
  # because need to handle the backward compatibility of the old skylet
sky/skylet/constants.py CHANGED
@@ -62,11 +62,14 @@ SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
62
62
  'curl -LsSf https://astral.sh/uv/install.sh '
63
63
  f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
64
64
  SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
65
- # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the
66
- # environment. `deactivate` command does not work when conda is used.
65
+ SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run')
66
+ # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
67
+ # VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
68
+ # not work when conda is used.
67
69
  DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
68
70
  'export PATH='
69
- f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")')
71
+ f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||") && '
72
+ 'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
70
73
 
71
74
  # Prefix for SkyPilot environment variables
72
75
  SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
@@ -91,14 +94,14 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
91
94
  # cluster yaml is updated.
92
95
  #
93
96
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
94
- SKYLET_VERSION = '17'
97
+ SKYLET_VERSION = '18'
95
98
  # The version of the lib files that skylet/jobs use. Whenever there is an API
96
99
  # change for the job_lib or log_lib, we need to bump this version, so that the
97
100
  # user can be notified to update their SkyPilot version on the remote cluster.
98
101
  SKYLET_LIB_VERSION = 4
99
102
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
100
103
  SKYLET_GRPC_PORT = 46590
101
- SKYLET_GRPC_TIMEOUT_SECONDS = 5
104
+ SKYLET_GRPC_TIMEOUT_SECONDS = 10
102
105
 
103
106
  # Docker default options
104
107
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
@@ -229,7 +232,7 @@ RAY_INSTALLATION_COMMANDS = (
229
232
  'export PATH=$PATH:$HOME/.local/bin; '
230
233
  # Writes ray path to file if it does not exist or the file is empty.
231
234
  f'[ -s {SKY_RAY_PATH_FILE} ] || '
232
- f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
235
+ f'{{ {SKY_UV_RUN_CMD} '
233
236
  f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
234
237
 
235
238
  SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
@@ -421,6 +424,7 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
421
424
  # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
422
425
  # Environment variable that is set to 'true' if this is a skypilot server.
423
426
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
427
+ OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
424
428
 
425
429
  # Environment variable that is set to 'true' if metrics are enabled.
426
430
  ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
@@ -447,7 +451,7 @@ SKYPILOT_DEFAULT_WORKSPACE = 'default'
447
451
  # BEGIN constants used for service catalog.
448
452
  HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
449
453
  HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
450
- CATALOG_SCHEMA_VERSION = 'v7'
454
+ CATALOG_SCHEMA_VERSION = 'v8'
451
455
  CATALOG_DIR = '~/.sky/catalogs'
452
456
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
453
457
  'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
@@ -508,3 +512,6 @@ SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
508
512
 
509
513
  ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
510
514
  'DEBUG_LOOP_LAG_THRESHOLD_MS')
515
+
516
+ ARM64_ARCH = 'arm64'
517
+ X86_64_ARCH = 'x86_64'