skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -4
- sky/backends/backend_utils.py +160 -23
- sky/backends/cloud_vm_ray_backend.py +226 -74
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +2 -71
- sky/client/sdk.py +20 -0
- sky/client/sdk_async.py +23 -18
- sky/clouds/aws.py +26 -6
- sky/clouds/cloud.py +8 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +5 -1
- sky/execution.py +21 -14
- sky/global_user_state.py +34 -0
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/constants.py +3 -0
- sky/jobs/controller.py +734 -310
- sky/jobs/recovery_strategy.py +251 -129
- sky/jobs/scheduler.py +247 -174
- sky/jobs/server/core.py +20 -4
- sky/jobs/server/utils.py +2 -2
- sky/jobs/state.py +709 -508
- sky/jobs/utils.py +90 -40
- sky/logs/agent.py +10 -2
- sky/provision/aws/config.py +4 -1
- sky/provision/gcp/config.py +6 -1
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +17 -8
- sky/provision/provisioner.py +1 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/serve/replica_managers.py +0 -7
- sky/serve/serve_utils.py +5 -0
- sky/serve/server/impl.py +1 -2
- sky/serve/service.py +0 -2
- sky/server/common.py +8 -3
- sky/server/config.py +55 -27
- sky/server/constants.py +1 -0
- sky/server/daemons.py +7 -11
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/requests/serializers/encoders.py +1 -1
- sky/server/server.py +9 -1
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +4 -2
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/constants.py +14 -7
- sky/skylet/events.py +2 -10
- sky/skylet/log_lib.py +11 -0
- sky/skylet/log_lib.pyi +9 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +25 -3
- sky/utils/common_utils.py +11 -1
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +5 -0
- sky/utils/db/db_utils.py +31 -2
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/utils/rich_utils.py +3 -1
- sky/utils/subprocess_utils.py +9 -0
- sky/volumes/volume.py +2 -0
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +38 -36
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +95 -95
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
sky/serve/replica_managers.py
CHANGED
|
@@ -22,7 +22,6 @@ from sky import global_user_state
|
|
|
22
22
|
from sky import sky_logging
|
|
23
23
|
from sky import task as task_lib
|
|
24
24
|
from sky.backends import backend_utils
|
|
25
|
-
from sky.jobs import scheduler as jobs_scheduler
|
|
26
25
|
from sky.serve import constants as serve_constants
|
|
27
26
|
from sky.serve import serve_state
|
|
28
27
|
from sky.serve import serve_utils
|
|
@@ -1052,7 +1051,6 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1052
1051
|
self._service_name, replica_id)
|
|
1053
1052
|
assert info is not None, replica_id
|
|
1054
1053
|
error_in_sky_launch = False
|
|
1055
|
-
schedule_next_jobs = False
|
|
1056
1054
|
if info.status == serve_state.ReplicaStatus.PENDING:
|
|
1057
1055
|
# sky.launch not started yet
|
|
1058
1056
|
if controller_utils.can_provision():
|
|
@@ -1080,7 +1078,6 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1080
1078
|
else:
|
|
1081
1079
|
info.status_property.sky_launch_status = (
|
|
1082
1080
|
common_utils.ProcessStatus.SUCCEEDED)
|
|
1083
|
-
schedule_next_jobs = True
|
|
1084
1081
|
if self._spot_placer is not None and info.is_spot:
|
|
1085
1082
|
# TODO(tian): Currently, we set the location to
|
|
1086
1083
|
# preemptive if the launch process failed. This is
|
|
@@ -1100,16 +1097,12 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1100
1097
|
self._spot_placer.set_active(location)
|
|
1101
1098
|
serve_state.add_or_update_replica(self._service_name,
|
|
1102
1099
|
replica_id, info)
|
|
1103
|
-
if schedule_next_jobs and self._is_pool:
|
|
1104
|
-
jobs_scheduler.maybe_schedule_next_jobs()
|
|
1105
1100
|
if error_in_sky_launch:
|
|
1106
1101
|
# Teardown after update replica info since
|
|
1107
1102
|
# _terminate_replica will update the replica info too.
|
|
1108
1103
|
self._terminate_replica(replica_id,
|
|
1109
1104
|
sync_down_logs=True,
|
|
1110
1105
|
replica_drain_delay_seconds=0)
|
|
1111
|
-
# Try schedule next job after acquiring the lock.
|
|
1112
|
-
jobs_scheduler.maybe_schedule_next_jobs()
|
|
1113
1106
|
down_process_pool_snapshot = list(self._down_process_pool.items())
|
|
1114
1107
|
for replica_id, p in down_process_pool_snapshot:
|
|
1115
1108
|
if p.is_alive():
|
sky/serve/serve_utils.py
CHANGED
|
@@ -294,6 +294,11 @@ def is_consolidation_mode(pool: bool = False) -> bool:
|
|
|
294
294
|
# We should only do this check on API server, as the controller will not
|
|
295
295
|
# have related config and will always seemingly disabled for consolidation
|
|
296
296
|
# mode. Check #6611 for more details.
|
|
297
|
+
if (os.environ.get(skylet_constants.OVERRIDE_CONSOLIDATION_MODE) is not None
|
|
298
|
+
and controller.controller_type == 'jobs'):
|
|
299
|
+
# if we are in the job controller, we must always be in consolidation
|
|
300
|
+
# mode.
|
|
301
|
+
return True
|
|
297
302
|
if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
298
303
|
_validate_consolidation_mode_config(consolidation_mode, pool)
|
|
299
304
|
return consolidation_mode
|
sky/serve/server/impl.py
CHANGED
|
@@ -280,8 +280,7 @@ def up(
|
|
|
280
280
|
]
|
|
281
281
|
run_script = '\n'.join(env_cmds + [run_script])
|
|
282
282
|
# Dump script for high availability recovery.
|
|
283
|
-
|
|
284
|
-
serve_state.set_ha_recovery_script(service_name, run_script)
|
|
283
|
+
serve_state.set_ha_recovery_script(service_name, run_script)
|
|
285
284
|
backend.run_on_head(controller_handle, run_script)
|
|
286
285
|
|
|
287
286
|
style = colorama.Style
|
sky/serve/service.py
CHANGED
|
@@ -21,7 +21,6 @@ from sky import task as task_lib
|
|
|
21
21
|
from sky.backends import backend_utils
|
|
22
22
|
from sky.backends import cloud_vm_ray_backend
|
|
23
23
|
from sky.data import data_utils
|
|
24
|
-
from sky.jobs import scheduler as jobs_scheduler
|
|
25
24
|
from sky.serve import constants
|
|
26
25
|
from sky.serve import controller
|
|
27
26
|
from sky.serve import load_balancer
|
|
@@ -278,7 +277,6 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
|
278
277
|
pool=service_spec.pool,
|
|
279
278
|
controller_pid=os.getpid(),
|
|
280
279
|
entrypoint=entrypoint)
|
|
281
|
-
jobs_scheduler.maybe_schedule_next_jobs()
|
|
282
280
|
# Directly throw an error here. See sky/serve/api.py::up
|
|
283
281
|
# for more details.
|
|
284
282
|
if not success:
|
sky/server/common.py
CHANGED
|
@@ -538,12 +538,17 @@ def _start_api_server(deploy: bool = False,
|
|
|
538
538
|
|
|
539
539
|
# Check available memory before starting the server.
|
|
540
540
|
avail_mem_size_gb: float = common_utils.get_mem_size_gb()
|
|
541
|
-
|
|
541
|
+
# pylint: disable=import-outside-toplevel
|
|
542
|
+
import sky.jobs.utils as job_utils
|
|
543
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
544
|
+
if job_utils.is_consolidation_mode() else
|
|
545
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
546
|
+
if avail_mem_size_gb <= max_memory:
|
|
542
547
|
logger.warning(
|
|
543
548
|
f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
|
|
544
549
|
f'has {avail_mem_size_gb:.1f}GB memory available. '
|
|
545
|
-
f'At least {
|
|
546
|
-
'
|
|
550
|
+
f'At least {max_memory}GB is recommended to support higher '
|
|
551
|
+
'load with better performance.'
|
|
547
552
|
f'{colorama.Style.RESET_ALL}')
|
|
548
553
|
|
|
549
554
|
args = [sys.executable, *API_SERVER_CMD.split()]
|
sky/server/config.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import Optional
|
|
|
6
6
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky.server import constants as server_constants
|
|
9
|
+
from sky.server import daemons
|
|
9
10
|
from sky.utils import common_utils
|
|
10
11
|
|
|
11
12
|
# Constants based on profiling the peak memory usage while serving various
|
|
@@ -19,8 +20,9 @@ from sky.utils import common_utils
|
|
|
19
20
|
# TODO(aylei): maintaining these constants is error-prone, we may need to
|
|
20
21
|
# automatically tune parallelism at runtime according to system usage stats
|
|
21
22
|
# in the future.
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
# TODO(luca): The future is now! ^^^
|
|
24
|
+
LONG_WORKER_MEM_GB = 0.4
|
|
25
|
+
SHORT_WORKER_MEM_GB = 0.3
|
|
24
26
|
# To control the number of long workers.
|
|
25
27
|
_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
|
|
26
28
|
# Limit the number of long workers of local API server, since local server is
|
|
@@ -35,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
|
|
|
35
37
|
_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
|
|
36
38
|
# Minimal number of long workers to ensure responsiveness.
|
|
37
39
|
_MIN_LONG_WORKERS = 1
|
|
38
|
-
# Minimal number of short workers
|
|
39
|
-
|
|
40
|
-
_MIN_SHORT_WORKERS = 2
|
|
40
|
+
# Minimal number of idle short workers to ensure responsiveness.
|
|
41
|
+
_MIN_IDLE_SHORT_WORKERS = 1
|
|
41
42
|
|
|
42
43
|
# Default number of burstable workers for local API server. A heuristic number
|
|
43
44
|
# that is large enough for most local cases.
|
|
@@ -75,8 +76,8 @@ class ServerConfig:
|
|
|
75
76
|
|
|
76
77
|
|
|
77
78
|
def compute_server_config(deploy: bool,
|
|
78
|
-
max_db_connections: Optional[int] = None
|
|
79
|
-
|
|
79
|
+
max_db_connections: Optional[int] = None,
|
|
80
|
+
quiet: bool = False) -> ServerConfig:
|
|
80
81
|
"""Compute the server config based on environment.
|
|
81
82
|
|
|
82
83
|
We have different assumptions for the resources in different deployment
|
|
@@ -140,7 +141,12 @@ def compute_server_config(deploy: bool,
|
|
|
140
141
|
burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
|
|
141
142
|
# Runs in low resource mode if the available memory is less than
|
|
142
143
|
# server_constants.MIN_AVAIL_MEM_GB.
|
|
143
|
-
|
|
144
|
+
# pylint: disable=import-outside-toplevel
|
|
145
|
+
import sky.jobs.utils as job_utils
|
|
146
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
147
|
+
if job_utils.is_consolidation_mode() else
|
|
148
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
149
|
+
if not deploy and mem_size_gb < max_memory:
|
|
144
150
|
# Permanent worker process may have significant memory consumption
|
|
145
151
|
# (~350MB per worker) after running commands like `sky check`, so we
|
|
146
152
|
# don't start any permanent workers in low resource local mode. This
|
|
@@ -151,25 +157,29 @@ def compute_server_config(deploy: bool,
|
|
|
151
157
|
# permanently because it never exits.
|
|
152
158
|
max_parallel_for_long = 0
|
|
153
159
|
max_parallel_for_short = 0
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
160
|
+
if not quiet:
|
|
161
|
+
logger.warning(
|
|
162
|
+
'SkyPilot API server will run in low resource mode because '
|
|
163
|
+
'the available memory is less than '
|
|
164
|
+
f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
|
|
158
165
|
elif max_db_connections is not None:
|
|
159
166
|
if max_parallel_all_workers > max_db_connections:
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
167
|
+
if not quiet:
|
|
168
|
+
logger.warning(
|
|
169
|
+
f'Max parallel all workers ({max_parallel_all_workers}) '
|
|
170
|
+
'is greater than max db connections '
|
|
171
|
+
f'({max_db_connections}). Increase the number of max db '
|
|
172
|
+
f'connections to at least {max_parallel_all_workers} for '
|
|
173
|
+
'optimal performance.')
|
|
165
174
|
else:
|
|
166
175
|
num_db_connections_per_worker = 1
|
|
167
176
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
177
|
+
if not quiet:
|
|
178
|
+
logger.info(
|
|
179
|
+
f'SkyPilot API server will start {num_server_workers} server '
|
|
180
|
+
f'processes with {max_parallel_for_long} background workers for '
|
|
181
|
+
f'long requests and will allow at max {max_parallel_for_short} '
|
|
182
|
+
'short requests in parallel.')
|
|
173
183
|
return ServerConfig(
|
|
174
184
|
num_server_workers=num_server_workers,
|
|
175
185
|
queue_backend=queue_backend,
|
|
@@ -190,10 +200,15 @@ def _max_long_worker_parallism(cpu_count: int,
|
|
|
190
200
|
local=False) -> int:
|
|
191
201
|
"""Max parallelism for long workers."""
|
|
192
202
|
# Reserve min available memory to avoid OOM.
|
|
193
|
-
|
|
203
|
+
# pylint: disable=import-outside-toplevel
|
|
204
|
+
import sky.jobs.utils as job_utils
|
|
205
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
206
|
+
if job_utils.is_consolidation_mode() else
|
|
207
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
208
|
+
available_mem = max(0, mem_size_gb - max_memory)
|
|
194
209
|
cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
|
|
195
210
|
mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
|
|
196
|
-
|
|
211
|
+
LONG_WORKER_MEM_GB)
|
|
197
212
|
n = max(_MIN_LONG_WORKERS,
|
|
198
213
|
min(cpu_based_max_parallel, mem_based_max_parallel))
|
|
199
214
|
if local:
|
|
@@ -201,12 +216,25 @@ def _max_long_worker_parallism(cpu_count: int,
|
|
|
201
216
|
return n
|
|
202
217
|
|
|
203
218
|
|
|
219
|
+
def _get_min_short_workers() -> int:
|
|
220
|
+
"""Min number of short workers."""
|
|
221
|
+
daemon_count = 0
|
|
222
|
+
for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
|
|
223
|
+
if not daemon.should_skip():
|
|
224
|
+
daemon_count += 1
|
|
225
|
+
return _MIN_IDLE_SHORT_WORKERS + daemon_count
|
|
226
|
+
|
|
227
|
+
|
|
204
228
|
def _max_short_worker_parallism(mem_size_gb: float,
|
|
205
229
|
long_worker_parallism: int) -> int:
|
|
206
230
|
"""Max parallelism for short workers."""
|
|
207
231
|
# Reserve memory for long workers and min available memory.
|
|
208
|
-
|
|
209
|
-
|
|
232
|
+
# pylint: disable=import-outside-toplevel
|
|
233
|
+
import sky.jobs.utils as job_utils
|
|
234
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
235
|
+
if job_utils.is_consolidation_mode() else
|
|
236
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
237
|
+
reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
|
|
210
238
|
available_mem = max(0, mem_size_gb - reserved_mem)
|
|
211
|
-
n = max(
|
|
239
|
+
n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
|
|
212
240
|
return n
|
sky/server/constants.py
CHANGED
|
@@ -34,6 +34,7 @@ VERSION_HEADER = 'X-SkyPilot-Version'
|
|
|
34
34
|
REQUEST_NAME_PREFIX = 'sky.'
|
|
35
35
|
# The memory (GB) that SkyPilot tries to not use to prevent OOM.
|
|
36
36
|
MIN_AVAIL_MEM_GB = 2
|
|
37
|
+
MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE = 4
|
|
37
38
|
# Default encoder/decoder handler name.
|
|
38
39
|
DEFAULT_HANDLER_NAME = 'default'
|
|
39
40
|
# The path to the API request database.
|
sky/server/daemons.py
CHANGED
|
@@ -11,6 +11,7 @@ from sky.utils import annotations
|
|
|
11
11
|
from sky.utils import common
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
from sky.utils import env_options
|
|
14
|
+
from sky.utils import subprocess_utils
|
|
14
15
|
from sky.utils import timeline
|
|
15
16
|
from sky.utils import ux_utils
|
|
16
17
|
|
|
@@ -74,6 +75,10 @@ class InternalRequestDaemon:
|
|
|
74
75
|
# using too much memory.
|
|
75
76
|
annotations.clear_request_level_cache()
|
|
76
77
|
timeline.save_timeline()
|
|
78
|
+
# Kill all children processes related to this request.
|
|
79
|
+
# Each executor handles a single request, so we can safely
|
|
80
|
+
# kill all children processes related to this request.
|
|
81
|
+
subprocess_utils.kill_children_processes()
|
|
77
82
|
common_utils.release_memory()
|
|
78
83
|
except Exception: # pylint: disable=broad-except
|
|
79
84
|
# It is OK to fail to run the event, as the event is not
|
|
@@ -123,21 +128,16 @@ def managed_job_status_refresh_event():
|
|
|
123
128
|
"""Refresh the managed job status for controller consolidation mode."""
|
|
124
129
|
# pylint: disable=import-outside-toplevel
|
|
125
130
|
from sky.jobs import utils as managed_job_utils
|
|
126
|
-
from sky.utils import controller_utils
|
|
127
131
|
|
|
128
132
|
# We run the recovery logic before starting the event loop as those two are
|
|
129
133
|
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
130
|
-
|
|
131
|
-
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
|
|
132
|
-
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
134
|
+
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
133
135
|
|
|
134
136
|
# After recovery, we start the event loop.
|
|
135
137
|
from sky.skylet import events
|
|
136
138
|
refresh_event = events.ManagedJobEvent()
|
|
137
|
-
scheduling_event = events.ManagedJobSchedulingEvent()
|
|
138
139
|
logger.info('=== Running managed job event ===')
|
|
139
140
|
refresh_event.run()
|
|
140
|
-
scheduling_event.run()
|
|
141
141
|
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
142
142
|
|
|
143
143
|
|
|
@@ -152,14 +152,10 @@ def _serve_status_refresh_event(pool: bool):
|
|
|
152
152
|
"""Refresh the sky serve status for controller consolidation mode."""
|
|
153
153
|
# pylint: disable=import-outside-toplevel
|
|
154
154
|
from sky.serve import serve_utils
|
|
155
|
-
from sky.utils import controller_utils
|
|
156
155
|
|
|
157
156
|
# We run the recovery logic before starting the event loop as those two are
|
|
158
157
|
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
159
|
-
|
|
160
|
-
if controller_utils.high_availability_specified(
|
|
161
|
-
controller.value.cluster_name):
|
|
162
|
-
serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
|
|
158
|
+
serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
|
|
163
159
|
|
|
164
160
|
# After recovery, we start the event loop.
|
|
165
161
|
from sky.skylet import events
|
sky/server/metrics.py
CHANGED
|
@@ -4,6 +4,7 @@ import contextlib
|
|
|
4
4
|
import functools
|
|
5
5
|
import multiprocessing
|
|
6
6
|
import os
|
|
7
|
+
import threading
|
|
7
8
|
import time
|
|
8
9
|
|
|
9
10
|
import fastapi
|
|
@@ -21,6 +22,24 @@ from sky.skylet import constants
|
|
|
21
22
|
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
22
23
|
'false').lower() == 'true'
|
|
23
24
|
|
|
25
|
+
_KB = 2**10
|
|
26
|
+
_MB = 2**20
|
|
27
|
+
_MEM_BUCKETS = [
|
|
28
|
+
_KB,
|
|
29
|
+
256 * _KB,
|
|
30
|
+
512 * _KB,
|
|
31
|
+
_MB,
|
|
32
|
+
2 * _MB,
|
|
33
|
+
4 * _MB,
|
|
34
|
+
8 * _MB,
|
|
35
|
+
16 * _MB,
|
|
36
|
+
32 * _MB,
|
|
37
|
+
64 * _MB,
|
|
38
|
+
128 * _MB,
|
|
39
|
+
256 * _MB,
|
|
40
|
+
float('inf'),
|
|
41
|
+
]
|
|
42
|
+
|
|
24
43
|
logger = sky_logging.init_logger(__name__)
|
|
25
44
|
|
|
26
45
|
# Total number of API server requests, grouped by path, method, and status.
|
|
@@ -92,6 +111,16 @@ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
|
92
111
|
['pid', 'type', 'mode'],
|
|
93
112
|
)
|
|
94
113
|
|
|
114
|
+
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
115
|
+
'sky_apiserver_request_memory_usage_bytes',
|
|
116
|
+
'Peak memory usage of requests', ['name'],
|
|
117
|
+
buckets=_MEM_BUCKETS)
|
|
118
|
+
|
|
119
|
+
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
120
|
+
'sky_apiserver_request_rss_incr_bytes',
|
|
121
|
+
'RSS increment after requests', ['name'],
|
|
122
|
+
buckets=_MEM_BUCKETS)
|
|
123
|
+
|
|
95
124
|
metrics_app = fastapi.FastAPI()
|
|
96
125
|
|
|
97
126
|
|
|
@@ -208,19 +237,23 @@ def time_me_async(func):
|
|
|
208
237
|
return async_wrapper
|
|
209
238
|
|
|
210
239
|
|
|
211
|
-
|
|
240
|
+
peak_rss_bytes = 0
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def process_monitor(process_type: str, stop: threading.Event):
|
|
212
244
|
pid = multiprocessing.current_process().pid
|
|
213
245
|
proc = psutil.Process(pid)
|
|
214
|
-
peak_rss = 0
|
|
215
246
|
last_bucket_end = time.time()
|
|
216
|
-
|
|
247
|
+
bucket_peak = 0
|
|
248
|
+
global peak_rss_bytes
|
|
249
|
+
while not stop.is_set():
|
|
217
250
|
if time.time() - last_bucket_end >= 30:
|
|
218
|
-
# Reset peak RSS
|
|
251
|
+
# Reset peak RSS for the next time bucket.
|
|
219
252
|
last_bucket_end = time.time()
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
223
|
-
|
|
253
|
+
bucket_peak = 0
|
|
254
|
+
peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
|
|
255
|
+
SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
256
|
+
pid=pid, type=process_type).set(peak_rss_bytes)
|
|
224
257
|
ctimes = proc.cpu_times()
|
|
225
258
|
SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
226
259
|
type=process_type,
|
sky/server/requests/executor.py
CHANGED
|
@@ -31,6 +31,7 @@ import time
|
|
|
31
31
|
import typing
|
|
32
32
|
from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
|
|
33
33
|
|
|
34
|
+
import psutil
|
|
34
35
|
import setproctitle
|
|
35
36
|
|
|
36
37
|
from sky import exceptions
|
|
@@ -130,8 +131,9 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
|
|
|
130
131
|
def executor_initializer(proc_group: str):
|
|
131
132
|
setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
|
|
132
133
|
f'{multiprocessing.current_process().pid}')
|
|
134
|
+
# Executor never stops, unless the whole process is killed.
|
|
133
135
|
threading.Thread(target=metrics_lib.process_monitor,
|
|
134
|
-
args=(f'worker:{proc_group}',),
|
|
136
|
+
args=(f'worker:{proc_group}', threading.Event()),
|
|
135
137
|
daemon=True).start()
|
|
136
138
|
|
|
137
139
|
|
|
@@ -373,11 +375,13 @@ def _request_execution_wrapper(request_id: str,
|
|
|
373
375
|
4. Handle the SIGTERM signal to abort the request gracefully.
|
|
374
376
|
5. Maintain the lifecycle of the temp dir used by the request.
|
|
375
377
|
"""
|
|
378
|
+
pid = multiprocessing.current_process().pid
|
|
379
|
+
proc = psutil.Process(pid)
|
|
380
|
+
rss_begin = proc.memory_info().rss
|
|
376
381
|
db_utils.set_max_connections(num_db_connections_per_worker)
|
|
377
382
|
# Handle the SIGTERM signal to abort the request processing gracefully.
|
|
378
383
|
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
379
384
|
|
|
380
|
-
pid = multiprocessing.current_process().pid
|
|
381
385
|
logger.info(f'Running request {request_id} with pid {pid}')
|
|
382
386
|
with api_requests.update_request(request_id) as request_task:
|
|
383
387
|
assert request_task is not None, request_id
|
|
@@ -443,8 +447,41 @@ def _request_execution_wrapper(request_id: str,
|
|
|
443
447
|
_restore_output(original_stdout, original_stderr)
|
|
444
448
|
logger.info(f'Request {request_id} finished')
|
|
445
449
|
finally:
|
|
446
|
-
|
|
447
|
-
|
|
450
|
+
try:
|
|
451
|
+
# Capture the peak RSS before GC.
|
|
452
|
+
peak_rss = max(proc.memory_info().rss,
|
|
453
|
+
metrics_lib.peak_rss_bytes)
|
|
454
|
+
with metrics_lib.time_it(name='release_memory',
|
|
455
|
+
group='internal'):
|
|
456
|
+
common_utils.release_memory()
|
|
457
|
+
_record_memory_metrics(request_name, proc, rss_begin, peak_rss)
|
|
458
|
+
except Exception as e: # pylint: disable=broad-except
|
|
459
|
+
logger.error(f'Failed to record memory metrics: '
|
|
460
|
+
f'{common_utils.format_exception(e)}')
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
_first_request = True
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def _record_memory_metrics(request_name: str, proc: psutil.Process,
|
|
467
|
+
rss_begin: int, peak_rss: int) -> None:
|
|
468
|
+
"""Record the memory metrics for a request."""
|
|
469
|
+
# Do not record full memory delta for the first request as it
|
|
470
|
+
# will loads the sky core modules and make the memory usage
|
|
471
|
+
# estimation inaccurate.
|
|
472
|
+
global _first_request
|
|
473
|
+
if _first_request:
|
|
474
|
+
_first_request = False
|
|
475
|
+
return
|
|
476
|
+
rss_end = proc.memory_info().rss
|
|
477
|
+
|
|
478
|
+
# Answer "how much RSS this request contributed?"
|
|
479
|
+
metrics_lib.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
|
|
480
|
+
name=request_name).observe(max(rss_end - rss_begin, 0))
|
|
481
|
+
# Estimate the memory usage by the request by capturing the
|
|
482
|
+
# peak memory delta during the request execution.
|
|
483
|
+
metrics_lib.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
|
|
484
|
+
name=request_name).observe(max(peak_rss - rss_begin, 0))
|
|
448
485
|
|
|
449
486
|
|
|
450
487
|
async def execute_request_coroutine(request: api_requests.Request):
|
|
@@ -131,7 +131,7 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
|
|
|
131
131
|
def encode_jobs_queue_v2(
|
|
132
132
|
jobs_or_tuple) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
|
|
133
133
|
# Support returning either a plain jobs list or a (jobs, total) tuple
|
|
134
|
-
status_counts = {}
|
|
134
|
+
status_counts: Dict[str, int] = {}
|
|
135
135
|
if isinstance(jobs_or_tuple, tuple):
|
|
136
136
|
if len(jobs_or_tuple) == 2:
|
|
137
137
|
jobs, total = jobs_or_tuple
|
sky/server/server.py
CHANGED
|
@@ -625,6 +625,9 @@ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
|
|
|
625
625
|
app.include_router(ssh_node_pools_rest.router,
|
|
626
626
|
prefix='/ssh_node_pools',
|
|
627
627
|
tags=['ssh_node_pools'])
|
|
628
|
+
# increase the resource limit for the server
|
|
629
|
+
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
630
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
|
|
628
631
|
|
|
629
632
|
# Increase the limit of files we can open to our hard limit. This fixes bugs
|
|
630
633
|
# where we can not aquire file locks or open enough logs and the API server
|
|
@@ -1211,6 +1214,7 @@ async def logs(
|
|
|
1211
1214
|
request_body=cluster_job_body,
|
|
1212
1215
|
func=core.tail_logs,
|
|
1213
1216
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1217
|
+
request_cluster_name=cluster_job_body.cluster_name,
|
|
1214
1218
|
)
|
|
1215
1219
|
task = asyncio.create_task(executor.execute_request_coroutine(request_task))
|
|
1216
1220
|
|
|
@@ -1826,7 +1830,7 @@ async def all_contexts(request: fastapi.Request) -> None:
|
|
|
1826
1830
|
async def gpu_metrics() -> fastapi.Response:
|
|
1827
1831
|
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
1828
1832
|
contexts = core.get_all_contexts()
|
|
1829
|
-
all_metrics = []
|
|
1833
|
+
all_metrics: List[str] = []
|
|
1830
1834
|
successful_contexts = 0
|
|
1831
1835
|
|
|
1832
1836
|
tasks = [
|
|
@@ -1841,6 +1845,10 @@ async def gpu_metrics() -> fastapi.Response:
|
|
|
1841
1845
|
if isinstance(result, Exception):
|
|
1842
1846
|
logger.error(
|
|
1843
1847
|
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
1848
|
+
elif isinstance(result, BaseException):
|
|
1849
|
+
# Avoid changing behavior for non-Exception BaseExceptions
|
|
1850
|
+
# like KeyboardInterrupt/SystemExit: re-raise them.
|
|
1851
|
+
raise result
|
|
1844
1852
|
else:
|
|
1845
1853
|
metrics_text = result
|
|
1846
1854
|
all_metrics.append(metrics_text)
|
sky/server/uvicorn.py
CHANGED
|
@@ -213,11 +213,17 @@ class Server(uvicorn.Server):
|
|
|
213
213
|
# Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
|
|
214
214
|
event_loop.set_debug(True)
|
|
215
215
|
event_loop.slow_callback_duration = lag_threshold
|
|
216
|
-
threading.
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
216
|
+
stop_monitor = threading.Event()
|
|
217
|
+
monitor = threading.Thread(target=metrics_lib.process_monitor,
|
|
218
|
+
args=('server', stop_monitor),
|
|
219
|
+
daemon=True)
|
|
220
|
+
monitor.start()
|
|
221
|
+
try:
|
|
222
|
+
with self.capture_signals():
|
|
223
|
+
asyncio.run(self.serve(*args, **kwargs))
|
|
224
|
+
finally:
|
|
225
|
+
stop_monitor.set()
|
|
226
|
+
monitor.join()
|
|
221
227
|
|
|
222
228
|
|
|
223
229
|
def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -63,6 +63,8 @@ install_requires = [
|
|
|
63
63
|
'setproctitle',
|
|
64
64
|
'sqlalchemy',
|
|
65
65
|
'psycopg2-binary',
|
|
66
|
+
'aiosqlite',
|
|
67
|
+
'asyncpg',
|
|
66
68
|
# TODO(hailong): These three dependencies should be removed after we make
|
|
67
69
|
# the client-side actually not importing them.
|
|
68
70
|
'casbin',
|
|
@@ -108,9 +110,9 @@ server_dependencies = [
|
|
|
108
110
|
local_ray = [
|
|
109
111
|
# Lower version of ray will cause dependency conflict for
|
|
110
112
|
# click/grpcio/protobuf.
|
|
111
|
-
#
|
|
113
|
+
# Ray 2.6.1+ resolved cluster launcher bugs and grpcio issues on Apple Silicon.
|
|
112
114
|
# https://github.com/ray-project/ray/releases/tag/ray-2.6.1
|
|
113
|
-
'ray[default] >= 2.
|
|
115
|
+
'ray[default] >= 2.6.1',
|
|
114
116
|
]
|
|
115
117
|
|
|
116
118
|
remote = [
|
sky/skylet/attempt_skylet.py
CHANGED
|
@@ -12,6 +12,7 @@ def restart_skylet():
|
|
|
12
12
|
# Kills old skylet if it is running.
|
|
13
13
|
# TODO(zhwu): make the killing graceful, e.g., use a signal to tell
|
|
14
14
|
# skylet to exit, instead of directly killing it.
|
|
15
|
+
|
|
15
16
|
subprocess.run(
|
|
16
17
|
# We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
|
|
17
18
|
# because need to handle the backward compatibility of the old skylet
|
sky/skylet/constants.py
CHANGED
|
@@ -62,11 +62,14 @@ SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
|
|
|
62
62
|
'curl -LsSf https://astral.sh/uv/install.sh '
|
|
63
63
|
f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
|
|
64
64
|
SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
|
|
65
|
-
|
|
66
|
-
#
|
|
65
|
+
SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run')
|
|
66
|
+
# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
|
|
67
|
+
# VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
|
|
68
|
+
# not work when conda is used.
|
|
67
69
|
DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
|
|
68
70
|
'export PATH='
|
|
69
|
-
f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")'
|
|
71
|
+
f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||") && '
|
|
72
|
+
'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
|
|
70
73
|
|
|
71
74
|
# Prefix for SkyPilot environment variables
|
|
72
75
|
SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
|
|
@@ -91,14 +94,14 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
91
94
|
# cluster yaml is updated.
|
|
92
95
|
#
|
|
93
96
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
94
|
-
SKYLET_VERSION = '
|
|
97
|
+
SKYLET_VERSION = '18'
|
|
95
98
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
96
99
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
97
100
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
98
101
|
SKYLET_LIB_VERSION = 4
|
|
99
102
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
|
100
103
|
SKYLET_GRPC_PORT = 46590
|
|
101
|
-
SKYLET_GRPC_TIMEOUT_SECONDS =
|
|
104
|
+
SKYLET_GRPC_TIMEOUT_SECONDS = 10
|
|
102
105
|
|
|
103
106
|
# Docker default options
|
|
104
107
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
|
@@ -229,7 +232,7 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
229
232
|
'export PATH=$PATH:$HOME/.local/bin; '
|
|
230
233
|
# Writes ray path to file if it does not exist or the file is empty.
|
|
231
234
|
f'[ -s {SKY_RAY_PATH_FILE} ] || '
|
|
232
|
-
f'{{ {
|
|
235
|
+
f'{{ {SKY_UV_RUN_CMD} '
|
|
233
236
|
f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
|
|
234
237
|
|
|
235
238
|
SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
|
|
@@ -421,6 +424,7 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
|
421
424
|
# TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
|
|
422
425
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
|
423
426
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
427
|
+
OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
|
|
424
428
|
|
|
425
429
|
# Environment variable that is set to 'true' if metrics are enabled.
|
|
426
430
|
ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
|
|
@@ -447,7 +451,7 @@ SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
|
|
447
451
|
# BEGIN constants used for service catalog.
|
|
448
452
|
HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
|
|
449
453
|
HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
|
|
450
|
-
CATALOG_SCHEMA_VERSION = '
|
|
454
|
+
CATALOG_SCHEMA_VERSION = 'v8'
|
|
451
455
|
CATALOG_DIR = '~/.sky/catalogs'
|
|
452
456
|
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
|
453
457
|
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
|
@@ -508,3 +512,6 @@ SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
|
|
|
508
512
|
|
|
509
513
|
ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
|
|
510
514
|
'DEBUG_LOOP_LAG_THRESHOLD_MS')
|
|
515
|
+
|
|
516
|
+
ARM64_ARCH = 'arm64'
|
|
517
|
+
X86_64_ARCH = 'x86_64'
|