skypilot-nightly 1.0.0.dev20250219__py3-none-any.whl → 1.0.0.dev20250220__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/server/common.py +5 -7
- sky/server/requests/executor.py +94 -87
- sky/server/server.py +10 -5
- sky/server/stream_utils.py +8 -11
- sky/utils/common_utils.py +38 -0
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/RECORD +12 -12
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '6b2b31d8358f3ff8394a7a33ec49e9985ada230f'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250220'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/server/common.py
CHANGED
@@ -15,7 +15,6 @@ import uuid
|
|
15
15
|
|
16
16
|
import colorama
|
17
17
|
import filelock
|
18
|
-
import psutil
|
19
18
|
import pydantic
|
20
19
|
import requests
|
21
20
|
|
@@ -146,13 +145,14 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
146
145
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY, api_version=None)
|
147
146
|
|
148
147
|
|
149
|
-
def
|
148
|
+
def start_api_server_in_background(deploy: bool = False,
|
149
|
+
host: str = '127.0.0.1'):
|
150
150
|
if not is_api_server_local():
|
151
151
|
raise RuntimeError(
|
152
152
|
f'Cannot start API server: {get_server_url()} is not a local URL')
|
153
153
|
|
154
154
|
# Check available memory before starting the server.
|
155
|
-
avail_mem_size_gb: float =
|
155
|
+
avail_mem_size_gb: float = common_utils.get_mem_size_gb()
|
156
156
|
if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
|
157
157
|
logger.warning(
|
158
158
|
f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only has '
|
@@ -163,8 +163,6 @@ def start_uvicorn_in_background(deploy: bool = False, host: str = '127.0.0.1'):
|
|
163
163
|
log_path = os.path.expanduser(constants.API_SERVER_LOGS)
|
164
164
|
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
165
165
|
|
166
|
-
# The command to run uvicorn. Adjust the app:app to your application's
|
167
|
-
# location.
|
168
166
|
api_server_cmd = API_SERVER_CMD
|
169
167
|
if deploy:
|
170
168
|
api_server_cmd += ' --deploy'
|
@@ -172,7 +170,7 @@ def start_uvicorn_in_background(deploy: bool = False, host: str = '127.0.0.1'):
|
|
172
170
|
api_server_cmd += f' --host {host}'
|
173
171
|
cmd = f'{sys.executable} {api_server_cmd} > {log_path} 2>&1'
|
174
172
|
|
175
|
-
# Start the
|
173
|
+
# Start the API server process in the background and don't wait for it.
|
176
174
|
# If this is called from a CLI invocation, we need start_new_session=True so
|
177
175
|
# that SIGINT on the CLI will not also kill the API server.
|
178
176
|
subprocess.Popen(cmd, shell=True, start_new_session=True)
|
@@ -232,7 +230,7 @@ def _start_api_server(deploy: bool = False, host: str = '127.0.0.1'):
|
|
232
230
|
f'SkyPilot API server at {server_url}. '
|
233
231
|
'Starting a local server.'
|
234
232
|
f'{colorama.Style.RESET_ALL}')
|
235
|
-
|
233
|
+
start_api_server_in_background(deploy=deploy, host=host)
|
236
234
|
logger.info(ux_utils.finishing_message('SkyPilot API server started.'))
|
237
235
|
|
238
236
|
|
sky/server/requests/executor.py
CHANGED
@@ -32,7 +32,6 @@ import traceback
|
|
32
32
|
import typing
|
33
33
|
from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
|
34
34
|
|
35
|
-
import psutil
|
36
35
|
import setproctitle
|
37
36
|
|
38
37
|
from sky import global_user_state
|
@@ -70,18 +69,36 @@ logger = sky_logging.init_logger(__name__)
|
|
70
69
|
# platforms, including macOS.
|
71
70
|
multiprocessing.set_start_method('spawn', force=True)
|
72
71
|
|
73
|
-
# Constants based on profiling the peak memory usage
|
74
|
-
#
|
75
|
-
#
|
76
|
-
|
77
|
-
|
78
|
-
#
|
79
|
-
|
80
|
-
|
81
|
-
#
|
72
|
+
# Constants based on profiling the peak memory usage while serving various
|
73
|
+
# sky commands. These estimation are highly related to usage patterns
|
74
|
+
# (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
|
75
|
+
# the profiling covers major clouds and common usage patterns. For user has
|
76
|
+
# deviated usage pattern, they can override the default estimation by
|
77
|
+
# environment variables.
|
78
|
+
# NOTE(dev): update these constants for each release according to the load
|
79
|
+
# test results.
|
80
|
+
# TODO(aylei): maintaining these constants is error-prone, we may need to
|
81
|
+
# automatically tune parallelism at runtime according to system usage stats
|
82
|
+
# in the future.
|
83
|
+
_LONG_WORKER_MEM_GB = 0.4
|
84
|
+
_SHORT_WORKER_MEM_GB = 0.25
|
85
|
+
# To control the number of long workers.
|
86
|
+
_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
|
87
|
+
# Limit the number of long workers of local API server, since local server is
|
88
|
+
# typically:
|
89
|
+
# 1. launched automatically in an environment with high resource contention
|
90
|
+
# (e.g. Laptop)
|
91
|
+
# 2. used by a single user
|
92
|
+
_MAX_LONG_WORKERS_LOCAL = 4
|
93
|
+
# Percentage of memory for long requests
|
82
94
|
# from the memory reserved for SkyPilot.
|
83
|
-
# This is to reserve some memory for
|
95
|
+
# This is to reserve some memory for short requests.
|
84
96
|
_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
|
97
|
+
# Minimal number of long workers to ensure responsiveness.
|
98
|
+
_MIN_LONG_WORKERS = 1
|
99
|
+
# Minimal number of short workers, there is a daemon task running on short
|
100
|
+
# workers so at least 2 workers are needed to ensure responsiveness.
|
101
|
+
_MIN_SHORT_WORKERS = 2
|
85
102
|
|
86
103
|
|
87
104
|
class QueueBackend(enum.Enum):
|
@@ -301,34 +318,32 @@ def schedule_request(request_id: str,
|
|
301
318
|
_get_queue(schedule_type).put(input_tuple)
|
302
319
|
|
303
320
|
|
321
|
+
def executor_initializer(proc_group: str):
|
322
|
+
setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
|
323
|
+
f'{multiprocessing.current_process().pid}')
|
324
|
+
|
325
|
+
|
304
326
|
def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
305
327
|
"""Worker for the requests.
|
306
328
|
|
307
329
|
Args:
|
308
330
|
max_parallel_size: Maximum number of parallel jobs this worker can run.
|
309
331
|
"""
|
310
|
-
|
311
|
-
|
312
|
-
setproctitle.setproctitle(
|
313
|
-
f'SkyPilot:worker:{worker.schedule_type.value}-{worker.id}')
|
332
|
+
proc_group = f'{worker.schedule_type.value}-{worker.id}'
|
333
|
+
setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
|
314
334
|
queue = _get_queue(worker.schedule_type)
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
# We use executor instead of individual multiprocessing.Process to avoid
|
319
|
-
# the overhead of forking a new process for each request, which can be about
|
320
|
-
# 1s delay.
|
321
|
-
with concurrent.futures.ProcessPoolExecutor(
|
322
|
-
max_workers=max_parallel_size) as executor:
|
323
|
-
while True:
|
335
|
+
|
336
|
+
def process_request(executor: concurrent.futures.ProcessPoolExecutor):
|
337
|
+
try:
|
324
338
|
request_element = queue.get()
|
325
339
|
if request_element is None:
|
326
340
|
time.sleep(0.1)
|
327
|
-
|
341
|
+
return
|
328
342
|
request_id, ignore_return_value = request_element
|
329
343
|
request = api_requests.get_request(request_id)
|
344
|
+
assert request is not None, f'Request with ID {request_id} is None'
|
330
345
|
if request.status == api_requests.RequestStatus.CANCELLED:
|
331
|
-
|
346
|
+
return
|
332
347
|
logger.info(f'[{worker}] Submitting request: {request_id}')
|
333
348
|
# Start additional process to run the request, so that it can be
|
334
349
|
# cancelled when requested by a user.
|
@@ -347,60 +362,49 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
|
347
362
|
logger.info(f'[{worker}] Finished request: {request_id}')
|
348
363
|
else:
|
349
364
|
logger.info(f'[{worker}] Submitted request: {request_id}')
|
365
|
+
except KeyboardInterrupt:
|
366
|
+
# Interrupt the worker process will stop request execution, but
|
367
|
+
# the SIGTERM request should be respected anyway since it might
|
368
|
+
# be explicitly sent by user.
|
369
|
+
# TODO(aylei): crash the API server or recreate the worker process
|
370
|
+
# to avoid broken state.
|
371
|
+
logger.error(f'[{worker}] Worker process interrupted')
|
372
|
+
raise
|
373
|
+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
374
|
+
# Catch any other exceptions to avoid crashing the worker process.
|
375
|
+
logger.error(
|
376
|
+
f'[{worker}] Error processing request {request_id}: '
|
377
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
350
378
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
with ux_utils.print_exception_no_traceback():
|
364
|
-
raise ValueError(
|
365
|
-
f'Failed to parse the number of CPUs from {cpu_count}'
|
366
|
-
) from e
|
367
|
-
return psutil.cpu_count()
|
368
|
-
|
369
|
-
|
370
|
-
def _get_mem_size_gb() -> float:
|
371
|
-
"""Get the memory size in GB.
|
372
|
-
|
373
|
-
If the API server is deployed as a pod in k8s cluster, we assume the
|
374
|
-
memory size is provided by the downward API.
|
375
|
-
"""
|
376
|
-
mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
|
377
|
-
if mem_size is not None:
|
378
|
-
try:
|
379
|
-
return float(mem_size)
|
380
|
-
except ValueError as e:
|
381
|
-
with ux_utils.print_exception_no_traceback():
|
382
|
-
raise ValueError(
|
383
|
-
f'Failed to parse the memory size from {mem_size}') from e
|
384
|
-
return psutil.virtual_memory().total / (1024**3)
|
379
|
+
# Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
|
380
|
+
# because the former is more efficient with the support of lazy creation of
|
381
|
+
# worker processes.
|
382
|
+
# We use executor instead of individual multiprocessing.Process to avoid
|
383
|
+
# the overhead of forking a new process for each request, which can be about
|
384
|
+
# 1s delay.
|
385
|
+
with concurrent.futures.ProcessPoolExecutor(
|
386
|
+
max_workers=max_parallel_size,
|
387
|
+
initializer=executor_initializer,
|
388
|
+
initargs=(proc_group,)) as executor:
|
389
|
+
while True:
|
390
|
+
process_request(executor)
|
385
391
|
|
386
392
|
|
387
393
|
def start(deploy: bool) -> List[multiprocessing.Process]:
|
388
394
|
"""Start the request workers."""
|
389
395
|
# Determine the job capacity of the workers based on the system resources.
|
390
|
-
cpu_count =
|
391
|
-
mem_size_gb =
|
396
|
+
cpu_count = common_utils.get_cpu_count()
|
397
|
+
mem_size_gb = common_utils.get_mem_size_gb()
|
392
398
|
mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
max_parallel_for_non_blocking = _max_parallel_size_for_non_blocking(
|
399
|
-
mem_size_gb, parallel_for_blocking)
|
399
|
+
max_parallel_for_long = _max_long_worker_parallism(cpu_count,
|
400
|
+
mem_size_gb,
|
401
|
+
local=not deploy)
|
402
|
+
max_parallel_for_short = _max_short_worker_parallism(
|
403
|
+
mem_size_gb, max_parallel_for_long)
|
400
404
|
logger.info(
|
401
|
-
f'SkyPilot API server will start {
|
402
|
-
f'
|
403
|
-
f'{
|
405
|
+
f'SkyPilot API server will start {max_parallel_for_long} workers for '
|
406
|
+
f'long requests and will allow at max '
|
407
|
+
f'{max_parallel_for_short} short requests in parallel.')
|
404
408
|
|
405
409
|
# Setup the queues.
|
406
410
|
if queue_backend == QueueBackend.MULTIPROCESSING:
|
@@ -424,7 +428,7 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
|
|
424
428
|
logger.info('Request queues created')
|
425
429
|
|
426
430
|
worker_procs = []
|
427
|
-
for worker_id in range(
|
431
|
+
for worker_id in range(max_parallel_for_long):
|
428
432
|
worker = RequestWorker(id=worker_id,
|
429
433
|
schedule_type=api_requests.ScheduleType.LONG)
|
430
434
|
worker_proc = multiprocessing.Process(target=request_worker,
|
@@ -432,31 +436,34 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
|
|
432
436
|
worker_proc.start()
|
433
437
|
worker_procs.append(worker_proc)
|
434
438
|
|
435
|
-
# Start a
|
439
|
+
# Start a worker for short requests.
|
436
440
|
worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
|
437
441
|
worker_proc = multiprocessing.Process(target=request_worker,
|
438
|
-
args=(worker,
|
439
|
-
max_parallel_for_non_blocking))
|
442
|
+
args=(worker, max_parallel_for_short))
|
440
443
|
worker_proc.start()
|
441
444
|
worker_procs.append(worker_proc)
|
442
445
|
return worker_procs
|
443
446
|
|
444
447
|
|
445
448
|
@annotations.lru_cache(scope='global', maxsize=1)
|
446
|
-
def
|
447
|
-
|
448
|
-
|
449
|
+
def _max_long_worker_parallism(cpu_count: int,
|
450
|
+
mem_size_gb: float,
|
451
|
+
local=False) -> int:
|
452
|
+
"""Max parallelism for long workers."""
|
453
|
+
cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
|
449
454
|
mem_based_max_parallel = int(mem_size_gb * _MAX_MEM_PERCENT_FOR_BLOCKING /
|
450
|
-
|
451
|
-
n = max(
|
455
|
+
_LONG_WORKER_MEM_GB)
|
456
|
+
n = max(_MIN_LONG_WORKERS,
|
457
|
+
min(cpu_based_max_parallel, mem_based_max_parallel))
|
458
|
+
if local:
|
459
|
+
return min(n, _MAX_LONG_WORKERS_LOCAL)
|
452
460
|
return n
|
453
461
|
|
454
462
|
|
455
463
|
@annotations.lru_cache(scope='global', maxsize=1)
|
456
|
-
def
|
457
|
-
|
458
|
-
"""Max parallelism for
|
459
|
-
available_mem = mem_size_gb - (
|
460
|
-
|
461
|
-
n = max(1, int(available_mem / _PER_NON_BLOCKING_REQUEST_MEM_GB))
|
464
|
+
def _max_short_worker_parallism(mem_size_gb: float,
|
465
|
+
long_worker_parallism: int) -> int:
|
466
|
+
"""Max parallelism for short workers."""
|
467
|
+
available_mem = mem_size_gb - (long_worker_parallism * _LONG_WORKER_MEM_GB)
|
468
|
+
n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
|
462
469
|
return n
|
sky/server/server.py
CHANGED
@@ -57,7 +57,9 @@ P = ParamSpec('P')
|
|
57
57
|
|
58
58
|
def _add_timestamp_prefix_for_server_logs() -> None:
|
59
59
|
server_logger = sky_logging.init_logger('sky.server')
|
60
|
-
#
|
60
|
+
# Clear existing handlers first to prevent duplicates
|
61
|
+
server_logger.handlers.clear()
|
62
|
+
# Disable propagation to avoid the root logger of SkyPilot being affected
|
61
63
|
server_logger.propagate = False
|
62
64
|
# Add date prefix to the log message printed by loggers under
|
63
65
|
# server.
|
@@ -460,6 +462,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
460
462
|
request: fastapi.Request) -> None:
|
461
463
|
"""Launches a cluster or task."""
|
462
464
|
request_id = request.state.request_id
|
465
|
+
logger.info(f'Launching request: {request_id}')
|
463
466
|
executor.schedule_request(
|
464
467
|
request_id,
|
465
468
|
request_name='launch',
|
@@ -627,6 +630,9 @@ async def logs(
|
|
627
630
|
request_name='logs',
|
628
631
|
request_body=cluster_job_body,
|
629
632
|
func=core.tail_logs,
|
633
|
+
# TODO(aylei): We have tail logs scheduled as SHORT request, because it
|
634
|
+
# should be responsive. However, it can be long running if the user's
|
635
|
+
# job keeps running, and we should avoid it taking the SHORT worker.
|
630
636
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
631
637
|
request_cluster_name=cluster_job_body.cluster_name,
|
632
638
|
)
|
@@ -794,10 +800,9 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
|
|
794
800
|
detail=dataclasses.asdict(
|
795
801
|
request_task.encode()))
|
796
802
|
return request_task.encode()
|
797
|
-
#
|
798
|
-
#
|
799
|
-
|
800
|
-
await asyncio.sleep(0)
|
803
|
+
# yield control to allow other coroutines to run, sleep shortly
|
804
|
+
# to avoid storming the DB and CPU in the meantime
|
805
|
+
await asyncio.sleep(0.1)
|
801
806
|
|
802
807
|
|
803
808
|
@app.get('/api/stream')
|
sky/server/stream_utils.py
CHANGED
@@ -68,7 +68,7 @@ async def log_streamer(request_id: Optional[str],
|
|
68
68
|
# Sleep 0 to yield, so other coroutines can run. This busy waiting
|
69
69
|
# loop is performance critical for short-running requests, so we do
|
70
70
|
# not want to yield too long.
|
71
|
-
await asyncio.sleep(0)
|
71
|
+
await asyncio.sleep(0.1)
|
72
72
|
request_task = requests_lib.get_request(request_id)
|
73
73
|
if not follow:
|
74
74
|
break
|
@@ -88,6 +88,9 @@ async def log_streamer(request_id: Optional[str],
|
|
88
88
|
yield line_str
|
89
89
|
|
90
90
|
while True:
|
91
|
+
# Sleep 0 to yield control to allow other coroutines to run,
|
92
|
+
# while keeps the loop tight to make log stream responsive.
|
93
|
+
await asyncio.sleep(0)
|
91
94
|
line: Optional[bytes] = await f.readline()
|
92
95
|
if not line:
|
93
96
|
if request_id is not None:
|
@@ -100,24 +103,18 @@ async def log_streamer(request_id: Optional[str],
|
|
100
103
|
break
|
101
104
|
if not follow:
|
102
105
|
break
|
103
|
-
|
104
|
-
#
|
105
|
-
#
|
106
|
-
|
107
|
-
await asyncio.sleep(0)
|
106
|
+
# Sleep shortly to avoid storming the DB and CPU, this has
|
107
|
+
# little impact on the responsivness here since we are waiting
|
108
|
+
# for a new line to come in.
|
109
|
+
await asyncio.sleep(0.1)
|
108
110
|
continue
|
109
111
|
line_str = line.decode('utf-8')
|
110
112
|
if plain_logs:
|
111
113
|
is_payload, line_str = message_utils.decode_payload(
|
112
114
|
line_str, raise_for_mismatch=False)
|
113
115
|
if is_payload:
|
114
|
-
# Sleep 0 to yield, so other coroutines can run. This busy
|
115
|
-
# waiting loop is performance critical for short-running
|
116
|
-
# requests, so we do not want to yield too long.
|
117
|
-
await asyncio.sleep(0)
|
118
116
|
continue
|
119
117
|
yield line_str
|
120
|
-
await asyncio.sleep(0) # Allow other tasks to run
|
121
118
|
|
122
119
|
|
123
120
|
def stream_response(
|
sky/utils/common_utils.py
CHANGED
@@ -18,6 +18,7 @@ import uuid
|
|
18
18
|
|
19
19
|
import jinja2
|
20
20
|
import jsonschema
|
21
|
+
import psutil
|
21
22
|
import yaml
|
22
23
|
|
23
24
|
from sky import exceptions
|
@@ -755,3 +756,40 @@ def is_port_available(port: int, reuse_addr: bool = True) -> bool:
|
|
755
756
|
return True
|
756
757
|
except OSError:
|
757
758
|
return False
|
759
|
+
|
760
|
+
|
761
|
+
# TODO(aylei): should be aware of cgroups
|
762
|
+
def get_cpu_count() -> int:
|
763
|
+
"""Get the number of CPUs.
|
764
|
+
|
765
|
+
If the API server is deployed as a pod in k8s cluster, we assume the
|
766
|
+
number of CPUs is provided by the downward API.
|
767
|
+
"""
|
768
|
+
cpu_count = os.getenv('SKYPILOT_POD_CPU_CORE_LIMIT')
|
769
|
+
if cpu_count is not None:
|
770
|
+
try:
|
771
|
+
return int(float(cpu_count))
|
772
|
+
except ValueError as e:
|
773
|
+
with ux_utils.print_exception_no_traceback():
|
774
|
+
raise ValueError(
|
775
|
+
f'Failed to parse the number of CPUs from {cpu_count}'
|
776
|
+
) from e
|
777
|
+
return psutil.cpu_count()
|
778
|
+
|
779
|
+
|
780
|
+
# TODO(aylei): should be aware of cgroups
|
781
|
+
def get_mem_size_gb() -> float:
|
782
|
+
"""Get the memory size in GB.
|
783
|
+
|
784
|
+
If the API server is deployed as a pod in k8s cluster, we assume the
|
785
|
+
memory size is provided by the downward API.
|
786
|
+
"""
|
787
|
+
mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
|
788
|
+
if mem_size is not None:
|
789
|
+
try:
|
790
|
+
return float(mem_size)
|
791
|
+
except ValueError as e:
|
792
|
+
with ux_utils.print_exception_no_traceback():
|
793
|
+
raise ValueError(
|
794
|
+
f'Failed to parse the memory size from {mem_size}') from e
|
795
|
+
return psutil.virtual_memory().total / (1024**3)
|
{skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=2WOLIr_y7h-Dzd_2cUqq56HiHaF6TBVULtoUaAeb-5c,6391
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
|
4
4
|
sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
|
@@ -221,13 +221,13 @@ sky/serve/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
221
221
|
sky/serve/server/core.py,sha256=pRvFadEIH_WTUkTtSmuFoPBP4JFq8Obt68ifi9DWuog,36865
|
222
222
|
sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,3283
|
223
223
|
sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
|
224
|
-
sky/server/common.py,sha256=
|
224
|
+
sky/server/common.py,sha256=uBshF4a-U8NGgm8XOHTW2YNSq0CsByfdIFgiybU5PEg,17321
|
225
225
|
sky/server/constants.py,sha256=SqhWJMassFyvWAJn2UJHvuA_0_C6f5vngMzZ2KYLsKw,770
|
226
|
-
sky/server/server.py,sha256=
|
227
|
-
sky/server/stream_utils.py,sha256
|
226
|
+
sky/server/server.py,sha256=0gcIn3jr_4DkHpBJYdNq--uPo9Im8bn2ftxgd8mBMcU,42225
|
227
|
+
sky/server/stream_utils.py,sha256=-3IX1YCgxAFfcvQIV0TCvOn1wbRLWovAx3ckCrsExWU,5651
|
228
228
|
sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
|
229
229
|
sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
230
|
-
sky/server/requests/executor.py,sha256=
|
230
|
+
sky/server/requests/executor.py,sha256=NxVB0aFA05GddXDdt89wEwEYyJcIIrsQxE2wowklhUI,19597
|
231
231
|
sky/server/requests/payloads.py,sha256=PeEkqQoTO3ellelkFX5yzPKbPkDV-NfVXkxHndYlrjE,15769
|
232
232
|
sky/server/requests/requests.py,sha256=aMdjiK5kjSYP36pxdXFU6qgKOXcOmtViHbFm3V8Dvf8,19590
|
233
233
|
sky/server/requests/queues/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -302,7 +302,7 @@ sky/utils/cluster_utils.py,sha256=s6DFRXktv6_gF_DnwDEXJ7CniifHp8CAPeGciRCbXgI,14
|
|
302
302
|
sky/utils/command_runner.py,sha256=-7vxLvwZnTvYMQ_nScmuQWY6ZvQYv69yvvIp2uOaOqU,39063
|
303
303
|
sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
|
304
304
|
sky/utils/common.py,sha256=P4oVXFATUYgkruHX92cN12SJBtfb8DiOOYZtbN1kvP0,1927
|
305
|
-
sky/utils/common_utils.py,sha256
|
305
|
+
sky/utils/common_utils.py,sha256=-O0GthIockeJy8LlA4heVYYtaUdQwNA-5mFMqHajRf8,27457
|
306
306
|
sky/utils/config_utils.py,sha256=VQ2E3DQ2XysD-kul-diSrxn_pXWsDMfKAev91OiJQ1Q,9041
|
307
307
|
sky/utils/control_master_utils.py,sha256=iD4M0onjYOdZ2RuxjwMBl4KhafHXJzuHjvqlBUnu-VE,1450
|
308
308
|
sky/utils/controller_utils.py,sha256=4Nck10XV6gNJKjBl7y_CIxIGqP3bbISuZSVTHbBumgs,45725
|
@@ -336,9 +336,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
|
|
336
336
|
sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=iAjfyPclOs8qlALACcfxLpRAO9CZ-h16leFqXZ6tNaY,10096
|
337
337
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
338
338
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
339
|
-
skypilot_nightly-1.0.0.
|
340
|
-
skypilot_nightly-1.0.0.
|
341
|
-
skypilot_nightly-1.0.0.
|
342
|
-
skypilot_nightly-1.0.0.
|
343
|
-
skypilot_nightly-1.0.0.
|
344
|
-
skypilot_nightly-1.0.0.
|
339
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
340
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/METADATA,sha256=uYtMxJQSUuL9hPmfqny_uQvuqWy65W5mHUHv7HvJb-o,18916
|
341
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
342
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
343
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
344
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|