skypilot-nightly 1.0.0.dev20251004__py3-none-any.whl → 1.0.0.dev20251008__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +17 -21
- sky/backends/backend.py +1 -3
- sky/backends/cloud_vm_ray_backend.py +8 -20
- sky/backends/local_docker_backend.py +0 -5
- sky/client/sdk.py +24 -23
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +1 -11
- sky/global_user_state.py +16 -5
- sky/jobs/constants.py +1 -7
- sky/jobs/controller.py +9 -1
- sky/jobs/scheduler.py +30 -15
- sky/jobs/server/core.py +8 -3
- sky/jobs/utils.py +30 -2
- sky/metrics/utils.py +62 -45
- sky/provision/instance_setup.py +32 -10
- sky/provision/kubernetes/utils.py +4 -1
- sky/provision/provisioner.py +10 -7
- sky/schemas/api/responses.py +2 -2
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/server/common.py +1 -0
- sky/server/config.py +2 -0
- sky/server/metrics.py +3 -1
- sky/server/requests/executor.py +103 -77
- sky/server/requests/requests.py +26 -11
- sky/server/server.py +16 -0
- sky/skylet/constants.py +9 -1
- sky/skylet/events.py +17 -0
- sky/skylet/skylet.py +3 -0
- sky/skypilot_config.py +2 -1
- sky/templates/kubernetes-ray.yml.j2 +5 -0
- sky/utils/context_utils.py +5 -1
- sky/utils/controller_utils.py +14 -0
- sky/utils/db/db_utils.py +2 -0
- sky/utils/db/migration_utils.py +11 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/METADATA +37 -37
- {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/RECORD +59 -58
- /sky/dashboard/out/_next/static/{KL03GEega4QqDqTOMtA_w → MnvNdzHHpiZG1_oKSpbxF}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{KL03GEega4QqDqTOMtA_w → MnvNdzHHpiZG1_oKSpbxF}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251004.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/top_level.txt +0 -0
sky/server/config.py
CHANGED
|
@@ -111,7 +111,9 @@ def compute_server_config(deploy: bool,
|
|
|
111
111
|
process after API server was introduced.
|
|
112
112
|
"""
|
|
113
113
|
cpu_count = common_utils.get_cpu_count()
|
|
114
|
+
logger.debug(f'CPU count: {cpu_count}')
|
|
114
115
|
mem_size_gb = common_utils.get_mem_size_gb()
|
|
116
|
+
logger.debug(f'Memory size: {mem_size_gb}GB')
|
|
115
117
|
max_parallel_for_long = _max_long_worker_parallism(cpu_count,
|
|
116
118
|
mem_size_gb,
|
|
117
119
|
local=not deploy)
|
sky/server/metrics.py
CHANGED
|
@@ -24,8 +24,10 @@ logger = sky_logging.init_logger(__name__)
|
|
|
24
24
|
metrics_app = fastapi.FastAPI()
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
# Serve /metrics in dedicated thread to avoid blocking the event loop
|
|
28
|
+
# of metrics server.
|
|
27
29
|
@metrics_app.get('/metrics')
|
|
28
|
-
|
|
30
|
+
def metrics() -> fastapi.Response:
|
|
29
31
|
"""Expose aggregated Prometheus metrics from all worker processes."""
|
|
30
32
|
if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
|
|
31
33
|
# In multiprocess mode, we need to collect metrics from all processes.
|
sky/server/requests/executor.py
CHANGED
|
@@ -349,27 +349,30 @@ def override_request_env_and_config(
|
|
|
349
349
|
os.environ.update(original_env)
|
|
350
350
|
|
|
351
351
|
|
|
352
|
-
def
|
|
353
|
-
"""
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
original_stdout = os.dup(sys.stdout.fileno())
|
|
357
|
-
original_stderr = os.dup(sys.stderr.fileno())
|
|
352
|
+
def _get_current_output() -> Tuple[int, int]:
|
|
353
|
+
"""Get the current stdout and stderr file descriptors."""
|
|
354
|
+
return os.dup(sys.stdout.fileno()), os.dup(sys.stderr.fileno())
|
|
355
|
+
|
|
358
356
|
|
|
357
|
+
def _redirect_output(file: TextIO) -> None:
|
|
358
|
+
"""Redirect stdout and stderr to the log file."""
|
|
359
|
+
# Get the file descriptor from the file object
|
|
360
|
+
fd = file.fileno()
|
|
359
361
|
# Copy this fd to stdout and stderr
|
|
360
362
|
os.dup2(fd, sys.stdout.fileno())
|
|
361
363
|
os.dup2(fd, sys.stderr.fileno())
|
|
362
|
-
return original_stdout, original_stderr
|
|
363
364
|
|
|
364
365
|
|
|
365
|
-
def _restore_output(original_stdout: int,
|
|
366
|
+
def _restore_output(original_stdout: Optional[int],
|
|
367
|
+
original_stderr: Optional[int]) -> None:
|
|
366
368
|
"""Restore stdout and stderr to their original file descriptors."""
|
|
367
|
-
|
|
368
|
-
|
|
369
|
+
if original_stdout is not None:
|
|
370
|
+
os.dup2(original_stdout, sys.stdout.fileno())
|
|
371
|
+
os.close(original_stdout)
|
|
369
372
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
+
if original_stderr is not None:
|
|
374
|
+
os.dup2(original_stderr, sys.stderr.fileno())
|
|
375
|
+
os.close(original_stderr)
|
|
373
376
|
|
|
374
377
|
|
|
375
378
|
def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
|
|
@@ -397,24 +400,38 @@ def _request_execution_wrapper(request_id: str,
|
|
|
397
400
|
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
398
401
|
|
|
399
402
|
logger.info(f'Running request {request_id} with pid {pid}')
|
|
400
|
-
with api_requests.update_request(request_id) as request_task:
|
|
401
|
-
assert request_task is not None, request_id
|
|
402
|
-
log_path = request_task.log_path
|
|
403
|
-
request_task.pid = pid
|
|
404
|
-
request_task.status = api_requests.RequestStatus.RUNNING
|
|
405
|
-
func = request_task.entrypoint
|
|
406
|
-
request_body = request_task.request_body
|
|
407
|
-
request_name = request_task.name
|
|
408
403
|
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
404
|
+
original_stdout = original_stderr = None
|
|
405
|
+
try:
|
|
406
|
+
# As soon as the request is updated with the executor PID, we can
|
|
407
|
+
# receive SIGTERM from cancellation. So, we update the request inside
|
|
408
|
+
# the try block to ensure we have the KeyboardInterrupt handling.
|
|
409
|
+
with api_requests.update_request(request_id) as request_task:
|
|
410
|
+
assert request_task is not None, request_id
|
|
411
|
+
if request_task.status != api_requests.RequestStatus.PENDING:
|
|
412
|
+
logger.debug(f'Request is already {request_task.status.value}, '
|
|
413
|
+
f'skipping execution')
|
|
414
|
+
return
|
|
415
|
+
log_path = request_task.log_path
|
|
416
|
+
request_task.pid = pid
|
|
417
|
+
request_task.status = api_requests.RequestStatus.RUNNING
|
|
418
|
+
func = request_task.entrypoint
|
|
419
|
+
request_body = request_task.request_body
|
|
420
|
+
request_name = request_task.name
|
|
421
|
+
|
|
412
422
|
# Store copies of the original stdout and stderr file descriptors
|
|
413
|
-
|
|
414
|
-
#
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
423
|
+
# We do this in two steps because we should make sure to restore the
|
|
424
|
+
# original values even if we are cancelled or fail during the redirect.
|
|
425
|
+
original_stdout, original_stderr = _get_current_output()
|
|
426
|
+
|
|
427
|
+
# Append to the log file instead of overwriting it since there might be
|
|
428
|
+
# logs from previous retries.
|
|
429
|
+
with log_path.open('a', encoding='utf-8') as f:
|
|
430
|
+
# Redirect the stdout/stderr before overriding the environment and
|
|
431
|
+
# config, as there can be some logs during override that needs to be
|
|
432
|
+
# captured in the log file.
|
|
433
|
+
_redirect_output(f)
|
|
434
|
+
|
|
418
435
|
with sky_logging.add_debug_log_handler(request_id), \
|
|
419
436
|
override_request_env_and_config(
|
|
420
437
|
request_body, request_id, request_name), \
|
|
@@ -429,53 +446,59 @@ def _request_execution_wrapper(request_id: str,
|
|
|
429
446
|
group='request_execution'):
|
|
430
447
|
return_value = func(**request_body.to_kwargs())
|
|
431
448
|
f.flush()
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
449
|
+
except KeyboardInterrupt:
|
|
450
|
+
logger.info(f'Request {request_id} cancelled by user')
|
|
451
|
+
# Kill all children processes related to this request.
|
|
452
|
+
# Each executor handles a single request, so we can safely kill all
|
|
453
|
+
# children processes related to this request.
|
|
454
|
+
# This is required as python does not pass the KeyboardInterrupt to the
|
|
455
|
+
# threads that are not main thread.
|
|
456
|
+
subprocess_utils.kill_children_processes()
|
|
457
|
+
return
|
|
458
|
+
except exceptions.ExecutionRetryableError as e:
|
|
459
|
+
logger.error(e)
|
|
460
|
+
logger.info(e.hint)
|
|
461
|
+
with api_requests.update_request(request_id) as request_task:
|
|
462
|
+
assert request_task is not None, request_id
|
|
463
|
+
# Retried request will undergo rescheduling and a new execution,
|
|
464
|
+
# clear the pid of the request.
|
|
465
|
+
request_task.pid = None
|
|
466
|
+
# Yield control to the scheduler for uniform handling of retries.
|
|
467
|
+
_restore_output(original_stdout, original_stderr)
|
|
468
|
+
raise
|
|
469
|
+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
470
|
+
api_requests.set_request_failed(request_id, e)
|
|
471
|
+
# Manually reset the original stdout and stderr file descriptors early
|
|
472
|
+
# so that the "Request xxxx failed due to ..." log message will be
|
|
473
|
+
# written to the original stdout and stderr file descriptors.
|
|
474
|
+
_restore_output(original_stdout, original_stderr)
|
|
475
|
+
original_stdout = original_stderr = None
|
|
476
|
+
logger.info(f'Request {request_id} failed due to '
|
|
477
|
+
f'{common_utils.format_exception(e)}')
|
|
478
|
+
return
|
|
479
|
+
else:
|
|
480
|
+
api_requests.set_request_succeeded(
|
|
481
|
+
request_id, return_value if not ignore_return_value else None)
|
|
482
|
+
# Manually reset the original stdout and stderr file descriptors early
|
|
483
|
+
# so that the "Request xxxx failed due to ..." log message will be
|
|
484
|
+
# written to the original stdout and stderr file descriptors.
|
|
485
|
+
_restore_output(original_stdout, original_stderr)
|
|
486
|
+
original_stdout = original_stderr = None
|
|
487
|
+
logger.info(f'Request {request_id} finished')
|
|
488
|
+
finally:
|
|
489
|
+
_restore_output(original_stdout, original_stderr)
|
|
490
|
+
try:
|
|
491
|
+
# Capture the peak RSS before GC.
|
|
492
|
+
peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
|
|
493
|
+
# Clear request level cache to release all memory used by the
|
|
494
|
+
# request.
|
|
495
|
+
annotations.clear_request_level_cache()
|
|
496
|
+
with metrics_utils.time_it(name='release_memory', group='internal'):
|
|
497
|
+
common_utils.release_memory()
|
|
498
|
+
_record_memory_metrics(request_name, proc, rss_begin, peak_rss)
|
|
499
|
+
except Exception as e: # pylint: disable=broad-except
|
|
500
|
+
logger.error(f'Failed to record memory metrics: '
|
|
501
|
+
f'{common_utils.format_exception(e)}')
|
|
479
502
|
|
|
480
503
|
|
|
481
504
|
_first_request = True
|
|
@@ -596,11 +619,14 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
596
619
|
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
|
597
620
|
# Handle any other error
|
|
598
621
|
ctx.redirect_log(original_output)
|
|
599
|
-
ctx.cancel()
|
|
600
622
|
api_requests.set_request_failed(request.request_id, e)
|
|
601
623
|
logger.error(f'Request {request.request_id} interrupted due to '
|
|
602
624
|
f'unhandled exception: {common_utils.format_exception(e)}')
|
|
603
625
|
raise
|
|
626
|
+
finally:
|
|
627
|
+
# Always cancel the context to kill potentially running background
|
|
628
|
+
# routine.
|
|
629
|
+
ctx.cancel()
|
|
604
630
|
|
|
605
631
|
|
|
606
632
|
def prepare_request(
|
sky/server/requests/requests.py
CHANGED
|
@@ -449,9 +449,15 @@ def init_db_async(func):
|
|
|
449
449
|
|
|
450
450
|
def reset_db_and_logs():
|
|
451
451
|
"""Create the database."""
|
|
452
|
+
logger.debug('clearing local API server database')
|
|
452
453
|
server_common.clear_local_api_server_database()
|
|
454
|
+
logger.debug(
|
|
455
|
+
f'clearing local API server logs directory at {REQUEST_LOG_PATH_PREFIX}'
|
|
456
|
+
)
|
|
453
457
|
shutil.rmtree(pathlib.Path(REQUEST_LOG_PATH_PREFIX).expanduser(),
|
|
454
458
|
ignore_errors=True)
|
|
459
|
+
logger.debug('clearing local API server client directory at '
|
|
460
|
+
f'{server_common.API_SERVER_CLIENT_DIR.expanduser()}')
|
|
455
461
|
shutil.rmtree(server_common.API_SERVER_CLIENT_DIR.expanduser(),
|
|
456
462
|
ignore_errors=True)
|
|
457
463
|
|
|
@@ -467,10 +473,13 @@ def request_lock_path(request_id: str) -> str:
|
|
|
467
473
|
@metrics_lib.time_me
|
|
468
474
|
def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
469
475
|
"""Get and update a SkyPilot API request."""
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
476
|
+
# Acquire the lock to avoid race conditions between multiple request
|
|
477
|
+
# operations, e.g. execute and cancel.
|
|
478
|
+
with filelock.FileLock(request_lock_path(request_id)):
|
|
479
|
+
request = _get_request_no_lock(request_id)
|
|
480
|
+
yield request
|
|
481
|
+
if request is not None:
|
|
482
|
+
_add_or_update_request_no_lock(request)
|
|
474
483
|
|
|
475
484
|
|
|
476
485
|
@init_db
|
|
@@ -485,12 +494,15 @@ def update_request_async(
|
|
|
485
494
|
|
|
486
495
|
@contextlib.asynccontextmanager
|
|
487
496
|
async def _cm():
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
497
|
+
# Acquire the lock to avoid race conditions between multiple request
|
|
498
|
+
# operations, e.g. execute and cancel.
|
|
499
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
500
|
+
request = await _get_request_no_lock_async(request_id)
|
|
501
|
+
try:
|
|
502
|
+
yield request
|
|
503
|
+
finally:
|
|
504
|
+
if request is not None:
|
|
505
|
+
await _add_or_update_request_no_lock_async(request)
|
|
494
506
|
|
|
495
507
|
return _cm()
|
|
496
508
|
|
|
@@ -775,9 +787,12 @@ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
|
|
|
775
787
|
|
|
776
788
|
|
|
777
789
|
def set_request_cancelled(request_id: str) -> None:
|
|
778
|
-
"""Set a request to cancelled."""
|
|
790
|
+
"""Set a pending or running request to cancelled."""
|
|
779
791
|
with update_request(request_id) as request_task:
|
|
780
792
|
assert request_task is not None, request_id
|
|
793
|
+
# Already finished or cancelled.
|
|
794
|
+
if request_task.status > RequestStatus.RUNNING:
|
|
795
|
+
return
|
|
781
796
|
request_task.finished_at = time.time()
|
|
782
797
|
request_task.status = RequestStatus.CANCELLED
|
|
783
798
|
|
sky/server/server.py
CHANGED
|
@@ -1943,6 +1943,7 @@ if __name__ == '__main__':
|
|
|
1943
1943
|
|
|
1944
1944
|
from sky.server import uvicorn as skyuvicorn
|
|
1945
1945
|
|
|
1946
|
+
logger.info('Initializing SkyPilot API server')
|
|
1946
1947
|
skyuvicorn.add_timestamp_prefix_for_server_logs()
|
|
1947
1948
|
|
|
1948
1949
|
parser = argparse.ArgumentParser()
|
|
@@ -1954,20 +1955,35 @@ if __name__ == '__main__':
|
|
|
1954
1955
|
parser.add_argument('--metrics-port', default=9090, type=int)
|
|
1955
1956
|
cmd_args = parser.parse_args()
|
|
1956
1957
|
if cmd_args.port == cmd_args.metrics_port:
|
|
1958
|
+
logger.error('port and metrics-port cannot be the same, exiting.')
|
|
1957
1959
|
raise ValueError('port and metrics-port cannot be the same')
|
|
1958
1960
|
|
|
1961
|
+
# Fail fast if the port is not available to avoid corrupt the state
|
|
1962
|
+
# of potential running server instance.
|
|
1963
|
+
# We might reach here because the running server is currently not
|
|
1964
|
+
# responding, thus the healthz check fails and `sky api start` think
|
|
1965
|
+
# we should start a new server instance.
|
|
1966
|
+
if not common_utils.is_port_available(cmd_args.port):
|
|
1967
|
+
logger.error(f'Port {cmd_args.port} is not available, exiting.')
|
|
1968
|
+
raise RuntimeError(f'Port {cmd_args.port} is not available')
|
|
1969
|
+
|
|
1959
1970
|
# Show the privacy policy if it is not already shown. We place it here so
|
|
1960
1971
|
# that it is shown only when the API server is started.
|
|
1961
1972
|
usage_lib.maybe_show_privacy_policy()
|
|
1962
1973
|
|
|
1963
1974
|
# Initialize global user state db
|
|
1964
1975
|
db_utils.set_max_connections(1)
|
|
1976
|
+
logger.info('Initializing database engine')
|
|
1965
1977
|
global_user_state.initialize_and_get_db()
|
|
1978
|
+
logger.info('Database engine initialized')
|
|
1966
1979
|
# Initialize request db
|
|
1967
1980
|
requests_lib.reset_db_and_logs()
|
|
1968
1981
|
# Restore the server user hash
|
|
1982
|
+
logger.info('Initializing server user hash')
|
|
1969
1983
|
_init_or_restore_server_user_hash()
|
|
1984
|
+
|
|
1970
1985
|
max_db_connections = global_user_state.get_max_db_connections()
|
|
1986
|
+
logger.info(f'Max db connections: {max_db_connections}')
|
|
1971
1987
|
config = server_config.compute_server_config(cmd_args.deploy,
|
|
1972
1988
|
max_db_connections)
|
|
1973
1989
|
|
sky/skylet/constants.py
CHANGED
|
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
100
100
|
# cluster yaml is updated.
|
|
101
101
|
#
|
|
102
102
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
103
|
-
SKYLET_VERSION = '
|
|
103
|
+
SKYLET_VERSION = '22'
|
|
104
104
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
105
105
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
106
106
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
@@ -331,6 +331,14 @@ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
|
|
|
331
331
|
# controller_utils.translate_local_file_mounts_to_two_hop().
|
|
332
332
|
FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
|
|
333
333
|
|
|
334
|
+
# For passing in CPU and memory limits to the controller pod when running
|
|
335
|
+
# in k8s. Right now, we only use this for the jobs controller, but we may
|
|
336
|
+
# use this for the serve controller as well in the future.
|
|
337
|
+
# These files are written to disk by the skylet, who reads it from env vars
|
|
338
|
+
# passed by the backend when starting the skylet (start_skylet_on_head_node).
|
|
339
|
+
CONTROLLER_K8S_CPU_FILE = '~/.sky/_internal_k8s_pod_cpu'
|
|
340
|
+
CONTROLLER_K8S_MEMORY_FILE = '~/.sky/_internal_k8s_pod_memory'
|
|
341
|
+
|
|
334
342
|
# Used when an managed jobs are created and
|
|
335
343
|
# files are synced up to the cloud.
|
|
336
344
|
FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
|
sky/skylet/events.py
CHANGED
|
@@ -47,6 +47,9 @@ class SkyletEvent:
|
|
|
47
47
|
EVENT_CHECKING_INTERVAL_SECONDS))
|
|
48
48
|
self._n = 0
|
|
49
49
|
|
|
50
|
+
def start(self):
|
|
51
|
+
pass
|
|
52
|
+
|
|
50
53
|
def run(self):
|
|
51
54
|
self._n = (self._n + 1) % self._event_interval
|
|
52
55
|
if self._n % self._event_interval == 0:
|
|
@@ -75,6 +78,20 @@ class ManagedJobEvent(SkyletEvent):
|
|
|
75
78
|
"""Skylet event for updating and scheduling managed jobs."""
|
|
76
79
|
EVENT_INTERVAL_SECONDS = 300
|
|
77
80
|
|
|
81
|
+
def start(self):
|
|
82
|
+
cpus_env_var = os.environ.get('SKYPILOT_POD_CPU_CORE_LIMIT')
|
|
83
|
+
if cpus_env_var is not None:
|
|
84
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_CPU_FILE),
|
|
85
|
+
'w',
|
|
86
|
+
encoding='utf-8') as f:
|
|
87
|
+
f.write(cpus_env_var)
|
|
88
|
+
memory_env_var = os.environ.get('SKYPILOT_POD_MEMORY_GB_LIMIT')
|
|
89
|
+
if memory_env_var is not None:
|
|
90
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
|
|
91
|
+
'w',
|
|
92
|
+
encoding='utf-8') as f:
|
|
93
|
+
f.write(memory_env_var)
|
|
94
|
+
|
|
78
95
|
def _run(self):
|
|
79
96
|
if not os.path.exists(
|
|
80
97
|
os.path.expanduser(
|
sky/skylet/skylet.py
CHANGED
|
@@ -71,6 +71,9 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
|
|
|
71
71
|
def run_event_loop():
|
|
72
72
|
"""Run the existing event loop."""
|
|
73
73
|
|
|
74
|
+
for event in EVENTS:
|
|
75
|
+
event.start()
|
|
76
|
+
|
|
74
77
|
while True:
|
|
75
78
|
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
76
79
|
for event in EVENTS:
|
sky/skypilot_config.py
CHANGED
|
@@ -825,7 +825,8 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
|
|
|
825
825
|
except ValueError as e:
|
|
826
826
|
raise ValueError(f'Invalid config override: {cli_config}. '
|
|
827
827
|
f'Check if config file exists or if the dotlist '
|
|
828
|
-
f'is formatted as: key1=value1,key2=value2'
|
|
828
|
+
f'is formatted as: key1=value1,key2=value2.\n'
|
|
829
|
+
f'Details: {e}') from e
|
|
829
830
|
logger.debug('CLI overrides config syntax check passed.')
|
|
830
831
|
|
|
831
832
|
return parsed_config
|
|
@@ -632,6 +632,9 @@ available_node_types:
|
|
|
632
632
|
command: ["/bin/bash", "-c", "--"]
|
|
633
633
|
args:
|
|
634
634
|
- |
|
|
635
|
+
# Set -x to print the commands and their arguments as they are executed.
|
|
636
|
+
# Useful for debugging.
|
|
637
|
+
set -x
|
|
635
638
|
# Helper function to conditionally use sudo
|
|
636
639
|
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
|
637
640
|
prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
|
|
@@ -1086,6 +1089,8 @@ available_node_types:
|
|
|
1086
1089
|
|
|
1087
1090
|
touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
|
|
1088
1091
|
{% endif %}
|
|
1092
|
+
# Set +x to stop printing the commands and their arguments as they are executed.
|
|
1093
|
+
set +x
|
|
1089
1094
|
|
|
1090
1095
|
trap : TERM INT; log_tail || sleep infinity & wait
|
|
1091
1096
|
|
sky/utils/context_utils.py
CHANGED
|
@@ -130,7 +130,11 @@ def wait_process(ctx: context.Context,
|
|
|
130
130
|
# Kill the process despite the caller's callback, the utility
|
|
131
131
|
# function gracefully handles the case where the process is
|
|
132
132
|
# already terminated.
|
|
133
|
-
|
|
133
|
+
# Bash script typically does not forward SIGTERM to childs, thus
|
|
134
|
+
# cannot be killed gracefully, shorten the grace period for faster
|
|
135
|
+
# termination.
|
|
136
|
+
subprocess_utils.kill_process_with_grace_period(proc,
|
|
137
|
+
grace_period=1)
|
|
134
138
|
raise asyncio.CancelledError()
|
|
135
139
|
try:
|
|
136
140
|
proc.wait(poll_interval)
|
sky/utils/controller_utils.py
CHANGED
|
@@ -506,6 +506,9 @@ def shared_controller_vars_to_fill(
|
|
|
506
506
|
# before popping allowed_contexts. If it is not on Kubernetes,
|
|
507
507
|
# we may be able to use allowed_contexts.
|
|
508
508
|
local_user_config.pop('allowed_contexts', None)
|
|
509
|
+
# Remove api_server config so that the controller does not try to use
|
|
510
|
+
# a remote API server.
|
|
511
|
+
local_user_config.pop('api_server', None)
|
|
509
512
|
with tempfile.NamedTemporaryFile(
|
|
510
513
|
delete=False,
|
|
511
514
|
suffix=_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX) as temp_file:
|
|
@@ -726,6 +729,17 @@ def get_controller_resources(
|
|
|
726
729
|
return result
|
|
727
730
|
|
|
728
731
|
|
|
732
|
+
def get_controller_mem_size_gb() -> float:
|
|
733
|
+
try:
|
|
734
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
|
|
735
|
+
'r',
|
|
736
|
+
encoding='utf-8') as f:
|
|
737
|
+
return float(f.read())
|
|
738
|
+
except FileNotFoundError:
|
|
739
|
+
pass
|
|
740
|
+
return common_utils.get_mem_size_gb()
|
|
741
|
+
|
|
742
|
+
|
|
729
743
|
def _setup_proxy_command_on_controller(
|
|
730
744
|
controller_launched_cloud: 'clouds.Cloud',
|
|
731
745
|
user_config: Dict[str, Any]) -> config_utils.Config:
|
sky/utils/db/db_utils.py
CHANGED
|
@@ -410,6 +410,8 @@ def get_engine(
|
|
|
410
410
|
conn_string, poolclass=sqlalchemy.NullPool)
|
|
411
411
|
with _db_creation_lock:
|
|
412
412
|
if conn_string not in _postgres_engine_cache:
|
|
413
|
+
logger.debug('Creating a new postgres engine with '
|
|
414
|
+
f'maximum {_max_connections} connections')
|
|
413
415
|
if _max_connections == 0:
|
|
414
416
|
_postgres_engine_cache[conn_string] = (
|
|
415
417
|
sqlalchemy.create_engine(
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -11,13 +11,14 @@ import filelock
|
|
|
11
11
|
import sqlalchemy
|
|
12
12
|
|
|
13
13
|
from sky import sky_logging
|
|
14
|
+
from sky.skylet import constants
|
|
14
15
|
|
|
15
16
|
logger = sky_logging.init_logger(__name__)
|
|
16
17
|
|
|
17
18
|
DB_INIT_LOCK_TIMEOUT_SECONDS = 10
|
|
18
19
|
|
|
19
20
|
GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
20
|
-
GLOBAL_USER_STATE_VERSION = '
|
|
21
|
+
GLOBAL_USER_STATE_VERSION = '010'
|
|
21
22
|
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
22
23
|
|
|
23
24
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|
|
@@ -85,12 +86,20 @@ def needs_upgrade(engine: sqlalchemy.engine.Engine, section: str,
|
|
|
85
86
|
connection, opts={'version_table': version_table})
|
|
86
87
|
current_rev = context.get_current_revision()
|
|
87
88
|
|
|
89
|
+
target_rev_num = int(target_revision)
|
|
88
90
|
if current_rev is None:
|
|
91
|
+
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
92
|
+
logger.debug(f'{section} database currently uninitialized, '
|
|
93
|
+
f'targeting revision {target_rev_num}')
|
|
89
94
|
return True
|
|
90
95
|
|
|
91
96
|
# Compare revisions - assuming they are numeric strings like '001', '002'
|
|
92
97
|
current_rev_num = int(current_rev)
|
|
93
|
-
target_rev_num
|
|
98
|
+
if (current_rev_num < target_rev_num and
|
|
99
|
+
os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None):
|
|
100
|
+
logger.debug(
|
|
101
|
+
f'{section} database currently at revision {current_rev_num}, '
|
|
102
|
+
f'targeting revision {target_rev_num}')
|
|
94
103
|
|
|
95
104
|
return current_rev_num < target_rev_num
|
|
96
105
|
|
sky/volumes/server/server.py
CHANGED
|
@@ -24,11 +24,11 @@ async def volume_list(request: fastapi.Request) -> None:
|
|
|
24
24
|
auth_user_env_vars_kwargs = {
|
|
25
25
|
'env_vars': auth_user.to_env_vars()
|
|
26
26
|
} if auth_user else {}
|
|
27
|
-
|
|
27
|
+
request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
|
|
28
28
|
executor.schedule_request(
|
|
29
29
|
request_id=request.state.request_id,
|
|
30
30
|
request_name='volume_list',
|
|
31
|
-
request_body=
|
|
31
|
+
request_body=request_body,
|
|
32
32
|
func=core.volume_list,
|
|
33
33
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
34
34
|
)
|