skypilot-nightly 1.0.0.dev20251005__py3-none-any.whl → 1.0.0.dev20251008__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (57) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +17 -21
  3. sky/backends/backend.py +1 -3
  4. sky/backends/cloud_vm_ray_backend.py +8 -20
  5. sky/backends/local_docker_backend.py +0 -5
  6. sky/client/sdk.py +24 -23
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  9. sky/dashboard/out/clusters/[cluster].html +1 -1
  10. sky/dashboard/out/clusters.html +1 -1
  11. sky/dashboard/out/config.html +1 -1
  12. sky/dashboard/out/index.html +1 -1
  13. sky/dashboard/out/infra/[context].html +1 -1
  14. sky/dashboard/out/infra.html +1 -1
  15. sky/dashboard/out/jobs/[job].html +1 -1
  16. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  17. sky/dashboard/out/jobs.html +1 -1
  18. sky/dashboard/out/users.html +1 -1
  19. sky/dashboard/out/volumes.html +1 -1
  20. sky/dashboard/out/workspace/new.html +1 -1
  21. sky/dashboard/out/workspaces/[name].html +1 -1
  22. sky/dashboard/out/workspaces.html +1 -1
  23. sky/execution.py +1 -11
  24. sky/global_user_state.py +16 -5
  25. sky/jobs/constants.py +1 -7
  26. sky/jobs/controller.py +9 -1
  27. sky/jobs/scheduler.py +30 -15
  28. sky/jobs/server/core.py +8 -3
  29. sky/jobs/utils.py +30 -2
  30. sky/metrics/utils.py +62 -45
  31. sky/provision/instance_setup.py +32 -10
  32. sky/provision/kubernetes/utils.py +4 -1
  33. sky/provision/provisioner.py +10 -7
  34. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  35. sky/server/common.py +1 -0
  36. sky/server/config.py +2 -0
  37. sky/server/metrics.py +3 -1
  38. sky/server/requests/executor.py +103 -77
  39. sky/server/requests/requests.py +26 -11
  40. sky/server/server.py +16 -0
  41. sky/skylet/constants.py +9 -1
  42. sky/skylet/events.py +17 -0
  43. sky/skylet/skylet.py +3 -0
  44. sky/templates/kubernetes-ray.yml.j2 +5 -0
  45. sky/utils/context_utils.py +5 -1
  46. sky/utils/controller_utils.py +14 -0
  47. sky/utils/db/db_utils.py +2 -0
  48. sky/utils/db/migration_utils.py +11 -2
  49. sky/volumes/server/server.py +2 -2
  50. {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/METADATA +35 -35
  51. {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/RECORD +57 -56
  52. /sky/dashboard/out/_next/static/{Vg53Kzbf7u4o6fYPeOHMe → MnvNdzHHpiZG1_oKSpbxF}/_buildManifest.js +0 -0
  53. /sky/dashboard/out/_next/static/{Vg53Kzbf7u4o6fYPeOHMe → MnvNdzHHpiZG1_oKSpbxF}/_ssgManifest.js +0 -0
  54. {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/WHEEL +0 -0
  55. {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/entry_points.txt +0 -0
  56. {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/licenses/LICENSE +0 -0
  57. {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/top_level.txt +0 -0
sky/server/metrics.py CHANGED
@@ -24,8 +24,10 @@ logger = sky_logging.init_logger(__name__)
24
24
  metrics_app = fastapi.FastAPI()
25
25
 
26
26
 
27
+ # Serve /metrics in dedicated thread to avoid blocking the event loop
28
+ # of metrics server.
27
29
  @metrics_app.get('/metrics')
28
- async def metrics() -> fastapi.Response:
30
+ def metrics() -> fastapi.Response:
29
31
  """Expose aggregated Prometheus metrics from all worker processes."""
30
32
  if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
31
33
  # In multiprocess mode, we need to collect metrics from all processes.
@@ -349,27 +349,30 @@ def override_request_env_and_config(
349
349
  os.environ.update(original_env)
350
350
 
351
351
 
352
- def _redirect_output(file: TextIO) -> Tuple[int, int]:
353
- """Redirect stdout and stderr to the log file."""
354
- fd = file.fileno() # Get the file descriptor from the file object
355
- # Store copies of the original stdout and stderr file descriptors
356
- original_stdout = os.dup(sys.stdout.fileno())
357
- original_stderr = os.dup(sys.stderr.fileno())
352
+ def _get_current_output() -> Tuple[int, int]:
353
+ """Get the current stdout and stderr file descriptors."""
354
+ return os.dup(sys.stdout.fileno()), os.dup(sys.stderr.fileno())
355
+
358
356
 
357
+ def _redirect_output(file: TextIO) -> None:
358
+ """Redirect stdout and stderr to the log file."""
359
+ # Get the file descriptor from the file object
360
+ fd = file.fileno()
359
361
  # Copy this fd to stdout and stderr
360
362
  os.dup2(fd, sys.stdout.fileno())
361
363
  os.dup2(fd, sys.stderr.fileno())
362
- return original_stdout, original_stderr
363
364
 
364
365
 
365
- def _restore_output(original_stdout: int, original_stderr: int) -> None:
366
+ def _restore_output(original_stdout: Optional[int],
367
+ original_stderr: Optional[int]) -> None:
366
368
  """Restore stdout and stderr to their original file descriptors."""
367
- os.dup2(original_stdout, sys.stdout.fileno())
368
- os.dup2(original_stderr, sys.stderr.fileno())
369
+ if original_stdout is not None:
370
+ os.dup2(original_stdout, sys.stdout.fileno())
371
+ os.close(original_stdout)
369
372
 
370
- # Close the duplicate file descriptors
371
- os.close(original_stdout)
372
- os.close(original_stderr)
373
+ if original_stderr is not None:
374
+ os.dup2(original_stderr, sys.stderr.fileno())
375
+ os.close(original_stderr)
373
376
 
374
377
 
375
378
  def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
@@ -397,24 +400,38 @@ def _request_execution_wrapper(request_id: str,
397
400
  signal.signal(signal.SIGTERM, _sigterm_handler)
398
401
 
399
402
  logger.info(f'Running request {request_id} with pid {pid}')
400
- with api_requests.update_request(request_id) as request_task:
401
- assert request_task is not None, request_id
402
- log_path = request_task.log_path
403
- request_task.pid = pid
404
- request_task.status = api_requests.RequestStatus.RUNNING
405
- func = request_task.entrypoint
406
- request_body = request_task.request_body
407
- request_name = request_task.name
408
403
 
409
- # Append to the log file instead of overwriting it since there might be
410
- # logs from previous retries.
411
- with log_path.open('a', encoding='utf-8') as f:
404
+ original_stdout = original_stderr = None
405
+ try:
406
+ # As soon as the request is updated with the executor PID, we can
407
+ # receive SIGTERM from cancellation. So, we update the request inside
408
+ # the try block to ensure we have the KeyboardInterrupt handling.
409
+ with api_requests.update_request(request_id) as request_task:
410
+ assert request_task is not None, request_id
411
+ if request_task.status != api_requests.RequestStatus.PENDING:
412
+ logger.debug(f'Request is already {request_task.status.value}, '
413
+ f'skipping execution')
414
+ return
415
+ log_path = request_task.log_path
416
+ request_task.pid = pid
417
+ request_task.status = api_requests.RequestStatus.RUNNING
418
+ func = request_task.entrypoint
419
+ request_body = request_task.request_body
420
+ request_name = request_task.name
421
+
412
422
  # Store copies of the original stdout and stderr file descriptors
413
- original_stdout, original_stderr = _redirect_output(f)
414
- # Redirect the stdout/stderr before overriding the environment and
415
- # config, as there can be some logs during override that needs to be
416
- # captured in the log file.
417
- try:
423
+ # We do this in two steps because we should make sure to restore the
424
+ # original values even if we are cancelled or fail during the redirect.
425
+ original_stdout, original_stderr = _get_current_output()
426
+
427
+ # Append to the log file instead of overwriting it since there might be
428
+ # logs from previous retries.
429
+ with log_path.open('a', encoding='utf-8') as f:
430
+ # Redirect the stdout/stderr before overriding the environment and
431
+ # config, as there can be some logs during override that needs to be
432
+ # captured in the log file.
433
+ _redirect_output(f)
434
+
418
435
  with sky_logging.add_debug_log_handler(request_id), \
419
436
  override_request_env_and_config(
420
437
  request_body, request_id, request_name), \
@@ -429,53 +446,59 @@ def _request_execution_wrapper(request_id: str,
429
446
  group='request_execution'):
430
447
  return_value = func(**request_body.to_kwargs())
431
448
  f.flush()
432
- except KeyboardInterrupt:
433
- logger.info(f'Request {request_id} cancelled by user')
434
- # Kill all children processes related to this request.
435
- # Each executor handles a single request, so we can safely kill all
436
- # children processes related to this request.
437
- # This is required as python does not pass the KeyboardInterrupt
438
- # to the threads that are not main thread.
439
- subprocess_utils.kill_children_processes()
440
- _restore_output(original_stdout, original_stderr)
441
- return
442
- except exceptions.ExecutionRetryableError as e:
443
- logger.error(e)
444
- logger.info(e.hint)
445
- with api_requests.update_request(request_id) as request_task:
446
- assert request_task is not None, request_id
447
- # Retried request will undergo rescheduling and a new execution,
448
- # clear the pid of the request.
449
- request_task.pid = None
450
- # Yield control to the scheduler for uniform handling of retries.
451
- _restore_output(original_stdout, original_stderr)
452
- raise
453
- except (Exception, SystemExit) as e: # pylint: disable=broad-except
454
- api_requests.set_request_failed(request_id, e)
455
- _restore_output(original_stdout, original_stderr)
456
- logger.info(f'Request {request_id} failed due to '
457
- f'{common_utils.format_exception(e)}')
458
- return
459
- else:
460
- api_requests.set_request_succeeded(
461
- request_id, return_value if not ignore_return_value else None)
462
- _restore_output(original_stdout, original_stderr)
463
- logger.info(f'Request {request_id} finished')
464
- finally:
465
- try:
466
- # Capture the peak RSS before GC.
467
- peak_rss = max(proc.memory_info().rss,
468
- metrics_lib.peak_rss_bytes)
469
- # Clear request level cache to release all memory used by
470
- # the request.
471
- annotations.clear_request_level_cache()
472
- with metrics_utils.time_it(name='release_memory',
473
- group='internal'):
474
- common_utils.release_memory()
475
- _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
476
- except Exception as e: # pylint: disable=broad-except
477
- logger.error(f'Failed to record memory metrics: '
478
- f'{common_utils.format_exception(e)}')
449
+ except KeyboardInterrupt:
450
+ logger.info(f'Request {request_id} cancelled by user')
451
+ # Kill all children processes related to this request.
452
+ # Each executor handles a single request, so we can safely kill all
453
+ # children processes related to this request.
454
+ # This is required as python does not pass the KeyboardInterrupt to the
455
+ # threads that are not main thread.
456
+ subprocess_utils.kill_children_processes()
457
+ return
458
+ except exceptions.ExecutionRetryableError as e:
459
+ logger.error(e)
460
+ logger.info(e.hint)
461
+ with api_requests.update_request(request_id) as request_task:
462
+ assert request_task is not None, request_id
463
+ # Retried request will undergo rescheduling and a new execution,
464
+ # clear the pid of the request.
465
+ request_task.pid = None
466
+ # Yield control to the scheduler for uniform handling of retries.
467
+ _restore_output(original_stdout, original_stderr)
468
+ raise
469
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
470
+ api_requests.set_request_failed(request_id, e)
471
+ # Manually reset the original stdout and stderr file descriptors early
472
+ # so that the "Request xxxx failed due to ..." log message will be
473
+ # written to the original stdout and stderr file descriptors.
474
+ _restore_output(original_stdout, original_stderr)
475
+ original_stdout = original_stderr = None
476
+ logger.info(f'Request {request_id} failed due to '
477
+ f'{common_utils.format_exception(e)}')
478
+ return
479
+ else:
480
+ api_requests.set_request_succeeded(
481
+ request_id, return_value if not ignore_return_value else None)
482
+ # Manually reset the original stdout and stderr file descriptors early
483
+ # so that the "Request xxxx failed due to ..." log message will be
484
+ # written to the original stdout and stderr file descriptors.
485
+ _restore_output(original_stdout, original_stderr)
486
+ original_stdout = original_stderr = None
487
+ logger.info(f'Request {request_id} finished')
488
+ finally:
489
+ _restore_output(original_stdout, original_stderr)
490
+ try:
491
+ # Capture the peak RSS before GC.
492
+ peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
493
+ # Clear request level cache to release all memory used by the
494
+ # request.
495
+ annotations.clear_request_level_cache()
496
+ with metrics_utils.time_it(name='release_memory', group='internal'):
497
+ common_utils.release_memory()
498
+ _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
499
+ except Exception as e: # pylint: disable=broad-except
500
+ logger.error(f'Failed to record memory metrics: '
501
+ f'{common_utils.format_exception(e)}')
479
502
 
480
503
 
481
504
  _first_request = True
@@ -596,11 +619,14 @@ async def _execute_request_coroutine(request: api_requests.Request):
596
619
  except (Exception, KeyboardInterrupt, SystemExit) as e:
597
620
  # Handle any other error
598
621
  ctx.redirect_log(original_output)
599
- ctx.cancel()
600
622
  api_requests.set_request_failed(request.request_id, e)
601
623
  logger.error(f'Request {request.request_id} interrupted due to '
602
624
  f'unhandled exception: {common_utils.format_exception(e)}')
603
625
  raise
626
+ finally:
627
+ # Always cancel the context to kill potentially running background
628
+ # routine.
629
+ ctx.cancel()
604
630
 
605
631
 
606
632
  def prepare_request(
@@ -449,9 +449,15 @@ def init_db_async(func):
449
449
 
450
450
  def reset_db_and_logs():
451
451
  """Create the database."""
452
+ logger.debug('clearing local API server database')
452
453
  server_common.clear_local_api_server_database()
454
+ logger.debug(
455
+ f'clearing local API server logs directory at {REQUEST_LOG_PATH_PREFIX}'
456
+ )
453
457
  shutil.rmtree(pathlib.Path(REQUEST_LOG_PATH_PREFIX).expanduser(),
454
458
  ignore_errors=True)
459
+ logger.debug('clearing local API server client directory at '
460
+ f'{server_common.API_SERVER_CLIENT_DIR.expanduser()}')
455
461
  shutil.rmtree(server_common.API_SERVER_CLIENT_DIR.expanduser(),
456
462
  ignore_errors=True)
457
463
 
@@ -467,10 +473,13 @@ def request_lock_path(request_id: str) -> str:
467
473
  @metrics_lib.time_me
468
474
  def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
469
475
  """Get and update a SkyPilot API request."""
470
- request = _get_request_no_lock(request_id)
471
- yield request
472
- if request is not None:
473
- _add_or_update_request_no_lock(request)
476
+ # Acquire the lock to avoid race conditions between multiple request
477
+ # operations, e.g. execute and cancel.
478
+ with filelock.FileLock(request_lock_path(request_id)):
479
+ request = _get_request_no_lock(request_id)
480
+ yield request
481
+ if request is not None:
482
+ _add_or_update_request_no_lock(request)
474
483
 
475
484
 
476
485
  @init_db
@@ -485,12 +494,15 @@ def update_request_async(
485
494
 
486
495
  @contextlib.asynccontextmanager
487
496
  async def _cm():
488
- request = await _get_request_no_lock_async(request_id)
489
- try:
490
- yield request
491
- finally:
492
- if request is not None:
493
- await _add_or_update_request_no_lock_async(request)
497
+ # Acquire the lock to avoid race conditions between multiple request
498
+ # operations, e.g. execute and cancel.
499
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
500
+ request = await _get_request_no_lock_async(request_id)
501
+ try:
502
+ yield request
503
+ finally:
504
+ if request is not None:
505
+ await _add_or_update_request_no_lock_async(request)
494
506
 
495
507
  return _cm()
496
508
 
@@ -775,9 +787,12 @@ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
775
787
 
776
788
 
777
789
  def set_request_cancelled(request_id: str) -> None:
778
- """Set a request to cancelled."""
790
+ """Set a pending or running request to cancelled."""
779
791
  with update_request(request_id) as request_task:
780
792
  assert request_task is not None, request_id
793
+ # Already finished or cancelled.
794
+ if request_task.status > RequestStatus.RUNNING:
795
+ return
781
796
  request_task.finished_at = time.time()
782
797
  request_task.status = RequestStatus.CANCELLED
783
798
 
sky/server/server.py CHANGED
@@ -1943,6 +1943,7 @@ if __name__ == '__main__':
1943
1943
 
1944
1944
  from sky.server import uvicorn as skyuvicorn
1945
1945
 
1946
+ logger.info('Initializing SkyPilot API server')
1946
1947
  skyuvicorn.add_timestamp_prefix_for_server_logs()
1947
1948
 
1948
1949
  parser = argparse.ArgumentParser()
@@ -1954,20 +1955,35 @@ if __name__ == '__main__':
1954
1955
  parser.add_argument('--metrics-port', default=9090, type=int)
1955
1956
  cmd_args = parser.parse_args()
1956
1957
  if cmd_args.port == cmd_args.metrics_port:
1958
+ logger.error('port and metrics-port cannot be the same, exiting.')
1957
1959
  raise ValueError('port and metrics-port cannot be the same')
1958
1960
 
1961
+ # Fail fast if the port is not available to avoid corrupt the state
1962
+ # of potential running server instance.
1963
+ # We might reach here because the running server is currently not
1964
+ # responding, thus the healthz check fails and `sky api start` think
1965
+ # we should start a new server instance.
1966
+ if not common_utils.is_port_available(cmd_args.port):
1967
+ logger.error(f'Port {cmd_args.port} is not available, exiting.')
1968
+ raise RuntimeError(f'Port {cmd_args.port} is not available')
1969
+
1959
1970
  # Show the privacy policy if it is not already shown. We place it here so
1960
1971
  # that it is shown only when the API server is started.
1961
1972
  usage_lib.maybe_show_privacy_policy()
1962
1973
 
1963
1974
  # Initialize global user state db
1964
1975
  db_utils.set_max_connections(1)
1976
+ logger.info('Initializing database engine')
1965
1977
  global_user_state.initialize_and_get_db()
1978
+ logger.info('Database engine initialized')
1966
1979
  # Initialize request db
1967
1980
  requests_lib.reset_db_and_logs()
1968
1981
  # Restore the server user hash
1982
+ logger.info('Initializing server user hash')
1969
1983
  _init_or_restore_server_user_hash()
1984
+
1970
1985
  max_db_connections = global_user_state.get_max_db_connections()
1986
+ logger.info(f'Max db connections: {max_db_connections}')
1971
1987
  config = server_config.compute_server_config(cmd_args.deploy,
1972
1988
  max_db_connections)
1973
1989
 
sky/skylet/constants.py CHANGED
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
100
100
  # cluster yaml is updated.
101
101
  #
102
102
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
103
- SKYLET_VERSION = '21'
103
+ SKYLET_VERSION = '22'
104
104
  # The version of the lib files that skylet/jobs use. Whenever there is an API
105
105
  # change for the job_lib or log_lib, we need to bump this version, so that the
106
106
  # user can be notified to update their SkyPilot version on the remote cluster.
@@ -331,6 +331,14 @@ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
331
331
  # controller_utils.translate_local_file_mounts_to_two_hop().
332
332
  FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
333
333
 
334
+ # For passing in CPU and memory limits to the controller pod when running
335
+ # in k8s. Right now, we only use this for the jobs controller, but we may
336
+ # use this for the serve controller as well in the future.
337
+ # These files are written to disk by the skylet, who reads it from env vars
338
+ # passed by the backend when starting the skylet (start_skylet_on_head_node).
339
+ CONTROLLER_K8S_CPU_FILE = '~/.sky/_internal_k8s_pod_cpu'
340
+ CONTROLLER_K8S_MEMORY_FILE = '~/.sky/_internal_k8s_pod_memory'
341
+
334
342
  # Used when an managed jobs are created and
335
343
  # files are synced up to the cloud.
336
344
  FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
sky/skylet/events.py CHANGED
@@ -47,6 +47,9 @@ class SkyletEvent:
47
47
  EVENT_CHECKING_INTERVAL_SECONDS))
48
48
  self._n = 0
49
49
 
50
+ def start(self):
51
+ pass
52
+
50
53
  def run(self):
51
54
  self._n = (self._n + 1) % self._event_interval
52
55
  if self._n % self._event_interval == 0:
@@ -75,6 +78,20 @@ class ManagedJobEvent(SkyletEvent):
75
78
  """Skylet event for updating and scheduling managed jobs."""
76
79
  EVENT_INTERVAL_SECONDS = 300
77
80
 
81
+ def start(self):
82
+ cpus_env_var = os.environ.get('SKYPILOT_POD_CPU_CORE_LIMIT')
83
+ if cpus_env_var is not None:
84
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_CPU_FILE),
85
+ 'w',
86
+ encoding='utf-8') as f:
87
+ f.write(cpus_env_var)
88
+ memory_env_var = os.environ.get('SKYPILOT_POD_MEMORY_GB_LIMIT')
89
+ if memory_env_var is not None:
90
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
91
+ 'w',
92
+ encoding='utf-8') as f:
93
+ f.write(memory_env_var)
94
+
78
95
  def _run(self):
79
96
  if not os.path.exists(
80
97
  os.path.expanduser(
sky/skylet/skylet.py CHANGED
@@ -71,6 +71,9 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
71
71
  def run_event_loop():
72
72
  """Run the existing event loop."""
73
73
 
74
+ for event in EVENTS:
75
+ event.start()
76
+
74
77
  while True:
75
78
  time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
76
79
  for event in EVENTS:
@@ -632,6 +632,9 @@ available_node_types:
632
632
  command: ["/bin/bash", "-c", "--"]
633
633
  args:
634
634
  - |
635
+ # Set -x to print the commands and their arguments as they are executed.
636
+ # Useful for debugging.
637
+ set -x
635
638
  # Helper function to conditionally use sudo
636
639
  # TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
637
640
  prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
@@ -1086,6 +1089,8 @@ available_node_types:
1086
1089
 
1087
1090
  touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
1088
1091
  {% endif %}
1092
+ # Set +x to stop printing the commands and their arguments as they are executed.
1093
+ set +x
1089
1094
 
1090
1095
  trap : TERM INT; log_tail || sleep infinity & wait
1091
1096
 
@@ -130,7 +130,11 @@ def wait_process(ctx: context.Context,
130
130
  # Kill the process despite the caller's callback, the utility
131
131
  # function gracefully handles the case where the process is
132
132
  # already terminated.
133
- subprocess_utils.kill_process_with_grace_period(proc)
133
+ # Bash script typically does not forward SIGTERM to childs, thus
134
+ # cannot be killed gracefully, shorten the grace period for faster
135
+ # termination.
136
+ subprocess_utils.kill_process_with_grace_period(proc,
137
+ grace_period=1)
134
138
  raise asyncio.CancelledError()
135
139
  try:
136
140
  proc.wait(poll_interval)
@@ -506,6 +506,9 @@ def shared_controller_vars_to_fill(
506
506
  # before popping allowed_contexts. If it is not on Kubernetes,
507
507
  # we may be able to use allowed_contexts.
508
508
  local_user_config.pop('allowed_contexts', None)
509
+ # Remove api_server config so that the controller does not try to use
510
+ # a remote API server.
511
+ local_user_config.pop('api_server', None)
509
512
  with tempfile.NamedTemporaryFile(
510
513
  delete=False,
511
514
  suffix=_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX) as temp_file:
@@ -726,6 +729,17 @@ def get_controller_resources(
726
729
  return result
727
730
 
728
731
 
732
+ def get_controller_mem_size_gb() -> float:
733
+ try:
734
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
735
+ 'r',
736
+ encoding='utf-8') as f:
737
+ return float(f.read())
738
+ except FileNotFoundError:
739
+ pass
740
+ return common_utils.get_mem_size_gb()
741
+
742
+
729
743
  def _setup_proxy_command_on_controller(
730
744
  controller_launched_cloud: 'clouds.Cloud',
731
745
  user_config: Dict[str, Any]) -> config_utils.Config:
sky/utils/db/db_utils.py CHANGED
@@ -410,6 +410,8 @@ def get_engine(
410
410
  conn_string, poolclass=sqlalchemy.NullPool)
411
411
  with _db_creation_lock:
412
412
  if conn_string not in _postgres_engine_cache:
413
+ logger.debug('Creating a new postgres engine with '
414
+ f'maximum {_max_connections} connections')
413
415
  if _max_connections == 0:
414
416
  _postgres_engine_cache[conn_string] = (
415
417
  sqlalchemy.create_engine(
@@ -11,13 +11,14 @@ import filelock
11
11
  import sqlalchemy
12
12
 
13
13
  from sky import sky_logging
14
+ from sky.skylet import constants
14
15
 
15
16
  logger = sky_logging.init_logger(__name__)
16
17
 
17
18
  DB_INIT_LOCK_TIMEOUT_SECONDS = 10
18
19
 
19
20
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
20
- GLOBAL_USER_STATE_VERSION = '009'
21
+ GLOBAL_USER_STATE_VERSION = '010'
21
22
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
22
23
 
23
24
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
@@ -85,12 +86,20 @@ def needs_upgrade(engine: sqlalchemy.engine.Engine, section: str,
85
86
  connection, opts={'version_table': version_table})
86
87
  current_rev = context.get_current_revision()
87
88
 
89
+ target_rev_num = int(target_revision)
88
90
  if current_rev is None:
91
+ if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
92
+ logger.debug(f'{section} database currently uninitialized, '
93
+ f'targeting revision {target_rev_num}')
89
94
  return True
90
95
 
91
96
  # Compare revisions - assuming they are numeric strings like '001', '002'
92
97
  current_rev_num = int(current_rev)
93
- target_rev_num = int(target_revision)
98
+ if (current_rev_num < target_rev_num and
99
+ os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None):
100
+ logger.debug(
101
+ f'{section} database currently at revision {current_rev_num}, '
102
+ f'targeting revision {target_rev_num}')
94
103
 
95
104
  return current_rev_num < target_rev_num
96
105
 
@@ -24,11 +24,11 @@ async def volume_list(request: fastapi.Request) -> None:
24
24
  auth_user_env_vars_kwargs = {
25
25
  'env_vars': auth_user.to_env_vars()
26
26
  } if auth_user else {}
27
- volume_list_body = payloads.VolumeListBody(**auth_user_env_vars_kwargs)
27
+ request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
28
28
  executor.schedule_request(
29
29
  request_id=request.state.request_id,
30
30
  request_name='volume_list',
31
- request_body=volume_list_body,
31
+ request_body=request_body,
32
32
  func=core.volume_list,
33
33
  schedule_type=requests_lib.ScheduleType.SHORT,
34
34
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20251005
3
+ Version: 1.0.0.dev20251008
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -155,51 +155,51 @@ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
155
155
  Requires-Dist: aiosqlite; extra == "server"
156
156
  Requires-Dist: greenlet; extra == "server"
157
157
  Provides-Extra: all
158
- Requires-Dist: azure-identity>=1.19.0; extra == "all"
158
+ Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
159
+ Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
160
+ Requires-Dist: anyio; extra == "all"
161
+ Requires-Dist: google-cloud-storage; extra == "all"
159
162
  Requires-Dist: passlib; extra == "all"
160
- Requires-Dist: tomli; python_version < "3.11" and extra == "all"
163
+ Requires-Dist: azure-core>=1.24.0; extra == "all"
164
+ Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
161
165
  Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
162
- Requires-Dist: python-dateutil; extra == "all"
163
- Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
164
- Requires-Dist: nebius>=0.2.47; extra == "all"
165
- Requires-Dist: azure-cli>=2.65.0; extra == "all"
166
+ Requires-Dist: azure-common; extra == "all"
167
+ Requires-Dist: casbin; extra == "all"
168
+ Requires-Dist: docker; extra == "all"
169
+ Requires-Dist: azure-core>=1.31.0; extra == "all"
170
+ Requires-Dist: azure-identity>=1.19.0; extra == "all"
171
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
166
172
  Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
167
- Requires-Dist: websockets; extra == "all"
168
- Requires-Dist: sqlalchemy_adapter; extra == "all"
169
- Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
170
- Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
173
+ Requires-Dist: grpcio>=1.63.0; extra == "all"
174
+ Requires-Dist: msrestazure; extra == "all"
171
175
  Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
172
176
  Requires-Dist: colorama<0.4.5; extra == "all"
173
- Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
174
- Requires-Dist: azure-core>=1.24.0; extra == "all"
175
- Requires-Dist: msrestazure; extra == "all"
176
- Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
177
- Requires-Dist: boto3>=1.26.1; extra == "all"
178
- Requires-Dist: ibm-cloud-sdk-core; extra == "all"
177
+ Requires-Dist: botocore>=1.29.10; extra == "all"
179
178
  Requires-Dist: pyjwt; extra == "all"
180
- Requires-Dist: ibm-vpc; extra == "all"
179
+ Requires-Dist: aiosqlite; extra == "all"
180
+ Requires-Dist: pydo>=0.3.0; extra == "all"
181
181
  Requires-Dist: cudo-compute>=0.1.10; extra == "all"
182
- Requires-Dist: azure-core>=1.31.0; extra == "all"
182
+ Requires-Dist: ibm-vpc; extra == "all"
183
+ Requires-Dist: awscli>=1.27.10; extra == "all"
184
+ Requires-Dist: azure-cli>=2.65.0; extra == "all"
185
+ Requires-Dist: ibm-cos-sdk; extra == "all"
186
+ Requires-Dist: greenlet; extra == "all"
187
+ Requires-Dist: tomli; python_version < "3.11" and extra == "all"
188
+ Requires-Dist: python-dateutil; extra == "all"
189
+ Requires-Dist: boto3>=1.26.1; extra == "all"
190
+ Requires-Dist: sqlalchemy_adapter; extra == "all"
191
+ Requires-Dist: ray[default]>=2.6.1; extra == "all"
192
+ Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
183
193
  Requires-Dist: oci; extra == "all"
184
- Requires-Dist: grpcio>=1.63.0; extra == "all"
185
- Requires-Dist: msgraph-sdk; extra == "all"
186
- Requires-Dist: botocore>=1.29.10; extra == "all"
187
194
  Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
188
- Requires-Dist: docker; extra == "all"
189
- Requires-Dist: greenlet; extra == "all"
195
+ Requires-Dist: ibm-cloud-sdk-core; extra == "all"
190
196
  Requires-Dist: runpod>=1.6.1; extra == "all"
191
- Requires-Dist: ibm-cos-sdk; extra == "all"
192
- Requires-Dist: pydo>=0.3.0; extra == "all"
193
- Requires-Dist: google-cloud-storage; extra == "all"
194
- Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
195
- Requires-Dist: ecsapi>=0.2.0; extra == "all"
196
- Requires-Dist: casbin; extra == "all"
197
- Requires-Dist: aiosqlite; extra == "all"
198
- Requires-Dist: azure-common; extra == "all"
199
- Requires-Dist: anyio; extra == "all"
200
- Requires-Dist: ray[default]>=2.6.1; extra == "all"
201
- Requires-Dist: awscli>=1.27.10; extra == "all"
202
197
  Requires-Dist: aiohttp; extra == "all"
198
+ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
199
+ Requires-Dist: websockets; extra == "all"
200
+ Requires-Dist: nebius>=0.2.47; extra == "all"
201
+ Requires-Dist: ecsapi>=0.2.0; extra == "all"
202
+ Requires-Dist: msgraph-sdk; extra == "all"
203
203
  Dynamic: author
204
204
  Dynamic: classifier
205
205
  Dynamic: description