skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +1 -6
  3. sky/backends/backend_utils.py +26 -11
  4. sky/backends/cloud_vm_ray_backend.py +16 -5
  5. sky/client/cli/command.py +232 -9
  6. sky/client/sdk.py +195 -91
  7. sky/clouds/aws.py +10 -7
  8. sky/clouds/azure.py +10 -7
  9. sky/clouds/cloud.py +2 -0
  10. sky/clouds/cudo.py +2 -0
  11. sky/clouds/do.py +10 -7
  12. sky/clouds/fluidstack.py +2 -0
  13. sky/clouds/gcp.py +10 -7
  14. sky/clouds/hyperbolic.py +10 -7
  15. sky/clouds/ibm.py +2 -0
  16. sky/clouds/kubernetes.py +26 -9
  17. sky/clouds/lambda_cloud.py +10 -7
  18. sky/clouds/nebius.py +10 -7
  19. sky/clouds/oci.py +10 -7
  20. sky/clouds/paperspace.py +10 -7
  21. sky/clouds/runpod.py +10 -7
  22. sky/clouds/scp.py +10 -7
  23. sky/clouds/ssh.py +36 -0
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +21 -0
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
  31. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
  39. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  40. sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
  43. sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  49. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  55. sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
  59. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
  62. sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
  63. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  64. sky/dashboard/out/clusters/[cluster].html +1 -1
  65. sky/dashboard/out/clusters.html +1 -1
  66. sky/dashboard/out/config.html +1 -1
  67. sky/dashboard/out/index.html +1 -1
  68. sky/dashboard/out/infra/[context].html +1 -1
  69. sky/dashboard/out/infra.html +1 -1
  70. sky/dashboard/out/jobs/[job].html +1 -1
  71. sky/dashboard/out/jobs.html +1 -1
  72. sky/dashboard/out/users.html +1 -1
  73. sky/dashboard/out/volumes.html +1 -0
  74. sky/dashboard/out/workspace/new.html +1 -1
  75. sky/dashboard/out/workspaces/[name].html +1 -1
  76. sky/dashboard/out/workspaces.html +1 -1
  77. sky/data/storage_utils.py +2 -4
  78. sky/exceptions.py +15 -0
  79. sky/execution.py +5 -0
  80. sky/global_user_state.py +129 -0
  81. sky/jobs/client/sdk.py +13 -11
  82. sky/jobs/server/core.py +4 -0
  83. sky/models.py +16 -0
  84. sky/provision/__init__.py +26 -0
  85. sky/provision/kubernetes/__init__.py +3 -0
  86. sky/provision/kubernetes/instance.py +38 -77
  87. sky/provision/kubernetes/utils.py +70 -4
  88. sky/provision/kubernetes/volume.py +147 -0
  89. sky/resources.py +20 -76
  90. sky/serve/client/sdk.py +13 -13
  91. sky/serve/server/core.py +5 -1
  92. sky/server/common.py +40 -5
  93. sky/server/constants.py +5 -1
  94. sky/server/metrics.py +105 -0
  95. sky/server/requests/executor.py +30 -14
  96. sky/server/requests/payloads.py +16 -0
  97. sky/server/requests/requests.py +35 -1
  98. sky/server/rest.py +153 -0
  99. sky/server/server.py +70 -43
  100. sky/server/state.py +20 -0
  101. sky/server/stream_utils.py +8 -3
  102. sky/server/uvicorn.py +153 -13
  103. sky/setup_files/dependencies.py +2 -0
  104. sky/skylet/constants.py +19 -3
  105. sky/skypilot_config.py +3 -0
  106. sky/ssh_node_pools/__init__.py +1 -0
  107. sky/ssh_node_pools/core.py +133 -0
  108. sky/ssh_node_pools/server.py +232 -0
  109. sky/task.py +141 -18
  110. sky/templates/kubernetes-ray.yml.j2 +30 -1
  111. sky/users/permission.py +2 -0
  112. sky/utils/context.py +3 -1
  113. sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
  114. sky/utils/kubernetes/ssh_utils.py +221 -0
  115. sky/utils/resources_utils.py +66 -0
  116. sky/utils/rich_utils.py +6 -0
  117. sky/utils/schemas.py +146 -3
  118. sky/utils/status_lib.py +10 -0
  119. sky/utils/validator.py +11 -1
  120. sky/volumes/__init__.py +0 -0
  121. sky/volumes/client/__init__.py +0 -0
  122. sky/volumes/client/sdk.py +64 -0
  123. sky/volumes/server/__init__.py +0 -0
  124. sky/volumes/server/core.py +199 -0
  125. sky/volumes/server/server.py +85 -0
  126. sky/volumes/utils.py +158 -0
  127. sky/volumes/volume.py +198 -0
  128. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
  129. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
  130. sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
  133. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
  134. sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  139. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
  141. sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  145. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
  151. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
  156. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
  157. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
  158. /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
  159. /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
  160. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
  161. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
  162. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
  163. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
sky/server/uvicorn.py CHANGED
@@ -3,17 +3,165 @@
3
3
  This module is a wrapper around uvicorn to customize the behavior of the
4
4
  server.
5
5
  """
6
- import functools
6
+ import asyncio
7
7
  import os
8
+ import signal
8
9
  import threading
9
- from typing import Optional
10
+ import time
11
+ from types import FrameType
12
+ from typing import Optional, Union
10
13
 
14
+ import filelock
11
15
  import uvicorn
12
16
  from uvicorn.supervisors import multiprocess
13
17
 
18
+ from sky import sky_logging
19
+ from sky.server import state
20
+ from sky.server.requests import requests as requests_lib
21
+ from sky.skylet import constants
14
22
  from sky.utils import context_utils
15
23
  from sky.utils import subprocess_utils
16
24
 
25
+ logger = sky_logging.init_logger(__name__)
26
+
27
+ # File lock path for coordinating graceful shutdown across processes
28
+ _GRACEFUL_SHUTDOWN_LOCK_PATH = '/tmp/skypilot_graceful_shutdown.lock'
29
+
30
+ # Interval to check for on-going requests.
31
+ _WAIT_REQUESTS_INTERVAL_SECONDS = 5
32
+
33
+ # Timeout for waiting for on-going requests to finish.
34
+ try:
35
+ _WAIT_REQUESTS_TIMEOUT_SECONDS = int(
36
+ os.environ.get(constants.GRACE_PERIOD_SECONDS_ENV_VAR, '60'))
37
+ except ValueError:
38
+ _WAIT_REQUESTS_TIMEOUT_SECONDS = 60
39
+
40
+ # TODO(aylei): use decorator to register requests that need to be proactively
41
+ # cancelled instead of hardcoding here.
42
+ _RETRIABLE_REQUEST_NAMES = [
43
+ 'sky.logs',
44
+ 'sky.jobs.logs',
45
+ 'sky.serve.logs',
46
+ ]
47
+
48
+
49
+ class Server(uvicorn.Server):
50
+ """Server wrapper for uvicorn.
51
+
52
+ Extended functionalities:
53
+ - Handle exit signal and perform custom graceful shutdown.
54
+ - Run the server process with contextually aware.
55
+ """
56
+
57
+ def __init__(self, config: uvicorn.Config):
58
+ super().__init__(config=config)
59
+ self.exiting: bool = False
60
+
61
+ def handle_exit(self, sig: int, frame: Union[FrameType, None]) -> None:
62
+ """Handle exit signal.
63
+
64
+ When a server process receives a SIGTERM or SIGINT signal, a graceful
65
+ shutdown will be initiated. If a SIGINT signal is received again, the
66
+ server will be forcefully shutdown.
67
+ """
68
+ if self.exiting and sig == signal.SIGINT:
69
+ # The server has been siganled to exit and recieved a SIGINT again,
70
+ # do force shutdown.
71
+ logger.info('Force shutdown.')
72
+ self.should_exit = True
73
+ super().handle_exit(sig, frame)
74
+ return
75
+ if not self.exiting:
76
+ self.exiting = True
77
+ # Perform graceful shutdown in a separate thread to avoid blocking
78
+ # the main thread.
79
+ threading.Thread(target=self._graceful_shutdown,
80
+ args=(sig, frame),
81
+ daemon=True).start()
82
+
83
+ def _graceful_shutdown(self, sig: int, frame: Union[FrameType,
84
+ None]) -> None:
85
+ """Perform graceful shutdown."""
86
+ # Block new requests so that we can wait until all on-going requests
87
+ # are finished. Note that /api/$verb operations are still allowed in
88
+ # this stage to ensure the client can still operate the on-going
89
+ # requests, e.g. /api/logs, /api/cancel, etc.
90
+ logger.info('Block new requests being submitted in worker '
91
+ f'{os.getpid()}.')
92
+ state.set_block_requests(True)
93
+ # Ensure the shutting_down are set on all workers before next step.
94
+ # TODO(aylei): hacky, need a reliable solution.
95
+ time.sleep(1)
96
+
97
+ lock = filelock.FileLock(_GRACEFUL_SHUTDOWN_LOCK_PATH)
98
+ # Elect a coordinator process to handle on-going requests check
99
+ with lock.acquire():
100
+ logger.info(f'Worker {os.getpid()} elected as shutdown coordinator')
101
+ self._wait_requests()
102
+
103
+ logger.info('Shutting down server...')
104
+ self.should_exit = True
105
+ super().handle_exit(sig, frame)
106
+
107
+ def _wait_requests(self) -> None:
108
+ """Wait until all on-going requests are finished or cancelled."""
109
+ start_time = time.time()
110
+ while True:
111
+ statuses = [
112
+ requests_lib.RequestStatus.PENDING,
113
+ requests_lib.RequestStatus.RUNNING,
114
+ ]
115
+ reqs = requests_lib.get_request_tasks(status=statuses)
116
+ if not reqs:
117
+ break
118
+ logger.info(f'{len(reqs)} on-going requests '
119
+ 'found, waiting for them to finish...')
120
+ # Proactively cancel internal requests and logs requests since
121
+ # they can run for infinite time.
122
+ internal_request_ids = [
123
+ d.id for d in requests_lib.INTERNAL_REQUEST_DAEMONS
124
+ ]
125
+ if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
126
+ logger.warning('Timeout waiting for on-going requests to '
127
+ 'finish, cancelling all on-going requests.')
128
+ for req in reqs:
129
+ self.interrupt_request_for_retry(req.request_id)
130
+ break
131
+ interrupted = 0
132
+ for req in reqs:
133
+ if req.request_id in internal_request_ids:
134
+ self.interrupt_request_for_retry(req.request_id)
135
+ interrupted += 1
136
+ elif req.name in _RETRIABLE_REQUEST_NAMES:
137
+ self.interrupt_request_for_retry(req.request_id)
138
+ interrupted += 1
139
+ # TODO(aylei): interrupt pending requests to accelerate the
140
+ # shutdown.
141
+ # If some requests are not interrupted, wait for them to finish,
142
+ # otherwise we just check again immediately to accelerate the
143
+ # shutdown process.
144
+ if interrupted < len(reqs):
145
+ time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
146
+
147
+ def interrupt_request_for_retry(self, request_id: str) -> None:
148
+ """Interrupt a request for retry."""
149
+ with requests_lib.update_request(request_id) as req:
150
+ if req is None:
151
+ return
152
+ if req.pid is not None:
153
+ os.kill(req.pid, signal.SIGTERM)
154
+ req.status = requests_lib.RequestStatus.CANCELLED
155
+ req.should_retry = True
156
+ logger.info(
157
+ f'Request {request_id} interrupted and will be retried by client.')
158
+
159
+ def run(self, *args, **kwargs):
160
+ """Run the server process."""
161
+ context_utils.hijack_sys_attrs()
162
+ with self.capture_signals():
163
+ asyncio.run(self.serve(*args, **kwargs))
164
+
17
165
 
18
166
  def run(config: uvicorn.Config):
19
167
  """Run unvicorn server."""
@@ -22,28 +170,20 @@ def run(config: uvicorn.Config):
22
170
  # in uvicorn. Since we do not use reload now, simply
23
171
  # guard by an exception.
24
172
  raise ValueError('Reload is not supported yet.')
25
- server = uvicorn.Server(config=config)
26
- run_server_process = functools.partial(_run_server_process, server)
173
+ server = Server(config=config)
27
174
  try:
28
175
  if config.workers is not None and config.workers > 1:
29
176
  sock = config.bind_socket()
30
- SlowStartMultiprocess(config,
31
- target=run_server_process,
177
+ SlowStartMultiprocess(config, target=server.run,
32
178
  sockets=[sock]).run()
33
179
  else:
34
- run_server_process()
180
+ server.run()
35
181
  finally:
36
182
  # Copied from unvicorn.run()
37
183
  if config.uds and os.path.exists(config.uds):
38
184
  os.remove(config.uds)
39
185
 
40
186
 
41
- def _run_server_process(server: uvicorn.Server, *args, **kwargs):
42
- """Run the server process with contextually aware."""
43
- context_utils.hijack_sys_attrs()
44
- server.run(*args, **kwargs)
45
-
46
-
47
187
  class SlowStartMultiprocess(multiprocess.Multiprocess):
48
188
  """Uvicorn Multiprocess wrapper with slow start.
49
189
 
@@ -62,6 +62,8 @@ install_requires = [
62
62
  # the client-side actually not importing them.
63
63
  'casbin',
64
64
  'sqlalchemy_adapter',
65
+ # Required for API server metrics
66
+ 'prometheus_client>=0.8.0',
65
67
  'passlib',
66
68
  ]
67
69
 
sky/skylet/constants.py CHANGED
@@ -413,6 +413,13 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
413
413
  # Environment variable that is set to 'true' if this is a skypilot server.
414
414
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
415
415
 
416
+ # Environment variable that is set to 'true' if metrics are enabled.
417
+ ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
418
+
419
+ # Environment variable that is used as the DB connection string for the
420
+ # skypilot server.
421
+ ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
422
+
416
423
  # Environment variable that is set to 'true' if basic
417
424
  # authentication is enabled in the API server.
418
425
  ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
@@ -449,20 +456,29 @@ TIME_PATTERN: str = (
449
456
 
450
457
  MEMORY_SIZE_UNITS = {
451
458
  'kb': 2**10,
459
+ 'ki': 2**10,
452
460
  'mb': 2**20,
461
+ 'mi': 2**20,
453
462
  'gb': 2**30,
463
+ 'gi': 2**30,
454
464
  'tb': 2**40,
465
+ 'ti': 2**40,
455
466
  'pb': 2**50,
467
+ 'pi': 2**50,
456
468
  }
457
469
 
458
470
  MEMORY_SIZE_PATTERN = (
459
471
  '^[0-9]+('
460
- f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}'
461
- ')?$/i')
462
- MEMORY_SIZE_PLUS_PATTERN = f'{MEMORY_SIZE_PATTERN[:-3]}+?$/i'
472
+ f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}|'
473
+ f'{"|".join([unit.upper() for unit in MEMORY_SIZE_UNITS])}|'
474
+ f'{"|".join([unit[0].upper() + unit[1:] for unit in MEMORY_SIZE_UNITS if len(unit) > 1])}' # pylint: disable=line-too-long
475
+ ')?$')
476
+
477
+ LAST_USE_TRUNC_LENGTH = 25
463
478
 
464
479
  MIN_PRIORITY = -1000
465
480
  MAX_PRIORITY = 1000
466
481
  DEFAULT_PRIORITY = 0
467
482
 
483
+ GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
468
484
  COST_REPORT_DEFAULT_DAYS = 30
sky/skypilot_config.py CHANGED
@@ -564,7 +564,10 @@ def _reload_config_as_server() -> None:
564
564
  _set_loaded_config_path(None)
565
565
 
566
566
  server_config_path = _resolve_server_config_path()
567
+ db_url_from_env = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
567
568
  server_config = _get_config_from_path(server_config_path)
569
+ if db_url_from_env:
570
+ server_config.set_nested(('db',), db_url_from_env)
568
571
 
569
572
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
570
573
  logger.debug(f'server config: \n'
@@ -0,0 +1 @@
1
+ """SSH Node Pool management package."""
@@ -0,0 +1,133 @@
1
+ """SSH Node Pool management core functionality."""
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Any, Dict, List
5
+
6
+ import yaml
7
+
8
+
9
+ class SSHNodePoolManager:
10
+ """Manager for SSH Node Pool configurations."""
11
+
12
+ def __init__(self):
13
+ self.config_path = Path.home() / '.sky' / 'ssh_node_pools.yaml'
14
+ self.keys_dir = Path.home() / '.sky' / 'ssh_keys'
15
+ self.keys_dir.mkdir(parents=True, exist_ok=True)
16
+
17
+ def get_all_pools(self) -> Dict[str, Any]:
18
+ """Read all SSH Node Pool configurations from YAML file."""
19
+ if not self.config_path.exists():
20
+ return {}
21
+
22
+ try:
23
+ with open(self.config_path, 'r', encoding='utf-8') as f:
24
+ return yaml.safe_load(f) or {}
25
+ except Exception as e:
26
+ raise RuntimeError(
27
+ f'Failed to read SSH Node Pool config: {e}') from e
28
+
29
+ def save_all_pools(self, pools_config: Dict[str, Any]) -> None:
30
+ """Write SSH Node Pool configurations to YAML file."""
31
+ try:
32
+ self.config_path.parent.mkdir(parents=True, exist_ok=True)
33
+ with open(self.config_path, 'w', encoding='utf-8') as f:
34
+ yaml.dump(pools_config, f, default_flow_style=False)
35
+ except Exception as e:
36
+ raise RuntimeError(
37
+ f'Failed to save SSH Node Pool config: {e}') from e
38
+
39
+ def update_pools(self, pools_config: Dict[str, Any]) -> None:
40
+ """Update SSH Node Pool configurations."""
41
+ all_pools = self.get_all_pools()
42
+ all_pools.update(pools_config)
43
+ self.save_all_pools(all_pools)
44
+
45
+ def add_or_update_pool(self, pool_name: str,
46
+ pool_config: Dict[str, Any]) -> None:
47
+ """Add or update a single SSH Node Pool configuration."""
48
+ # Validate pool configuration
49
+ self._validate_pool_config(pool_config)
50
+
51
+ all_pools = self.get_all_pools()
52
+ all_pools[pool_name] = pool_config
53
+ self.save_all_pools(all_pools)
54
+
55
+ def delete_pool(self, pool_name: str) -> bool:
56
+ """Delete a SSH Node Pool configuration."""
57
+ all_pools = self.get_all_pools()
58
+ if pool_name in all_pools:
59
+ del all_pools[pool_name]
60
+ self.save_all_pools(all_pools)
61
+ return True
62
+ return False
63
+
64
+ def save_ssh_key(self, key_name: str, key_content: str) -> str:
65
+ """Save SSH private key to ~/.sky/ssh_keys/ directory."""
66
+ # Validate key name
67
+ if not key_name or '/' in key_name or key_name.startswith('.'):
68
+ raise ValueError('Invalid key name')
69
+
70
+ key_path = self.keys_dir / key_name
71
+ try:
72
+ with open(key_path, 'w', encoding='utf-8') as f:
73
+ f.write(key_content)
74
+ os.chmod(key_path, 0o600) # Set secure permissions
75
+ return str(key_path)
76
+ except Exception as e:
77
+ raise RuntimeError(f'Failed to save SSH key: {e}') from e
78
+
79
+ def list_ssh_keys(self) -> List[str]:
80
+ """List available SSH key files."""
81
+ if not self.keys_dir.exists():
82
+ return []
83
+ try:
84
+ return [f.name for f in self.keys_dir.iterdir() if f.is_file()]
85
+ except Exception: # pylint: disable=broad-except
86
+ return []
87
+
88
+ def _validate_pool_config(self, config: Dict[str, Any]) -> None:
89
+ """Validate SSH Node Pool configuration."""
90
+ if 'hosts' not in config:
91
+ raise ValueError('Pool configuration must include `hosts`')
92
+
93
+ if not isinstance(config['hosts'], list) or not config['hosts']:
94
+ raise ValueError('`hosts` must be a non-empty list')
95
+
96
+ # Validate user field
97
+ if not config.get('user', '').strip():
98
+ raise ValueError('Pool configuration must include `user`')
99
+
100
+ # Validate authentication - must have either identity_file or password
101
+ if not config.get('identity_file') and not config.get('password'):
102
+ raise ValueError('Pool configuration must include '
103
+ 'either `identity_file` or `password`')
104
+
105
+
106
+ def get_all_pools() -> Dict[str, Any]:
107
+ """Get all SSH Node Pool configurations."""
108
+ manager = SSHNodePoolManager()
109
+ return manager.get_all_pools()
110
+
111
+
112
+ def update_pools(pools_config: Dict[str, Any]) -> None:
113
+ """Update SSH Node Pool configurations."""
114
+ manager = SSHNodePoolManager()
115
+ manager.update_pools(pools_config)
116
+
117
+
118
+ def delete_pool(pool_name: str) -> bool:
119
+ """Delete a SSH Node Pool configuration."""
120
+ manager = SSHNodePoolManager()
121
+ return manager.delete_pool(pool_name)
122
+
123
+
124
+ def upload_ssh_key(key_name: str, key_content: str) -> str:
125
+ """Upload SSH private key."""
126
+ manager = SSHNodePoolManager()
127
+ return manager.save_ssh_key(key_name, key_content)
128
+
129
+
130
+ def list_ssh_keys() -> List[str]:
131
+ """List available SSH keys."""
132
+ manager = SSHNodePoolManager()
133
+ return manager.list_ssh_keys()
@@ -0,0 +1,232 @@
1
+ """SSH Node Pool management API endpoints."""
2
+ import re
3
+ from typing import Any, Dict, List
4
+
5
+ import fastapi
6
+
7
+ from sky import core as sky_core
8
+ from sky.server.requests import executor
9
+ from sky.server.requests import payloads
10
+ from sky.server.requests import requests as requests_lib
11
+ from sky.ssh_node_pools import core as ssh_node_pools_core
12
+ from sky.utils import common_utils
13
+
14
+ router = fastapi.APIRouter()
15
+
16
+
17
+ @router.get('')
18
+ async def get_ssh_node_pools() -> Dict[str, Any]:
19
+ """Get all SSH Node Pool configurations."""
20
+ try:
21
+ return ssh_node_pools_core.get_all_pools()
22
+ except Exception as e:
23
+ raise fastapi.HTTPException(
24
+ status_code=500,
25
+ detail=
26
+ f'Failed to get SSH Node Pools: {common_utils.format_exception(e)}')
27
+
28
+
29
+ @router.post('')
30
+ async def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
31
+ """Update SSH Node Pool configurations."""
32
+ try:
33
+ ssh_node_pools_core.update_pools(pools_config)
34
+ return {'status': 'success'}
35
+ except Exception as e:
36
+ raise fastapi.HTTPException(status_code=400,
37
+ detail=f'Failed to update SSH Node Pools:'
38
+ f' {common_utils.format_exception(e)}')
39
+
40
+
41
+ @router.delete('/{pool_name}')
42
+ async def delete_ssh_node_pool(pool_name: str) -> Dict[str, str]:
43
+ """Delete a SSH Node Pool configuration."""
44
+ try:
45
+ if ssh_node_pools_core.delete_pool(pool_name):
46
+ return {'status': 'success'}
47
+ else:
48
+ raise fastapi.HTTPException(
49
+ status_code=404,
50
+ detail=f'SSH Node Pool `{pool_name}` not found')
51
+ except fastapi.HTTPException:
52
+ raise
53
+ except Exception as e:
54
+ raise fastapi.HTTPException(status_code=500,
55
+ detail='Failed to delete SSH Node Pool: '
56
+ f'{common_utils.format_exception(e)}')
57
+
58
+
59
+ @router.post('/keys')
60
+ async def upload_ssh_key(request: fastapi.Request) -> Dict[str, str]:
61
+ """Upload SSH private key."""
62
+ try:
63
+ form = await request.form()
64
+ key_name = form.get('key_name')
65
+ key_file = form.get('key_file')
66
+
67
+ if not key_name or not key_file:
68
+ raise fastapi.HTTPException(status_code=400,
69
+ detail='Missing key_name or key_file')
70
+
71
+ key_content = await key_file.read()
72
+ key_path = ssh_node_pools_core.upload_ssh_key(key_name,
73
+ key_content.decode())
74
+
75
+ return {'status': 'success', 'key_path': key_path}
76
+ except fastapi.HTTPException:
77
+ raise
78
+ except Exception as e:
79
+ raise fastapi.HTTPException(
80
+ status_code=500,
81
+ detail=
82
+ f'Failed to upload SSH key: {common_utils.format_exception(e)}')
83
+
84
+
85
+ @router.get('/keys')
86
+ async def list_ssh_keys() -> List[str]:
87
+ """List available SSH keys."""
88
+ try:
89
+ return ssh_node_pools_core.list_ssh_keys()
90
+ except Exception as e:
91
+ exception_msg = common_utils.format_exception(e)
92
+ raise fastapi.HTTPException(
93
+ status_code=500, detail=f'Failed to list SSH keys: {exception_msg}')
94
+
95
+
96
+ @router.post('/{pool_name}/deploy')
97
+ async def deploy_ssh_node_pool(request: fastapi.Request,
98
+ pool_name: str) -> Dict[str, str]:
99
+ """Deploy SSH Node Pool using existing ssh_up functionality."""
100
+ try:
101
+ ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
102
+ executor.schedule_request(
103
+ request_id=request.state.request_id,
104
+ request_name='ssh_up',
105
+ request_body=ssh_up_body,
106
+ func=sky_core.ssh_up,
107
+ schedule_type=requests_lib.ScheduleType.LONG,
108
+ )
109
+
110
+ return {
111
+ 'status': 'success',
112
+ 'request_id': request.state.request_id,
113
+ 'message': f'SSH Node Pool `{pool_name}` deployment started'
114
+ }
115
+ except Exception as e:
116
+ raise fastapi.HTTPException(status_code=500,
117
+ detail=f'Failed to deploy SSH Node Pool: '
118
+ f'{common_utils.format_exception(e)}')
119
+
120
+
121
+ @router.post('/deploy')
122
+ async def deploy_ssh_node_pool_general(
123
+ request: fastapi.Request,
124
+ ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
125
+ """Deploys all SSH Node Pools."""
126
+ try:
127
+ executor.schedule_request(
128
+ request_id=request.state.request_id,
129
+ request_name='ssh_up',
130
+ request_body=ssh_up_body,
131
+ func=sky_core.ssh_up,
132
+ schedule_type=requests_lib.ScheduleType.LONG,
133
+ )
134
+
135
+ pool_name = ssh_up_body.infra or 'default'
136
+ return {
137
+ 'status': 'success',
138
+ 'request_id': request.state.request_id,
139
+ 'message': f'SSH Node Pool `{pool_name}` deployment started'
140
+ }
141
+ except Exception as e:
142
+ raise fastapi.HTTPException(status_code=500,
143
+ detail=f'Failed to deploy SSH Node Pool: '
144
+ f'{common_utils.format_exception(e)}')
145
+
146
+
147
+ @router.post('/{pool_name}/down')
148
+ async def down_ssh_node_pool(request: fastapi.Request,
149
+ pool_name: str) -> Dict[str, str]:
150
+ """Cleans up a SSH Node Pools."""
151
+ try:
152
+ ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
153
+ executor.schedule_request(
154
+ request_id=request.state.request_id,
155
+ request_name='ssh_down',
156
+ request_body=ssh_up_body,
157
+ func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
158
+ schedule_type=requests_lib.ScheduleType.LONG,
159
+ )
160
+
161
+ return {
162
+ 'status': 'success',
163
+ 'request_id': request.state.request_id,
164
+ 'message': f'SSH Node Pool `{pool_name}` teardown started'
165
+ }
166
+ except Exception as e:
167
+ raise fastapi.HTTPException(
168
+ status_code=500,
169
+ detail=f'Failed to tear down SSH Node Pool: '
170
+ f'{common_utils.format_exception(e)}')
171
+
172
+
173
+ @router.post('/down')
174
+ async def down_ssh_node_pool_general(
175
+ request: fastapi.Request,
176
+ ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
177
+ """Cleans up all SSH Node Pools."""
178
+ try:
179
+ # Set cleanup=True for down operation
180
+ ssh_up_body.cleanup = True
181
+ executor.schedule_request(
182
+ request_id=request.state.request_id,
183
+ request_name='ssh_down',
184
+ request_body=ssh_up_body,
185
+ func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
186
+ schedule_type=requests_lib.ScheduleType.LONG,
187
+ )
188
+
189
+ pool_name = ssh_up_body.infra or 'default'
190
+ return {
191
+ 'status': 'success',
192
+ 'request_id': request.state.request_id,
193
+ 'message': f'SSH Node Pool `{pool_name}` teardown started'
194
+ }
195
+ except Exception as e:
196
+ raise fastapi.HTTPException(
197
+ status_code=500,
198
+ detail=f'Failed to tear down SSH Node Pool: '
199
+ f'{common_utils.format_exception(e)}')
200
+
201
+
202
+ @router.get('/{pool_name}/status')
203
+ async def get_ssh_node_pool_status(pool_name: str) -> Dict[str, str]:
204
+ """Get the status of a specific SSH Node Pool."""
205
+ try:
206
+ # Call ssh_status to check the context
207
+ context_name = f'ssh-{pool_name}'
208
+ is_ready, reason = sky_core.ssh_status(context_name)
209
+
210
+ # Strip ANSI escape codes from the reason text
211
+ def strip_ansi_codes(text):
212
+ if not text:
213
+ return text
214
+ # Remove ANSI escape sequences (color codes, formatting, etc.)
215
+ text = re.sub(r'\x1b\[[0-9;]*m', '', text)
216
+ # Remove 'disabled. Reason: ' prefix if present
217
+ text = text.replace('disabled. Reason: ', '')
218
+ return text
219
+
220
+ cleaned_reason = strip_ansi_codes(reason) if reason else reason
221
+
222
+ return {
223
+ 'pool_name': pool_name,
224
+ 'context_name': context_name,
225
+ 'status': 'Ready' if is_ready else 'Not Ready',
226
+ 'reason': cleaned_reason
227
+ }
228
+ except Exception as e:
229
+ raise fastapi.HTTPException(
230
+ status_code=500,
231
+ detail=f'Failed to get SSH Node Pool status: '
232
+ f'{common_utils.format_exception(e)}')