skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +26 -11
  3. sky/backends/cloud_vm_ray_backend.py +16 -5
  4. sky/client/cli/command.py +222 -4
  5. sky/client/sdk.py +110 -82
  6. sky/clouds/aws.py +10 -7
  7. sky/clouds/azure.py +10 -7
  8. sky/clouds/cloud.py +2 -0
  9. sky/clouds/cudo.py +2 -0
  10. sky/clouds/do.py +10 -7
  11. sky/clouds/fluidstack.py +2 -0
  12. sky/clouds/gcp.py +10 -7
  13. sky/clouds/hyperbolic.py +10 -7
  14. sky/clouds/ibm.py +2 -0
  15. sky/clouds/kubernetes.py +26 -9
  16. sky/clouds/lambda_cloud.py +10 -7
  17. sky/clouds/nebius.py +10 -7
  18. sky/clouds/oci.py +10 -7
  19. sky/clouds/paperspace.py +10 -7
  20. sky/clouds/runpod.py +10 -7
  21. sky/clouds/scp.py +10 -7
  22. sky/clouds/vast.py +10 -7
  23. sky/clouds/vsphere.py +2 -0
  24. sky/core.py +1 -0
  25. sky/dag.py +14 -0
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  30. sky/dashboard/out/_next/static/chunks/{37-4650f214e2119168.js → 37-1f1e94f5a561202a.js} +2 -2
  31. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  32. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  37. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  38. sky/dashboard/out/_next/static/chunks/{856-bfddc18e16f3873c.js → 856-cdf66268ec878d0c.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-0ef7418d1a3822f3.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-ecc5a7003776cfa7.js → [name]-0b4c662a25e4747a.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  54. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  55. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  56. sky/dashboard/out/clusters/[cluster].html +1 -1
  57. sky/dashboard/out/clusters.html +1 -1
  58. sky/dashboard/out/config.html +1 -1
  59. sky/dashboard/out/index.html +1 -1
  60. sky/dashboard/out/infra/[context].html +1 -1
  61. sky/dashboard/out/infra.html +1 -1
  62. sky/dashboard/out/jobs/[job].html +1 -1
  63. sky/dashboard/out/jobs.html +1 -1
  64. sky/dashboard/out/users.html +1 -1
  65. sky/dashboard/out/volumes.html +1 -0
  66. sky/dashboard/out/workspace/new.html +1 -1
  67. sky/dashboard/out/workspaces/[name].html +1 -1
  68. sky/dashboard/out/workspaces.html +1 -1
  69. sky/data/storage_utils.py +2 -4
  70. sky/exceptions.py +15 -0
  71. sky/execution.py +5 -0
  72. sky/global_user_state.py +129 -0
  73. sky/jobs/client/sdk.py +13 -11
  74. sky/jobs/server/core.py +4 -0
  75. sky/models.py +16 -0
  76. sky/provision/__init__.py +26 -0
  77. sky/provision/kubernetes/__init__.py +3 -0
  78. sky/provision/kubernetes/instance.py +38 -77
  79. sky/provision/kubernetes/utils.py +52 -2
  80. sky/provision/kubernetes/volume.py +147 -0
  81. sky/resources.py +20 -76
  82. sky/serve/client/sdk.py +13 -13
  83. sky/serve/server/core.py +5 -1
  84. sky/server/common.py +40 -5
  85. sky/server/constants.py +5 -1
  86. sky/server/metrics.py +105 -0
  87. sky/server/requests/executor.py +30 -14
  88. sky/server/requests/payloads.py +16 -0
  89. sky/server/requests/requests.py +35 -1
  90. sky/server/rest.py +152 -0
  91. sky/server/server.py +66 -16
  92. sky/server/state.py +20 -0
  93. sky/server/stream_utils.py +8 -3
  94. sky/server/uvicorn.py +153 -13
  95. sky/setup_files/dependencies.py +2 -0
  96. sky/skylet/constants.py +14 -3
  97. sky/task.py +141 -18
  98. sky/templates/kubernetes-ray.yml.j2 +30 -1
  99. sky/users/permission.py +2 -0
  100. sky/utils/context.py +3 -1
  101. sky/utils/resources_utils.py +66 -0
  102. sky/utils/rich_utils.py +6 -0
  103. sky/utils/schemas.py +146 -3
  104. sky/utils/status_lib.py +10 -0
  105. sky/utils/validator.py +11 -1
  106. sky/volumes/__init__.py +0 -0
  107. sky/volumes/client/__init__.py +0 -0
  108. sky/volumes/client/sdk.py +64 -0
  109. sky/volumes/server/__init__.py +0 -0
  110. sky/volumes/server/core.py +199 -0
  111. sky/volumes/server/server.py +85 -0
  112. sky/volumes/utils.py +158 -0
  113. sky/volumes/volume.py +198 -0
  114. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  115. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +123 -108
  116. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
  119. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  124. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
  125. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
  126. sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
  131. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
  136. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
  137. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
  138. /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  139. /sky/dashboard/out/_next/static/chunks/{843-bde186946d353355.js → 843-07d25a7e64462fd8.js} +0 -0
  140. /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
  141. /sky/dashboard/out/_next/static/chunks/{973-56412c7976b4655b.js → 973-5b5019ba333e8d62.js} +0 -0
  142. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  143. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  144. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  145. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
@@ -155,9 +155,14 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
155
155
  if request_task.status > requests_lib.RequestStatus.RUNNING:
156
156
  if (request_task.status ==
157
157
  requests_lib.RequestStatus.CANCELLED):
158
- buffer.append(
159
- f'{request_task.name!r} request {request_id}'
160
- ' cancelled\n')
158
+ if request_task.should_retry:
159
+ buffer.append(
160
+ message_utils.encode_payload(
161
+ rich_utils.Control.RETRY.encode('')))
162
+ else:
163
+ buffer.append(
164
+ f'{request_task.name!r} request {request_id}'
165
+ ' cancelled\n')
161
166
  break
162
167
  if not follow:
163
168
  break
sky/server/uvicorn.py CHANGED
@@ -3,17 +3,165 @@
3
3
  This module is a wrapper around uvicorn to customize the behavior of the
4
4
  server.
5
5
  """
6
- import functools
6
+ import asyncio
7
7
  import os
8
+ import signal
8
9
  import threading
9
- from typing import Optional
10
+ import time
11
+ from types import FrameType
12
+ from typing import Optional, Union
10
13
 
14
+ import filelock
11
15
  import uvicorn
12
16
  from uvicorn.supervisors import multiprocess
13
17
 
18
+ from sky import sky_logging
19
+ from sky.server import state
20
+ from sky.server.requests import requests as requests_lib
21
+ from sky.skylet import constants
14
22
  from sky.utils import context_utils
15
23
  from sky.utils import subprocess_utils
16
24
 
25
+ logger = sky_logging.init_logger(__name__)
26
+
27
+ # File lock path for coordinating graceful shutdown across processes
28
+ _GRACEFUL_SHUTDOWN_LOCK_PATH = '/tmp/skypilot_graceful_shutdown.lock'
29
+
30
+ # Interval to check for on-going requests.
31
+ _WAIT_REQUESTS_INTERVAL_SECONDS = 5
32
+
33
+ # Timeout for waiting for on-going requests to finish.
34
+ try:
35
+ _WAIT_REQUESTS_TIMEOUT_SECONDS = int(
36
+ os.environ.get(constants.GRACE_PERIOD_SECONDS_ENV_VAR, '60'))
37
+ except ValueError:
38
+ _WAIT_REQUESTS_TIMEOUT_SECONDS = 60
39
+
40
+ # TODO(aylei): use decorator to register requests that need to be proactively
41
+ # cancelled instead of hardcoding here.
42
+ _RETRIABLE_REQUEST_NAMES = [
43
+ 'sky.logs',
44
+ 'sky.jobs.logs',
45
+ 'sky.serve.logs',
46
+ ]
47
+
48
+
49
+ class Server(uvicorn.Server):
50
+ """Server wrapper for uvicorn.
51
+
52
+ Extended functionalities:
53
+ - Handle exit signal and perform custom graceful shutdown.
54
+ - Run the server process with contextually aware.
55
+ """
56
+
57
+ def __init__(self, config: uvicorn.Config):
58
+ super().__init__(config=config)
59
+ self.exiting: bool = False
60
+
61
+ def handle_exit(self, sig: int, frame: Union[FrameType, None]) -> None:
62
+ """Handle exit signal.
63
+
64
+ When a server process receives a SIGTERM or SIGINT signal, a graceful
65
+ shutdown will be initiated. If a SIGINT signal is received again, the
66
+ server will be forcefully shutdown.
67
+ """
68
+ if self.exiting and sig == signal.SIGINT:
69
+ # The server has been siganled to exit and recieved a SIGINT again,
70
+ # do force shutdown.
71
+ logger.info('Force shutdown.')
72
+ self.should_exit = True
73
+ super().handle_exit(sig, frame)
74
+ return
75
+ if not self.exiting:
76
+ self.exiting = True
77
+ # Perform graceful shutdown in a separate thread to avoid blocking
78
+ # the main thread.
79
+ threading.Thread(target=self._graceful_shutdown,
80
+ args=(sig, frame),
81
+ daemon=True).start()
82
+
83
+ def _graceful_shutdown(self, sig: int, frame: Union[FrameType,
84
+ None]) -> None:
85
+ """Perform graceful shutdown."""
86
+ # Block new requests so that we can wait until all on-going requests
87
+ # are finished. Note that /api/$verb operations are still allowed in
88
+ # this stage to ensure the client can still operate the on-going
89
+ # requests, e.g. /api/logs, /api/cancel, etc.
90
+ logger.info('Block new requests being submitted in worker '
91
+ f'{os.getpid()}.')
92
+ state.set_block_requests(True)
93
+ # Ensure the shutting_down are set on all workers before next step.
94
+ # TODO(aylei): hacky, need a reliable solution.
95
+ time.sleep(1)
96
+
97
+ lock = filelock.FileLock(_GRACEFUL_SHUTDOWN_LOCK_PATH)
98
+ # Elect a coordinator process to handle on-going requests check
99
+ with lock.acquire():
100
+ logger.info(f'Worker {os.getpid()} elected as shutdown coordinator')
101
+ self._wait_requests()
102
+
103
+ logger.info('Shutting down server...')
104
+ self.should_exit = True
105
+ super().handle_exit(sig, frame)
106
+
107
+ def _wait_requests(self) -> None:
108
+ """Wait until all on-going requests are finished or cancelled."""
109
+ start_time = time.time()
110
+ while True:
111
+ statuses = [
112
+ requests_lib.RequestStatus.PENDING,
113
+ requests_lib.RequestStatus.RUNNING,
114
+ ]
115
+ reqs = requests_lib.get_request_tasks(status=statuses)
116
+ if not reqs:
117
+ break
118
+ logger.info(f'{len(reqs)} on-going requests '
119
+ 'found, waiting for them to finish...')
120
+ # Proactively cancel internal requests and logs requests since
121
+ # they can run for infinite time.
122
+ internal_request_ids = [
123
+ d.id for d in requests_lib.INTERNAL_REQUEST_DAEMONS
124
+ ]
125
+ if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
126
+ logger.warning('Timeout waiting for on-going requests to '
127
+ 'finish, cancelling all on-going requests.')
128
+ for req in reqs:
129
+ self.interrupt_request_for_retry(req.request_id)
130
+ break
131
+ interrupted = 0
132
+ for req in reqs:
133
+ if req.request_id in internal_request_ids:
134
+ self.interrupt_request_for_retry(req.request_id)
135
+ interrupted += 1
136
+ elif req.name in _RETRIABLE_REQUEST_NAMES:
137
+ self.interrupt_request_for_retry(req.request_id)
138
+ interrupted += 1
139
+ # TODO(aylei): interrupt pending requests to accelerate the
140
+ # shutdown.
141
+ # If some requests are not interrupted, wait for them to finish,
142
+ # otherwise we just check again immediately to accelerate the
143
+ # shutdown process.
144
+ if interrupted < len(reqs):
145
+ time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
146
+
147
+ def interrupt_request_for_retry(self, request_id: str) -> None:
148
+ """Interrupt a request for retry."""
149
+ with requests_lib.update_request(request_id) as req:
150
+ if req is None:
151
+ return
152
+ if req.pid is not None:
153
+ os.kill(req.pid, signal.SIGTERM)
154
+ req.status = requests_lib.RequestStatus.CANCELLED
155
+ req.should_retry = True
156
+ logger.info(
157
+ f'Request {request_id} interrupted and will be retried by client.')
158
+
159
+ def run(self, *args, **kwargs):
160
+ """Run the server process."""
161
+ context_utils.hijack_sys_attrs()
162
+ with self.capture_signals():
163
+ asyncio.run(self.serve(*args, **kwargs))
164
+
17
165
 
18
166
  def run(config: uvicorn.Config):
19
167
  """Run unvicorn server."""
@@ -22,28 +170,20 @@ def run(config: uvicorn.Config):
22
170
  # in uvicorn. Since we do not use reload now, simply
23
171
  # guard by an exception.
24
172
  raise ValueError('Reload is not supported yet.')
25
- server = uvicorn.Server(config=config)
26
- run_server_process = functools.partial(_run_server_process, server)
173
+ server = Server(config=config)
27
174
  try:
28
175
  if config.workers is not None and config.workers > 1:
29
176
  sock = config.bind_socket()
30
- SlowStartMultiprocess(config,
31
- target=run_server_process,
177
+ SlowStartMultiprocess(config, target=server.run,
32
178
  sockets=[sock]).run()
33
179
  else:
34
- run_server_process()
180
+ server.run()
35
181
  finally:
36
182
  # Copied from unvicorn.run()
37
183
  if config.uds and os.path.exists(config.uds):
38
184
  os.remove(config.uds)
39
185
 
40
186
 
41
- def _run_server_process(server: uvicorn.Server, *args, **kwargs):
42
- """Run the server process with contextually aware."""
43
- context_utils.hijack_sys_attrs()
44
- server.run(*args, **kwargs)
45
-
46
-
47
187
  class SlowStartMultiprocess(multiprocess.Multiprocess):
48
188
  """Uvicorn Multiprocess wrapper with slow start.
49
189
 
@@ -62,6 +62,8 @@ install_requires = [
62
62
  # the client-side actually not importing them.
63
63
  'casbin',
64
64
  'sqlalchemy_adapter',
65
+ # Required for API server metrics
66
+ 'prometheus_client>=0.8.0',
65
67
  'passlib',
66
68
  ]
67
69
 
sky/skylet/constants.py CHANGED
@@ -413,6 +413,8 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
413
413
  # Environment variable that is set to 'true' if this is a skypilot server.
414
414
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
415
415
 
416
+ # Environment variable that is set to 'true' if metrics are enabled.
417
+ ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
416
418
  # Environment variable that is set to 'true' if basic
417
419
  # authentication is enabled in the API server.
418
420
  ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
@@ -449,20 +451,29 @@ TIME_PATTERN: str = (
449
451
 
450
452
  MEMORY_SIZE_UNITS = {
451
453
  'kb': 2**10,
454
+ 'ki': 2**10,
452
455
  'mb': 2**20,
456
+ 'mi': 2**20,
453
457
  'gb': 2**30,
458
+ 'gi': 2**30,
454
459
  'tb': 2**40,
460
+ 'ti': 2**40,
455
461
  'pb': 2**50,
462
+ 'pi': 2**50,
456
463
  }
457
464
 
458
465
  MEMORY_SIZE_PATTERN = (
459
466
  '^[0-9]+('
460
- f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}'
461
- ')?$/i')
462
- MEMORY_SIZE_PLUS_PATTERN = f'{MEMORY_SIZE_PATTERN[:-3]}+?$/i'
467
+ f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}|'
468
+ f'{"|".join([unit.upper() for unit in MEMORY_SIZE_UNITS])}|'
469
+ f'{"|".join([unit[0].upper() + unit[1:] for unit in MEMORY_SIZE_UNITS if len(unit) > 1])}' # pylint: disable=line-too-long
470
+ ')?$')
471
+
472
+ LAST_USE_TRUNC_LENGTH = 25
463
473
 
464
474
  MIN_PRIORITY = -1000
465
475
  MAX_PRIORITY = 1000
466
476
  DEFAULT_PRIORITY = 0
467
477
 
478
+ GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
468
479
  COST_REPORT_DEFAULT_DAYS = 30
sky/task.py CHANGED
@@ -24,6 +24,7 @@ from sky.skylet import constants
24
24
  from sky.utils import common_utils
25
25
  from sky.utils import schemas
26
26
  from sky.utils import ux_utils
27
+ from sky.volumes import volume as volume_lib
27
28
 
28
29
  if typing.TYPE_CHECKING:
29
30
  import yaml
@@ -246,12 +247,14 @@ class Task:
246
247
  secrets: Optional[Dict[str, str]] = None,
247
248
  workdir: Optional[str] = None,
248
249
  num_nodes: Optional[int] = None,
250
+ volumes: Optional[Dict[str, str]] = None,
249
251
  # Advanced:
250
252
  docker_image: Optional[str] = None,
251
253
  event_callback: Optional[str] = None,
252
254
  blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
253
255
  # Internal use only.
254
256
  file_mounts_mapping: Optional[Dict[str, str]] = None,
257
+ volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
255
258
  ):
256
259
  """Initializes a Task.
257
260
 
@@ -319,6 +322,7 @@ class Task:
319
322
  self.setup = setup
320
323
  self._envs = envs or {}
321
324
  self._secrets = secrets or {}
325
+ self._volumes = volumes or {}
322
326
 
323
327
  # Validate Docker login configuration early if both envs and secrets
324
328
  # contain Docker variables
@@ -361,7 +365,9 @@ class Task:
361
365
  self.best_resources: Optional[sky.Resources] = None
362
366
 
363
367
  # For internal use only.
364
- self.file_mounts_mapping = file_mounts_mapping
368
+ self.file_mounts_mapping: Optional[Dict[str, str]] = file_mounts_mapping
369
+ self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
370
+ volume_mounts)
365
371
 
366
372
  dag = sky.dag.get_current_dag()
367
373
  if dag is not None:
@@ -442,12 +448,9 @@ class Task:
442
448
  if self.file_mounts is None:
443
449
  return
444
450
  for target, source in self.file_mounts.items():
445
- if target.endswith('/') or source.endswith('/'):
446
- with ux_utils.print_exception_no_traceback():
447
- raise ValueError(
448
- 'File mount paths cannot end with a slash '
449
- '(try "/mydir: /mydir" or "/myfile: /myfile"). '
450
- f'Found: target={target} source={source}')
451
+ location = f'file_mounts.{target}: {source}'
452
+ self._validate_mount_path(target, location)
453
+ self._validate_path(source, location)
451
454
  if data_utils.is_cloud_store_url(target):
452
455
  with ux_utils.print_exception_no_traceback():
453
456
  raise ValueError(
@@ -462,17 +465,25 @@ class Task:
462
465
  f'File mount source {source!r} does not exist '
463
466
  'locally. To fix: check if it exists, and correct '
464
467
  'the path.')
465
- # TODO(zhwu): /home/username/sky_workdir as the target path need
466
- # to be filtered out as well.
467
- if (target == constants.SKY_REMOTE_WORKDIR and
468
- self.workdir is not None):
469
- with ux_utils.print_exception_no_traceback():
470
- raise ValueError(
471
- f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
472
- 'destination path of a file mount, as it will be used '
473
- 'by the workdir. If uploading a file/folder to the '
474
- 'workdir is needed, please specify the full path to '
475
- 'the file/folder.')
468
+
469
+ def _validate_mount_path(self, path: str, location: str):
470
+ self._validate_path(path, location)
471
+ # TODO(zhwu): /home/username/sky_workdir as the target path need
472
+ # to be filtered out as well.
473
+ if (path == constants.SKY_REMOTE_WORKDIR and self.workdir is not None):
474
+ with ux_utils.print_exception_no_traceback():
475
+ raise ValueError(
476
+ f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
477
+ 'destination path of a file mount, as it will be used '
478
+ 'by the workdir. If uploading a file/folder to the '
479
+ 'workdir is needed, please specify the full path to '
480
+ 'the file/folder.')
481
+
482
+ def _validate_path(self, path: str, location: str):
483
+ if path.endswith('/'):
484
+ with ux_utils.print_exception_no_traceback():
485
+ raise ValueError('Mount paths cannot end with a slash '
486
+ f'Found: {path} in {location}')
476
487
 
477
488
  def expand_and_validate_workdir(self):
478
489
  """Expand workdir to absolute path and validate it.
@@ -587,6 +598,7 @@ class Task:
587
598
  secrets=config.pop('secrets', None),
588
599
  event_callback=config.pop('event_callback', None),
589
600
  file_mounts_mapping=config.pop('file_mounts_mapping', None),
601
+ volumes=config.pop('volumes', None),
590
602
  )
591
603
 
592
604
  # Create lists to store storage objects inlined in file_mounts.
@@ -711,6 +723,16 @@ class Task:
711
723
  service = service_spec.SkyServiceSpec.from_yaml_config(service)
712
724
  task.set_service(service)
713
725
 
726
+ volume_mounts = config.pop('volume_mounts', None)
727
+ if volume_mounts is not None:
728
+ task.volume_mounts = []
729
+ for vol in volume_mounts:
730
+ common_utils.validate_schema(vol,
731
+ schemas.get_volume_mount_schema(),
732
+ 'Invalid volume mount config: ')
733
+ volume_mount = volume_lib.VolumeMount.from_yaml_config(vol)
734
+ task.volume_mounts.append(volume_mount)
735
+
714
736
  assert not config, f'Invalid task args: {config.keys()}'
715
737
  return task
716
738
 
@@ -745,6 +767,97 @@ class Task:
745
767
  config = {}
746
768
  return Task.from_yaml_config(config)
747
769
 
770
+ def resolve_and_validate_volumes(self) -> None:
771
+ """Resolve volumes config to volume mounts and validate them.
772
+
773
+ Raises:
774
+ exceptions.VolumeNotFoundError: if any volume is not found.
775
+ exceptions.VolumeTopologyConflictError: if there is conflict in the
776
+ volumes and compute topology.
777
+ """
778
+ # Volumes has been resolved, a typical case is that the API server
779
+ # has resolved the volumes and the dag was then submitted to
780
+ # controllers.
781
+ if self.volume_mounts is not None:
782
+ return None
783
+ if not self._volumes:
784
+ return None
785
+ volume_mounts: List[volume_lib.VolumeMount] = []
786
+ for dst_path, vol in self._volumes.items():
787
+ self._validate_mount_path(dst_path, location='volumes')
788
+ # Shortcut for `dst_path: volume_name`
789
+ if isinstance(vol, str):
790
+ volume_mount = volume_lib.VolumeMount.resolve(dst_path, vol)
791
+ elif isinstance(vol, dict):
792
+ assert 'name' in vol, 'Volume name must be set.'
793
+ volume_mount = volume_lib.VolumeMount.resolve(
794
+ dst_path, vol['name'])
795
+ else:
796
+ raise ValueError(f'Invalid volume config: {dst_path}: {vol}')
797
+ volume_mounts.append(volume_mount)
798
+ # Disable certain access modes
799
+ disabled_modes = {}
800
+ if self.num_nodes > 1:
801
+ disabled_modes[
802
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value] = (
803
+ 'access mode ReadWriteOnce is not supported for '
804
+ 'multi-node tasks.')
805
+ disabled_modes[
806
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value] = (
807
+ 'access mode ReadWriteOncePod is not supported for '
808
+ 'multi-node tasks.')
809
+ # TODO(aylei): generalize access mode to all volume types
810
+ # Record the required topology and the volume that requires it, e.g.
811
+ # {'cloud': ('volume_name', 'aws')}
812
+ topology: Dict[str, Tuple[str, Optional[str]]] = {
813
+ 'cloud': ('', None),
814
+ 'region': ('', None),
815
+ 'zone': ('', None),
816
+ }
817
+ for vol in volume_mounts:
818
+ # Check access mode
819
+ access_mode = vol.volume_config.config.get('access_mode', '')
820
+ if access_mode in disabled_modes:
821
+ raise ValueError(f'Volume {vol.volume_name} with '
822
+ f'{disabled_modes[access_mode]}')
823
+ # Check topology
824
+ for key, (vol_name, previous_req) in topology.items():
825
+ req = getattr(vol.volume_config, key)
826
+ if req is not None:
827
+ if previous_req is not None and req != previous_req:
828
+ raise exceptions.VolumeTopologyConflictError(
829
+ f'Volume {vol.volume_name} can only be attached on '
830
+ f'{key}:{req}, which conflicts with another volume '
831
+ f'{vol_name} that requires {key}:{previous_req}.'
832
+ f'Please use different volumes and retry.')
833
+ topology[key] = (vol_name, req)
834
+ # Now we have the topology requirements from the intersection of all
835
+ # volumes. Check if there is topology conflict with the resources.
836
+ # Volume must have no conflict with ALL resources even if user
837
+ # specifies 'any_of' resources to ensure no resources will conflict
838
+ # with the volumes during failover.
839
+
840
+ for res in self.resources:
841
+ for key, (vol_name, vol_req) in topology.items():
842
+ req = getattr(res, key)
843
+ if (req is not None and vol_req is not None and
844
+ str(req) != vol_req):
845
+ raise exceptions.VolumeTopologyConflictError(
846
+ f'The task requires {key}:{req}, which conflicts with '
847
+ f'the volume constraint {key}:{vol_req}. Please '
848
+ f'use different volumes and retry.')
849
+ # No topology conflict, we safely override the topology of resources to
850
+ # satisfy the volume constraints.
851
+ override_params = {}
852
+ for key, (vol_name, vol_req) in topology.items():
853
+ if vol_req is not None:
854
+ if key == 'cloud':
855
+ override_params[key] = sky.CLOUD_REGISTRY.from_str(vol_req)
856
+ else:
857
+ override_params[key] = vol_req
858
+ self.set_resources_override(override_params)
859
+ self.volume_mounts = volume_mounts
860
+
748
861
  @property
749
862
  def num_nodes(self) -> int:
750
863
  return self._num_nodes
@@ -767,6 +880,10 @@ class Task:
767
880
  def secrets(self) -> Dict[str, str]:
768
881
  return self._secrets
769
882
 
883
+ @property
884
+ def volumes(self) -> Dict[str, str]:
885
+ return self._volumes
886
+
770
887
  def update_envs(
771
888
  self, envs: Union[None, List[Tuple[str, str]],
772
889
  Dict[str, str]]) -> 'Task':
@@ -1453,6 +1570,12 @@ class Task:
1453
1570
  })
1454
1571
 
1455
1572
  add_if_not_none('file_mounts_mapping', self.file_mounts_mapping)
1573
+ add_if_not_none('volumes', self.volumes)
1574
+ if self.volume_mounts is not None:
1575
+ config['volume_mounts'] = [
1576
+ volume_mount.to_yaml_config()
1577
+ for volume_mount in self.volume_mounts
1578
+ ]
1456
1579
  return config
1457
1580
 
1458
1581
  def get_required_cloud_features(
@@ -243,6 +243,22 @@ provider:
243
243
  # This selector must match the head node pod's selector below.
244
244
  selector:
245
245
  component: {{cluster_name_on_cloud}}-head
246
+ # Headless service mapping hostnames to rest of the worker nodes
247
+ {% for worker_id in range(1, num_nodes) %}
248
+ - apiVersion: v1
249
+ kind: Service
250
+ metadata:
251
+ labels:
252
+ parent: skypilot
253
+ skypilot-cluster: {{cluster_name_on_cloud}}
254
+ skypilot-user: {{ user }}
255
+ name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
256
+ spec:
257
+ selector:
258
+ component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
259
+ clusterIP: None
260
+ {% endfor %}
261
+
246
262
 
247
263
  # Specify the pod type for the ray head node (as configured below).
248
264
  head_node_type: ray_head_default
@@ -255,7 +271,7 @@ available_node_types:
255
271
  metadata:
256
272
  # name will be filled in the provisioner
257
273
  # head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
258
- # service is required.
274
+ # service is required. Worker nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
259
275
  labels:
260
276
  parent: skypilot
261
277
  # component will be set for the head node pod to be the same as the head node service selector above if a
@@ -287,6 +303,10 @@ available_node_types:
287
303
  serviceAccountName: {{k8s_service_account_name}}
288
304
  automountServiceAccountToken: {{k8s_automount_sa_token}}
289
305
  restartPolicy: {{ "Always" if high_availability else "Never" }}
306
+ {% if volume_mounts %}
307
+ securityContext:
308
+ fsGroup: 1000
309
+ {% endif %}
290
310
 
291
311
  # Add node selector if GPU/TPUs are requested:
292
312
  {% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
@@ -365,6 +385,11 @@ available_node_types:
365
385
  persistentVolumeClaim:
366
386
  claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
367
387
  {% endif %}
388
+ {% for volume_mount in volume_mounts %}
389
+ - name: {{volume_mount.name}}
390
+ persistentVolumeClaim:
391
+ claimName: {{volume_mount.volume_name_on_cloud}}
392
+ {% endfor %}
368
393
  containers:
369
394
  - name: ray-node
370
395
  imagePullPolicy: IfNotPresent
@@ -734,6 +759,10 @@ available_node_types:
734
759
  - name: fusermount-shared-dir
735
760
  mountPath: {{k8s_fusermount_shared_dir}}
736
761
  {% endif %}
762
+ {% for volume_mount in volume_mounts %}
763
+ - name: {{volume_mount.name}}
764
+ mountPath: {{volume_mount.path}}
765
+ {% endfor %}
737
766
  resources:
738
767
  requests:
739
768
  cpu: {{cpus}}
sky/users/permission.py CHANGED
@@ -18,6 +18,8 @@ from sky.utils import common_utils
18
18
 
19
19
  logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
20
20
  logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)
21
+ logging.getLogger('casbin.model').setLevel(sky_logging.ERROR)
22
+ logging.getLogger('casbin.rbac').setLevel(sky_logging.ERROR)
21
23
  logger = sky_logging.init_logger(__name__)
22
24
 
23
25
  # Filelocks for the policy update.
sky/utils/context.py CHANGED
@@ -254,7 +254,9 @@ class Popen(subprocess.Popen):
254
254
  def __init__(self, *args, **kwargs):
255
255
  env = kwargs.pop('env', None)
256
256
  if env is None:
257
- env = os.environ
257
+ # Pass a copy of current context.environ to avoid race condition
258
+ # when the context is updated after the Popen is created.
259
+ env = os.environ.copy()
258
260
  super().__init__(*args, env=env, **kwargs)
259
261
 
260
262
 
@@ -8,6 +8,7 @@ import typing
8
8
  from typing import Dict, List, Optional, Set, Union
9
9
 
10
10
  from sky import skypilot_config
11
+ from sky.skylet import constants
11
12
  from sky.utils import common_utils
12
13
  from sky.utils import registry
13
14
  from sky.utils import ux_utils
@@ -331,3 +332,68 @@ def make_launchables_for_valid_region_zones(
331
332
  # Batch the requests at the granularity of a single region.
332
333
  launchables.append(launchable_resources.copy(region=region.name))
333
334
  return launchables
335
+
336
+
337
+ def parse_memory_resource(resource_qty_str: Union[str, int, float],
338
+ field_name: str,
339
+ ret_type: type = int,
340
+ unit: str = 'gb',
341
+ allow_plus: bool = False,
342
+ allow_x: bool = False,
343
+ allow_rounding: bool = False) -> str:
344
+ """Returns memory size in chosen units given a resource quantity string.
345
+
346
+ Args:
347
+ resource_qty_str: Resource quantity string
348
+ unit: Unit to convert to
349
+ allow_plus: Whether to allow '+' prefix
350
+ allow_x: Whether to allow 'x' suffix
351
+ """
352
+ assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
353
+
354
+ error_msg = (f'"{field_name}" field should be a '
355
+ f'{constants.MEMORY_SIZE_PATTERN}+?,'
356
+ f' got {resource_qty_str}')
357
+
358
+ resource_str = str(resource_qty_str)
359
+
360
+ # Handle plus and x suffixes, x is only used internally for jobs controller
361
+ plus = ''
362
+ if resource_str.endswith('+'):
363
+ if allow_plus:
364
+ resource_str = resource_str[:-1]
365
+ plus = '+'
366
+ else:
367
+ raise ValueError(error_msg)
368
+
369
+ x = ''
370
+ if resource_str.endswith('x'):
371
+ if allow_x:
372
+ resource_str = resource_str[:-1]
373
+ x = 'x'
374
+ else:
375
+ raise ValueError(error_msg)
376
+
377
+ try:
378
+ # We assume it is already in the wanted units to maintain backwards
379
+ # compatibility
380
+ ret_type(resource_str)
381
+ return f'{resource_str}{plus}{x}'
382
+ except ValueError:
383
+ pass
384
+
385
+ resource_str = resource_str.lower()
386
+ for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
387
+ if resource_str.endswith(mem_unit):
388
+ try:
389
+ value = ret_type(resource_str[:-len(mem_unit)])
390
+ converted = (value * multiplier /
391
+ constants.MEMORY_SIZE_UNITS[unit])
392
+ if not allow_rounding and ret_type(converted) != converted:
393
+ raise ValueError(error_msg)
394
+ converted = ret_type(converted)
395
+ return f'{converted}{plus}{x}'
396
+ except ValueError:
397
+ continue
398
+
399
+ raise ValueError(error_msg)