skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (120) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/backends/backend_utils.py +102 -8
  4. sky/backends/cloud_vm_ray_backend.py +197 -31
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +60 -77
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +19 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +14 -0
  14. sky/core.py +5 -0
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
  18. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
  25. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  26. sky/dashboard/out/clusters/[cluster].html +1 -1
  27. sky/dashboard/out/clusters.html +1 -1
  28. sky/dashboard/out/config.html +1 -1
  29. sky/dashboard/out/index.html +1 -1
  30. sky/dashboard/out/infra/[context].html +1 -1
  31. sky/dashboard/out/infra.html +1 -1
  32. sky/dashboard/out/jobs/[job].html +1 -1
  33. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  34. sky/dashboard/out/jobs.html +1 -1
  35. sky/dashboard/out/users.html +1 -1
  36. sky/dashboard/out/volumes.html +1 -1
  37. sky/dashboard/out/workspace/new.html +1 -1
  38. sky/dashboard/out/workspaces/[name].html +1 -1
  39. sky/dashboard/out/workspaces.html +1 -1
  40. sky/data/storage.py +11 -1
  41. sky/exceptions.py +5 -0
  42. sky/execution.py +15 -0
  43. sky/global_user_state.py +160 -2
  44. sky/jobs/constants.py +1 -1
  45. sky/jobs/controller.py +0 -1
  46. sky/jobs/recovery_strategy.py +6 -3
  47. sky/jobs/scheduler.py +23 -68
  48. sky/jobs/server/core.py +22 -12
  49. sky/jobs/state.py +6 -2
  50. sky/jobs/utils.py +17 -2
  51. sky/provision/__init__.py +4 -2
  52. sky/provision/aws/config.py +9 -0
  53. sky/provision/aws/instance.py +41 -17
  54. sky/provision/azure/instance.py +7 -4
  55. sky/provision/cudo/cudo_wrapper.py +1 -1
  56. sky/provision/cudo/instance.py +7 -4
  57. sky/provision/do/instance.py +7 -4
  58. sky/provision/fluidstack/instance.py +7 -4
  59. sky/provision/gcp/instance.py +7 -4
  60. sky/provision/hyperbolic/instance.py +7 -5
  61. sky/provision/kubernetes/instance.py +169 -6
  62. sky/provision/lambda_cloud/instance.py +7 -4
  63. sky/provision/nebius/instance.py +7 -4
  64. sky/provision/oci/instance.py +7 -4
  65. sky/provision/paperspace/instance.py +7 -5
  66. sky/provision/paperspace/utils.py +1 -1
  67. sky/provision/provisioner.py +6 -0
  68. sky/provision/runpod/instance.py +7 -4
  69. sky/provision/runpod/utils.py +1 -1
  70. sky/provision/scp/instance.py +7 -5
  71. sky/provision/vast/instance.py +7 -5
  72. sky/provision/vsphere/instance.py +7 -4
  73. sky/resources.py +1 -2
  74. sky/schemas/__init__.py +0 -0
  75. sky/schemas/api/__init__.py +0 -0
  76. sky/schemas/api/responses.py +70 -0
  77. sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
  78. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  79. sky/schemas/db/serve_state/001_initial_schema.py +1 -1
  80. sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
  81. sky/schemas/generated/__init__.py +0 -0
  82. sky/schemas/generated/autostopv1_pb2.py +36 -0
  83. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  84. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  85. sky/serve/constants.py +3 -7
  86. sky/serve/replica_managers.py +15 -16
  87. sky/serve/serve_state.py +10 -0
  88. sky/serve/serve_utils.py +58 -23
  89. sky/serve/server/impl.py +15 -19
  90. sky/serve/service.py +31 -16
  91. sky/server/server.py +20 -14
  92. sky/setup_files/dependencies.py +11 -10
  93. sky/skylet/autostop_lib.py +38 -5
  94. sky/skylet/constants.py +3 -1
  95. sky/skylet/services.py +44 -0
  96. sky/skylet/skylet.py +49 -4
  97. sky/skypilot_config.py +4 -4
  98. sky/task.py +19 -16
  99. sky/templates/aws-ray.yml.j2 +2 -2
  100. sky/templates/jobs-controller.yaml.j2 +6 -0
  101. sky/users/permission.py +1 -1
  102. sky/utils/cli_utils/status_utils.py +9 -0
  103. sky/utils/command_runner.py +1 -1
  104. sky/utils/config_utils.py +29 -5
  105. sky/utils/controller_utils.py +73 -0
  106. sky/utils/db/db_utils.py +39 -1
  107. sky/utils/db/migration_utils.py +1 -1
  108. sky/utils/schemas.py +3 -0
  109. sky/volumes/server/core.py +2 -2
  110. sky/volumes/server/server.py +2 -2
  111. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
  112. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
  113. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
  115. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
  116. /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/serve/server/impl.py CHANGED
@@ -11,7 +11,6 @@ import uuid
11
11
  import colorama
12
12
  import filelock
13
13
 
14
- import sky
15
14
  from sky import backends
16
15
  from sky import exceptions
17
16
  from sky import execution
@@ -25,6 +24,7 @@ from sky.serve import constants as serve_constants
25
24
  from sky.serve import serve_state
26
25
  from sky.serve import serve_utils
27
26
  from sky.skylet import constants
27
+ from sky.skylet import job_lib
28
28
  from sky.utils import admin_policy_utils
29
29
  from sky.utils import command_runner
30
30
  from sky.utils import common
@@ -39,7 +39,7 @@ logger = sky_logging.init_logger(__name__)
39
39
 
40
40
 
41
41
  def _rewrite_tls_credential_paths_and_get_tls_env_vars(
42
- service_name: str, task: 'sky.Task') -> Dict[str, Any]:
42
+ service_name: str, task: 'task_lib.Task') -> Dict[str, Any]:
43
43
  """Rewrite the paths of TLS credentials in the task.
44
44
 
45
45
  Args:
@@ -103,15 +103,11 @@ def _get_service_record(
103
103
 
104
104
 
105
105
  def up(
106
- task: 'sky.Task',
106
+ task: 'task_lib.Task',
107
107
  service_name: Optional[str] = None,
108
108
  pool: bool = False,
109
109
  ) -> Tuple[str, str]:
110
110
  """Spins up a service or a pool."""
111
- if pool and not serve_utils.is_consolidation_mode(pool):
112
- raise ValueError(
113
- 'Pool is only supported in consolidation mode. To fix, set '
114
- '`jobs.controller.consolidation_mode: true` in SkyPilot config.')
115
111
  task.validate()
116
112
  serve_utils.validate_service_task(task, pool=pool)
117
113
  assert task.service is not None
@@ -191,8 +187,7 @@ def up(
191
187
  controller_log_file = (
192
188
  serve_utils.generate_remote_controller_log_file_name(service_name))
193
189
  controller_resources = controller_utils.get_controller_resources(
194
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
195
- task_resources=task.resources)
190
+ controller=controller, task_resources=task.resources)
196
191
  controller_job_id = None
197
192
  if serve_utils.is_consolidation_mode(pool):
198
193
  # We need a unique integer per sky.serve.up call to avoid name
@@ -228,10 +223,11 @@ def up(
228
223
  # balancer port from the controller? So we don't need to open so many
229
224
  # ports here. Or, we should have a nginx traffic control to refuse
230
225
  # any connection to the unregistered ports.
231
- controller_resources = {
232
- r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
233
- for r in controller_resources
234
- }
226
+ if not pool:
227
+ controller_resources = {
228
+ r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
229
+ for r in controller_resources
230
+ }
235
231
  controller_task.set_resources(controller_resources)
236
232
 
237
233
  # # Set service_name so the backend will know to modify default ray
@@ -325,7 +321,7 @@ def up(
325
321
  [controller_job_id],
326
322
  stream_logs=False)
327
323
  controller_job_status = list(statuses.values())[0]
328
- if controller_job_status == sky.JobStatus.PENDING:
324
+ if controller_job_status == job_lib.JobStatus.PENDING:
329
325
  # Max number of services reached due to vCPU constraint.
330
326
  # The controller job is pending due to ray job scheduling.
331
327
  # We manually cancel the job here.
@@ -350,7 +346,7 @@ def up(
350
346
  else:
351
347
  lb_port = serve_utils.load_service_initialization_result(
352
348
  lb_port_payload)
353
- if not serve_utils.is_consolidation_mode(pool):
349
+ if not serve_utils.is_consolidation_mode(pool) and not pool:
354
350
  socket_endpoint = backend_utils.get_endpoints(
355
351
  controller_handle.cluster_name,
356
352
  lb_port,
@@ -374,10 +370,10 @@ def up(
374
370
  f'\n📋 Useful Commands'
375
371
  f'\n{ux_utils.INDENT_SYMBOL}To submit jobs to the pool:\t'
376
372
  f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
377
- f'<run-command>{ux_utils.RESET_BOLD}'
373
+ f'<yaml_file>{ux_utils.RESET_BOLD}'
378
374
  f'\n{ux_utils.INDENT_SYMBOL}To submit multiple jobs:\t'
379
375
  f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
380
- f'--num-jobs 10 <run-command>{ux_utils.RESET_BOLD}'
376
+ f'--num-jobs 10 <yaml_file>{ux_utils.RESET_BOLD}'
381
377
  f'\n{ux_utils.INDENT_SYMBOL}To check the pool status:\t'
382
378
  f'{ux_utils.BOLD}sky jobs pool status {service_name}'
383
379
  f'{ux_utils.RESET_BOLD}'
@@ -421,7 +417,7 @@ def up(
421
417
 
422
418
 
423
419
  def update(
424
- task: 'sky.Task',
420
+ task: 'task_lib.Task',
425
421
  service_name: str,
426
422
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
427
423
  pool: bool = False,
@@ -576,7 +572,7 @@ def update(
576
572
 
577
573
 
578
574
  def apply(
579
- task: 'sky.Task',
575
+ task: 'task_lib.Task',
580
576
  service_name: str,
581
577
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
582
578
  pool: bool = False,
sky/serve/service.py CHANGED
@@ -15,11 +15,13 @@ import filelock
15
15
 
16
16
  from sky import authentication
17
17
  from sky import exceptions
18
+ from sky import global_user_state
18
19
  from sky import sky_logging
19
20
  from sky import task as task_lib
20
21
  from sky.backends import backend_utils
21
22
  from sky.backends import cloud_vm_ray_backend
22
23
  from sky.data import data_utils
24
+ from sky.jobs import scheduler as jobs_scheduler
23
25
  from sky.serve import constants
24
26
  from sky.serve import controller
25
27
  from sky.serve import load_balancer
@@ -28,6 +30,7 @@ from sky.serve import serve_state
28
30
  from sky.serve import serve_utils
29
31
  from sky.skylet import constants as skylet_constants
30
32
  from sky.utils import common_utils
33
+ from sky.utils import controller_utils
31
34
  from sky.utils import subprocess_utils
32
35
  from sky.utils import ux_utils
33
36
 
@@ -120,7 +123,16 @@ def _cleanup(service_name: str) -> bool:
120
123
  replica_infos = serve_state.get_replica_infos(service_name)
121
124
  info2proc: Dict[replica_managers.ReplicaInfo,
122
125
  multiprocessing.Process] = dict()
126
+ # NOTE(dev): This relies on `sky/serve/serve_utils.py::
127
+ # generate_replica_cluster_name`. Change it if you change the function.
128
+ existing_cluster_names = global_user_state.get_cluster_names_start_with(
129
+ service_name)
123
130
  for info in replica_infos:
131
+ if info.cluster_name not in existing_cluster_names:
132
+ logger.info(f'Cluster {info.cluster_name} for replica '
133
+ f'{info.replica_id} not found. Might be a failed '
134
+ 'cluster. Skipping.')
135
+ continue
124
136
  p = multiprocessing.Process(target=replica_managers.terminate_cluster,
125
137
  args=(info.cluster_name,))
126
138
  p.start()
@@ -214,22 +226,25 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
214
226
  service_name, version)
215
227
 
216
228
  if not is_recovery:
217
- if (len(serve_state.get_services()) >=
218
- serve_utils.get_num_service_threshold()):
219
- cleanup_storage(tmp_task_yaml)
220
- with ux_utils.print_exception_no_traceback():
221
- raise RuntimeError('Max number of services reached.')
222
- success = serve_state.add_service(
223
- service_name,
224
- controller_job_id=job_id,
225
- policy=service_spec.autoscaling_policy_str(),
226
- requested_resources_str=backend_utils.get_task_resources_str(task),
227
- load_balancing_policy=service_spec.load_balancing_policy,
228
- status=serve_state.ServiceStatus.CONTROLLER_INIT,
229
- tls_encrypted=service_spec.tls_credential is not None,
230
- pool=service_spec.pool,
231
- controller_pid=os.getpid(),
232
- entrypoint=entrypoint)
229
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
230
+ if not controller_utils.can_start_new_process():
231
+ cleanup_storage(tmp_task_yaml)
232
+ with ux_utils.print_exception_no_traceback():
233
+ raise RuntimeError(
234
+ constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
235
+ success = serve_state.add_service(
236
+ service_name,
237
+ controller_job_id=job_id,
238
+ policy=service_spec.autoscaling_policy_str(),
239
+ requested_resources_str=backend_utils.get_task_resources_str(
240
+ task),
241
+ load_balancing_policy=service_spec.load_balancing_policy,
242
+ status=serve_state.ServiceStatus.CONTROLLER_INIT,
243
+ tls_encrypted=service_spec.tls_credential is not None,
244
+ pool=service_spec.pool,
245
+ controller_pid=os.getpid(),
246
+ entrypoint=entrypoint)
247
+ jobs_scheduler.maybe_schedule_next_jobs()
233
248
  # Directly throw an error here. See sky/serve/api.py::up
234
249
  # for more details.
235
250
  if not success:
sky/server/server.py CHANGED
@@ -17,7 +17,7 @@ import resource
17
17
  import shutil
18
18
  import sys
19
19
  import threading
20
- from typing import Any, Dict, List, Literal, Optional, Set, Tuple
20
+ from typing import Dict, List, Literal, Optional, Set, Tuple
21
21
  import uuid
22
22
  import zipfile
23
23
 
@@ -42,6 +42,7 @@ from sky.data import storage_utils
42
42
  from sky.jobs.server import server as jobs_rest
43
43
  from sky.metrics import utils as metrics_utils
44
44
  from sky.provision.kubernetes import utils as kubernetes_utils
45
+ from sky.schemas.api import responses
45
46
  from sky.serve.server import server as serve_rest
46
47
  from sky.server import common
47
48
  from sky.server import config as server_config
@@ -1531,8 +1532,12 @@ async def api_status(
1531
1532
  return encoded_request_tasks
1532
1533
 
1533
1534
 
1534
- @app.get('/api/health')
1535
- async def health(request: fastapi.Request) -> Dict[str, Any]:
1535
+ @app.get(
1536
+ '/api/health',
1537
+ # response_model_exclude_unset omits unset fields
1538
+ # in the response JSON.
1539
+ response_model_exclude_unset=True)
1540
+ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
1536
1541
  """Checks the health of the API server.
1537
1542
 
1538
1543
  Returns:
@@ -1570,7 +1575,8 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
1570
1575
  # - There is no harm when an malicious client calls /api/health
1571
1576
  # without authentication since no sensitive information is
1572
1577
  # returned.
1573
- return {'status': common.ApiServerStatus.HEALTHY}
1578
+ return responses.APIHealthResponse(
1579
+ status=common.ApiServerStatus.HEALTHY,)
1574
1580
  # TODO(aylei): remove this after min_compatible_api_version >= 14.
1575
1581
  if client_version < 14:
1576
1582
  # For Client with API version < 14, the NEEDS_AUTH status is not
@@ -1579,19 +1585,19 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
1579
1585
  detail='Authentication required')
1580
1586
 
1581
1587
  logger.debug(f'Health endpoint: request.state.auth_user = {user}')
1582
- return {
1583
- 'status': server_status,
1588
+ return responses.APIHealthResponse(
1589
+ status=server_status,
1584
1590
  # Kept for backward compatibility, clients before 0.11.0 will read this
1585
1591
  # field to check compatibility and hint the user to upgrade the CLI.
1586
1592
  # TODO(aylei): remove this field after 0.13.0
1587
- 'api_version': str(server_constants.API_VERSION),
1588
- 'version': sky.__version__,
1589
- 'version_on_disk': common.get_skypilot_version_on_disk(),
1590
- 'commit': sky.__commit__,
1591
- 'user': user.to_dict() if user is not None else None,
1592
- 'basic_auth_enabled': os.environ.get(
1593
- constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false').lower() == 'true',
1594
- }
1593
+ api_version=str(server_constants.API_VERSION),
1594
+ version=sky.__version__,
1595
+ version_on_disk=common.get_skypilot_version_on_disk(),
1596
+ commit=sky.__commit__,
1597
+ basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
1598
+ 'false').lower() == 'true',
1599
+ user=user if user is not None else None,
1600
+ )
1595
1601
 
1596
1602
 
1597
1603
  @app.websocket('/kubernetes-pod-ssh-proxy')
@@ -88,17 +88,18 @@ local_ray = [
88
88
  'ray[default] >= 2.2.0, != 2.6.0',
89
89
  ]
90
90
 
91
+ # See requirements-dev.txt for the version of grpc and protobuf
92
+ # used to generate the code during development.
91
93
  remote = [
92
- # Adopted from ray's setup.py:
93
- # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L251-L252
94
- # SkyPilot: != 1.48.0 is required to avoid the error where ray dashboard
95
- # fails to start when ray start is called (#2054).
96
- # Tracking issue: https://github.com/ray-project/ray/issues/30984
97
- 'grpcio >= 1.32.0, != 1.48.0; python_version < \'3.10\'',
98
- 'grpcio >= 1.42.0, != 1.48.0; python_version >= \'3.10\'',
99
- # Adopted from ray's setup.py:
100
- # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L343
101
- 'protobuf >= 3.15.3, != 3.19.5',
94
+ # The grpc version at runtime has to be newer than the version
95
+ # used to generate the code.
96
+ 'grpcio>=1.63.0',
97
+ # >= 5.26.1 because the runtime version can't be older than the version
98
+ # used to generate the code.
99
+ # < 7.0.0 because code generated for a major version V will be supported by
100
+ # protobuf runtimes of version V and V+1.
101
+ # https://protobuf.dev/support/cross-version-runtime-guarantee
102
+ 'protobuf >= 5.26.1, < 7.0.0',
102
103
  ]
103
104
 
104
105
  # NOTE: Change the templates/jobs-controller.yaml.j2 file if any of the
@@ -16,8 +16,13 @@ from sky.utils import ux_utils
16
16
 
17
17
  if typing.TYPE_CHECKING:
18
18
  import psutil
19
+
20
+ from sky.schemas.generated import autostopv1_pb2
19
21
  else:
20
22
  psutil = adaptors_common.LazyImport('psutil')
23
+ # To avoid requiring protobuf to be installed on the client side.
24
+ autostopv1_pb2 = adaptors_common.LazyImport(
25
+ 'sky.schemas.generated.autostopv1_pb2')
21
26
 
22
27
  logger = sky_logging.init_logger(__name__)
23
28
 
@@ -55,11 +60,9 @@ Determines the condition for resetting the idleness timer.
55
60
  This option works in conjunction with ``--{pair}``. Options:
56
61
 
57
62
  \b
58
- 1. ``jobs_and_ssh`` (default): Wait for all jobs to complete AND all SSH
59
- sessions to disconnect.
60
- 2. ``jobs``: Wait for all jobs to complete.
61
- 3. ``none``: Stop immediately after idle time expires, regardless of running
62
- jobs or SSH connections."""
63
+ 1. ``jobs_and_ssh`` (default): Wait for in-progress jobs and SSH connections to finish.
64
+ 2. ``jobs``: Only wait for in-progress jobs.
65
+ 3. ``none``: Wait for nothing; autostop right after ``{pair}``."""
63
66
 
64
67
  @classmethod
65
68
  def from_str(cls, mode: str) -> 'AutostopWaitFor':
@@ -78,6 +81,36 @@ jobs or SSH connections."""
78
81
  f'\'{cls.JOBS.value}\', or '
79
82
  f'\'{cls.NONE.value}\'. ')
80
83
 
84
+ @classmethod
85
+ def from_protobuf(
86
+ cls, protobuf_value: 'autostopv1_pb2.AutostopWaitFor'
87
+ ) -> Optional['AutostopWaitFor']:
88
+ """Convert protobuf AutostopWaitFor enum to Python enum value."""
89
+ protobuf_to_enum = {
90
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: cls.JOBS_AND_SSH,
91
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS: cls.JOBS,
92
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE: cls.NONE,
93
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED: None,
94
+ }
95
+ if protobuf_value not in protobuf_to_enum:
96
+ with ux_utils.print_exception_no_traceback():
97
+ raise ValueError(
98
+ f'Unknown protobuf AutostopWaitFor value: {protobuf_value}')
99
+ return protobuf_to_enum[protobuf_value]
100
+
101
+ def to_protobuf(self) -> 'autostopv1_pb2.AutostopWaitFor':
102
+ """Convert this Python enum value to protobuf enum value."""
103
+ enum_to_protobuf = {
104
+ AutostopWaitFor.JOBS_AND_SSH:
105
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH,
106
+ AutostopWaitFor.JOBS: autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS,
107
+ AutostopWaitFor.NONE: autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE,
108
+ }
109
+ if self not in enum_to_protobuf:
110
+ with ux_utils.print_exception_no_traceback():
111
+ raise ValueError(f'Unknown AutostopWaitFor value: {self}')
112
+ return enum_to_protobuf[self]
113
+
81
114
 
82
115
  DEFAULT_AUTOSTOP_WAIT_FOR: AutostopWaitFor = AutostopWaitFor.JOBS_AND_SSH
83
116
 
sky/skylet/constants.py CHANGED
@@ -90,12 +90,14 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
90
90
  # cluster yaml is updated.
91
91
  #
92
92
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
93
- SKYLET_VERSION = '16'
93
+ SKYLET_VERSION = '17'
94
94
  # The version of the lib files that skylet/jobs use. Whenever there is an API
95
95
  # change for the job_lib or log_lib, we need to bump this version, so that the
96
96
  # user can be notified to update their SkyPilot version on the remote cluster.
97
97
  SKYLET_LIB_VERSION = 4
98
98
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
99
+ SKYLET_GRPC_PORT = 46590
100
+ SKYLET_GRPC_TIMEOUT_SECONDS = 5
99
101
 
100
102
  # Docker default options
101
103
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
sky/skylet/services.py ADDED
@@ -0,0 +1,44 @@
1
+ """gRPC service implementations for skylet."""
2
+
3
+ import grpc
4
+
5
+ from sky import sky_logging
6
+ from sky.schemas.generated import autostopv1_pb2
7
+ from sky.schemas.generated import autostopv1_pb2_grpc
8
+ from sky.skylet import autostop_lib
9
+
10
+ logger = sky_logging.init_logger(__name__)
11
+
12
+
13
+ class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
14
+ """Implementation of the AutostopService gRPC service."""
15
+
16
+ def SetAutostop( # type: ignore[return]
17
+ self, request: autostopv1_pb2.SetAutostopRequest,
18
+ context: grpc.ServicerContext
19
+ ) -> autostopv1_pb2.SetAutostopResponse:
20
+ """Sets autostop configuration for the cluster."""
21
+ try:
22
+ wait_for = autostop_lib.AutostopWaitFor.from_protobuf(
23
+ request.wait_for)
24
+ autostop_lib.set_autostop(
25
+ idle_minutes=request.idle_minutes,
26
+ backend=request.backend,
27
+ wait_for=wait_for if wait_for is not None else
28
+ autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
29
+ down=request.down)
30
+ return autostopv1_pb2.SetAutostopResponse()
31
+ except Exception as e: # pylint: disable=broad-except
32
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
33
+
34
+ def IsAutostopping( # type: ignore[return]
35
+ self, request: autostopv1_pb2.IsAutostoppingRequest,
36
+ context: grpc.ServicerContext
37
+ ) -> autostopv1_pb2.IsAutostoppingResponse:
38
+ """Checks if the cluster is currently autostopping."""
39
+ try:
40
+ is_autostopping = autostop_lib.get_is_autostopping()
41
+ return autostopv1_pb2.IsAutostoppingResponse(
42
+ is_autostopping=is_autostopping)
43
+ except Exception as e: # pylint: disable=broad-except
44
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
sky/skylet/skylet.py CHANGED
@@ -1,11 +1,17 @@
1
1
  """skylet: a daemon running on the head node of a cluster."""
2
2
 
3
+ import concurrent.futures
4
+ import os
3
5
  import time
4
6
 
7
+ import grpc
8
+
5
9
  import sky
6
10
  from sky import sky_logging
11
+ from sky.schemas.generated import autostopv1_pb2_grpc
7
12
  from sky.skylet import constants
8
13
  from sky.skylet import events
14
+ from sky.skylet import services
9
15
 
10
16
  # Use the explicit logger name so that the logger is under the
11
17
  # `sky.skylet.skylet` namespace when executed directly, so as
@@ -31,7 +37,46 @@ EVENTS = [
31
37
  events.UsageHeartbeatReportEvent(),
32
38
  ]
33
39
 
34
- while True:
35
- time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
36
- for event in EVENTS:
37
- event.run()
40
+
41
+ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
42
+ """Start the gRPC server."""
43
+ # This is the default value in Python 3.8 - 3.12,
44
+ # putting it here for visibility.
45
+ # TODO(kevin): Determine the optimal max number of threads.
46
+ max_workers = min(32, (os.cpu_count() or 1) + 4)
47
+ server = grpc.server(
48
+ concurrent.futures.ThreadPoolExecutor(max_workers=max_workers))
49
+
50
+ autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
51
+ services.AutostopServiceImpl(), server)
52
+
53
+ listen_addr = f'127.0.0.1:{port}'
54
+ server.add_insecure_port(listen_addr)
55
+
56
+ server.start()
57
+ logger.info(f'gRPC server started on {listen_addr}')
58
+
59
+ return server
60
+
61
+
62
+ def run_event_loop():
63
+ """Run the existing event loop."""
64
+
65
+ while True:
66
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
67
+ for event in EVENTS:
68
+ event.run()
69
+
70
+
71
+ def main():
72
+ grpc_server = start_grpc_server()
73
+ try:
74
+ run_event_loop()
75
+ except KeyboardInterrupt:
76
+ logger.info('Shutting down skylet...')
77
+ finally:
78
+ grpc_server.stop(grace=5)
79
+
80
+
81
+ if __name__ == '__main__':
82
+ main()
sky/skypilot_config.py CHANGED
@@ -575,8 +575,8 @@ def _reload_config_as_server() -> None:
575
575
  with _DB_USE_LOCK:
576
576
  sqlalchemy_engine = sqlalchemy.create_engine(db_url,
577
577
  poolclass=NullPool)
578
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
579
- sqlalchemy_engine)
578
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
579
+ sqlalchemy_engine)
580
580
 
581
581
  def _get_config_yaml_from_db(
582
582
  key: str) -> Optional[config_utils.Config]:
@@ -867,8 +867,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
867
867
  with _DB_USE_LOCK:
868
868
  sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
869
869
  poolclass=NullPool)
870
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
871
- sqlalchemy_engine)
870
+ db_utils.add_all_tables_to_db_sqlalchemy(
871
+ Base.metadata, sqlalchemy_engine)
872
872
 
873
873
  def _set_config_yaml_to_db(key: str,
874
874
  config: config_utils.Config):
sky/task.py CHANGED
@@ -10,26 +10,25 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
10
10
 
11
11
  import colorama
12
12
 
13
- import sky
14
13
  from sky import clouds
14
+ from sky import dag as dag_lib
15
15
  from sky import exceptions
16
+ from sky import resources as resources_lib
16
17
  from sky import sky_logging
17
18
  from sky.adaptors import common as adaptors_common
18
- import sky.dag
19
19
  from sky.data import data_utils
20
20
  from sky.data import storage as storage_lib
21
21
  from sky.provision import docker_utils
22
22
  from sky.serve import service_spec
23
23
  from sky.skylet import constants
24
24
  from sky.utils import common_utils
25
+ from sky.utils import registry
25
26
  from sky.utils import schemas
26
27
  from sky.utils import ux_utils
27
28
  from sky.utils import volume as volume_lib
28
29
 
29
30
  if typing.TYPE_CHECKING:
30
31
  import yaml
31
-
32
- from sky import resources as resources_lib
33
32
  else:
34
33
  yaml = adaptors_common.LazyImport('yaml')
35
34
 
@@ -382,26 +381,28 @@ class Task:
382
381
  self.estimated_inputs_size_gigabytes: Optional[float] = None
383
382
  self.estimated_outputs_size_gigabytes: Optional[float] = None
384
383
  # Default to CPU VM
385
- self.resources: Union[List[sky.Resources],
386
- Set[sky.Resources]] = {sky.Resources()}
384
+ self.resources: Union[List['resources_lib.Resources'],
385
+ Set['resources_lib.Resources']] = {
386
+ resources_lib.Resources()
387
+ }
387
388
  self._service: Optional[service_spec.SkyServiceSpec] = None
388
389
 
389
390
  # Resources that this task cannot run on.
390
391
  self.blocked_resources = blocked_resources
391
392
 
392
- self.time_estimator_func: Optional[Callable[['sky.Resources'],
393
+ self.time_estimator_func: Optional[Callable[['resources_lib.Resources'],
393
394
  int]] = None
394
395
  self.file_mounts: Optional[Dict[str, str]] = None
395
396
 
396
397
  # Only set when 'self' is a jobs controller task: 'self.managed_job_dag'
397
398
  # is the underlying managed job dag (sky.Dag object).
398
- self.managed_job_dag: Optional['sky.Dag'] = None
399
+ self.managed_job_dag: Optional['dag_lib.Dag'] = None
399
400
 
400
401
  # Only set when 'self' is a sky serve controller task.
401
402
  self.service_name: Optional[str] = None
402
403
 
403
404
  # Filled in by the optimizer. If None, this Task is not planned.
404
- self.best_resources: Optional[sky.Resources] = None
405
+ self.best_resources: Optional['resources_lib.Resources'] = None
405
406
 
406
407
  # For internal use only.
407
408
  self.file_mounts_mapping: Optional[Dict[str,
@@ -418,7 +419,7 @@ class Task:
418
419
  if file_mounts is not None:
419
420
  self.set_file_mounts(file_mounts)
420
421
 
421
- dag = sky.dag.get_current_dag()
422
+ dag = dag_lib.get_current_dag()
422
423
  if dag is not None:
423
424
  dag.add(self)
424
425
 
@@ -783,7 +784,8 @@ class Task:
783
784
  '_cluster_config_overrides'] = cluster_config_override
784
785
  if volumes:
785
786
  resources_config['volumes'] = volumes
786
- task.set_resources(sky.Resources.from_yaml_config(resources_config))
787
+ task.set_resources(
788
+ resources_lib.Resources.from_yaml_config(resources_config))
787
789
 
788
790
  service = config.pop('service', None)
789
791
  pool = config.pop('pool', None)
@@ -931,7 +933,8 @@ class Task:
931
933
  for key, (vol_name, vol_req) in topology.items():
932
934
  if vol_req is not None:
933
935
  if key == 'cloud':
934
- override_params[key] = sky.CLOUD_REGISTRY.from_str(vol_req)
936
+ override_params[key] = registry.CLOUD_REGISTRY.from_str(
937
+ vol_req)
935
938
  else:
936
939
  override_params[key] = vol_req
937
940
  self.set_resources_override(override_params)
@@ -1142,7 +1145,7 @@ class Task:
1142
1145
  Returns:
1143
1146
  self: The current task, with resources set.
1144
1147
  """
1145
- if isinstance(resources, sky.Resources):
1148
+ if isinstance(resources, resources_lib.Resources):
1146
1149
  resources = {resources}
1147
1150
  # TODO(woosuk): Check if the resources are None.
1148
1151
  self.resources = _with_docker_login_config(resources, self.envs,
@@ -1187,8 +1190,8 @@ class Task:
1187
1190
  self._service = service
1188
1191
  return self
1189
1192
 
1190
- def set_time_estimator(self, func: Callable[['sky.Resources'],
1191
- int]) -> 'Task':
1193
+ def set_time_estimator(
1194
+ self, func: Callable[['resources_lib.Resources'], int]) -> 'Task':
1192
1195
  """Sets a func mapping resources to estimated time (secs).
1193
1196
 
1194
1197
  This is EXPERIMENTAL.
@@ -1712,7 +1715,7 @@ class Task:
1712
1715
  return required_features
1713
1716
 
1714
1717
  def __rshift__(self, b):
1715
- sky.dag.get_current_dag().add_edge(self, b)
1718
+ dag_lib.get_current_dag().add_edge(self, b)
1716
1719
 
1717
1720
  def __repr__(self):
1718
1721
  if isinstance(self.run, str):
@@ -50,7 +50,7 @@ provider:
50
50
  disable_launch_config_check: true
51
51
 
52
52
  auth:
53
- ssh_user: ubuntu
53
+ ssh_user: {{ssh_user}}
54
54
  ssh_private_key: {{ssh_private_key}}
55
55
  {% if ssh_proxy_command is not none %}
56
56
  ssh_proxy_command: {{ssh_proxy_command}}
@@ -68,7 +68,7 @@ available_node_types:
68
68
  ImageId: {{image_id}} # Deep Learning AMI (Ubuntu 18.04); see aws.py.
69
69
  # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
70
70
  BlockDeviceMappings:
71
- - DeviceName: /dev/sda1
71
+ - DeviceName: {{root_device_name}}
72
72
  Ebs:
73
73
  VolumeSize: {{disk_size}}
74
74
  VolumeType: {{disk_tier}}