skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/backends/backend_utils.py +102 -8
- sky/backends/cloud_vm_ray_backend.py +197 -31
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +60 -77
- sky/client/common.py +1 -1
- sky/client/sdk.py +19 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +14 -0
- sky/core.py +5 -0
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +15 -0
- sky/global_user_state.py +160 -2
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +6 -3
- sky/jobs/scheduler.py +23 -68
- sky/jobs/server/core.py +22 -12
- sky/jobs/state.py +6 -2
- sky/jobs/utils.py +17 -2
- sky/provision/__init__.py +4 -2
- sky/provision/aws/config.py +9 -0
- sky/provision/aws/instance.py +41 -17
- sky/provision/azure/instance.py +7 -4
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +7 -4
- sky/provision/do/instance.py +7 -4
- sky/provision/fluidstack/instance.py +7 -4
- sky/provision/gcp/instance.py +7 -4
- sky/provision/hyperbolic/instance.py +7 -5
- sky/provision/kubernetes/instance.py +169 -6
- sky/provision/lambda_cloud/instance.py +7 -4
- sky/provision/nebius/instance.py +7 -4
- sky/provision/oci/instance.py +7 -4
- sky/provision/paperspace/instance.py +7 -5
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +6 -0
- sky/provision/runpod/instance.py +7 -4
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +7 -5
- sky/provision/vast/instance.py +7 -5
- sky/provision/vsphere/instance.py +7 -4
- sky/resources.py +1 -2
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/serve_state/001_initial_schema.py +1 -1
- sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +15 -16
- sky/serve/serve_state.py +10 -0
- sky/serve/serve_utils.py +58 -23
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +31 -16
- sky/server/server.py +20 -14
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/skypilot_config.py +4 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/users/permission.py +1 -1
- sky/utils/cli_utils/status_utils.py +9 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +73 -0
- sky/utils/db/db_utils.py +39 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/schemas.py +3 -0
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
- sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
- /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/serve/server/impl.py
CHANGED
|
@@ -11,7 +11,6 @@ import uuid
|
|
|
11
11
|
import colorama
|
|
12
12
|
import filelock
|
|
13
13
|
|
|
14
|
-
import sky
|
|
15
14
|
from sky import backends
|
|
16
15
|
from sky import exceptions
|
|
17
16
|
from sky import execution
|
|
@@ -25,6 +24,7 @@ from sky.serve import constants as serve_constants
|
|
|
25
24
|
from sky.serve import serve_state
|
|
26
25
|
from sky.serve import serve_utils
|
|
27
26
|
from sky.skylet import constants
|
|
27
|
+
from sky.skylet import job_lib
|
|
28
28
|
from sky.utils import admin_policy_utils
|
|
29
29
|
from sky.utils import command_runner
|
|
30
30
|
from sky.utils import common
|
|
@@ -39,7 +39,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
|
42
|
-
service_name: str, task: '
|
|
42
|
+
service_name: str, task: 'task_lib.Task') -> Dict[str, Any]:
|
|
43
43
|
"""Rewrite the paths of TLS credentials in the task.
|
|
44
44
|
|
|
45
45
|
Args:
|
|
@@ -103,15 +103,11 @@ def _get_service_record(
|
|
|
103
103
|
|
|
104
104
|
|
|
105
105
|
def up(
|
|
106
|
-
task: '
|
|
106
|
+
task: 'task_lib.Task',
|
|
107
107
|
service_name: Optional[str] = None,
|
|
108
108
|
pool: bool = False,
|
|
109
109
|
) -> Tuple[str, str]:
|
|
110
110
|
"""Spins up a service or a pool."""
|
|
111
|
-
if pool and not serve_utils.is_consolidation_mode(pool):
|
|
112
|
-
raise ValueError(
|
|
113
|
-
'Pool is only supported in consolidation mode. To fix, set '
|
|
114
|
-
'`jobs.controller.consolidation_mode: true` in SkyPilot config.')
|
|
115
111
|
task.validate()
|
|
116
112
|
serve_utils.validate_service_task(task, pool=pool)
|
|
117
113
|
assert task.service is not None
|
|
@@ -191,8 +187,7 @@ def up(
|
|
|
191
187
|
controller_log_file = (
|
|
192
188
|
serve_utils.generate_remote_controller_log_file_name(service_name))
|
|
193
189
|
controller_resources = controller_utils.get_controller_resources(
|
|
194
|
-
controller=
|
|
195
|
-
task_resources=task.resources)
|
|
190
|
+
controller=controller, task_resources=task.resources)
|
|
196
191
|
controller_job_id = None
|
|
197
192
|
if serve_utils.is_consolidation_mode(pool):
|
|
198
193
|
# We need a unique integer per sky.serve.up call to avoid name
|
|
@@ -228,10 +223,11 @@ def up(
|
|
|
228
223
|
# balancer port from the controller? So we don't need to open so many
|
|
229
224
|
# ports here. Or, we should have a nginx traffic control to refuse
|
|
230
225
|
# any connection to the unregistered ports.
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
226
|
+
if not pool:
|
|
227
|
+
controller_resources = {
|
|
228
|
+
r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
|
|
229
|
+
for r in controller_resources
|
|
230
|
+
}
|
|
235
231
|
controller_task.set_resources(controller_resources)
|
|
236
232
|
|
|
237
233
|
# # Set service_name so the backend will know to modify default ray
|
|
@@ -325,7 +321,7 @@ def up(
|
|
|
325
321
|
[controller_job_id],
|
|
326
322
|
stream_logs=False)
|
|
327
323
|
controller_job_status = list(statuses.values())[0]
|
|
328
|
-
if controller_job_status ==
|
|
324
|
+
if controller_job_status == job_lib.JobStatus.PENDING:
|
|
329
325
|
# Max number of services reached due to vCPU constraint.
|
|
330
326
|
# The controller job is pending due to ray job scheduling.
|
|
331
327
|
# We manually cancel the job here.
|
|
@@ -350,7 +346,7 @@ def up(
|
|
|
350
346
|
else:
|
|
351
347
|
lb_port = serve_utils.load_service_initialization_result(
|
|
352
348
|
lb_port_payload)
|
|
353
|
-
if not serve_utils.is_consolidation_mode(pool):
|
|
349
|
+
if not serve_utils.is_consolidation_mode(pool) and not pool:
|
|
354
350
|
socket_endpoint = backend_utils.get_endpoints(
|
|
355
351
|
controller_handle.cluster_name,
|
|
356
352
|
lb_port,
|
|
@@ -374,10 +370,10 @@ def up(
|
|
|
374
370
|
f'\n📋 Useful Commands'
|
|
375
371
|
f'\n{ux_utils.INDENT_SYMBOL}To submit jobs to the pool:\t'
|
|
376
372
|
f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
|
|
377
|
-
f'<
|
|
373
|
+
f'<yaml_file>{ux_utils.RESET_BOLD}'
|
|
378
374
|
f'\n{ux_utils.INDENT_SYMBOL}To submit multiple jobs:\t'
|
|
379
375
|
f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
|
|
380
|
-
f'--num-jobs 10 <
|
|
376
|
+
f'--num-jobs 10 <yaml_file>{ux_utils.RESET_BOLD}'
|
|
381
377
|
f'\n{ux_utils.INDENT_SYMBOL}To check the pool status:\t'
|
|
382
378
|
f'{ux_utils.BOLD}sky jobs pool status {service_name}'
|
|
383
379
|
f'{ux_utils.RESET_BOLD}'
|
|
@@ -421,7 +417,7 @@ def up(
|
|
|
421
417
|
|
|
422
418
|
|
|
423
419
|
def update(
|
|
424
|
-
task: '
|
|
420
|
+
task: 'task_lib.Task',
|
|
425
421
|
service_name: str,
|
|
426
422
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
427
423
|
pool: bool = False,
|
|
@@ -576,7 +572,7 @@ def update(
|
|
|
576
572
|
|
|
577
573
|
|
|
578
574
|
def apply(
|
|
579
|
-
task: '
|
|
575
|
+
task: 'task_lib.Task',
|
|
580
576
|
service_name: str,
|
|
581
577
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
582
578
|
pool: bool = False,
|
sky/serve/service.py
CHANGED
|
@@ -15,11 +15,13 @@ import filelock
|
|
|
15
15
|
|
|
16
16
|
from sky import authentication
|
|
17
17
|
from sky import exceptions
|
|
18
|
+
from sky import global_user_state
|
|
18
19
|
from sky import sky_logging
|
|
19
20
|
from sky import task as task_lib
|
|
20
21
|
from sky.backends import backend_utils
|
|
21
22
|
from sky.backends import cloud_vm_ray_backend
|
|
22
23
|
from sky.data import data_utils
|
|
24
|
+
from sky.jobs import scheduler as jobs_scheduler
|
|
23
25
|
from sky.serve import constants
|
|
24
26
|
from sky.serve import controller
|
|
25
27
|
from sky.serve import load_balancer
|
|
@@ -28,6 +30,7 @@ from sky.serve import serve_state
|
|
|
28
30
|
from sky.serve import serve_utils
|
|
29
31
|
from sky.skylet import constants as skylet_constants
|
|
30
32
|
from sky.utils import common_utils
|
|
33
|
+
from sky.utils import controller_utils
|
|
31
34
|
from sky.utils import subprocess_utils
|
|
32
35
|
from sky.utils import ux_utils
|
|
33
36
|
|
|
@@ -120,7 +123,16 @@ def _cleanup(service_name: str) -> bool:
|
|
|
120
123
|
replica_infos = serve_state.get_replica_infos(service_name)
|
|
121
124
|
info2proc: Dict[replica_managers.ReplicaInfo,
|
|
122
125
|
multiprocessing.Process] = dict()
|
|
126
|
+
# NOTE(dev): This relies on `sky/serve/serve_utils.py::
|
|
127
|
+
# generate_replica_cluster_name`. Change it if you change the function.
|
|
128
|
+
existing_cluster_names = global_user_state.get_cluster_names_start_with(
|
|
129
|
+
service_name)
|
|
123
130
|
for info in replica_infos:
|
|
131
|
+
if info.cluster_name not in existing_cluster_names:
|
|
132
|
+
logger.info(f'Cluster {info.cluster_name} for replica '
|
|
133
|
+
f'{info.replica_id} not found. Might be a failed '
|
|
134
|
+
'cluster. Skipping.')
|
|
135
|
+
continue
|
|
124
136
|
p = multiprocessing.Process(target=replica_managers.terminate_cluster,
|
|
125
137
|
args=(info.cluster_name,))
|
|
126
138
|
p.start()
|
|
@@ -214,22 +226,25 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
|
214
226
|
service_name, version)
|
|
215
227
|
|
|
216
228
|
if not is_recovery:
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
229
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
230
|
+
if not controller_utils.can_start_new_process():
|
|
231
|
+
cleanup_storage(tmp_task_yaml)
|
|
232
|
+
with ux_utils.print_exception_no_traceback():
|
|
233
|
+
raise RuntimeError(
|
|
234
|
+
constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
|
|
235
|
+
success = serve_state.add_service(
|
|
236
|
+
service_name,
|
|
237
|
+
controller_job_id=job_id,
|
|
238
|
+
policy=service_spec.autoscaling_policy_str(),
|
|
239
|
+
requested_resources_str=backend_utils.get_task_resources_str(
|
|
240
|
+
task),
|
|
241
|
+
load_balancing_policy=service_spec.load_balancing_policy,
|
|
242
|
+
status=serve_state.ServiceStatus.CONTROLLER_INIT,
|
|
243
|
+
tls_encrypted=service_spec.tls_credential is not None,
|
|
244
|
+
pool=service_spec.pool,
|
|
245
|
+
controller_pid=os.getpid(),
|
|
246
|
+
entrypoint=entrypoint)
|
|
247
|
+
jobs_scheduler.maybe_schedule_next_jobs()
|
|
233
248
|
# Directly throw an error here. See sky/serve/api.py::up
|
|
234
249
|
# for more details.
|
|
235
250
|
if not success:
|
sky/server/server.py
CHANGED
|
@@ -17,7 +17,7 @@ import resource
|
|
|
17
17
|
import shutil
|
|
18
18
|
import sys
|
|
19
19
|
import threading
|
|
20
|
-
from typing import
|
|
20
|
+
from typing import Dict, List, Literal, Optional, Set, Tuple
|
|
21
21
|
import uuid
|
|
22
22
|
import zipfile
|
|
23
23
|
|
|
@@ -42,6 +42,7 @@ from sky.data import storage_utils
|
|
|
42
42
|
from sky.jobs.server import server as jobs_rest
|
|
43
43
|
from sky.metrics import utils as metrics_utils
|
|
44
44
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
45
|
+
from sky.schemas.api import responses
|
|
45
46
|
from sky.serve.server import server as serve_rest
|
|
46
47
|
from sky.server import common
|
|
47
48
|
from sky.server import config as server_config
|
|
@@ -1531,8 +1532,12 @@ async def api_status(
|
|
|
1531
1532
|
return encoded_request_tasks
|
|
1532
1533
|
|
|
1533
1534
|
|
|
1534
|
-
@app.get(
|
|
1535
|
-
|
|
1535
|
+
@app.get(
|
|
1536
|
+
'/api/health',
|
|
1537
|
+
# response_model_exclude_unset omits unset fields
|
|
1538
|
+
# in the response JSON.
|
|
1539
|
+
response_model_exclude_unset=True)
|
|
1540
|
+
async def health(request: fastapi.Request) -> responses.APIHealthResponse:
|
|
1536
1541
|
"""Checks the health of the API server.
|
|
1537
1542
|
|
|
1538
1543
|
Returns:
|
|
@@ -1570,7 +1575,8 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
|
|
|
1570
1575
|
# - There is no harm when an malicious client calls /api/health
|
|
1571
1576
|
# without authentication since no sensitive information is
|
|
1572
1577
|
# returned.
|
|
1573
|
-
return
|
|
1578
|
+
return responses.APIHealthResponse(
|
|
1579
|
+
status=common.ApiServerStatus.HEALTHY,)
|
|
1574
1580
|
# TODO(aylei): remove this after min_compatible_api_version >= 14.
|
|
1575
1581
|
if client_version < 14:
|
|
1576
1582
|
# For Client with API version < 14, the NEEDS_AUTH status is not
|
|
@@ -1579,19 +1585,19 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
|
|
|
1579
1585
|
detail='Authentication required')
|
|
1580
1586
|
|
|
1581
1587
|
logger.debug(f'Health endpoint: request.state.auth_user = {user}')
|
|
1582
|
-
return
|
|
1583
|
-
|
|
1588
|
+
return responses.APIHealthResponse(
|
|
1589
|
+
status=server_status,
|
|
1584
1590
|
# Kept for backward compatibility, clients before 0.11.0 will read this
|
|
1585
1591
|
# field to check compatibility and hint the user to upgrade the CLI.
|
|
1586
1592
|
# TODO(aylei): remove this field after 0.13.0
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1593
|
+
api_version=str(server_constants.API_VERSION),
|
|
1594
|
+
version=sky.__version__,
|
|
1595
|
+
version_on_disk=common.get_skypilot_version_on_disk(),
|
|
1596
|
+
commit=sky.__commit__,
|
|
1597
|
+
basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
|
|
1598
|
+
'false').lower() == 'true',
|
|
1599
|
+
user=user if user is not None else None,
|
|
1600
|
+
)
|
|
1595
1601
|
|
|
1596
1602
|
|
|
1597
1603
|
@app.websocket('/kubernetes-pod-ssh-proxy')
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -88,17 +88,18 @@ local_ray = [
|
|
|
88
88
|
'ray[default] >= 2.2.0, != 2.6.0',
|
|
89
89
|
]
|
|
90
90
|
|
|
91
|
+
# See requirements-dev.txt for the version of grpc and protobuf
|
|
92
|
+
# used to generate the code during development.
|
|
91
93
|
remote = [
|
|
92
|
-
#
|
|
93
|
-
#
|
|
94
|
-
|
|
95
|
-
#
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
#
|
|
100
|
-
|
|
101
|
-
'protobuf >= 3.15.3, != 3.19.5',
|
|
94
|
+
# The grpc version at runtime has to be newer than the version
|
|
95
|
+
# used to generate the code.
|
|
96
|
+
'grpcio>=1.63.0',
|
|
97
|
+
# >= 5.26.1 because the runtime version can't be older than the version
|
|
98
|
+
# used to generate the code.
|
|
99
|
+
# < 7.0.0 because code generated for a major version V will be supported by
|
|
100
|
+
# protobuf runtimes of version V and V+1.
|
|
101
|
+
# https://protobuf.dev/support/cross-version-runtime-guarantee
|
|
102
|
+
'protobuf >= 5.26.1, < 7.0.0',
|
|
102
103
|
]
|
|
103
104
|
|
|
104
105
|
# NOTE: Change the templates/jobs-controller.yaml.j2 file if any of the
|
sky/skylet/autostop_lib.py
CHANGED
|
@@ -16,8 +16,13 @@ from sky.utils import ux_utils
|
|
|
16
16
|
|
|
17
17
|
if typing.TYPE_CHECKING:
|
|
18
18
|
import psutil
|
|
19
|
+
|
|
20
|
+
from sky.schemas.generated import autostopv1_pb2
|
|
19
21
|
else:
|
|
20
22
|
psutil = adaptors_common.LazyImport('psutil')
|
|
23
|
+
# To avoid requiring protobuf to be installed on the client side.
|
|
24
|
+
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
25
|
+
'sky.schemas.generated.autostopv1_pb2')
|
|
21
26
|
|
|
22
27
|
logger = sky_logging.init_logger(__name__)
|
|
23
28
|
|
|
@@ -55,11 +60,9 @@ Determines the condition for resetting the idleness timer.
|
|
|
55
60
|
This option works in conjunction with ``--{pair}``. Options:
|
|
56
61
|
|
|
57
62
|
\b
|
|
58
|
-
1. ``jobs_and_ssh`` (default): Wait for
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
3. ``none``: Stop immediately after idle time expires, regardless of running
|
|
62
|
-
jobs or SSH connections."""
|
|
63
|
+
1. ``jobs_and_ssh`` (default): Wait for in-progress jobs and SSH connections to finish.
|
|
64
|
+
2. ``jobs``: Only wait for in-progress jobs.
|
|
65
|
+
3. ``none``: Wait for nothing; autostop right after ``{pair}``."""
|
|
63
66
|
|
|
64
67
|
@classmethod
|
|
65
68
|
def from_str(cls, mode: str) -> 'AutostopWaitFor':
|
|
@@ -78,6 +81,36 @@ jobs or SSH connections."""
|
|
|
78
81
|
f'\'{cls.JOBS.value}\', or '
|
|
79
82
|
f'\'{cls.NONE.value}\'. ')
|
|
80
83
|
|
|
84
|
+
@classmethod
|
|
85
|
+
def from_protobuf(
|
|
86
|
+
cls, protobuf_value: 'autostopv1_pb2.AutostopWaitFor'
|
|
87
|
+
) -> Optional['AutostopWaitFor']:
|
|
88
|
+
"""Convert protobuf AutostopWaitFor enum to Python enum value."""
|
|
89
|
+
protobuf_to_enum = {
|
|
90
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: cls.JOBS_AND_SSH,
|
|
91
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS: cls.JOBS,
|
|
92
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE: cls.NONE,
|
|
93
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED: None,
|
|
94
|
+
}
|
|
95
|
+
if protobuf_value not in protobuf_to_enum:
|
|
96
|
+
with ux_utils.print_exception_no_traceback():
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f'Unknown protobuf AutostopWaitFor value: {protobuf_value}')
|
|
99
|
+
return protobuf_to_enum[protobuf_value]
|
|
100
|
+
|
|
101
|
+
def to_protobuf(self) -> 'autostopv1_pb2.AutostopWaitFor':
|
|
102
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
103
|
+
enum_to_protobuf = {
|
|
104
|
+
AutostopWaitFor.JOBS_AND_SSH:
|
|
105
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH,
|
|
106
|
+
AutostopWaitFor.JOBS: autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS,
|
|
107
|
+
AutostopWaitFor.NONE: autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE,
|
|
108
|
+
}
|
|
109
|
+
if self not in enum_to_protobuf:
|
|
110
|
+
with ux_utils.print_exception_no_traceback():
|
|
111
|
+
raise ValueError(f'Unknown AutostopWaitFor value: {self}')
|
|
112
|
+
return enum_to_protobuf[self]
|
|
113
|
+
|
|
81
114
|
|
|
82
115
|
DEFAULT_AUTOSTOP_WAIT_FOR: AutostopWaitFor = AutostopWaitFor.JOBS_AND_SSH
|
|
83
116
|
|
sky/skylet/constants.py
CHANGED
|
@@ -90,12 +90,14 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
90
90
|
# cluster yaml is updated.
|
|
91
91
|
#
|
|
92
92
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
93
|
-
SKYLET_VERSION = '
|
|
93
|
+
SKYLET_VERSION = '17'
|
|
94
94
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
95
95
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
96
96
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
97
97
|
SKYLET_LIB_VERSION = 4
|
|
98
98
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
|
99
|
+
SKYLET_GRPC_PORT = 46590
|
|
100
|
+
SKYLET_GRPC_TIMEOUT_SECONDS = 5
|
|
99
101
|
|
|
100
102
|
# Docker default options
|
|
101
103
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
sky/skylet/services.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""gRPC service implementations for skylet."""
|
|
2
|
+
|
|
3
|
+
import grpc
|
|
4
|
+
|
|
5
|
+
from sky import sky_logging
|
|
6
|
+
from sky.schemas.generated import autostopv1_pb2
|
|
7
|
+
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
8
|
+
from sky.skylet import autostop_lib
|
|
9
|
+
|
|
10
|
+
logger = sky_logging.init_logger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
|
|
14
|
+
"""Implementation of the AutostopService gRPC service."""
|
|
15
|
+
|
|
16
|
+
def SetAutostop( # type: ignore[return]
|
|
17
|
+
self, request: autostopv1_pb2.SetAutostopRequest,
|
|
18
|
+
context: grpc.ServicerContext
|
|
19
|
+
) -> autostopv1_pb2.SetAutostopResponse:
|
|
20
|
+
"""Sets autostop configuration for the cluster."""
|
|
21
|
+
try:
|
|
22
|
+
wait_for = autostop_lib.AutostopWaitFor.from_protobuf(
|
|
23
|
+
request.wait_for)
|
|
24
|
+
autostop_lib.set_autostop(
|
|
25
|
+
idle_minutes=request.idle_minutes,
|
|
26
|
+
backend=request.backend,
|
|
27
|
+
wait_for=wait_for if wait_for is not None else
|
|
28
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
|
|
29
|
+
down=request.down)
|
|
30
|
+
return autostopv1_pb2.SetAutostopResponse()
|
|
31
|
+
except Exception as e: # pylint: disable=broad-except
|
|
32
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
33
|
+
|
|
34
|
+
def IsAutostopping( # type: ignore[return]
|
|
35
|
+
self, request: autostopv1_pb2.IsAutostoppingRequest,
|
|
36
|
+
context: grpc.ServicerContext
|
|
37
|
+
) -> autostopv1_pb2.IsAutostoppingResponse:
|
|
38
|
+
"""Checks if the cluster is currently autostopping."""
|
|
39
|
+
try:
|
|
40
|
+
is_autostopping = autostop_lib.get_is_autostopping()
|
|
41
|
+
return autostopv1_pb2.IsAutostoppingResponse(
|
|
42
|
+
is_autostopping=is_autostopping)
|
|
43
|
+
except Exception as e: # pylint: disable=broad-except
|
|
44
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
sky/skylet/skylet.py
CHANGED
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
"""skylet: a daemon running on the head node of a cluster."""
|
|
2
2
|
|
|
3
|
+
import concurrent.futures
|
|
4
|
+
import os
|
|
3
5
|
import time
|
|
4
6
|
|
|
7
|
+
import grpc
|
|
8
|
+
|
|
5
9
|
import sky
|
|
6
10
|
from sky import sky_logging
|
|
11
|
+
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
7
12
|
from sky.skylet import constants
|
|
8
13
|
from sky.skylet import events
|
|
14
|
+
from sky.skylet import services
|
|
9
15
|
|
|
10
16
|
# Use the explicit logger name so that the logger is under the
|
|
11
17
|
# `sky.skylet.skylet` namespace when executed directly, so as
|
|
@@ -31,7 +37,46 @@ EVENTS = [
|
|
|
31
37
|
events.UsageHeartbeatReportEvent(),
|
|
32
38
|
]
|
|
33
39
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
40
|
+
|
|
41
|
+
def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
|
|
42
|
+
"""Start the gRPC server."""
|
|
43
|
+
# This is the default value in Python 3.8 - 3.12,
|
|
44
|
+
# putting it here for visibility.
|
|
45
|
+
# TODO(kevin): Determine the optimal max number of threads.
|
|
46
|
+
max_workers = min(32, (os.cpu_count() or 1) + 4)
|
|
47
|
+
server = grpc.server(
|
|
48
|
+
concurrent.futures.ThreadPoolExecutor(max_workers=max_workers))
|
|
49
|
+
|
|
50
|
+
autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
|
|
51
|
+
services.AutostopServiceImpl(), server)
|
|
52
|
+
|
|
53
|
+
listen_addr = f'127.0.0.1:{port}'
|
|
54
|
+
server.add_insecure_port(listen_addr)
|
|
55
|
+
|
|
56
|
+
server.start()
|
|
57
|
+
logger.info(f'gRPC server started on {listen_addr}')
|
|
58
|
+
|
|
59
|
+
return server
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def run_event_loop():
|
|
63
|
+
"""Run the existing event loop."""
|
|
64
|
+
|
|
65
|
+
while True:
|
|
66
|
+
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
67
|
+
for event in EVENTS:
|
|
68
|
+
event.run()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def main():
|
|
72
|
+
grpc_server = start_grpc_server()
|
|
73
|
+
try:
|
|
74
|
+
run_event_loop()
|
|
75
|
+
except KeyboardInterrupt:
|
|
76
|
+
logger.info('Shutting down skylet...')
|
|
77
|
+
finally:
|
|
78
|
+
grpc_server.stop(grace=5)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == '__main__':
|
|
82
|
+
main()
|
sky/skypilot_config.py
CHANGED
|
@@ -575,8 +575,8 @@ def _reload_config_as_server() -> None:
|
|
|
575
575
|
with _DB_USE_LOCK:
|
|
576
576
|
sqlalchemy_engine = sqlalchemy.create_engine(db_url,
|
|
577
577
|
poolclass=NullPool)
|
|
578
|
-
db_utils.
|
|
579
|
-
|
|
578
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
|
|
579
|
+
sqlalchemy_engine)
|
|
580
580
|
|
|
581
581
|
def _get_config_yaml_from_db(
|
|
582
582
|
key: str) -> Optional[config_utils.Config]:
|
|
@@ -867,8 +867,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
867
867
|
with _DB_USE_LOCK:
|
|
868
868
|
sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
|
|
869
869
|
poolclass=NullPool)
|
|
870
|
-
db_utils.
|
|
871
|
-
|
|
870
|
+
db_utils.add_all_tables_to_db_sqlalchemy(
|
|
871
|
+
Base.metadata, sqlalchemy_engine)
|
|
872
872
|
|
|
873
873
|
def _set_config_yaml_to_db(key: str,
|
|
874
874
|
config: config_utils.Config):
|
sky/task.py
CHANGED
|
@@ -10,26 +10,25 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
|
|
10
10
|
|
|
11
11
|
import colorama
|
|
12
12
|
|
|
13
|
-
import sky
|
|
14
13
|
from sky import clouds
|
|
14
|
+
from sky import dag as dag_lib
|
|
15
15
|
from sky import exceptions
|
|
16
|
+
from sky import resources as resources_lib
|
|
16
17
|
from sky import sky_logging
|
|
17
18
|
from sky.adaptors import common as adaptors_common
|
|
18
|
-
import sky.dag
|
|
19
19
|
from sky.data import data_utils
|
|
20
20
|
from sky.data import storage as storage_lib
|
|
21
21
|
from sky.provision import docker_utils
|
|
22
22
|
from sky.serve import service_spec
|
|
23
23
|
from sky.skylet import constants
|
|
24
24
|
from sky.utils import common_utils
|
|
25
|
+
from sky.utils import registry
|
|
25
26
|
from sky.utils import schemas
|
|
26
27
|
from sky.utils import ux_utils
|
|
27
28
|
from sky.utils import volume as volume_lib
|
|
28
29
|
|
|
29
30
|
if typing.TYPE_CHECKING:
|
|
30
31
|
import yaml
|
|
31
|
-
|
|
32
|
-
from sky import resources as resources_lib
|
|
33
32
|
else:
|
|
34
33
|
yaml = adaptors_common.LazyImport('yaml')
|
|
35
34
|
|
|
@@ -382,26 +381,28 @@ class Task:
|
|
|
382
381
|
self.estimated_inputs_size_gigabytes: Optional[float] = None
|
|
383
382
|
self.estimated_outputs_size_gigabytes: Optional[float] = None
|
|
384
383
|
# Default to CPU VM
|
|
385
|
-
self.resources: Union[List[
|
|
386
|
-
Set[
|
|
384
|
+
self.resources: Union[List['resources_lib.Resources'],
|
|
385
|
+
Set['resources_lib.Resources']] = {
|
|
386
|
+
resources_lib.Resources()
|
|
387
|
+
}
|
|
387
388
|
self._service: Optional[service_spec.SkyServiceSpec] = None
|
|
388
389
|
|
|
389
390
|
# Resources that this task cannot run on.
|
|
390
391
|
self.blocked_resources = blocked_resources
|
|
391
392
|
|
|
392
|
-
self.time_estimator_func: Optional[Callable[['
|
|
393
|
+
self.time_estimator_func: Optional[Callable[['resources_lib.Resources'],
|
|
393
394
|
int]] = None
|
|
394
395
|
self.file_mounts: Optional[Dict[str, str]] = None
|
|
395
396
|
|
|
396
397
|
# Only set when 'self' is a jobs controller task: 'self.managed_job_dag'
|
|
397
398
|
# is the underlying managed job dag (sky.Dag object).
|
|
398
|
-
self.managed_job_dag: Optional['
|
|
399
|
+
self.managed_job_dag: Optional['dag_lib.Dag'] = None
|
|
399
400
|
|
|
400
401
|
# Only set when 'self' is a sky serve controller task.
|
|
401
402
|
self.service_name: Optional[str] = None
|
|
402
403
|
|
|
403
404
|
# Filled in by the optimizer. If None, this Task is not planned.
|
|
404
|
-
self.best_resources: Optional[
|
|
405
|
+
self.best_resources: Optional['resources_lib.Resources'] = None
|
|
405
406
|
|
|
406
407
|
# For internal use only.
|
|
407
408
|
self.file_mounts_mapping: Optional[Dict[str,
|
|
@@ -418,7 +419,7 @@ class Task:
|
|
|
418
419
|
if file_mounts is not None:
|
|
419
420
|
self.set_file_mounts(file_mounts)
|
|
420
421
|
|
|
421
|
-
dag =
|
|
422
|
+
dag = dag_lib.get_current_dag()
|
|
422
423
|
if dag is not None:
|
|
423
424
|
dag.add(self)
|
|
424
425
|
|
|
@@ -783,7 +784,8 @@ class Task:
|
|
|
783
784
|
'_cluster_config_overrides'] = cluster_config_override
|
|
784
785
|
if volumes:
|
|
785
786
|
resources_config['volumes'] = volumes
|
|
786
|
-
task.set_resources(
|
|
787
|
+
task.set_resources(
|
|
788
|
+
resources_lib.Resources.from_yaml_config(resources_config))
|
|
787
789
|
|
|
788
790
|
service = config.pop('service', None)
|
|
789
791
|
pool = config.pop('pool', None)
|
|
@@ -931,7 +933,8 @@ class Task:
|
|
|
931
933
|
for key, (vol_name, vol_req) in topology.items():
|
|
932
934
|
if vol_req is not None:
|
|
933
935
|
if key == 'cloud':
|
|
934
|
-
override_params[key] =
|
|
936
|
+
override_params[key] = registry.CLOUD_REGISTRY.from_str(
|
|
937
|
+
vol_req)
|
|
935
938
|
else:
|
|
936
939
|
override_params[key] = vol_req
|
|
937
940
|
self.set_resources_override(override_params)
|
|
@@ -1142,7 +1145,7 @@ class Task:
|
|
|
1142
1145
|
Returns:
|
|
1143
1146
|
self: The current task, with resources set.
|
|
1144
1147
|
"""
|
|
1145
|
-
if isinstance(resources,
|
|
1148
|
+
if isinstance(resources, resources_lib.Resources):
|
|
1146
1149
|
resources = {resources}
|
|
1147
1150
|
# TODO(woosuk): Check if the resources are None.
|
|
1148
1151
|
self.resources = _with_docker_login_config(resources, self.envs,
|
|
@@ -1187,8 +1190,8 @@ class Task:
|
|
|
1187
1190
|
self._service = service
|
|
1188
1191
|
return self
|
|
1189
1192
|
|
|
1190
|
-
def set_time_estimator(
|
|
1191
|
-
|
|
1193
|
+
def set_time_estimator(
|
|
1194
|
+
self, func: Callable[['resources_lib.Resources'], int]) -> 'Task':
|
|
1192
1195
|
"""Sets a func mapping resources to estimated time (secs).
|
|
1193
1196
|
|
|
1194
1197
|
This is EXPERIMENTAL.
|
|
@@ -1712,7 +1715,7 @@ class Task:
|
|
|
1712
1715
|
return required_features
|
|
1713
1716
|
|
|
1714
1717
|
def __rshift__(self, b):
|
|
1715
|
-
|
|
1718
|
+
dag_lib.get_current_dag().add_edge(self, b)
|
|
1716
1719
|
|
|
1717
1720
|
def __repr__(self):
|
|
1718
1721
|
if isinstance(self.run, str):
|
sky/templates/aws-ray.yml.j2
CHANGED
|
@@ -50,7 +50,7 @@ provider:
|
|
|
50
50
|
disable_launch_config_check: true
|
|
51
51
|
|
|
52
52
|
auth:
|
|
53
|
-
ssh_user:
|
|
53
|
+
ssh_user: {{ssh_user}}
|
|
54
54
|
ssh_private_key: {{ssh_private_key}}
|
|
55
55
|
{% if ssh_proxy_command is not none %}
|
|
56
56
|
ssh_proxy_command: {{ssh_proxy_command}}
|
|
@@ -68,7 +68,7 @@ available_node_types:
|
|
|
68
68
|
ImageId: {{image_id}} # Deep Learning AMI (Ubuntu 18.04); see aws.py.
|
|
69
69
|
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
|
70
70
|
BlockDeviceMappings:
|
|
71
|
-
- DeviceName:
|
|
71
|
+
- DeviceName: {{root_device_name}}
|
|
72
72
|
Ebs:
|
|
73
73
|
VolumeSize: {{disk_size}}
|
|
74
74
|
VolumeType: {{disk_tier}}
|