skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/backends/backend_utils.py +69 -6
- sky/backends/cloud_vm_ray_backend.py +156 -25
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +40 -77
- sky/client/common.py +1 -1
- sky/client/sdk.py +19 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +14 -0
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-7fd0cf9dbecff10f.js → webpack-00c0a51d21157453.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +63 -7
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +23 -68
- sky/jobs/server/core.py +18 -12
- sky/jobs/state.py +6 -2
- sky/jobs/utils.py +8 -0
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +9 -0
- sky/provision/aws/instance.py +36 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +1 -2
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +15 -16
- sky/serve/serve_state.py +10 -0
- sky/serve/serve_utils.py +21 -20
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +31 -16
- sky/server/server.py +20 -14
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +73 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/schemas.py +3 -0
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +102 -94
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/setup_files/dependencies.py
CHANGED
|
@@ -88,17 +88,18 @@ local_ray = [
|
|
|
88
88
|
'ray[default] >= 2.2.0, != 2.6.0',
|
|
89
89
|
]
|
|
90
90
|
|
|
91
|
+
# See requirements-dev.txt for the version of grpc and protobuf
|
|
92
|
+
# used to generate the code during development.
|
|
91
93
|
remote = [
|
|
92
|
-
#
|
|
93
|
-
#
|
|
94
|
-
|
|
95
|
-
#
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
#
|
|
100
|
-
|
|
101
|
-
'protobuf >= 3.15.3, != 3.19.5',
|
|
94
|
+
# The grpc version at runtime has to be newer than the version
|
|
95
|
+
# used to generate the code.
|
|
96
|
+
'grpcio>=1.63.0',
|
|
97
|
+
# >= 5.26.1 because the runtime version can't be older than the version
|
|
98
|
+
# used to generate the code.
|
|
99
|
+
# < 7.0.0 because code generated for a major version V will be supported by
|
|
100
|
+
# protobuf runtimes of version V and V+1.
|
|
101
|
+
# https://protobuf.dev/support/cross-version-runtime-guarantee
|
|
102
|
+
'protobuf >= 5.26.1, < 7.0.0',
|
|
102
103
|
]
|
|
103
104
|
|
|
104
105
|
# NOTE: Change the templates/jobs-controller.yaml.j2 file if any of the
|
sky/skylet/autostop_lib.py
CHANGED
|
@@ -16,8 +16,13 @@ from sky.utils import ux_utils
|
|
|
16
16
|
|
|
17
17
|
if typing.TYPE_CHECKING:
|
|
18
18
|
import psutil
|
|
19
|
+
|
|
20
|
+
from sky.schemas.generated import autostopv1_pb2
|
|
19
21
|
else:
|
|
20
22
|
psutil = adaptors_common.LazyImport('psutil')
|
|
23
|
+
# To avoid requiring protobuf to be installed on the client side.
|
|
24
|
+
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
25
|
+
'sky.schemas.generated.autostopv1_pb2')
|
|
21
26
|
|
|
22
27
|
logger = sky_logging.init_logger(__name__)
|
|
23
28
|
|
|
@@ -55,11 +60,9 @@ Determines the condition for resetting the idleness timer.
|
|
|
55
60
|
This option works in conjunction with ``--{pair}``. Options:
|
|
56
61
|
|
|
57
62
|
\b
|
|
58
|
-
1. ``jobs_and_ssh`` (default): Wait for
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
3. ``none``: Stop immediately after idle time expires, regardless of running
|
|
62
|
-
jobs or SSH connections."""
|
|
63
|
+
1. ``jobs_and_ssh`` (default): Wait for in-progress jobs and SSH connections to finish.
|
|
64
|
+
2. ``jobs``: Only wait for in-progress jobs.
|
|
65
|
+
3. ``none``: Wait for nothing; autostop right after ``{pair}``."""
|
|
63
66
|
|
|
64
67
|
@classmethod
|
|
65
68
|
def from_str(cls, mode: str) -> 'AutostopWaitFor':
|
|
@@ -78,6 +81,36 @@ jobs or SSH connections."""
|
|
|
78
81
|
f'\'{cls.JOBS.value}\', or '
|
|
79
82
|
f'\'{cls.NONE.value}\'. ')
|
|
80
83
|
|
|
84
|
+
@classmethod
|
|
85
|
+
def from_protobuf(
|
|
86
|
+
cls, protobuf_value: 'autostopv1_pb2.AutostopWaitFor'
|
|
87
|
+
) -> Optional['AutostopWaitFor']:
|
|
88
|
+
"""Convert protobuf AutostopWaitFor enum to Python enum value."""
|
|
89
|
+
protobuf_to_enum = {
|
|
90
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: cls.JOBS_AND_SSH,
|
|
91
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS: cls.JOBS,
|
|
92
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE: cls.NONE,
|
|
93
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED: None,
|
|
94
|
+
}
|
|
95
|
+
if protobuf_value not in protobuf_to_enum:
|
|
96
|
+
with ux_utils.print_exception_no_traceback():
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f'Unknown protobuf AutostopWaitFor value: {protobuf_value}')
|
|
99
|
+
return protobuf_to_enum[protobuf_value]
|
|
100
|
+
|
|
101
|
+
def to_protobuf(self) -> 'autostopv1_pb2.AutostopWaitFor':
|
|
102
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
103
|
+
enum_to_protobuf = {
|
|
104
|
+
AutostopWaitFor.JOBS_AND_SSH:
|
|
105
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH,
|
|
106
|
+
AutostopWaitFor.JOBS: autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS,
|
|
107
|
+
AutostopWaitFor.NONE: autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE,
|
|
108
|
+
}
|
|
109
|
+
if self not in enum_to_protobuf:
|
|
110
|
+
with ux_utils.print_exception_no_traceback():
|
|
111
|
+
raise ValueError(f'Unknown AutostopWaitFor value: {self}')
|
|
112
|
+
return enum_to_protobuf[self]
|
|
113
|
+
|
|
81
114
|
|
|
82
115
|
DEFAULT_AUTOSTOP_WAIT_FOR: AutostopWaitFor = AutostopWaitFor.JOBS_AND_SSH
|
|
83
116
|
|
sky/skylet/constants.py
CHANGED
|
@@ -90,12 +90,14 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
90
90
|
# cluster yaml is updated.
|
|
91
91
|
#
|
|
92
92
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
93
|
-
SKYLET_VERSION = '
|
|
93
|
+
SKYLET_VERSION = '17'
|
|
94
94
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
95
95
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
96
96
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
97
97
|
SKYLET_LIB_VERSION = 4
|
|
98
98
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
|
99
|
+
SKYLET_GRPC_PORT = 46590
|
|
100
|
+
SKYLET_GRPC_TIMEOUT_SECONDS = 5
|
|
99
101
|
|
|
100
102
|
# Docker default options
|
|
101
103
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
sky/skylet/services.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""gRPC service implementations for skylet."""
|
|
2
|
+
|
|
3
|
+
import grpc
|
|
4
|
+
|
|
5
|
+
from sky import sky_logging
|
|
6
|
+
from sky.schemas.generated import autostopv1_pb2
|
|
7
|
+
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
8
|
+
from sky.skylet import autostop_lib
|
|
9
|
+
|
|
10
|
+
logger = sky_logging.init_logger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
|
|
14
|
+
"""Implementation of the AutostopService gRPC service."""
|
|
15
|
+
|
|
16
|
+
def SetAutostop( # type: ignore[return]
|
|
17
|
+
self, request: autostopv1_pb2.SetAutostopRequest,
|
|
18
|
+
context: grpc.ServicerContext
|
|
19
|
+
) -> autostopv1_pb2.SetAutostopResponse:
|
|
20
|
+
"""Sets autostop configuration for the cluster."""
|
|
21
|
+
try:
|
|
22
|
+
wait_for = autostop_lib.AutostopWaitFor.from_protobuf(
|
|
23
|
+
request.wait_for)
|
|
24
|
+
autostop_lib.set_autostop(
|
|
25
|
+
idle_minutes=request.idle_minutes,
|
|
26
|
+
backend=request.backend,
|
|
27
|
+
wait_for=wait_for if wait_for is not None else
|
|
28
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
|
|
29
|
+
down=request.down)
|
|
30
|
+
return autostopv1_pb2.SetAutostopResponse()
|
|
31
|
+
except Exception as e: # pylint: disable=broad-except
|
|
32
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
33
|
+
|
|
34
|
+
def IsAutostopping( # type: ignore[return]
|
|
35
|
+
self, request: autostopv1_pb2.IsAutostoppingRequest,
|
|
36
|
+
context: grpc.ServicerContext
|
|
37
|
+
) -> autostopv1_pb2.IsAutostoppingResponse:
|
|
38
|
+
"""Checks if the cluster is currently autostopping."""
|
|
39
|
+
try:
|
|
40
|
+
is_autostopping = autostop_lib.get_is_autostopping()
|
|
41
|
+
return autostopv1_pb2.IsAutostoppingResponse(
|
|
42
|
+
is_autostopping=is_autostopping)
|
|
43
|
+
except Exception as e: # pylint: disable=broad-except
|
|
44
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
sky/skylet/skylet.py
CHANGED
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
"""skylet: a daemon running on the head node of a cluster."""
|
|
2
2
|
|
|
3
|
+
import concurrent.futures
|
|
4
|
+
import os
|
|
3
5
|
import time
|
|
4
6
|
|
|
7
|
+
import grpc
|
|
8
|
+
|
|
5
9
|
import sky
|
|
6
10
|
from sky import sky_logging
|
|
11
|
+
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
7
12
|
from sky.skylet import constants
|
|
8
13
|
from sky.skylet import events
|
|
14
|
+
from sky.skylet import services
|
|
9
15
|
|
|
10
16
|
# Use the explicit logger name so that the logger is under the
|
|
11
17
|
# `sky.skylet.skylet` namespace when executed directly, so as
|
|
@@ -31,7 +37,46 @@ EVENTS = [
|
|
|
31
37
|
events.UsageHeartbeatReportEvent(),
|
|
32
38
|
]
|
|
33
39
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
40
|
+
|
|
41
|
+
def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
|
|
42
|
+
"""Start the gRPC server."""
|
|
43
|
+
# This is the default value in Python 3.8 - 3.12,
|
|
44
|
+
# putting it here for visibility.
|
|
45
|
+
# TODO(kevin): Determine the optimal max number of threads.
|
|
46
|
+
max_workers = min(32, (os.cpu_count() or 1) + 4)
|
|
47
|
+
server = grpc.server(
|
|
48
|
+
concurrent.futures.ThreadPoolExecutor(max_workers=max_workers))
|
|
49
|
+
|
|
50
|
+
autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
|
|
51
|
+
services.AutostopServiceImpl(), server)
|
|
52
|
+
|
|
53
|
+
listen_addr = f'127.0.0.1:{port}'
|
|
54
|
+
server.add_insecure_port(listen_addr)
|
|
55
|
+
|
|
56
|
+
server.start()
|
|
57
|
+
logger.info(f'gRPC server started on {listen_addr}')
|
|
58
|
+
|
|
59
|
+
return server
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def run_event_loop():
|
|
63
|
+
"""Run the existing event loop."""
|
|
64
|
+
|
|
65
|
+
while True:
|
|
66
|
+
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
67
|
+
for event in EVENTS:
|
|
68
|
+
event.run()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def main():
|
|
72
|
+
grpc_server = start_grpc_server()
|
|
73
|
+
try:
|
|
74
|
+
run_event_loop()
|
|
75
|
+
except KeyboardInterrupt:
|
|
76
|
+
logger.info('Shutting down skylet...')
|
|
77
|
+
finally:
|
|
78
|
+
grpc_server.stop(grace=5)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == '__main__':
|
|
82
|
+
main()
|
sky/task.py
CHANGED
|
@@ -10,26 +10,25 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
|
|
10
10
|
|
|
11
11
|
import colorama
|
|
12
12
|
|
|
13
|
-
import sky
|
|
14
13
|
from sky import clouds
|
|
14
|
+
from sky import dag as dag_lib
|
|
15
15
|
from sky import exceptions
|
|
16
|
+
from sky import resources as resources_lib
|
|
16
17
|
from sky import sky_logging
|
|
17
18
|
from sky.adaptors import common as adaptors_common
|
|
18
|
-
import sky.dag
|
|
19
19
|
from sky.data import data_utils
|
|
20
20
|
from sky.data import storage as storage_lib
|
|
21
21
|
from sky.provision import docker_utils
|
|
22
22
|
from sky.serve import service_spec
|
|
23
23
|
from sky.skylet import constants
|
|
24
24
|
from sky.utils import common_utils
|
|
25
|
+
from sky.utils import registry
|
|
25
26
|
from sky.utils import schemas
|
|
26
27
|
from sky.utils import ux_utils
|
|
27
28
|
from sky.utils import volume as volume_lib
|
|
28
29
|
|
|
29
30
|
if typing.TYPE_CHECKING:
|
|
30
31
|
import yaml
|
|
31
|
-
|
|
32
|
-
from sky import resources as resources_lib
|
|
33
32
|
else:
|
|
34
33
|
yaml = adaptors_common.LazyImport('yaml')
|
|
35
34
|
|
|
@@ -382,26 +381,28 @@ class Task:
|
|
|
382
381
|
self.estimated_inputs_size_gigabytes: Optional[float] = None
|
|
383
382
|
self.estimated_outputs_size_gigabytes: Optional[float] = None
|
|
384
383
|
# Default to CPU VM
|
|
385
|
-
self.resources: Union[List[
|
|
386
|
-
Set[
|
|
384
|
+
self.resources: Union[List['resources_lib.Resources'],
|
|
385
|
+
Set['resources_lib.Resources']] = {
|
|
386
|
+
resources_lib.Resources()
|
|
387
|
+
}
|
|
387
388
|
self._service: Optional[service_spec.SkyServiceSpec] = None
|
|
388
389
|
|
|
389
390
|
# Resources that this task cannot run on.
|
|
390
391
|
self.blocked_resources = blocked_resources
|
|
391
392
|
|
|
392
|
-
self.time_estimator_func: Optional[Callable[['
|
|
393
|
+
self.time_estimator_func: Optional[Callable[['resources_lib.Resources'],
|
|
393
394
|
int]] = None
|
|
394
395
|
self.file_mounts: Optional[Dict[str, str]] = None
|
|
395
396
|
|
|
396
397
|
# Only set when 'self' is a jobs controller task: 'self.managed_job_dag'
|
|
397
398
|
# is the underlying managed job dag (sky.Dag object).
|
|
398
|
-
self.managed_job_dag: Optional['
|
|
399
|
+
self.managed_job_dag: Optional['dag_lib.Dag'] = None
|
|
399
400
|
|
|
400
401
|
# Only set when 'self' is a sky serve controller task.
|
|
401
402
|
self.service_name: Optional[str] = None
|
|
402
403
|
|
|
403
404
|
# Filled in by the optimizer. If None, this Task is not planned.
|
|
404
|
-
self.best_resources: Optional[
|
|
405
|
+
self.best_resources: Optional['resources_lib.Resources'] = None
|
|
405
406
|
|
|
406
407
|
# For internal use only.
|
|
407
408
|
self.file_mounts_mapping: Optional[Dict[str,
|
|
@@ -418,7 +419,7 @@ class Task:
|
|
|
418
419
|
if file_mounts is not None:
|
|
419
420
|
self.set_file_mounts(file_mounts)
|
|
420
421
|
|
|
421
|
-
dag =
|
|
422
|
+
dag = dag_lib.get_current_dag()
|
|
422
423
|
if dag is not None:
|
|
423
424
|
dag.add(self)
|
|
424
425
|
|
|
@@ -783,7 +784,8 @@ class Task:
|
|
|
783
784
|
'_cluster_config_overrides'] = cluster_config_override
|
|
784
785
|
if volumes:
|
|
785
786
|
resources_config['volumes'] = volumes
|
|
786
|
-
task.set_resources(
|
|
787
|
+
task.set_resources(
|
|
788
|
+
resources_lib.Resources.from_yaml_config(resources_config))
|
|
787
789
|
|
|
788
790
|
service = config.pop('service', None)
|
|
789
791
|
pool = config.pop('pool', None)
|
|
@@ -931,7 +933,8 @@ class Task:
|
|
|
931
933
|
for key, (vol_name, vol_req) in topology.items():
|
|
932
934
|
if vol_req is not None:
|
|
933
935
|
if key == 'cloud':
|
|
934
|
-
override_params[key] =
|
|
936
|
+
override_params[key] = registry.CLOUD_REGISTRY.from_str(
|
|
937
|
+
vol_req)
|
|
935
938
|
else:
|
|
936
939
|
override_params[key] = vol_req
|
|
937
940
|
self.set_resources_override(override_params)
|
|
@@ -1142,7 +1145,7 @@ class Task:
|
|
|
1142
1145
|
Returns:
|
|
1143
1146
|
self: The current task, with resources set.
|
|
1144
1147
|
"""
|
|
1145
|
-
if isinstance(resources,
|
|
1148
|
+
if isinstance(resources, resources_lib.Resources):
|
|
1146
1149
|
resources = {resources}
|
|
1147
1150
|
# TODO(woosuk): Check if the resources are None.
|
|
1148
1151
|
self.resources = _with_docker_login_config(resources, self.envs,
|
|
@@ -1187,8 +1190,8 @@ class Task:
|
|
|
1187
1190
|
self._service = service
|
|
1188
1191
|
return self
|
|
1189
1192
|
|
|
1190
|
-
def set_time_estimator(
|
|
1191
|
-
|
|
1193
|
+
def set_time_estimator(
|
|
1194
|
+
self, func: Callable[['resources_lib.Resources'], int]) -> 'Task':
|
|
1192
1195
|
"""Sets a func mapping resources to estimated time (secs).
|
|
1193
1196
|
|
|
1194
1197
|
This is EXPERIMENTAL.
|
|
@@ -1712,7 +1715,7 @@ class Task:
|
|
|
1712
1715
|
return required_features
|
|
1713
1716
|
|
|
1714
1717
|
def __rshift__(self, b):
|
|
1715
|
-
|
|
1718
|
+
dag_lib.get_current_dag().add_edge(self, b)
|
|
1716
1719
|
|
|
1717
1720
|
def __repr__(self):
|
|
1718
1721
|
if isinstance(self.run, str):
|
sky/templates/aws-ray.yml.j2
CHANGED
|
@@ -50,7 +50,7 @@ provider:
|
|
|
50
50
|
disable_launch_config_check: true
|
|
51
51
|
|
|
52
52
|
auth:
|
|
53
|
-
ssh_user:
|
|
53
|
+
ssh_user: {{ssh_user}}
|
|
54
54
|
ssh_private_key: {{ssh_private_key}}
|
|
55
55
|
{% if ssh_proxy_command is not none %}
|
|
56
56
|
ssh_proxy_command: {{ssh_proxy_command}}
|
|
@@ -68,7 +68,7 @@ available_node_types:
|
|
|
68
68
|
ImageId: {{image_id}} # Deep Learning AMI (Ubuntu 18.04); see aws.py.
|
|
69
69
|
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
|
70
70
|
BlockDeviceMappings:
|
|
71
|
-
- DeviceName:
|
|
71
|
+
- DeviceName: {{root_device_name}}
|
|
72
72
|
Ebs:
|
|
73
73
|
VolumeSize: {{disk_size}}
|
|
74
74
|
VolumeType: {{disk_tier}}
|
|
@@ -15,6 +15,12 @@ file_mounts:
|
|
|
15
15
|
{{controller_file_mount_path}}: {{local_file_mount_path}}
|
|
16
16
|
{%- endfor %}
|
|
17
17
|
|
|
18
|
+
# NOTE(dev): This needs to be a subset of sky/templates/sky-serve-controller.yaml.j2.
|
|
19
|
+
# It is because we use the --fast flag to submit jobs and no --fast flag to launch pools.
|
|
20
|
+
# So when we launch a new pool, it will install the required dependencies.
|
|
21
|
+
# TODO(tian): Add --fast to launch pools as well, and figure out the dependency installation.
|
|
22
|
+
# Maybe in the --fast implementation, we can store the hash of setup commands that used to be
|
|
23
|
+
# run and don't skip setup phase if the hash is different.
|
|
18
24
|
setup: |
|
|
19
25
|
{{ sky_activate_python_env }}
|
|
20
26
|
# Disable the pip version check to avoid the warning message, which makes the
|
sky/utils/command_runner.py
CHANGED
|
@@ -674,7 +674,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
674
674
|
ssh += ['-tt']
|
|
675
675
|
if port_forward is not None:
|
|
676
676
|
for local, remote in port_forward:
|
|
677
|
-
logger.
|
|
677
|
+
logger.debug(
|
|
678
678
|
f'Forwarding local port {local} to remote port {remote}.')
|
|
679
679
|
ssh += ['-NL', f'{local}:localhost:{remote}']
|
|
680
680
|
if self._docker_ssh_proxy_command is not None:
|
sky/utils/config_utils.py
CHANGED
|
@@ -8,6 +8,26 @@ logger = sky_logging.init_logger(__name__)
|
|
|
8
8
|
|
|
9
9
|
_REGION_CONFIG_CLOUDS = ['nebius', 'oci']
|
|
10
10
|
|
|
11
|
+
# Kubernetes API use list to represent dictionary fields with patch strategy
|
|
12
|
+
# merge and each item is indexed by the patch merge key. The following map
|
|
13
|
+
# maps the field name to the patch merge key.
|
|
14
|
+
# pylint: disable=line-too-long
|
|
15
|
+
# Ref: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#podspec-v1-core
|
|
16
|
+
# NOTE: field containers and imagePullSecrets are not included deliberately for
|
|
17
|
+
# backward compatibility (we only support one container per pod now).
|
|
18
|
+
_PATCH_MERGE_KEYS = {
|
|
19
|
+
'initContainers': 'name',
|
|
20
|
+
'ephemeralContainers': 'name',
|
|
21
|
+
'volumes': 'name',
|
|
22
|
+
'volumeMounts': 'name',
|
|
23
|
+
'resourceClaims': 'name',
|
|
24
|
+
'env': 'name',
|
|
25
|
+
'hostAliases': 'ip',
|
|
26
|
+
'topologySpreadConstraints': 'topologyKey',
|
|
27
|
+
'ports': 'containerPort',
|
|
28
|
+
'volumeDevices': 'devicePath',
|
|
29
|
+
}
|
|
30
|
+
|
|
11
31
|
|
|
12
32
|
class Config(Dict[str, Any]):
|
|
13
33
|
"""SkyPilot config that supports setting/getting values with nested keys."""
|
|
@@ -211,19 +231,23 @@ def merge_k8s_configs(
|
|
|
211
231
|
merge_k8s_configs(base_config[key][0], value[0],
|
|
212
232
|
next_allowed_override_keys,
|
|
213
233
|
next_disallowed_override_keys)
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
234
|
+
# For list fields with patch strategy "merge", we merge the list
|
|
235
|
+
# by the patch merge key.
|
|
236
|
+
elif key in _PATCH_MERGE_KEYS:
|
|
237
|
+
patch_merge_key = _PATCH_MERGE_KEYS[key]
|
|
217
238
|
for override_item in value:
|
|
218
|
-
override_item_name = override_item.get(
|
|
239
|
+
override_item_name = override_item.get(patch_merge_key)
|
|
219
240
|
if override_item_name is not None:
|
|
220
241
|
existing_base_item = next(
|
|
221
242
|
(v for v in base_config[key]
|
|
222
|
-
if v.get(
|
|
243
|
+
if v.get(patch_merge_key) == override_item_name),
|
|
244
|
+
None)
|
|
223
245
|
if existing_base_item is not None:
|
|
224
246
|
merge_k8s_configs(existing_base_item, override_item)
|
|
225
247
|
else:
|
|
226
248
|
base_config[key].append(override_item)
|
|
249
|
+
else:
|
|
250
|
+
base_config[key].append(override_item)
|
|
227
251
|
else:
|
|
228
252
|
base_config[key].extend(value)
|
|
229
253
|
else:
|
sky/utils/controller_utils.py
CHANGED
|
@@ -23,11 +23,14 @@ from sky.clouds import gcp
|
|
|
23
23
|
from sky.data import data_utils
|
|
24
24
|
from sky.data import storage as storage_lib
|
|
25
25
|
from sky.jobs import constants as managed_job_constants
|
|
26
|
+
from sky.jobs import state as managed_job_state
|
|
26
27
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
27
28
|
from sky.serve import constants as serve_constants
|
|
29
|
+
from sky.serve import serve_state
|
|
28
30
|
from sky.setup_files import dependencies
|
|
29
31
|
from sky.skylet import constants
|
|
30
32
|
from sky.skylet import log_lib
|
|
33
|
+
from sky.utils import annotations
|
|
31
34
|
from sky.utils import common
|
|
32
35
|
from sky.utils import common_utils
|
|
33
36
|
from sky.utils import config_utils
|
|
@@ -37,8 +40,13 @@ from sky.utils import rich_utils
|
|
|
37
40
|
from sky.utils import ux_utils
|
|
38
41
|
|
|
39
42
|
if typing.TYPE_CHECKING:
|
|
43
|
+
import psutil
|
|
44
|
+
|
|
40
45
|
from sky import task as task_lib
|
|
41
46
|
from sky.backends import cloud_vm_ray_backend
|
|
47
|
+
else:
|
|
48
|
+
from sky.adaptors import common as adaptors_common
|
|
49
|
+
psutil = adaptors_common.LazyImport('psutil')
|
|
42
50
|
|
|
43
51
|
logger = sky_logging.init_logger(__name__)
|
|
44
52
|
|
|
@@ -1161,3 +1169,68 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
|
1161
1169
|
task.update_storage_mounts(updated_mount_storages)
|
|
1162
1170
|
if msg:
|
|
1163
1171
|
logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
|
|
1172
|
+
|
|
1173
|
+
|
|
1174
|
+
# ======================= Resources Management Functions =======================
|
|
1175
|
+
|
|
1176
|
+
# Based on testing, assume a running job process uses 350MB memory. We use the
|
|
1177
|
+
# same estimation for service controller process.
|
|
1178
|
+
JOB_MEMORY_MB = 350
|
|
1179
|
+
# Monitoring process for service is 1GB. This is based on an old estimation but
|
|
1180
|
+
# we keep it here for now.
|
|
1181
|
+
# TODO(tian): Remeasure this.
|
|
1182
|
+
SERVE_MONITORING_MEMORY_MB = 1024
|
|
1183
|
+
# The ratio of service controller process to job process. We will treat each
|
|
1184
|
+
# service as SERVE_PROC_RATIO job processes.
|
|
1185
|
+
SERVE_PROC_RATIO = SERVE_MONITORING_MEMORY_MB / JOB_MEMORY_MB
|
|
1186
|
+
# Past 2000 simultaneous jobs, we become unstable.
|
|
1187
|
+
# See https://github.com/skypilot-org/skypilot/issues/4649.
|
|
1188
|
+
MAX_JOB_LIMIT = 2000
|
|
1189
|
+
# Number of ongoing launches launches allowed per CPU, for managed jobs.
|
|
1190
|
+
JOB_LAUNCHES_PER_CPU = 4
|
|
1191
|
+
# Number of ongoing launches launches allowed per CPU, for services. This is
|
|
1192
|
+
# also based on an old estimation, but SKyServe indeed spawn a new process
|
|
1193
|
+
# for each launch operation, so it should be slightly more resources demanding
|
|
1194
|
+
# than managed jobs.
|
|
1195
|
+
SERVE_LAUNCHES_PER_CPU = 2
|
|
1196
|
+
# The ratio of service launch to job launch. This is inverted as the parallelism
|
|
1197
|
+
# is determined by 1 / LAUNCHES_PER_CPU.
|
|
1198
|
+
SERVE_LAUNCH_RATIO = JOB_LAUNCHES_PER_CPU / SERVE_LAUNCHES_PER_CPU
|
|
1199
|
+
|
|
1200
|
+
# The _RESOURCES_LOCK should be held whenever we are checking the parallelism
|
|
1201
|
+
# control or updating the schedule_state of any job or service. Any code that
|
|
1202
|
+
# takes this lock must conclude by calling maybe_schedule_next_jobs.
|
|
1203
|
+
_RESOURCES_LOCK = '~/.sky/locks/controller_resources.lock'
|
|
1204
|
+
|
|
1205
|
+
|
|
1206
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
|
1207
|
+
def get_resources_lock_path() -> str:
|
|
1208
|
+
path = os.path.expanduser(_RESOURCES_LOCK)
|
|
1209
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
1210
|
+
return path
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
@annotations.lru_cache(scope='request')
|
|
1214
|
+
def _get_job_parallelism() -> int:
|
|
1215
|
+
job_memory = JOB_MEMORY_MB * 1024 * 1024
|
|
1216
|
+
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
|
1217
|
+
return max(job_limit, 1)
|
|
1218
|
+
|
|
1219
|
+
|
|
1220
|
+
@annotations.lru_cache(scope='request')
|
|
1221
|
+
def _get_launch_parallelism() -> int:
|
|
1222
|
+
cpus = os.cpu_count()
|
|
1223
|
+
return cpus * JOB_LAUNCHES_PER_CPU if cpus is not None else 1
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def can_provision() -> bool:
|
|
1227
|
+
num_provision = (
|
|
1228
|
+
serve_state.total_number_provisioning_replicas() * SERVE_LAUNCH_RATIO +
|
|
1229
|
+
managed_job_state.get_num_launching_jobs())
|
|
1230
|
+
return num_provision < _get_launch_parallelism()
|
|
1231
|
+
|
|
1232
|
+
|
|
1233
|
+
def can_start_new_process() -> bool:
|
|
1234
|
+
num_procs = (serve_state.get_num_services() * SERVE_PROC_RATIO +
|
|
1235
|
+
managed_job_state.get_num_alive_jobs())
|
|
1236
|
+
return num_procs < _get_job_parallelism()
|
sky/utils/db/db_utils.py
CHANGED
|
@@ -32,6 +32,23 @@ if typing.TYPE_CHECKING:
|
|
|
32
32
|
_DB_TIMEOUT_S = 60
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
class UniqueConstraintViolationError(Exception):
|
|
36
|
+
"""Exception raised for unique constraint violation.
|
|
37
|
+
Attributes:
|
|
38
|
+
value -- the input value that caused the error
|
|
39
|
+
message -- explanation of the error
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, value, message='Unique constraint violation'):
|
|
43
|
+
self.value = value
|
|
44
|
+
self.message = message
|
|
45
|
+
super().__init__(self.message)
|
|
46
|
+
|
|
47
|
+
def __str__(self):
|
|
48
|
+
return (f'UniqueConstraintViolationError: {self.message} '
|
|
49
|
+
f'(Value: {self.value})')
|
|
50
|
+
|
|
51
|
+
|
|
35
52
|
class SQLAlchemyDialect(enum.Enum):
|
|
36
53
|
SQLITE = 'sqlite'
|
|
37
54
|
POSTGRESQL = 'postgresql'
|
sky/utils/schemas.py
CHANGED
sky/volumes/server/core.py
CHANGED
|
@@ -7,12 +7,12 @@ import uuid
|
|
|
7
7
|
|
|
8
8
|
import filelock
|
|
9
9
|
|
|
10
|
-
import sky
|
|
11
10
|
from sky import global_user_state
|
|
12
11
|
from sky import models
|
|
13
12
|
from sky import provision
|
|
14
13
|
from sky import sky_logging
|
|
15
14
|
from sky.utils import common_utils
|
|
15
|
+
from sky.utils import registry
|
|
16
16
|
from sky.utils import rich_utils
|
|
17
17
|
from sky.utils import status_lib
|
|
18
18
|
from sky.utils import ux_utils
|
|
@@ -180,7 +180,7 @@ def volume_apply(name: str, volume_type: str, cloud: str, region: Optional[str],
|
|
|
180
180
|
with rich_utils.safe_status(ux_utils.spinner_message('Creating volume')):
|
|
181
181
|
# Reuse the method for cluster name on cloud to
|
|
182
182
|
# generate the storage name on cloud.
|
|
183
|
-
cloud_obj =
|
|
183
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud)
|
|
184
184
|
assert cloud_obj is not None
|
|
185
185
|
name_uuid = str(uuid.uuid4())[:6]
|
|
186
186
|
name_on_cloud = common_utils.make_cluster_name_on_cloud(
|
sky/volumes/server/server.py
CHANGED
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
import fastapi
|
|
4
4
|
|
|
5
|
-
import sky
|
|
6
5
|
from sky import clouds
|
|
7
6
|
from sky import sky_logging
|
|
8
7
|
from sky.server.requests import executor
|
|
9
8
|
from sky.server.requests import payloads
|
|
10
9
|
from sky.server.requests import requests as requests_lib
|
|
10
|
+
from sky.utils import registry
|
|
11
11
|
from sky.utils import volume
|
|
12
12
|
from sky.volumes.server import core
|
|
13
13
|
|
|
@@ -55,7 +55,7 @@ async def volume_apply(request: fastapi.Request,
|
|
|
55
55
|
if volume_type not in supported_volume_types:
|
|
56
56
|
raise fastapi.HTTPException(
|
|
57
57
|
status_code=400, detail=f'Invalid volume type: {volume_type}')
|
|
58
|
-
cloud =
|
|
58
|
+
cloud = registry.CLOUD_REGISTRY.from_str(volume_cloud)
|
|
59
59
|
if cloud is None:
|
|
60
60
|
raise fastapi.HTTPException(status_code=400,
|
|
61
61
|
detail=f'Invalid cloud: {volume_cloud}')
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20250814
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -104,9 +104,8 @@ Provides-Extra: ssh
|
|
|
104
104
|
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "ssh"
|
|
105
105
|
Requires-Dist: websockets; extra == "ssh"
|
|
106
106
|
Provides-Extra: remote
|
|
107
|
-
Requires-Dist: grpcio
|
|
108
|
-
Requires-Dist:
|
|
109
|
-
Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "remote"
|
|
107
|
+
Requires-Dist: grpcio>=1.63.0; extra == "remote"
|
|
108
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "remote"
|
|
110
109
|
Provides-Extra: runpod
|
|
111
110
|
Requires-Dist: runpod>=1.6.1; extra == "runpod"
|
|
112
111
|
Provides-Extra: fluidstack
|
|
@@ -169,9 +168,8 @@ Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
|
169
168
|
Requires-Dist: websockets; extra == "all"
|
|
170
169
|
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
171
170
|
Requires-Dist: websockets; extra == "all"
|
|
172
|
-
Requires-Dist: grpcio
|
|
173
|
-
Requires-Dist:
|
|
174
|
-
Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "all"
|
|
171
|
+
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
172
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
175
173
|
Requires-Dist: runpod>=1.6.1; extra == "all"
|
|
176
174
|
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
177
175
|
Requires-Dist: pydo>=0.3.0; extra == "all"
|