skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/models.py
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
"""Data Models for SkyPilot."""
|
2
|
+
|
3
|
+
import collections
|
4
|
+
import dataclasses
|
5
|
+
from typing import Dict, Optional
|
6
|
+
|
7
|
+
|
8
|
+
@dataclasses.dataclass
|
9
|
+
class User:
|
10
|
+
# User hash
|
11
|
+
id: str
|
12
|
+
# Display name of the user
|
13
|
+
name: Optional[str] = None
|
14
|
+
|
15
|
+
|
16
|
+
RealtimeGpuAvailability = collections.namedtuple(
|
17
|
+
'RealtimeGpuAvailability', ['gpu', 'counts', 'capacity', 'available'])
|
18
|
+
|
19
|
+
|
20
|
+
@dataclasses.dataclass
|
21
|
+
class KubernetesNodeInfo:
|
22
|
+
"""Dataclass to store Kubernetes node information."""
|
23
|
+
name: str
|
24
|
+
accelerator_type: Optional[str]
|
25
|
+
# Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
|
26
|
+
total: Dict[str, int]
|
27
|
+
free: Dict[str, int]
|
sky/optimizer.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
"""Optimizer: assigns best resources to user tasks."""
|
2
2
|
import collections
|
3
3
|
import copy
|
4
|
-
import enum
|
5
4
|
import json
|
6
5
|
import typing
|
7
6
|
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
|
@@ -17,8 +16,14 @@ from sky import resources as resources_lib
|
|
17
16
|
from sky import sky_logging
|
18
17
|
from sky import task as task_lib
|
19
18
|
from sky.adaptors import common as adaptors_common
|
19
|
+
from sky.usage import usage_lib
|
20
|
+
from sky.utils import common
|
20
21
|
from sky.utils import env_options
|
21
22
|
from sky.utils import log_utils
|
23
|
+
from sky.utils import resources_utils
|
24
|
+
from sky.utils import rich_utils
|
25
|
+
from sky.utils import subprocess_utils
|
26
|
+
from sky.utils import timeline
|
22
27
|
from sky.utils import ux_utils
|
23
28
|
|
24
29
|
if typing.TYPE_CHECKING:
|
@@ -41,12 +46,6 @@ _PerCloudCandidates = Dict[clouds.Cloud, List[resources_lib.Resources]]
|
|
41
46
|
_TaskToPerCloudCandidates = Dict[task_lib.Task, _PerCloudCandidates]
|
42
47
|
|
43
48
|
|
44
|
-
# Constants: minimize what target?
|
45
|
-
class OptimizeTarget(enum.Enum):
|
46
|
-
COST = 0
|
47
|
-
TIME = 1
|
48
|
-
|
49
|
-
|
50
49
|
# For logging purposes.
|
51
50
|
def _create_table(field_names: List[str]) -> prettytable.PrettyTable:
|
52
51
|
table_kwargs = {
|
@@ -102,11 +101,13 @@ class Optimizer:
|
|
102
101
|
return egress_time
|
103
102
|
|
104
103
|
@staticmethod
|
104
|
+
@timeline.event
|
105
|
+
@usage_lib.entrypoint('sky.optimizer.optimize')
|
105
106
|
def optimize(dag: 'dag_lib.Dag',
|
106
|
-
minimize: OptimizeTarget = OptimizeTarget.COST,
|
107
|
+
minimize: common.OptimizeTarget = common.OptimizeTarget.COST,
|
107
108
|
blocked_resources: Optional[Iterable[
|
108
109
|
resources_lib.Resources]] = None,
|
109
|
-
quiet: bool = False):
|
110
|
+
quiet: bool = False) -> 'dag_lib.Dag':
|
110
111
|
"""Find the best execution plan for the given DAG.
|
111
112
|
|
112
113
|
Args:
|
@@ -120,22 +121,22 @@ class Optimizer:
|
|
120
121
|
for a task.
|
121
122
|
exceptions.NoCloudAccessError: if no public clouds are enabled.
|
122
123
|
"""
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
124
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Optimizing')):
|
125
|
+
_check_specified_clouds(dag)
|
126
|
+
# This function is effectful: mutates every node in 'dag' by setting
|
127
|
+
# node.best_resources if it is None.
|
128
|
+
Optimizer._add_dummy_source_sink_nodes(dag)
|
129
|
+
try:
|
130
|
+
unused_best_plan = Optimizer._optimize_dag(
|
131
|
+
dag=dag,
|
132
|
+
minimize_cost=minimize == common.OptimizeTarget.COST,
|
133
|
+
blocked_resources=blocked_resources,
|
134
|
+
quiet=quiet)
|
135
|
+
finally:
|
136
|
+
# Make sure to remove the dummy source/sink nodes, even if the
|
137
|
+
# optimization fails.
|
138
|
+
Optimizer._remove_dummy_source_sink_nodes(dag)
|
139
|
+
return dag
|
139
140
|
|
140
141
|
@staticmethod
|
141
142
|
def _add_dummy_source_sink_nodes(dag: 'dag_lib.Dag'):
|
@@ -182,7 +183,7 @@ class Optimizer:
|
|
182
183
|
"""Removes special Source and Sink nodes."""
|
183
184
|
source = [t for t in dag.tasks if t.name == _DUMMY_SOURCE_NAME]
|
184
185
|
sink = [t for t in dag.tasks if t.name == _DUMMY_SINK_NAME]
|
185
|
-
if
|
186
|
+
if not source and not sink:
|
186
187
|
return
|
187
188
|
assert len(source) == len(sink) == 1, dag.tasks
|
188
189
|
dag.remove(source[0])
|
@@ -252,6 +253,29 @@ class Optimizer:
|
|
252
253
|
# node -> cloud -> list of resources that satisfy user's requirements.
|
253
254
|
node_to_candidate_map: _TaskToPerCloudCandidates = {}
|
254
255
|
|
256
|
+
def get_available_reservations(
|
257
|
+
launchable_resources: Dict[resources_lib.Resources,
|
258
|
+
List[resources_lib.Resources]]
|
259
|
+
) -> Dict[resources_lib.Resources, int]:
|
260
|
+
if not resources_utils.need_to_query_reservations():
|
261
|
+
return {}
|
262
|
+
|
263
|
+
num_available_reserved_nodes_per_resource = {}
|
264
|
+
|
265
|
+
def get_reservations_available_resources(
|
266
|
+
resources: resources_lib.Resources):
|
267
|
+
num_available_reserved_nodes_per_resource[resources] = sum(
|
268
|
+
resources.get_reservations_available_resources().values())
|
269
|
+
|
270
|
+
launchable_resources_list: List[resources_lib.Resources] = sum(
|
271
|
+
launchable_resources.values(), [])
|
272
|
+
with rich_utils.safe_status(
|
273
|
+
ux_utils.spinner_message('Checking reserved resources')):
|
274
|
+
subprocess_utils.run_in_parallel(
|
275
|
+
get_reservations_available_resources,
|
276
|
+
launchable_resources_list)
|
277
|
+
return num_available_reserved_nodes_per_resource
|
278
|
+
|
255
279
|
# Compute the estimated cost/time for each node.
|
256
280
|
for node_i, node in enumerate(topo_order):
|
257
281
|
if node_i == 0:
|
@@ -261,8 +285,6 @@ class Optimizer:
|
|
261
285
|
|
262
286
|
# Don't print for the last node, Sink.
|
263
287
|
do_print = node_i != len(topo_order) - 1
|
264
|
-
if do_print:
|
265
|
-
logger.debug('#### {} ####'.format(node))
|
266
288
|
|
267
289
|
fuzzy_candidates: List[str] = []
|
268
290
|
if node_i < len(topo_order) - 1:
|
@@ -273,13 +295,21 @@ class Optimizer:
|
|
273
295
|
blocked_resources=blocked_resources,
|
274
296
|
quiet=quiet))
|
275
297
|
node_to_candidate_map[node] = cloud_candidates
|
298
|
+
# Has to call the printing after the launchable resources are
|
299
|
+
# computed, because the missing fields of the resources are
|
300
|
+
# inferred in the _fill_in_launchable_resources function.
|
301
|
+
logger.debug('#### {} ####'.format(node))
|
276
302
|
else:
|
277
303
|
# Dummy sink node.
|
278
304
|
launchable_resources = {
|
279
305
|
list(node.resources)[0]: list(node.resources)
|
280
306
|
}
|
281
307
|
|
308
|
+
# Fetch reservations in advance and in parallel to speed up the
|
309
|
+
# reservation info fetching.
|
282
310
|
num_resources = len(list(node.resources))
|
311
|
+
num_available_reserved_nodes_per_resource = (
|
312
|
+
get_available_reservations(launchable_resources))
|
283
313
|
|
284
314
|
for orig_resources, launchable_list in launchable_resources.items():
|
285
315
|
if num_resources == 1 and node.time_estimator_func is None:
|
@@ -302,15 +332,16 @@ class Optimizer:
|
|
302
332
|
else:
|
303
333
|
estimated_runtime = node.estimate_runtime(
|
304
334
|
orig_resources)
|
335
|
+
|
305
336
|
for resources in launchable_list:
|
306
337
|
if do_print:
|
307
338
|
logger.debug(f'resources: {resources}')
|
308
339
|
|
309
340
|
if minimize_cost:
|
310
341
|
cost_per_node = resources.get_cost(estimated_runtime)
|
311
|
-
num_available_reserved_nodes =
|
312
|
-
|
313
|
-
|
342
|
+
num_available_reserved_nodes = (
|
343
|
+
num_available_reserved_nodes_per_resource.get(
|
344
|
+
resources, 0))
|
314
345
|
|
315
346
|
# We consider the cost of the unused reservation
|
316
347
|
# resources to be 0 since we are already paying for
|
@@ -348,10 +379,6 @@ class Optimizer:
|
|
348
379
|
for orig_resources in node.resources):
|
349
380
|
source_hint = 'kubernetes cluster'
|
350
381
|
|
351
|
-
# TODO(romilb): When `sky show-gpus` supports Kubernetes,
|
352
|
-
# add a hint to run `sky show-gpus --kubernetes` to list
|
353
|
-
# available accelerators on Kubernetes.
|
354
|
-
|
355
382
|
bold = colorama.Style.BRIGHT
|
356
383
|
cyan = colorama.Fore.CYAN
|
357
384
|
reset = colorama.Style.RESET_ALL
|
@@ -360,10 +387,14 @@ class Optimizer:
|
|
360
387
|
fuzzy_candidates_str = (
|
361
388
|
f'\nTry one of these offered accelerators: {cyan}'
|
362
389
|
f'{fuzzy_candidates}{reset}')
|
390
|
+
node_resources_reprs = ', '.join(f'{node.num_nodes}x ' +
|
391
|
+
r.repr_with_region_zone
|
392
|
+
for r in node.resources)
|
363
393
|
error_msg = (
|
364
394
|
f'{source_hint.capitalize()} does not contain any '
|
365
|
-
f'instances satisfying the request
|
366
|
-
f'
|
395
|
+
f'instances satisfying the request: '
|
396
|
+
f'{node_resources_reprs}.'
|
397
|
+
f'\nTo fix: relax or change the '
|
367
398
|
f'resource requirements.{fuzzy_candidates_str}\n\n'
|
368
399
|
f'Hint: {bold}sky show-gpus{reset} '
|
369
400
|
'to list available accelerators.\n'
|
@@ -692,7 +723,6 @@ class Optimizer:
|
|
692
723
|
node_to_cost_map: _TaskToCostMap,
|
693
724
|
minimize_cost: bool,
|
694
725
|
):
|
695
|
-
logger.info('== Optimizer ==')
|
696
726
|
ordered_node_to_cost_map = collections.OrderedDict()
|
697
727
|
ordered_best_plan = collections.OrderedDict()
|
698
728
|
for node in topo_order:
|
@@ -714,15 +744,18 @@ class Optimizer:
|
|
714
744
|
node.get_inputs() is None and node.get_outputs() is None):
|
715
745
|
print_hourly_cost = True
|
716
746
|
|
717
|
-
if
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
747
|
+
if not env_options.Options.MINIMIZE_LOGGING.get():
|
748
|
+
if print_hourly_cost:
|
749
|
+
logger.info(
|
750
|
+
f'{colorama.Style.BRIGHT}Estimated cost: '
|
751
|
+
f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
|
752
|
+
else:
|
753
|
+
logger.info(
|
754
|
+
f'{colorama.Style.BRIGHT}Estimated total runtime: '
|
755
|
+
f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
|
756
|
+
'hours\n'
|
757
|
+
f'{colorama.Style.BRIGHT}Estimated total cost: '
|
758
|
+
f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
|
726
759
|
|
727
760
|
def _get_resources_element_list(
|
728
761
|
resources: 'resources_lib.Resources') -> List[str]:
|
@@ -797,13 +830,17 @@ class Optimizer:
|
|
797
830
|
return row
|
798
831
|
|
799
832
|
def _get_resource_group_hash(resources: 'resources_lib.Resources'):
|
800
|
-
|
801
|
-
{
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
833
|
+
resource_key_dict = {
|
834
|
+
'cloud': f'{resources.cloud}',
|
835
|
+
'accelerators': f'{resources.accelerators}',
|
836
|
+
'use_spot': resources.use_spot
|
837
|
+
}
|
838
|
+
if isinstance(resources.cloud, clouds.Kubernetes):
|
839
|
+
# Region for Kubernetes is the context name, i.e. different
|
840
|
+
# Kubernetes clusters. We add region to the key to show all the
|
841
|
+
# Kubernetes clusters in the optimizer table for better UX.
|
842
|
+
resource_key_dict['region'] = resources.region
|
843
|
+
return json.dumps(resource_key_dict, sort_keys=True)
|
807
844
|
|
808
845
|
# Print the list of resouces that the optimizer considered.
|
809
846
|
resource_fields = [
|
@@ -821,7 +858,7 @@ class Optimizer:
|
|
821
858
|
best_plan_table = _create_table(['TASK', '#NODES'] +
|
822
859
|
resource_fields)
|
823
860
|
best_plan_table.add_rows(best_plan_rows)
|
824
|
-
logger.info(f'{best_plan_table}
|
861
|
+
logger.info(f'{best_plan_table}')
|
825
862
|
|
826
863
|
# Print the egress plan if any data egress is scheduled.
|
827
864
|
Optimizer._print_egress_plan(graph, best_plan, minimize_cost)
|
@@ -840,10 +877,12 @@ class Optimizer:
|
|
840
877
|
}
|
841
878
|
task_str = (f'for task {task.name!r} ' if num_tasks > 1 else '')
|
842
879
|
plural = 's' if task.num_nodes > 1 else ''
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
880
|
+
if num_tasks > 1:
|
881
|
+
# Add a new line for better readability, when there are multiple
|
882
|
+
# tasks.
|
883
|
+
logger.info('')
|
884
|
+
logger.info(f'Considered resources {task_str}'
|
885
|
+
f'({task.num_nodes} node{plural}):')
|
847
886
|
|
848
887
|
# Only print 1 row per cloud.
|
849
888
|
# The following code is to generate the table
|
@@ -910,7 +949,16 @@ class Optimizer:
|
|
910
949
|
|
911
950
|
table = _create_table(field_names)
|
912
951
|
table.add_rows(rows)
|
913
|
-
logger.info(f'{table}
|
952
|
+
logger.info(f'{table}')
|
953
|
+
|
954
|
+
# Warning message for using disk_tier=ultra
|
955
|
+
# TODO(yi): Consider price of disks in optimizer and
|
956
|
+
# move this warning there.
|
957
|
+
if chosen_resources.disk_tier == resources_utils.DiskTier.ULTRA:
|
958
|
+
logger.warning(
|
959
|
+
'Using disk_tier=ultra will utilize more advanced disks '
|
960
|
+
'(io2 Block Express on AWS and extreme persistent disk on '
|
961
|
+
'GCP), which can lead to significant higher costs (~$2/h).')
|
914
962
|
|
915
963
|
@staticmethod
|
916
964
|
def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
|
@@ -932,10 +980,10 @@ class Optimizer:
|
|
932
980
|
f'Multiple {cloud} instances satisfy '
|
933
981
|
f'{acc_name}:{int(acc_count)}. '
|
934
982
|
f'The cheapest {candidate_list[0]!r} is considered '
|
935
|
-
f'among:\n{instance_list}
|
983
|
+
f'among:\n{instance_list}.')
|
936
984
|
if is_multi_instances:
|
937
985
|
logger.info(
|
938
|
-
f'To list more details, run
|
986
|
+
f'To list more details, run: sky show-gpus {acc_name}\n')
|
939
987
|
|
940
988
|
@staticmethod
|
941
989
|
def _optimize_dag(
|
@@ -1068,8 +1116,7 @@ class Optimizer:
|
|
1068
1116
|
Optimizer.print_optimized_plan(graph, topo_order, best_plan,
|
1069
1117
|
total_time, total_cost,
|
1070
1118
|
node_to_cost_map, minimize_cost)
|
1071
|
-
|
1072
|
-
Optimizer._print_candidates(local_node_to_candidate_map)
|
1119
|
+
Optimizer._print_candidates(local_node_to_candidate_map)
|
1073
1120
|
return best_plan
|
1074
1121
|
|
1075
1122
|
|
@@ -1120,7 +1167,7 @@ def _make_launchables_for_valid_region_zones(
|
|
1120
1167
|
regions = launchable_resources.get_valid_regions_for_launchable()
|
1121
1168
|
for region in regions:
|
1122
1169
|
if (launchable_resources.use_spot and region.zones is not None or
|
1123
|
-
|
1170
|
+
launchable_resources.cloud.optimize_by_zone()):
|
1124
1171
|
# Spot instances.
|
1125
1172
|
# Do not batch the per-zone requests.
|
1126
1173
|
for zone in region.zones:
|
@@ -1231,6 +1278,9 @@ def _fill_in_launchable_resources(
|
|
1231
1278
|
if blocked_resources is None:
|
1232
1279
|
blocked_resources = []
|
1233
1280
|
for resources in task.resources:
|
1281
|
+
# Validate the resources first which may fill in missing fields
|
1282
|
+
# automatically for the resources.
|
1283
|
+
resources.validate()
|
1234
1284
|
if (resources.cloud is not None and
|
1235
1285
|
not clouds.cloud_in_iterable(resources.cloud, enabled_clouds)):
|
1236
1286
|
# Skip the resources that are on a cloud that is not enabled. The
|
@@ -1239,22 +1289,29 @@ def _fill_in_launchable_resources(
|
|
1239
1289
|
continue
|
1240
1290
|
clouds_list = ([resources.cloud]
|
1241
1291
|
if resources.cloud is not None else enabled_clouds)
|
1242
|
-
for
|
1243
|
-
|
1244
|
-
|
1245
|
-
|
1246
|
-
|
1292
|
+
# If clouds provide hints, store them for later printing.
|
1293
|
+
hints: Dict[clouds.Cloud, str] = {}
|
1294
|
+
|
1295
|
+
feasible_list = subprocess_utils.run_in_parallel(
|
1296
|
+
lambda cloud, r=resources, n=task.num_nodes:
|
1297
|
+
(cloud, cloud.get_feasible_launchable_resources(r, n)),
|
1298
|
+
clouds_list)
|
1299
|
+
for cloud, feasible_resources in feasible_list:
|
1300
|
+
if feasible_resources.hint is not None:
|
1301
|
+
hints[cloud] = feasible_resources.hint
|
1302
|
+
if feasible_resources.resources_list:
|
1247
1303
|
# Assume feasible_resources is sorted by prices. Guaranteed by
|
1248
1304
|
# the implementation of get_feasible_launchable_resources and
|
1249
1305
|
# the underlying service_catalog filtering
|
1250
|
-
cheapest = feasible_resources[0]
|
1306
|
+
cheapest = feasible_resources.resources_list[0]
|
1251
1307
|
# Generate region/zone-specified resources.
|
1252
1308
|
launchable[resources].extend(
|
1253
1309
|
_make_launchables_for_valid_region_zones(cheapest))
|
1254
|
-
cloud_candidates[cloud] = feasible_resources
|
1310
|
+
cloud_candidates[cloud] = feasible_resources.resources_list
|
1255
1311
|
else:
|
1256
|
-
all_fuzzy_candidates.update(
|
1257
|
-
|
1312
|
+
all_fuzzy_candidates.update(
|
1313
|
+
feasible_resources.fuzzy_candidate_list)
|
1314
|
+
if not launchable[resources]:
|
1258
1315
|
clouds_str = str(clouds_list) if len(clouds_list) > 1 else str(
|
1259
1316
|
clouds_list[0])
|
1260
1317
|
num_node_str = ''
|
@@ -1269,15 +1326,17 @@ def _fill_in_launchable_resources(
|
|
1269
1326
|
f'{colorama.Fore.CYAN}'
|
1270
1327
|
f'{sorted(all_fuzzy_candidates)}'
|
1271
1328
|
f'{colorama.Style.RESET_ALL}')
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1329
|
+
else:
|
1330
|
+
if resources.cpus is not None:
|
1331
|
+
logger.info('Try specifying a different CPU count, '
|
1332
|
+
'or add "+" to the end of the CPU count '
|
1333
|
+
'to allow for larger instances.')
|
1334
|
+
if resources.memory is not None:
|
1335
|
+
logger.info('Try specifying a different memory size, '
|
1336
|
+
'or add "+" to the end of the memory size '
|
1337
|
+
'to allow for larger instances.')
|
1338
|
+
for cloud, hint in hints.items():
|
1339
|
+
logger.info(f'{repr(cloud)}: {hint}')
|
1281
1340
|
|
1282
1341
|
launchable[resources] = _filter_out_blocked_launchable_resources(
|
1283
1342
|
launchable[resources], blocked_resources)
|
sky/provision/__init__.py
CHANGED
@@ -5,10 +5,10 @@ providers supported by SkyPilot need to follow.
|
|
5
5
|
"""
|
6
6
|
import functools
|
7
7
|
import inspect
|
8
|
+
import typing
|
8
9
|
from typing import Any, Dict, List, Optional, Type
|
9
10
|
|
10
11
|
from sky import sky_logging
|
11
|
-
from sky import status_lib
|
12
12
|
# These provision.<cloud> modules should never fail even if underlying cloud SDK
|
13
13
|
# dependencies are not installed. This is ensured by using sky.adaptors inside
|
14
14
|
# these modules, for lazy loading of cloud SDKs.
|
@@ -19,9 +19,17 @@ from sky.provision import cudo
|
|
19
19
|
from sky.provision import fluidstack
|
20
20
|
from sky.provision import gcp
|
21
21
|
from sky.provision import kubernetes
|
22
|
+
from sky.provision import lambda_cloud
|
23
|
+
from sky.provision import nebius
|
24
|
+
from sky.provision import oci
|
22
25
|
from sky.provision import runpod
|
26
|
+
from sky.provision import vast
|
23
27
|
from sky.provision import vsphere
|
24
28
|
from sky.utils import command_runner
|
29
|
+
from sky.utils import timeline
|
30
|
+
|
31
|
+
if typing.TYPE_CHECKING:
|
32
|
+
from sky.utils import status_lib
|
25
33
|
|
26
34
|
logger = sky_logging.init_logger(__name__)
|
27
35
|
|
@@ -39,6 +47,8 @@ def _route_to_cloud_impl(func):
|
|
39
47
|
provider_name = kwargs.pop('provider_name')
|
40
48
|
|
41
49
|
module_name = provider_name.lower()
|
50
|
+
if module_name == 'lambda':
|
51
|
+
module_name = 'lambda_cloud'
|
42
52
|
module = globals().get(module_name)
|
43
53
|
assert module is not None, f'Unknown provider: {module_name}'
|
44
54
|
|
@@ -55,13 +65,14 @@ def _route_to_cloud_impl(func):
|
|
55
65
|
# pylint: disable=unused-argument
|
56
66
|
|
57
67
|
|
68
|
+
@timeline.event
|
58
69
|
@_route_to_cloud_impl
|
59
70
|
def query_instances(
|
60
71
|
provider_name: str,
|
61
72
|
cluster_name_on_cloud: str,
|
62
73
|
provider_config: Optional[Dict[str, Any]] = None,
|
63
74
|
non_terminated_only: bool = True,
|
64
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
75
|
+
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
65
76
|
"""Query instances.
|
66
77
|
|
67
78
|
Returns a dictionary of instance IDs and status.
|
@@ -155,6 +166,10 @@ def query_ports(
|
|
155
166
|
return the endpoint without querying the cloud provider. If head_ip is not
|
156
167
|
provided, the cloud provider will be queried to get the endpoint info.
|
157
168
|
|
169
|
+
The underlying implementation is responsible for retries and timeout, e.g.
|
170
|
+
kubernetes will wait for the service that expose the ports to be ready
|
171
|
+
before returning the endpoint info.
|
172
|
+
|
158
173
|
Returns a dict with port as the key and a list of common.Endpoint.
|
159
174
|
"""
|
160
175
|
del provider_name, provider_config, cluster_name_on_cloud # unused
|
@@ -163,7 +178,7 @@ def query_ports(
|
|
163
178
|
|
164
179
|
@_route_to_cloud_impl
|
165
180
|
def wait_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
|
166
|
-
state: Optional[status_lib.ClusterStatus]) -> None:
|
181
|
+
state: Optional['status_lib.ClusterStatus']) -> None:
|
167
182
|
"""Wait instances until they ends up in the given state."""
|
168
183
|
raise NotImplementedError
|
169
184
|
|
@@ -182,12 +197,12 @@ def get_cluster_info(
|
|
182
197
|
def get_command_runners(
|
183
198
|
provider_name: str,
|
184
199
|
cluster_info: common.ClusterInfo,
|
185
|
-
**
|
200
|
+
**credentials: Dict[str, Any],
|
186
201
|
) -> List[command_runner.CommandRunner]:
|
187
202
|
"""Get a command runner for the given cluster."""
|
188
203
|
ip_list = cluster_info.get_feasible_ips()
|
189
204
|
port_list = cluster_info.get_ssh_ports()
|
190
205
|
return command_runner.SSHCommandRunner.make_runner_list(
|
191
206
|
node_list=zip(ip_list, port_list),
|
192
|
-
**
|
207
|
+
**credentials,
|
193
208
|
)
|