skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/models.py ADDED
@@ -0,0 +1,27 @@
1
+ """Data Models for SkyPilot."""
2
+
3
+ import collections
4
+ import dataclasses
5
+ from typing import Dict, Optional
6
+
7
+
8
+ @dataclasses.dataclass
9
+ class User:
10
+ # User hash
11
+ id: str
12
+ # Display name of the user
13
+ name: Optional[str] = None
14
+
15
+
16
+ RealtimeGpuAvailability = collections.namedtuple(
17
+ 'RealtimeGpuAvailability', ['gpu', 'counts', 'capacity', 'available'])
18
+
19
+
20
+ @dataclasses.dataclass
21
+ class KubernetesNodeInfo:
22
+ """Dataclass to store Kubernetes node information."""
23
+ name: str
24
+ accelerator_type: Optional[str]
25
+ # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
26
+ total: Dict[str, int]
27
+ free: Dict[str, int]
sky/optimizer.py CHANGED
@@ -1,7 +1,6 @@
1
1
  """Optimizer: assigns best resources to user tasks."""
2
2
  import collections
3
3
  import copy
4
- import enum
5
4
  import json
6
5
  import typing
7
6
  from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
@@ -17,8 +16,14 @@ from sky import resources as resources_lib
17
16
  from sky import sky_logging
18
17
  from sky import task as task_lib
19
18
  from sky.adaptors import common as adaptors_common
19
+ from sky.usage import usage_lib
20
+ from sky.utils import common
20
21
  from sky.utils import env_options
21
22
  from sky.utils import log_utils
23
+ from sky.utils import resources_utils
24
+ from sky.utils import rich_utils
25
+ from sky.utils import subprocess_utils
26
+ from sky.utils import timeline
22
27
  from sky.utils import ux_utils
23
28
 
24
29
  if typing.TYPE_CHECKING:
@@ -41,12 +46,6 @@ _PerCloudCandidates = Dict[clouds.Cloud, List[resources_lib.Resources]]
41
46
  _TaskToPerCloudCandidates = Dict[task_lib.Task, _PerCloudCandidates]
42
47
 
43
48
 
44
- # Constants: minimize what target?
45
- class OptimizeTarget(enum.Enum):
46
- COST = 0
47
- TIME = 1
48
-
49
-
50
49
  # For logging purposes.
51
50
  def _create_table(field_names: List[str]) -> prettytable.PrettyTable:
52
51
  table_kwargs = {
@@ -102,11 +101,13 @@ class Optimizer:
102
101
  return egress_time
103
102
 
104
103
  @staticmethod
104
+ @timeline.event
105
+ @usage_lib.entrypoint('sky.optimizer.optimize')
105
106
  def optimize(dag: 'dag_lib.Dag',
106
- minimize: OptimizeTarget = OptimizeTarget.COST,
107
+ minimize: common.OptimizeTarget = common.OptimizeTarget.COST,
107
108
  blocked_resources: Optional[Iterable[
108
109
  resources_lib.Resources]] = None,
109
- quiet: bool = False):
110
+ quiet: bool = False) -> 'dag_lib.Dag':
110
111
  """Find the best execution plan for the given DAG.
111
112
 
112
113
  Args:
@@ -120,22 +121,22 @@ class Optimizer:
120
121
  for a task.
121
122
  exceptions.NoCloudAccessError: if no public clouds are enabled.
122
123
  """
123
- _check_specified_clouds(dag)
124
-
125
- # This function is effectful: mutates every node in 'dag' by setting
126
- # node.best_resources if it is None.
127
- Optimizer._add_dummy_source_sink_nodes(dag)
128
- try:
129
- unused_best_plan = Optimizer._optimize_dag(
130
- dag=dag,
131
- minimize_cost=minimize == OptimizeTarget.COST,
132
- blocked_resources=blocked_resources,
133
- quiet=quiet)
134
- finally:
135
- # Make sure to remove the dummy source/sink nodes, even if the
136
- # optimization fails.
137
- Optimizer._remove_dummy_source_sink_nodes(dag)
138
- return dag
124
+ with rich_utils.safe_status(ux_utils.spinner_message('Optimizing')):
125
+ _check_specified_clouds(dag)
126
+ # This function is effectful: mutates every node in 'dag' by setting
127
+ # node.best_resources if it is None.
128
+ Optimizer._add_dummy_source_sink_nodes(dag)
129
+ try:
130
+ unused_best_plan = Optimizer._optimize_dag(
131
+ dag=dag,
132
+ minimize_cost=minimize == common.OptimizeTarget.COST,
133
+ blocked_resources=blocked_resources,
134
+ quiet=quiet)
135
+ finally:
136
+ # Make sure to remove the dummy source/sink nodes, even if the
137
+ # optimization fails.
138
+ Optimizer._remove_dummy_source_sink_nodes(dag)
139
+ return dag
139
140
 
140
141
  @staticmethod
141
142
  def _add_dummy_source_sink_nodes(dag: 'dag_lib.Dag'):
@@ -182,7 +183,7 @@ class Optimizer:
182
183
  """Removes special Source and Sink nodes."""
183
184
  source = [t for t in dag.tasks if t.name == _DUMMY_SOURCE_NAME]
184
185
  sink = [t for t in dag.tasks if t.name == _DUMMY_SINK_NAME]
185
- if len(source) == len(sink) == 0:
186
+ if not source and not sink:
186
187
  return
187
188
  assert len(source) == len(sink) == 1, dag.tasks
188
189
  dag.remove(source[0])
@@ -252,6 +253,29 @@ class Optimizer:
252
253
  # node -> cloud -> list of resources that satisfy user's requirements.
253
254
  node_to_candidate_map: _TaskToPerCloudCandidates = {}
254
255
 
256
+ def get_available_reservations(
257
+ launchable_resources: Dict[resources_lib.Resources,
258
+ List[resources_lib.Resources]]
259
+ ) -> Dict[resources_lib.Resources, int]:
260
+ if not resources_utils.need_to_query_reservations():
261
+ return {}
262
+
263
+ num_available_reserved_nodes_per_resource = {}
264
+
265
+ def get_reservations_available_resources(
266
+ resources: resources_lib.Resources):
267
+ num_available_reserved_nodes_per_resource[resources] = sum(
268
+ resources.get_reservations_available_resources().values())
269
+
270
+ launchable_resources_list: List[resources_lib.Resources] = sum(
271
+ launchable_resources.values(), [])
272
+ with rich_utils.safe_status(
273
+ ux_utils.spinner_message('Checking reserved resources')):
274
+ subprocess_utils.run_in_parallel(
275
+ get_reservations_available_resources,
276
+ launchable_resources_list)
277
+ return num_available_reserved_nodes_per_resource
278
+
255
279
  # Compute the estimated cost/time for each node.
256
280
  for node_i, node in enumerate(topo_order):
257
281
  if node_i == 0:
@@ -261,8 +285,6 @@ class Optimizer:
261
285
 
262
286
  # Don't print for the last node, Sink.
263
287
  do_print = node_i != len(topo_order) - 1
264
- if do_print:
265
- logger.debug('#### {} ####'.format(node))
266
288
 
267
289
  fuzzy_candidates: List[str] = []
268
290
  if node_i < len(topo_order) - 1:
@@ -273,13 +295,21 @@ class Optimizer:
273
295
  blocked_resources=blocked_resources,
274
296
  quiet=quiet))
275
297
  node_to_candidate_map[node] = cloud_candidates
298
+ # Has to call the printing after the launchable resources are
299
+ # computed, because the missing fields of the resources are
300
+ # inferred in the _fill_in_launchable_resources function.
301
+ logger.debug('#### {} ####'.format(node))
276
302
  else:
277
303
  # Dummy sink node.
278
304
  launchable_resources = {
279
305
  list(node.resources)[0]: list(node.resources)
280
306
  }
281
307
 
308
+ # Fetch reservations in advance and in parallel to speed up the
309
+ # reservation info fetching.
282
310
  num_resources = len(list(node.resources))
311
+ num_available_reserved_nodes_per_resource = (
312
+ get_available_reservations(launchable_resources))
283
313
 
284
314
  for orig_resources, launchable_list in launchable_resources.items():
285
315
  if num_resources == 1 and node.time_estimator_func is None:
@@ -302,15 +332,16 @@ class Optimizer:
302
332
  else:
303
333
  estimated_runtime = node.estimate_runtime(
304
334
  orig_resources)
335
+
305
336
  for resources in launchable_list:
306
337
  if do_print:
307
338
  logger.debug(f'resources: {resources}')
308
339
 
309
340
  if minimize_cost:
310
341
  cost_per_node = resources.get_cost(estimated_runtime)
311
- num_available_reserved_nodes = sum(
312
- resources.get_reservations_available_resources(
313
- ).values())
342
+ num_available_reserved_nodes = (
343
+ num_available_reserved_nodes_per_resource.get(
344
+ resources, 0))
314
345
 
315
346
  # We consider the cost of the unused reservation
316
347
  # resources to be 0 since we are already paying for
@@ -348,10 +379,6 @@ class Optimizer:
348
379
  for orig_resources in node.resources):
349
380
  source_hint = 'kubernetes cluster'
350
381
 
351
- # TODO(romilb): When `sky show-gpus` supports Kubernetes,
352
- # add a hint to run `sky show-gpus --kubernetes` to list
353
- # available accelerators on Kubernetes.
354
-
355
382
  bold = colorama.Style.BRIGHT
356
383
  cyan = colorama.Fore.CYAN
357
384
  reset = colorama.Style.RESET_ALL
@@ -360,10 +387,14 @@ class Optimizer:
360
387
  fuzzy_candidates_str = (
361
388
  f'\nTry one of these offered accelerators: {cyan}'
362
389
  f'{fuzzy_candidates}{reset}')
390
+ node_resources_reprs = ', '.join(f'{node.num_nodes}x ' +
391
+ r.repr_with_region_zone
392
+ for r in node.resources)
363
393
  error_msg = (
364
394
  f'{source_hint.capitalize()} does not contain any '
365
- f'instances satisfying the request:\n{node}.'
366
- f'\n\nTo fix: relax or change the '
395
+ f'instances satisfying the request: '
396
+ f'{node_resources_reprs}.'
397
+ f'\nTo fix: relax or change the '
367
398
  f'resource requirements.{fuzzy_candidates_str}\n\n'
368
399
  f'Hint: {bold}sky show-gpus{reset} '
369
400
  'to list available accelerators.\n'
@@ -692,7 +723,6 @@ class Optimizer:
692
723
  node_to_cost_map: _TaskToCostMap,
693
724
  minimize_cost: bool,
694
725
  ):
695
- logger.info('== Optimizer ==')
696
726
  ordered_node_to_cost_map = collections.OrderedDict()
697
727
  ordered_best_plan = collections.OrderedDict()
698
728
  for node in topo_order:
@@ -714,15 +744,18 @@ class Optimizer:
714
744
  node.get_inputs() is None and node.get_outputs() is None):
715
745
  print_hourly_cost = True
716
746
 
717
- if print_hourly_cost:
718
- logger.info(f'{colorama.Style.BRIGHT}Estimated cost: '
719
- f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
720
- else:
721
- logger.info(f'{colorama.Style.BRIGHT}Estimated total runtime: '
722
- f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
723
- 'hours\n'
724
- f'{colorama.Style.BRIGHT}Estimated total cost: '
725
- f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
747
+ if not env_options.Options.MINIMIZE_LOGGING.get():
748
+ if print_hourly_cost:
749
+ logger.info(
750
+ f'{colorama.Style.BRIGHT}Estimated cost: '
751
+ f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
752
+ else:
753
+ logger.info(
754
+ f'{colorama.Style.BRIGHT}Estimated total runtime: '
755
+ f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
756
+ 'hours\n'
757
+ f'{colorama.Style.BRIGHT}Estimated total cost: '
758
+ f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
726
759
 
727
760
  def _get_resources_element_list(
728
761
  resources: 'resources_lib.Resources') -> List[str]:
@@ -797,13 +830,17 @@ class Optimizer:
797
830
  return row
798
831
 
799
832
  def _get_resource_group_hash(resources: 'resources_lib.Resources'):
800
- return json.dumps(
801
- {
802
- 'cloud': f'{resources.cloud}',
803
- 'accelerators': f'{resources.accelerators}',
804
- 'use_spot': resources.use_spot
805
- },
806
- sort_keys=True)
833
+ resource_key_dict = {
834
+ 'cloud': f'{resources.cloud}',
835
+ 'accelerators': f'{resources.accelerators}',
836
+ 'use_spot': resources.use_spot
837
+ }
838
+ if isinstance(resources.cloud, clouds.Kubernetes):
839
+ # Region for Kubernetes is the context name, i.e. different
840
+ # Kubernetes clusters. We add region to the key to show all the
841
+ # Kubernetes clusters in the optimizer table for better UX.
842
+ resource_key_dict['region'] = resources.region
843
+ return json.dumps(resource_key_dict, sort_keys=True)
807
844
 
808
845
  # Print the list of resouces that the optimizer considered.
809
846
  resource_fields = [
@@ -821,7 +858,7 @@ class Optimizer:
821
858
  best_plan_table = _create_table(['TASK', '#NODES'] +
822
859
  resource_fields)
823
860
  best_plan_table.add_rows(best_plan_rows)
824
- logger.info(f'{best_plan_table}\n')
861
+ logger.info(f'{best_plan_table}')
825
862
 
826
863
  # Print the egress plan if any data egress is scheduled.
827
864
  Optimizer._print_egress_plan(graph, best_plan, minimize_cost)
@@ -840,10 +877,12 @@ class Optimizer:
840
877
  }
841
878
  task_str = (f'for task {task.name!r} ' if num_tasks > 1 else '')
842
879
  plural = 's' if task.num_nodes > 1 else ''
843
- logger.info(
844
- f'{colorama.Style.BRIGHT}Considered resources {task_str}'
845
- f'({task.num_nodes} node{plural}):'
846
- f'{colorama.Style.RESET_ALL}')
880
+ if num_tasks > 1:
881
+ # Add a new line for better readability, when there are multiple
882
+ # tasks.
883
+ logger.info('')
884
+ logger.info(f'Considered resources {task_str}'
885
+ f'({task.num_nodes} node{plural}):')
847
886
 
848
887
  # Only print 1 row per cloud.
849
888
  # The following code is to generate the table
@@ -910,7 +949,16 @@ class Optimizer:
910
949
 
911
950
  table = _create_table(field_names)
912
951
  table.add_rows(rows)
913
- logger.info(f'{table}\n')
952
+ logger.info(f'{table}')
953
+
954
+ # Warning message for using disk_tier=ultra
955
+ # TODO(yi): Consider price of disks in optimizer and
956
+ # move this warning there.
957
+ if chosen_resources.disk_tier == resources_utils.DiskTier.ULTRA:
958
+ logger.warning(
959
+ 'Using disk_tier=ultra will utilize more advanced disks '
960
+ '(io2 Block Express on AWS and extreme persistent disk on '
961
+ 'GCP), which can lead to significant higher costs (~$2/h).')
914
962
 
915
963
  @staticmethod
916
964
  def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
@@ -932,10 +980,10 @@ class Optimizer:
932
980
  f'Multiple {cloud} instances satisfy '
933
981
  f'{acc_name}:{int(acc_count)}. '
934
982
  f'The cheapest {candidate_list[0]!r} is considered '
935
- f'among:\n{instance_list}.\n')
983
+ f'among:\n{instance_list}.')
936
984
  if is_multi_instances:
937
985
  logger.info(
938
- f'To list more details, run \'sky show-gpus {acc_name}\'.')
986
+ f'To list more details, run: sky show-gpus {acc_name}\n')
939
987
 
940
988
  @staticmethod
941
989
  def _optimize_dag(
@@ -1068,8 +1116,7 @@ class Optimizer:
1068
1116
  Optimizer.print_optimized_plan(graph, topo_order, best_plan,
1069
1117
  total_time, total_cost,
1070
1118
  node_to_cost_map, minimize_cost)
1071
- if not env_options.Options.MINIMIZE_LOGGING.get():
1072
- Optimizer._print_candidates(local_node_to_candidate_map)
1119
+ Optimizer._print_candidates(local_node_to_candidate_map)
1073
1120
  return best_plan
1074
1121
 
1075
1122
 
@@ -1120,7 +1167,7 @@ def _make_launchables_for_valid_region_zones(
1120
1167
  regions = launchable_resources.get_valid_regions_for_launchable()
1121
1168
  for region in regions:
1122
1169
  if (launchable_resources.use_spot and region.zones is not None or
1123
- isinstance(launchable_resources.cloud, clouds.GCP)):
1170
+ launchable_resources.cloud.optimize_by_zone()):
1124
1171
  # Spot instances.
1125
1172
  # Do not batch the per-zone requests.
1126
1173
  for zone in region.zones:
@@ -1231,6 +1278,9 @@ def _fill_in_launchable_resources(
1231
1278
  if blocked_resources is None:
1232
1279
  blocked_resources = []
1233
1280
  for resources in task.resources:
1281
+ # Validate the resources first which may fill in missing fields
1282
+ # automatically for the resources.
1283
+ resources.validate()
1234
1284
  if (resources.cloud is not None and
1235
1285
  not clouds.cloud_in_iterable(resources.cloud, enabled_clouds)):
1236
1286
  # Skip the resources that are on a cloud that is not enabled. The
@@ -1239,22 +1289,29 @@ def _fill_in_launchable_resources(
1239
1289
  continue
1240
1290
  clouds_list = ([resources.cloud]
1241
1291
  if resources.cloud is not None else enabled_clouds)
1242
- for cloud in clouds_list:
1243
- (feasible_resources,
1244
- fuzzy_candidate_list) = cloud.get_feasible_launchable_resources(
1245
- resources, num_nodes=task.num_nodes)
1246
- if len(feasible_resources) > 0:
1292
+ # If clouds provide hints, store them for later printing.
1293
+ hints: Dict[clouds.Cloud, str] = {}
1294
+
1295
+ feasible_list = subprocess_utils.run_in_parallel(
1296
+ lambda cloud, r=resources, n=task.num_nodes:
1297
+ (cloud, cloud.get_feasible_launchable_resources(r, n)),
1298
+ clouds_list)
1299
+ for cloud, feasible_resources in feasible_list:
1300
+ if feasible_resources.hint is not None:
1301
+ hints[cloud] = feasible_resources.hint
1302
+ if feasible_resources.resources_list:
1247
1303
  # Assume feasible_resources is sorted by prices. Guaranteed by
1248
1304
  # the implementation of get_feasible_launchable_resources and
1249
1305
  # the underlying service_catalog filtering
1250
- cheapest = feasible_resources[0]
1306
+ cheapest = feasible_resources.resources_list[0]
1251
1307
  # Generate region/zone-specified resources.
1252
1308
  launchable[resources].extend(
1253
1309
  _make_launchables_for_valid_region_zones(cheapest))
1254
- cloud_candidates[cloud] = feasible_resources
1310
+ cloud_candidates[cloud] = feasible_resources.resources_list
1255
1311
  else:
1256
- all_fuzzy_candidates.update(fuzzy_candidate_list)
1257
- if len(launchable[resources]) == 0:
1312
+ all_fuzzy_candidates.update(
1313
+ feasible_resources.fuzzy_candidate_list)
1314
+ if not launchable[resources]:
1258
1315
  clouds_str = str(clouds_list) if len(clouds_list) > 1 else str(
1259
1316
  clouds_list[0])
1260
1317
  num_node_str = ''
@@ -1269,15 +1326,17 @@ def _fill_in_launchable_resources(
1269
1326
  f'{colorama.Fore.CYAN}'
1270
1327
  f'{sorted(all_fuzzy_candidates)}'
1271
1328
  f'{colorama.Style.RESET_ALL}')
1272
- else:
1273
- if resources.cpus is not None:
1274
- logger.info('Try specifying a different CPU count, '
1275
- 'or add "+" to the end of the CPU count '
1276
- 'to allow for larger instances.')
1277
- if resources.memory is not None:
1278
- logger.info('Try specifying a different memory size, '
1279
- 'or add "+" to the end of the memory size '
1280
- 'to allow for larger instances.')
1329
+ else:
1330
+ if resources.cpus is not None:
1331
+ logger.info('Try specifying a different CPU count, '
1332
+ 'or add "+" to the end of the CPU count '
1333
+ 'to allow for larger instances.')
1334
+ if resources.memory is not None:
1335
+ logger.info('Try specifying a different memory size, '
1336
+ 'or add "+" to the end of the memory size '
1337
+ 'to allow for larger instances.')
1338
+ for cloud, hint in hints.items():
1339
+ logger.info(f'{repr(cloud)}: {hint}')
1281
1340
 
1282
1341
  launchable[resources] = _filter_out_blocked_launchable_resources(
1283
1342
  launchable[resources], blocked_resources)
sky/provision/__init__.py CHANGED
@@ -5,10 +5,10 @@ providers supported by SkyPilot need to follow.
5
5
  """
6
6
  import functools
7
7
  import inspect
8
+ import typing
8
9
  from typing import Any, Dict, List, Optional, Type
9
10
 
10
11
  from sky import sky_logging
11
- from sky import status_lib
12
12
  # These provision.<cloud> modules should never fail even if underlying cloud SDK
13
13
  # dependencies are not installed. This is ensured by using sky.adaptors inside
14
14
  # these modules, for lazy loading of cloud SDKs.
@@ -19,9 +19,17 @@ from sky.provision import cudo
19
19
  from sky.provision import fluidstack
20
20
  from sky.provision import gcp
21
21
  from sky.provision import kubernetes
22
+ from sky.provision import lambda_cloud
23
+ from sky.provision import nebius
24
+ from sky.provision import oci
22
25
  from sky.provision import runpod
26
+ from sky.provision import vast
23
27
  from sky.provision import vsphere
24
28
  from sky.utils import command_runner
29
+ from sky.utils import timeline
30
+
31
+ if typing.TYPE_CHECKING:
32
+ from sky.utils import status_lib
25
33
 
26
34
  logger = sky_logging.init_logger(__name__)
27
35
 
@@ -39,6 +47,8 @@ def _route_to_cloud_impl(func):
39
47
  provider_name = kwargs.pop('provider_name')
40
48
 
41
49
  module_name = provider_name.lower()
50
+ if module_name == 'lambda':
51
+ module_name = 'lambda_cloud'
42
52
  module = globals().get(module_name)
43
53
  assert module is not None, f'Unknown provider: {module_name}'
44
54
 
@@ -55,13 +65,14 @@ def _route_to_cloud_impl(func):
55
65
  # pylint: disable=unused-argument
56
66
 
57
67
 
68
+ @timeline.event
58
69
  @_route_to_cloud_impl
59
70
  def query_instances(
60
71
  provider_name: str,
61
72
  cluster_name_on_cloud: str,
62
73
  provider_config: Optional[Dict[str, Any]] = None,
63
74
  non_terminated_only: bool = True,
64
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
75
+ ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
65
76
  """Query instances.
66
77
 
67
78
  Returns a dictionary of instance IDs and status.
@@ -155,6 +166,10 @@ def query_ports(
155
166
  return the endpoint without querying the cloud provider. If head_ip is not
156
167
  provided, the cloud provider will be queried to get the endpoint info.
157
168
 
169
+ The underlying implementation is responsible for retries and timeout, e.g.
170
+ kubernetes will wait for the service that expose the ports to be ready
171
+ before returning the endpoint info.
172
+
158
173
  Returns a dict with port as the key and a list of common.Endpoint.
159
174
  """
160
175
  del provider_name, provider_config, cluster_name_on_cloud # unused
@@ -163,7 +178,7 @@ def query_ports(
163
178
 
164
179
  @_route_to_cloud_impl
165
180
  def wait_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
166
- state: Optional[status_lib.ClusterStatus]) -> None:
181
+ state: Optional['status_lib.ClusterStatus']) -> None:
167
182
  """Wait instances until they ends up in the given state."""
168
183
  raise NotImplementedError
169
184
 
@@ -182,12 +197,12 @@ def get_cluster_info(
182
197
  def get_command_runners(
183
198
  provider_name: str,
184
199
  cluster_info: common.ClusterInfo,
185
- **crednetials: Dict[str, Any],
200
+ **credentials: Dict[str, Any],
186
201
  ) -> List[command_runner.CommandRunner]:
187
202
  """Get a command runner for the given cluster."""
188
203
  ip_list = cluster_info.get_feasible_ips()
189
204
  port_list = cluster_info.get_ssh_ports()
190
205
  return command_runner.SSHCommandRunner.make_runner_list(
191
206
  node_list=zip(ip_list, port_list),
192
- **crednetials,
207
+ **credentials,
193
208
  )