skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,111 @@
1
+ """Digital ocean service catalog.
2
+
3
+ This module loads the service catalog file and can be used to
4
+ query instance types and pricing information for digital ocean.
5
+ """
6
+
7
+ import typing
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+ from sky.clouds.service_catalog import common
11
+ from sky.utils import ux_utils
12
+
13
+ if typing.TYPE_CHECKING:
14
+ from sky.clouds import cloud
15
+
16
+ _df = common.read_catalog('do/vms.csv')
17
+
18
+
19
+ def instance_type_exists(instance_type: str) -> bool:
20
+ return common.instance_type_exists_impl(_df, instance_type)
21
+
22
+
23
+ def validate_region_zone(
24
+ region: Optional[str],
25
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
26
+ if zone is not None:
27
+ with ux_utils.print_exception_no_traceback():
28
+ raise ValueError('DO does not support zones.')
29
+ return common.validate_region_zone_impl('DO', _df, region, zone)
30
+
31
+
32
+ def get_hourly_cost(
33
+ instance_type: str,
34
+ use_spot: bool = False,
35
+ region: Optional[str] = None,
36
+ zone: Optional[str] = None,
37
+ ) -> float:
38
+ """Returns the cost, or the cheapest cost among all zones for spot."""
39
+ if zone is not None:
40
+ with ux_utils.print_exception_no_traceback():
41
+ raise ValueError('DO does not support zones.')
42
+ return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
43
+ zone)
44
+
45
+
46
+ def get_vcpus_mem_from_instance_type(
47
+ instance_type: str,) -> Tuple[Optional[float], Optional[float]]:
48
+ return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
49
+
50
+
51
+ def get_default_instance_type(
52
+ cpus: Optional[str] = None,
53
+ memory: Optional[str] = None,
54
+ disk_tier: Optional[str] = None,
55
+ ) -> Optional[str]:
56
+ # NOTE: After expanding catalog to multiple entries, you may
57
+ # want to specify a default instance type or family.
58
+ del disk_tier # unused
59
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
60
+
61
+
62
+ def get_accelerators_from_instance_type(
63
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
64
+ return common.get_accelerators_from_instance_type_impl(_df, instance_type)
65
+
66
+
67
+ def get_instance_type_for_accelerator(
68
+ acc_name: str,
69
+ acc_count: int,
70
+ cpus: Optional[str] = None,
71
+ memory: Optional[str] = None,
72
+ use_spot: bool = False,
73
+ region: Optional[str] = None,
74
+ zone: Optional[str] = None,
75
+ ) -> Tuple[Optional[List[str]], List[str]]:
76
+ """Returns a list of instance types that have the given accelerator."""
77
+ if zone is not None:
78
+ with ux_utils.print_exception_no_traceback():
79
+ raise ValueError('DO does not support zones.')
80
+ return common.get_instance_type_for_accelerator_impl(
81
+ df=_df,
82
+ acc_name=acc_name,
83
+ acc_count=acc_count,
84
+ cpus=cpus,
85
+ memory=memory,
86
+ use_spot=use_spot,
87
+ region=region,
88
+ zone=zone,
89
+ )
90
+
91
+
92
+ def get_region_zones_for_instance_type(instance_type: str,
93
+ use_spot: bool) -> List['cloud.Region']:
94
+ df = _df[_df['InstanceType'] == instance_type]
95
+ return common.get_region_zones(df, use_spot)
96
+
97
+
98
+ def list_accelerators(
99
+ gpus_only: bool,
100
+ name_filter: Optional[str],
101
+ region_filter: Optional[str],
102
+ quantity_filter: Optional[int],
103
+ case_sensitive: bool = True,
104
+ all_regions: bool = False,
105
+ require_price: bool = True,
106
+ ) -> Dict[str, List[common.InstanceTypeInfo]]:
107
+ """Returns all instance types in DO offering GPUs."""
108
+ del require_price # unused
109
+ return common.list_accelerators_impl('DO', _df, gpus_only, name_filter,
110
+ region_filter, quantity_filter,
111
+ case_sensitive, all_regions)
@@ -4,7 +4,7 @@ This module loads the service catalog file and can be used to query
4
4
  instance types and pricing information for FluidStack.
5
5
  """
6
6
  import typing
7
- from typing import Dict, List, Optional, Tuple
7
+ from typing import Dict, List, Optional, Tuple, Union
8
8
 
9
9
  from sky.clouds.service_catalog import common
10
10
  from sky.utils import ux_utils
@@ -65,7 +65,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
65
65
 
66
66
 
67
67
  def get_accelerators_from_instance_type(
68
- instance_type: str) -> Optional[Dict[str, int]]:
68
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
69
69
  return common.get_accelerators_from_instance_type_impl(_df, instance_type)
70
70
 
71
71
 
@@ -9,6 +9,7 @@ from typing import Dict, List, Optional, Tuple
9
9
  from sky import exceptions
10
10
  from sky import sky_logging
11
11
  from sky.adaptors import common as adaptors_common
12
+ from sky.clouds import GCP
12
13
  from sky.clouds.service_catalog import common
13
14
  from sky.utils import resources_utils
14
15
  from sky.utils import ux_utils
@@ -96,7 +97,13 @@ _ACC_INSTANCE_TYPE_DICTS = {
96
97
  8: ['g2-standard-96'],
97
98
  },
98
99
  'H100': {
100
+ 1: ['a3-highgpu-1g'],
101
+ 2: ['a3-highgpu-2g'],
102
+ 4: ['a3-highgpu-4g'],
99
103
  8: ['a3-highgpu-8g'],
104
+ },
105
+ 'H100-MEGA': {
106
+ 8: ['a3-megagpu-8g'],
100
107
  }
101
108
  }
102
109
 
@@ -243,7 +250,6 @@ def get_default_instance_type(
243
250
  cpus: Optional[str] = None,
244
251
  memory: Optional[str] = None,
245
252
  disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
246
- del disk_tier # unused
247
253
  if cpus is None and memory is None:
248
254
  cpus = f'{_DEFAULT_NUM_VCPUS}+'
249
255
  if memory is None:
@@ -254,6 +260,12 @@ def get_default_instance_type(
254
260
  f'{family}-' for family in _DEFAULT_INSTANCE_FAMILY)
255
261
  df = _df[_df['InstanceType'].notna()]
256
262
  df = df[df['InstanceType'].str.startswith(instance_type_prefix)]
263
+
264
+ def _filter_disk_type(instance_type: str) -> bool:
265
+ valid, _ = GCP.check_disk_tier(instance_type, disk_tier)
266
+ return valid
267
+
268
+ df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
257
269
  return common.get_instance_type_for_cpus_mem_impl(df, cpus,
258
270
  memory_gb_or_ratio)
259
271
 
@@ -280,7 +292,9 @@ def get_instance_type_for_accelerator(
280
292
 
281
293
  if acc_name in _ACC_INSTANCE_TYPE_DICTS:
282
294
  df = _df[_df['InstanceType'].notna()]
283
- instance_types = _ACC_INSTANCE_TYPE_DICTS[acc_name][acc_count]
295
+ instance_types = _ACC_INSTANCE_TYPE_DICTS[acc_name].get(acc_count, None)
296
+ if instance_types is None:
297
+ return None, []
284
298
  df = df[df['InstanceType'].isin(instance_types)]
285
299
 
286
300
  # Check the cpus and memory specified by the user.
@@ -4,7 +4,7 @@ This module loads the service catalog file and can be used to query
4
4
  instance types and pricing information for IBM.
5
5
  """
6
6
 
7
- from typing import Dict, List, Optional, Tuple
7
+ from typing import Dict, List, Optional, Tuple, Union
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import ibm
@@ -43,7 +43,7 @@ def get_vcpus_mem_from_instance_type(
43
43
 
44
44
 
45
45
  def get_accelerators_from_instance_type(
46
- instance_type: str) -> Optional[Dict[str, int]]:
46
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
47
47
  return common.get_accelerators_from_instance_type_impl(_df, instance_type)
48
48
 
49
49
 
@@ -8,17 +8,23 @@ import typing
8
8
  from typing import Dict, List, Optional, Set, Tuple
9
9
 
10
10
  from sky import check as sky_check
11
+ from sky import clouds as sky_clouds
12
+ from sky import sky_logging
11
13
  from sky.adaptors import common as adaptors_common
12
- from sky.clouds import Kubernetes
14
+ from sky.adaptors import kubernetes
13
15
  from sky.clouds.service_catalog import CloudFilter
14
16
  from sky.clouds.service_catalog import common
15
17
  from sky.provision.kubernetes import utils as kubernetes_utils
16
18
 
19
+ logger = sky_logging.init_logger(__name__)
20
+
17
21
  if typing.TYPE_CHECKING:
18
22
  import pandas as pd
19
23
  else:
20
24
  pd = adaptors_common.LazyImport('pandas')
21
25
 
26
+ logger = sky_logging.init_logger(__name__)
27
+
22
28
  _PULL_FREQUENCY_HOURS = 7
23
29
 
24
30
  # We keep pull_frequency_hours so we can remotely update the default image paths
@@ -31,7 +37,16 @@ _image_df = common.read_catalog('kubernetes/images.csv',
31
37
 
32
38
  def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
33
39
  """Returns the image id from the tag."""
34
- return common.get_image_id_from_tag_impl(_image_df, tag, region)
40
+ global _image_df
41
+ image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
42
+ if image_id is None:
43
+ # Refresh the image catalog and try again, if the image tag is not
44
+ # found.
45
+ logger.debug('Refreshing the image catalog and trying again.')
46
+ _image_df = common.read_catalog('kubernetes/images.csv',
47
+ pull_frequency_hours=0)
48
+ image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
49
+ return image_id
35
50
 
36
51
 
37
52
  def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
@@ -50,9 +65,14 @@ def list_accelerators(
50
65
  # TODO(romilb): We should consider putting a lru_cache() with TTL to
51
66
  # avoid multiple calls to kubernetes API in a short period of time (e.g.,
52
67
  # from the optimizer).
53
- return list_accelerators_realtime(gpus_only, name_filter, region_filter,
54
- quantity_filter, case_sensitive,
55
- all_regions, require_price)[0]
68
+ return _list_accelerators(gpus_only,
69
+ name_filter,
70
+ region_filter,
71
+ quantity_filter,
72
+ case_sensitive,
73
+ all_regions,
74
+ require_price,
75
+ realtime=False)[0]
56
76
 
57
77
 
58
78
  def list_accelerators_realtime(
@@ -65,27 +85,100 @@ def list_accelerators_realtime(
65
85
  require_price: bool = True
66
86
  ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
67
87
  int]]:
88
+ return _list_accelerators(gpus_only,
89
+ name_filter,
90
+ region_filter,
91
+ quantity_filter,
92
+ case_sensitive,
93
+ all_regions,
94
+ require_price,
95
+ realtime=True)
96
+
97
+
98
+ def _list_accelerators(
99
+ gpus_only: bool,
100
+ name_filter: Optional[str],
101
+ region_filter: Optional[str],
102
+ quantity_filter: Optional[int],
103
+ case_sensitive: bool = True,
104
+ all_regions: bool = False,
105
+ require_price: bool = True,
106
+ realtime: bool = False
107
+ ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
108
+ int]]:
109
+ """List accelerators in the Kubernetes cluster.
110
+
111
+ If realtime is True, the function will query the cluster to fetch real-time
112
+ GPU usage, which is returned in total_accelerators_available. Note that
113
+ this may require an expensive list_pod_for_all_namespaces call, which
114
+ requires cluster-wide pod read permissions.
115
+
116
+ If the user does not have sufficient permissions to list pods in all
117
+ namespaces, the function will return free GPUs as -1.
118
+
119
+ Returns:
120
+ A tuple of three dictionaries:
121
+ - qtys_map: Dict mapping accelerator names to lists of InstanceTypeInfo
122
+ objects with quantity information.
123
+ - total_accelerators_capacity: Dict mapping accelerator names to their
124
+ total capacity in the cluster.
125
+ - total_accelerators_available: Dict mapping accelerator names to their
126
+ current availability. Returns -1 for each accelerator if
127
+ realtime=False or if insufficient permissions.
128
+ """
129
+ # TODO(romilb): This should be refactored to use get_kubernetes_node_info()
130
+ # function from kubernetes_utils.
68
131
  del all_regions, require_price # Unused.
69
- k8s_cloud = Kubernetes()
70
- if not any(
71
- map(k8s_cloud.is_same_cloud,
72
- sky_check.get_cached_enabled_clouds_or_refresh())
73
- ) or not kubernetes_utils.check_credentials()[0]:
132
+
133
+ # First check if Kubernetes is enabled. This ensures k8s python client is
134
+ # installed. Do not put any k8s-specific logic before this check.
135
+ enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
136
+ if not sky_clouds.cloud_in_iterable(sky_clouds.Kubernetes(),
137
+ enabled_clouds):
138
+ return {}, {}, {}
139
+
140
+ # TODO(zhwu): this should return all accelerators in multiple kubernetes
141
+ # clusters defined by allowed_contexts.
142
+ if region_filter is None:
143
+ context = kubernetes_utils.get_current_kube_config_context_name()
144
+ if context is None and kubernetes_utils.is_incluster_config_available():
145
+ # If context is None and we are running in a kubernetes pod, use the
146
+ # in-cluster context as the current context.
147
+ context = kubernetes.in_cluster_context_name()
148
+ else:
149
+ context = region_filter
150
+ if context is None:
74
151
  return {}, {}, {}
75
152
 
76
- has_gpu = kubernetes_utils.detect_gpu_resource()
153
+ # Verify that the credentials are still valid.
154
+ if not kubernetes_utils.check_credentials(context)[0]:
155
+ return {}, {}, {}
156
+
157
+ has_gpu = kubernetes_utils.detect_accelerator_resource(context)
77
158
  if not has_gpu:
78
159
  return {}, {}, {}
79
160
 
80
- label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter()
81
- if not label_formatter:
161
+ lf, _ = kubernetes_utils.detect_gpu_label_formatter(context)
162
+ if not lf:
82
163
  return {}, {}, {}
83
164
 
84
165
  accelerators_qtys: Set[Tuple[str, int]] = set()
85
- key = label_formatter.get_label_key()
86
- nodes = kubernetes_utils.get_kubernetes_nodes()
87
- # Get the pods to get the real-time GPU usage
88
- pods = kubernetes_utils.get_kubernetes_pods()
166
+ keys = lf.get_label_keys()
167
+ nodes = kubernetes_utils.get_kubernetes_nodes(context)
168
+ pods = None
169
+ if realtime:
170
+ # Get the pods to get the real-time GPU usage
171
+ try:
172
+ pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
173
+ except kubernetes.api_exception() as e:
174
+ if e.status == 403:
175
+ logger.warning(
176
+ 'Failed to get pods in the Kubernetes cluster '
177
+ '(forbidden). Please check if your account has '
178
+ 'necessary permissions to list pods. Realtime GPU '
179
+ 'availability information may be incorrect.')
180
+ else:
181
+ raise
89
182
  # Total number of GPUs in the cluster
90
183
  total_accelerators_capacity: Dict[str, int] = {}
91
184
  # Total number of GPUs currently available in the cluster
@@ -93,58 +186,88 @@ def list_accelerators_realtime(
93
186
  min_quantity_filter = quantity_filter if quantity_filter else 1
94
187
 
95
188
  for node in nodes:
96
- if key in node.metadata.labels:
97
- allocated_qty = 0
98
- accelerator_name = label_formatter.get_accelerator_from_label_value(
99
- node.metadata.labels.get(key))
100
-
101
- # Check if name_filter regex matches the accelerator_name
102
- regex_flags = 0 if case_sensitive else re.IGNORECASE
103
- if name_filter and not re.match(
104
- name_filter, accelerator_name, flags=regex_flags):
105
- continue
106
-
107
- accelerator_count = int(
108
- node.status.allocatable.get('nvidia.com/gpu', 0))
109
-
110
- # Generate the GPU quantities for the accelerators
111
- if accelerator_name and accelerator_count > 0:
112
- for count in range(1, accelerator_count + 1):
113
- accelerators_qtys.add((accelerator_name, count))
114
-
115
- for pod in pods:
116
- # Get all the pods running on the node
117
- if (pod.spec.node_name == node.metadata.name and
118
- pod.status.phase in ['Running', 'Pending']):
119
- # Iterate over all the containers in the pod and sum the
120
- # GPU requests
121
- for container in pod.spec.containers:
122
- if container.resources.requests:
123
- allocated_qty += int(
124
- container.resources.requests.get(
125
- 'nvidia.com/gpu', 0))
126
-
127
- accelerators_available = accelerator_count - allocated_qty
128
-
129
- if accelerator_count >= min_quantity_filter:
130
- quantized_count = (min_quantity_filter *
131
- (accelerator_count // min_quantity_filter))
132
- if accelerator_name not in total_accelerators_capacity:
133
- total_accelerators_capacity[
134
- accelerator_name] = quantized_count
135
- else:
136
- total_accelerators_capacity[
137
- accelerator_name] += quantized_count
138
-
139
- if accelerators_available >= min_quantity_filter:
140
- quantized_availability = min_quantity_filter * (
141
- accelerators_available // min_quantity_filter)
189
+ for key in keys:
190
+ if key in node.metadata.labels:
191
+ allocated_qty = 0
192
+ accelerator_name = lf.get_accelerator_from_label_value(
193
+ node.metadata.labels.get(key))
194
+
195
+ # Exclude multi-host TPUs from being processed.
196
+ # TODO(Doyoung): Remove the logic when adding support for
197
+ # multi-host TPUs.
198
+ if kubernetes_utils.is_multi_host_tpu(node.metadata.labels):
199
+ continue
200
+
201
+ # Check if name_filter regex matches the accelerator_name
202
+ regex_flags = 0 if case_sensitive else re.IGNORECASE
203
+ if name_filter and not re.match(
204
+ name_filter, accelerator_name, flags=regex_flags):
205
+ continue
206
+
207
+ # Generate the accelerator quantities
208
+ accelerator_count = (
209
+ kubernetes_utils.get_node_accelerator_count(
210
+ node.status.allocatable))
211
+
212
+ if accelerator_name and accelerator_count > 0:
213
+ # TPUs are counted in a different way compared to GPUs.
214
+ # Multi-node GPUs can be split into smaller units and be
215
+ # provisioned, but TPUs are considered as an atomic unit.
216
+ if kubernetes_utils.is_tpu_on_gke(accelerator_name):
217
+ accelerators_qtys.add(
218
+ (accelerator_name, accelerator_count))
219
+ else:
220
+ count = 1
221
+ while count <= accelerator_count:
222
+ accelerators_qtys.add((accelerator_name, count))
223
+ count *= 2
224
+ # Add the accelerator count if it's not already in the
225
+ # set (e.g., if there's 12 GPUs, we should have qtys 1,
226
+ # 2, 4, 8, 12)
227
+ if accelerator_count not in accelerators_qtys:
228
+ accelerators_qtys.add(
229
+ (accelerator_name, accelerator_count))
230
+
231
+ if accelerator_count >= min_quantity_filter:
232
+ quantized_count = (
233
+ min_quantity_filter *
234
+ (accelerator_count // min_quantity_filter))
235
+ if accelerator_name not in total_accelerators_capacity:
236
+ total_accelerators_capacity[
237
+ accelerator_name] = quantized_count
238
+ else:
239
+ total_accelerators_capacity[
240
+ accelerator_name] += quantized_count
241
+
242
+ if pods is None:
243
+ # If we can't get the pods, we can't get the GPU usage
244
+ total_accelerators_available[accelerator_name] = -1
245
+ continue
246
+
247
+ for pod in pods:
248
+ # Get all the pods running on the node
249
+ if (pod.spec.node_name == node.metadata.name and
250
+ pod.status.phase in ['Running', 'Pending']):
251
+ # Iterate over all the containers in the pod and sum
252
+ # the GPU requests
253
+ for container in pod.spec.containers:
254
+ if container.resources.requests:
255
+ allocated_qty += (
256
+ kubernetes_utils.get_node_accelerator_count(
257
+ container.resources.requests))
258
+
259
+ accelerators_available = accelerator_count - allocated_qty
260
+
261
+ # Initialize the entry if it doesn't exist yet
142
262
  if accelerator_name not in total_accelerators_available:
143
- total_accelerators_available[
144
- accelerator_name] = quantized_availability
145
- else:
146
- total_accelerators_available[
147
- accelerator_name] += quantized_availability
263
+ total_accelerators_available[accelerator_name] = 0
264
+
265
+ if accelerators_available >= min_quantity_filter:
266
+ quantized_availability = min_quantity_filter * (
267
+ accelerators_available // min_quantity_filter)
268
+ total_accelerators_available[accelerator_name] = (
269
+ total_accelerators_available.get(accelerator_name, 0) +
270
+ quantized_availability)
148
271
 
149
272
  result = []
150
273
 
@@ -160,7 +283,7 @@ def list_accelerators_realtime(
160
283
  memory=None,
161
284
  price=0.0,
162
285
  spot_price=0.0,
163
- region='kubernetes'))
286
+ region=context))
164
287
 
165
288
  df = pd.DataFrame(result,
166
289
  columns=[
@@ -175,7 +298,6 @@ def list_accelerators_realtime(
175
298
  qtys_map = common.list_accelerators_impl('Kubernetes', df, gpus_only,
176
299
  name_filter, region_filter,
177
300
  quantity_filter, case_sensitive)
178
-
179
301
  return qtys_map, total_accelerators_capacity, total_accelerators_available
180
302
 
181
303
 
@@ -4,7 +4,7 @@ This module loads the service catalog file and can be used to query
4
4
  instance types and pricing information for Lambda.
5
5
  """
6
6
  import typing
7
- from typing import Dict, List, Optional, Tuple
7
+ from typing import Dict, List, Optional, Tuple, Union
8
8
 
9
9
  from sky.clouds.service_catalog import common
10
10
  from sky.utils import resources_utils
@@ -13,7 +13,12 @@ from sky.utils import ux_utils
13
13
  if typing.TYPE_CHECKING:
14
14
  from sky.clouds import cloud
15
15
 
16
- _df = common.read_catalog('lambda/vms.csv')
16
+ # Keep it synced with the frequency in
17
+ # skypilot-catalog/.github/workflows/update-lambda-catalog.yml
18
+ _PULL_FREQUENCY_HOURS = 7
19
+
20
+ _df = common.read_catalog('lambda/vms.csv',
21
+ pull_frequency_hours=_PULL_FREQUENCY_HOURS)
17
22
 
18
23
  # Number of vCPUS for gpu_1x_a10
19
24
  _DEFAULT_NUM_VCPUS = 30
@@ -67,7 +72,7 @@ def get_default_instance_type(
67
72
 
68
73
 
69
74
  def get_accelerators_from_instance_type(
70
- instance_type: str) -> Optional[Dict[str, int]]:
75
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
71
76
  return common.get_accelerators_from_instance_type_impl(_df, instance_type)
72
77
 
73
78