skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -26,6 +26,9 @@ spec:
26
26
  hostname: smarter-device-management
27
27
  hostNetwork: true
28
28
  dnsPolicy: ClusterFirstWithHostNet
29
+ tolerations:
30
+ - effect: NoSchedule
31
+ operator: Exists
29
32
  containers:
30
33
  - name: smarter-device-manager
31
34
  image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/smarter-device-manager:v1.1.2
@@ -1,6 +1,7 @@
1
1
  """Kubernetes network provisioning."""
2
2
  from typing import Any, Dict, List, Optional
3
3
 
4
+ from sky import sky_logging
4
5
  from sky.adaptors import kubernetes
5
6
  from sky.provision import common
6
7
  from sky.provision.kubernetes import network_utils
@@ -8,6 +9,8 @@ from sky.provision.kubernetes import utils as kubernetes_utils
8
9
  from sky.utils import kubernetes_enums
9
10
  from sky.utils.resources_utils import port_ranges_to_set
10
11
 
12
+ logger = sky_logging.init_logger(__name__)
13
+
11
14
  _PATH_PREFIX = '/skypilot/{namespace}/{cluster_name_on_cloud}/{port}'
12
15
  _LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}--skypilot-lb'
13
16
 
@@ -55,7 +58,8 @@ def _open_ports_using_loadbalancer(
55
58
  kubernetes_utils.merge_custom_metadata(content['service_spec']['metadata'])
56
59
 
57
60
  network_utils.create_or_replace_namespaced_service(
58
- namespace=provider_config.get('namespace', 'default'),
61
+ namespace=kubernetes_utils.get_namespace_from_config(provider_config),
62
+ context=kubernetes_utils.get_context_from_config(provider_config),
59
63
  service_name=service_name,
60
64
  service_spec=content['service_spec'])
61
65
 
@@ -65,8 +69,9 @@ def _open_ports_using_ingress(
65
69
  ports: List[int],
66
70
  provider_config: Dict[str, Any],
67
71
  ) -> None:
72
+ context = kubernetes_utils.get_context_from_config(provider_config)
68
73
  # Check if an ingress controller exists
69
- if not network_utils.ingress_controller_exists():
74
+ if not network_utils.ingress_controller_exists(context):
70
75
  raise Exception(
71
76
  'Ingress controller not found. '
72
77
  'Install Nginx ingress controller first: '
@@ -74,13 +79,14 @@ def _open_ports_using_ingress(
74
79
  )
75
80
 
76
81
  # Prepare service names, ports, for template rendering
77
- service_details = [(f'{cluster_name_on_cloud}--skypilot-svc--{port}', port,
78
- _PATH_PREFIX.format(
79
- cluster_name_on_cloud=cluster_name_on_cloud,
80
- port=port,
81
- namespace=kubernetes_utils.
82
- get_current_kube_config_context_namespace()).rstrip(
83
- '/').lstrip('/')) for port in ports]
82
+ service_details = [
83
+ (f'{cluster_name_on_cloud}--skypilot-svc--{port}', port,
84
+ _PATH_PREFIX.format(
85
+ cluster_name_on_cloud=cluster_name_on_cloud,
86
+ port=port,
87
+ namespace=kubernetes_utils.get_kube_config_context_namespace(
88
+ context)).rstrip('/').lstrip('/')) for port in ports
89
+ ]
84
90
 
85
91
  # Generate ingress and services specs
86
92
  # We batch ingress rule creation because each rule triggers a hot reload of
@@ -105,7 +111,9 @@ def _open_ports_using_ingress(
105
111
  # Update metadata from config
106
112
  kubernetes_utils.merge_custom_metadata(service_spec['metadata'])
107
113
  network_utils.create_or_replace_namespaced_service(
108
- namespace=provider_config.get('namespace', 'default'),
114
+ namespace=kubernetes_utils.get_namespace_from_config(
115
+ provider_config),
116
+ context=kubernetes_utils.get_context_from_config(provider_config),
109
117
  service_name=service_name,
110
118
  service_spec=service_spec,
111
119
  )
@@ -113,7 +121,8 @@ def _open_ports_using_ingress(
113
121
  kubernetes_utils.merge_custom_metadata(content['ingress_spec']['metadata'])
114
122
  # Create or update the single ingress for all services
115
123
  network_utils.create_or_replace_namespaced_ingress(
116
- namespace=provider_config.get('namespace', 'default'),
124
+ namespace=kubernetes_utils.get_namespace_from_config(provider_config),
125
+ context=kubernetes_utils.get_context_from_config(provider_config),
117
126
  ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
118
127
  ingress_spec=content['ingress_spec'],
119
128
  )
@@ -163,14 +172,16 @@ def _cleanup_ports_for_ingress(
163
172
  for port in ports:
164
173
  service_name = f'{cluster_name_on_cloud}--skypilot-svc--{port}'
165
174
  network_utils.delete_namespaced_service(
166
- namespace=provider_config.get('namespace', 'default'),
175
+ namespace=provider_config.get('namespace',
176
+ kubernetes_utils.DEFAULT_NAMESPACE),
167
177
  service_name=service_name,
168
178
  )
169
179
 
170
180
  # Delete the single ingress used for all ports
171
181
  ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress'
172
182
  network_utils.delete_namespaced_ingress(
173
- namespace=provider_config.get('namespace', 'default'),
183
+ namespace=kubernetes_utils.get_namespace_from_config(provider_config),
184
+ context=kubernetes_utils.get_context_from_config(provider_config),
174
185
  ingress_name=ingress_name,
175
186
  )
176
187
 
@@ -199,11 +210,13 @@ def query_ports(
199
210
  return _query_ports_for_ingress(
200
211
  cluster_name_on_cloud=cluster_name_on_cloud,
201
212
  ports=ports,
213
+ provider_config=provider_config,
202
214
  )
203
215
  elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP:
204
216
  return _query_ports_for_podip(
205
217
  cluster_name_on_cloud=cluster_name_on_cloud,
206
218
  ports=ports,
219
+ provider_config=provider_config,
207
220
  )
208
221
  else:
209
222
  return {}
@@ -218,12 +231,23 @@ def _query_ports_for_loadbalancer(
218
231
  ports: List[int],
219
232
  provider_config: Dict[str, Any],
220
233
  ) -> Dict[int, List[common.Endpoint]]:
234
+ logger.debug(f'Getting loadbalancer IP for cluster {cluster_name_on_cloud}')
221
235
  result: Dict[int, List[common.Endpoint]] = {}
222
236
  service_name = _LOADBALANCER_SERVICE_NAME.format(
223
237
  cluster_name_on_cloud=cluster_name_on_cloud)
238
+ context = provider_config.get(
239
+ 'context', kubernetes_utils.get_current_kube_config_context_name())
240
+ namespace = provider_config.get(
241
+ 'namespace',
242
+ kubernetes_utils.get_kube_config_context_namespace(context))
224
243
  external_ip = network_utils.get_loadbalancer_ip(
225
- namespace=provider_config.get('namespace', 'default'),
226
- service_name=service_name)
244
+ context=context,
245
+ namespace=namespace,
246
+ service_name=service_name,
247
+ # Timeout is set so that we can retry the query when the
248
+ # cluster is firstly created and the load balancer is not ready yet.
249
+ timeout=60,
250
+ )
227
251
 
228
252
  if external_ip is None:
229
253
  return {}
@@ -237,19 +261,24 @@ def _query_ports_for_loadbalancer(
237
261
  def _query_ports_for_ingress(
238
262
  cluster_name_on_cloud: str,
239
263
  ports: List[int],
264
+ provider_config: Dict[str, Any],
240
265
  ) -> Dict[int, List[common.Endpoint]]:
241
- ingress_details = network_utils.get_ingress_external_ip_and_ports()
266
+ context = provider_config.get(
267
+ 'context', kubernetes_utils.get_current_kube_config_context_name())
268
+ ingress_details = network_utils.get_ingress_external_ip_and_ports(context)
242
269
  external_ip, external_ports = ingress_details
243
270
  if external_ip is None:
244
271
  return {}
245
272
 
273
+ namespace = provider_config.get(
274
+ 'namespace',
275
+ kubernetes_utils.get_kube_config_context_namespace(context))
246
276
  result: Dict[int, List[common.Endpoint]] = {}
247
277
  for port in ports:
248
278
  path_prefix = _PATH_PREFIX.format(
249
279
  cluster_name_on_cloud=cluster_name_on_cloud,
250
280
  port=port,
251
- namespace=kubernetes_utils.
252
- get_current_kube_config_context_namespace())
281
+ namespace=namespace)
253
282
 
254
283
  http_port, https_port = external_ports \
255
284
  if external_ports is not None else (None, None)
@@ -268,10 +297,15 @@ def _query_ports_for_ingress(
268
297
  def _query_ports_for_podip(
269
298
  cluster_name_on_cloud: str,
270
299
  ports: List[int],
300
+ provider_config: Dict[str, Any],
271
301
  ) -> Dict[int, List[common.Endpoint]]:
272
- namespace = kubernetes_utils.get_current_kube_config_context_namespace()
302
+ context = provider_config.get(
303
+ 'context', kubernetes_utils.get_current_kube_config_context_name())
304
+ namespace = provider_config.get(
305
+ 'namespace',
306
+ kubernetes_utils.get_kube_config_context_namespace(context))
273
307
  pod_name = kubernetes_utils.get_head_pod_name(cluster_name_on_cloud)
274
- pod_ip = network_utils.get_pod_ip(namespace, pod_name)
308
+ pod_ip = network_utils.get_pod_ip(context, namespace, pod_name)
275
309
 
276
310
  result: Dict[int, List[common.Endpoint]] = {}
277
311
  if pod_ip is None:
@@ -1,5 +1,6 @@
1
1
  """Kubernetes network provisioning utils."""
2
2
  import os
3
+ import time
3
4
  from typing import Dict, List, Optional, Tuple, Union
4
5
 
5
6
  import jinja2
@@ -7,12 +8,15 @@ import yaml
7
8
 
8
9
  import sky
9
10
  from sky import exceptions
11
+ from sky import sky_logging
10
12
  from sky import skypilot_config
11
13
  from sky.adaptors import kubernetes
12
14
  from sky.provision.kubernetes import utils as kubernetes_utils
13
15
  from sky.utils import kubernetes_enums
14
16
  from sky.utils import ux_utils
15
17
 
18
+ logger = sky_logging.init_logger(__name__)
19
+
16
20
  _INGRESS_TEMPLATE_NAME = 'kubernetes-ingress.yml.j2'
17
21
  _LOADBALANCER_TEMPLATE_NAME = 'kubernetes-loadbalancer.yml.j2'
18
22
 
@@ -43,6 +47,23 @@ def get_port_mode(
43
47
  return port_mode
44
48
 
45
49
 
50
+ def get_networking_mode(
51
+ mode_str: Optional[str] = None
52
+ ) -> kubernetes_enums.KubernetesNetworkingMode:
53
+ """Get the networking mode from the provider config."""
54
+ mode_str = mode_str or skypilot_config.get_nested(
55
+ ('kubernetes', 'networking_mode'),
56
+ kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
57
+ try:
58
+ networking_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
59
+ mode_str)
60
+ except ValueError as e:
61
+ with ux_utils.print_exception_no_traceback():
62
+ raise ValueError(str(e) +
63
+ ' Please check: ~/.sky/config.yaml.') from None
64
+ return networking_mode
65
+
66
+
46
67
  def fill_loadbalancer_template(namespace: str, service_name: str,
47
68
  ports: List[int], selector_key: str,
48
69
  selector_value: str) -> Dict:
@@ -54,6 +75,10 @@ def fill_loadbalancer_template(namespace: str, service_name: str,
54
75
 
55
76
  with open(template_path, 'r', encoding='utf-8') as fin:
56
77
  template = fin.read()
78
+ annotations = skypilot_config.get_nested(
79
+ ('kubernetes', 'custom_metadata', 'annotations'), {})
80
+ labels = skypilot_config.get_nested(
81
+ ('kubernetes', 'custom_metadata', 'labels'), {})
57
82
  j2_template = jinja2.Template(template)
58
83
  cont = j2_template.render(
59
84
  namespace=namespace,
@@ -61,6 +86,8 @@ def fill_loadbalancer_template(namespace: str, service_name: str,
61
86
  ports=ports,
62
87
  selector_key=selector_key,
63
88
  selector_value=selector_value,
89
+ annotations=annotations,
90
+ labels=labels,
64
91
  )
65
92
  content = yaml.safe_load(cont)
66
93
  return content
@@ -77,6 +104,10 @@ def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int,
77
104
  f'Template "{_INGRESS_TEMPLATE_NAME}" does not exist.')
78
105
  with open(template_path, 'r', encoding='utf-8') as fin:
79
106
  template = fin.read()
107
+ annotations = skypilot_config.get_nested(
108
+ ('kubernetes', 'custom_metadata', 'annotations'), {})
109
+ labels = skypilot_config.get_nested(
110
+ ('kubernetes', 'custom_metadata', 'labels'), {})
80
111
  j2_template = jinja2.Template(template)
81
112
  cont = j2_template.render(
82
113
  namespace=namespace,
@@ -88,6 +119,8 @@ def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int,
88
119
  ingress_name=ingress_name,
89
120
  selector_key=selector_key,
90
121
  selector_value=selector_value,
122
+ annotations=annotations,
123
+ labels=labels,
91
124
  )
92
125
  content = yaml.safe_load(cont)
93
126
 
@@ -99,10 +132,10 @@ def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int,
99
132
 
100
133
 
101
134
  def create_or_replace_namespaced_ingress(
102
- namespace: str, ingress_name: str,
135
+ namespace: str, context: Optional[str], ingress_name: str,
103
136
  ingress_spec: Dict[str, Union[str, int]]) -> None:
104
137
  """Creates an ingress resource for the specified service."""
105
- networking_api = kubernetes.networking_api()
138
+ networking_api = kubernetes.networking_api(context)
106
139
 
107
140
  try:
108
141
  networking_api.read_namespaced_ingress(
@@ -123,9 +156,10 @@ def create_or_replace_namespaced_ingress(
123
156
  _request_timeout=kubernetes.API_TIMEOUT)
124
157
 
125
158
 
126
- def delete_namespaced_ingress(namespace: str, ingress_name: str) -> None:
159
+ def delete_namespaced_ingress(namespace: str, context: Optional[str],
160
+ ingress_name: str) -> None:
127
161
  """Deletes an ingress resource."""
128
- networking_api = kubernetes.networking_api()
162
+ networking_api = kubernetes.networking_api(context)
129
163
  try:
130
164
  networking_api.delete_namespaced_ingress(
131
165
  ingress_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
@@ -137,10 +171,10 @@ def delete_namespaced_ingress(namespace: str, ingress_name: str) -> None:
137
171
 
138
172
 
139
173
  def create_or_replace_namespaced_service(
140
- namespace: str, service_name: str,
174
+ namespace: str, context: Optional[str], service_name: str,
141
175
  service_spec: Dict[str, Union[str, int]]) -> None:
142
176
  """Creates a service resource for the specified service."""
143
- core_api = kubernetes.core_api()
177
+ core_api = kubernetes.core_api(context)
144
178
 
145
179
  try:
146
180
  core_api.read_namespaced_service(
@@ -174,9 +208,10 @@ def delete_namespaced_service(namespace: str, service_name: str) -> None:
174
208
  raise e
175
209
 
176
210
 
177
- def ingress_controller_exists(ingress_class_name: str = 'nginx') -> bool:
211
+ def ingress_controller_exists(context: Optional[str],
212
+ ingress_class_name: str = 'nginx') -> bool:
178
213
  """Checks if an ingress controller exists in the cluster."""
179
- networking_api = kubernetes.networking_api()
214
+ networking_api = kubernetes.networking_api(context)
180
215
  ingress_classes = networking_api.list_ingress_class(
181
216
  _request_timeout=kubernetes.API_TIMEOUT).items
182
217
  return any(
@@ -185,16 +220,17 @@ def ingress_controller_exists(ingress_class_name: str = 'nginx') -> bool:
185
220
 
186
221
 
187
222
  def get_ingress_external_ip_and_ports(
223
+ context: Optional[str],
188
224
  namespace: str = 'ingress-nginx'
189
225
  ) -> Tuple[Optional[str], Optional[Tuple[int, int]]]:
190
226
  """Returns external ip and ports for the ingress controller."""
191
- core_api = kubernetes.core_api()
227
+ core_api = kubernetes.core_api(context)
192
228
  ingress_services = [
193
229
  item for item in core_api.list_namespaced_service(
194
230
  namespace, _request_timeout=kubernetes.API_TIMEOUT).items
195
231
  if item.metadata.name == 'ingress-nginx-controller'
196
232
  ]
197
- if len(ingress_services) == 0:
233
+ if not ingress_services:
198
234
  return (None, None)
199
235
 
200
236
  ingress_service = ingress_services[0]
@@ -222,23 +258,36 @@ def get_ingress_external_ip_and_ports(
222
258
  return external_ip, None
223
259
 
224
260
 
225
- def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]:
261
+ def get_loadbalancer_ip(context: Optional[str],
262
+ namespace: str,
263
+ service_name: str,
264
+ timeout: int = 0) -> Optional[str]:
226
265
  """Returns the IP address of the load balancer."""
227
- core_api = kubernetes.core_api()
228
- service = core_api.read_namespaced_service(
229
- service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
266
+ core_api = kubernetes.core_api(context)
230
267
 
231
- if service.status.load_balancer.ingress is None:
232
- return None
268
+ ip = None
233
269
 
234
- ip = service.status.load_balancer.ingress[
235
- 0].ip or service.status.load_balancer.ingress[0].hostname
236
- return ip if ip is not None else None
270
+ start_time = time.time()
271
+ retry_cnt = 0
272
+ while ip is None and (retry_cnt == 0 or time.time() - start_time < timeout):
273
+ service = core_api.read_namespaced_service(
274
+ service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
275
+ if service.status.load_balancer.ingress is not None:
276
+ ip = (service.status.load_balancer.ingress[0].ip or
277
+ service.status.load_balancer.ingress[0].hostname)
278
+ if ip is None:
279
+ retry_cnt += 1
280
+ if retry_cnt % 5 == 0:
281
+ logger.debug('Waiting for load balancer IP to be assigned'
282
+ '...')
283
+ time.sleep(1)
284
+ return ip
237
285
 
238
286
 
239
- def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]:
287
+ def get_pod_ip(context: Optional[str], namespace: str,
288
+ pod_name: str) -> Optional[str]:
240
289
  """Returns the IP address of the pod."""
241
- core_api = kubernetes.core_api()
290
+ core_api = kubernetes.core_api(context)
242
291
  pod = core_api.read_namespaced_pod(pod_name,
243
292
  namespace,
244
293
  _request_timeout=kubernetes.API_TIMEOUT)