skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,9 @@ import yaml
9
9
 
10
10
  from sky.adaptors import kubernetes
11
11
  from sky.provision import common
12
+ from sky.provision.kubernetes import network_utils
12
13
  from sky.provision.kubernetes import utils as kubernetes_utils
14
+ from sky.utils import kubernetes_enums
13
15
 
14
16
  logger = logging.getLogger(__name__)
15
17
 
@@ -21,11 +23,16 @@ def bootstrap_instances(
21
23
  region: str, cluster_name: str,
22
24
  config: common.ProvisionConfig) -> common.ProvisionConfig:
23
25
  del region, cluster_name # unused
24
- namespace = kubernetes_utils.get_current_kube_config_context_namespace()
26
+ namespace = kubernetes_utils.get_namespace_from_config(
27
+ config.provider_config)
28
+ context = kubernetes_utils.get_context_from_config(config.provider_config)
25
29
 
26
- _configure_services(namespace, config.provider_config)
30
+ _configure_services(namespace, context, config.provider_config)
27
31
 
28
- config = _configure_ssh_jump(namespace, config)
32
+ networking_mode = network_utils.get_networking_mode(
33
+ config.provider_config.get('networking_mode'))
34
+ if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
35
+ config = _configure_ssh_jump(namespace, context, config)
29
36
 
30
37
  requested_service_account = config.node_config['spec']['serviceAccountName']
31
38
  if (requested_service_account ==
@@ -35,26 +42,47 @@ def bootstrap_instances(
35
42
  # necessary roles and role bindings.
36
43
  # If not, set up the roles and bindings for skypilot-service-account
37
44
  # here.
38
- _configure_autoscaler_service_account(namespace, config.provider_config)
45
+ _configure_autoscaler_service_account(namespace, context,
46
+ config.provider_config)
39
47
  _configure_autoscaler_role(namespace,
48
+ context,
40
49
  config.provider_config,
41
50
  role_field='autoscaler_role')
42
51
  _configure_autoscaler_role_binding(
43
52
  namespace,
53
+ context,
44
54
  config.provider_config,
45
55
  binding_field='autoscaler_role_binding')
46
- _configure_autoscaler_cluster_role(namespace, config.provider_config)
47
- _configure_autoscaler_cluster_role_binding(namespace,
56
+ _configure_autoscaler_cluster_role(namespace, context,
57
+ config.provider_config)
58
+ _configure_autoscaler_cluster_role_binding(namespace, context,
48
59
  config.provider_config)
60
+ # SkyPilot system namespace is required for FUSE mounting. Here we just
61
+ # create the namespace and set up the necessary permissions.
62
+ #
63
+ # We need to setup the namespace outside the
64
+ # if config.provider_config.get('fuse_device_required') block below
65
+ # because if we put in the if block, the following happens:
66
+ # 1. User launches job controller on Kubernetes with SERVICE_ACCOUNT. No
67
+ # namespace is created at this point since the controller does not
68
+ # require FUSE.
69
+ # 2. User submits a job requiring FUSE.
70
+ # 3. The namespace is created here, but since the job controller is
71
+ # using DEFAULT_SERVICE_ACCOUNT_NAME, it does not have the necessary
72
+ # permissions to create a role for itself to create the FUSE manager.
73
+ # 4. The job fails to launch.
74
+ _configure_skypilot_system_namespace(config.provider_config)
49
75
  if config.provider_config.get('port_mode', 'loadbalancer') == 'ingress':
50
76
  logger.info('Port mode is set to ingress, setting up ingress role '
51
77
  'and role binding.')
52
78
  try:
53
79
  _configure_autoscaler_role(namespace,
80
+ context,
54
81
  config.provider_config,
55
82
  role_field='autoscaler_ingress_role')
56
83
  _configure_autoscaler_role_binding(
57
84
  namespace,
85
+ context,
58
86
  config.provider_config,
59
87
  binding_field='autoscaler_ingress_role_binding')
60
88
  except kubernetes.api_exception() as e:
@@ -69,26 +97,8 @@ def bootstrap_instances(
69
97
  elif requested_service_account != 'default':
70
98
  logger.info(f'Using service account {requested_service_account!r}, '
71
99
  'skipping role and role binding setup.')
72
-
73
- # SkyPilot system namespace is required for FUSE mounting. Here we just
74
- # create the namespace and set up the necessary permissions.
75
- #
76
- # We need to setup the namespace outside the if block below because if
77
- # we put in the if block, the following happens:
78
- # 1. User launches job controller on Kubernetes with SERVICE_ACCOUNT. No
79
- # namespace is created at this point since the controller does not
80
- # require FUSE.
81
- # 2. User submits a job requiring FUSE.
82
- # 3. The namespace is created here, but since the job controller is using
83
- # SERVICE_ACCOUNT, it does not have the necessary permissions to create
84
- # a role for itself to create the FUSE device manager.
85
- # 4. The job fails to launch.
86
- _configure_skypilot_system_namespace(config.provider_config,
87
- requested_service_account)
88
-
89
100
  if config.provider_config.get('fuse_device_required', False):
90
101
  _configure_fuse_mounting(config.provider_config)
91
-
92
102
  return config
93
103
 
94
104
 
@@ -222,7 +232,7 @@ def _get_resource(container_resources: Dict[str, Any], resource_name: str,
222
232
  # Look for keys containing the resource_name. For example,
223
233
  # the key 'nvidia.com/gpu' contains the key 'gpu'.
224
234
  matching_keys = [key for key in resources if resource_name in key.lower()]
225
- if len(matching_keys) == 0:
235
+ if not matching_keys:
226
236
  return float('inf')
227
237
  if len(matching_keys) > 1:
228
238
  # Should have only one match -- mostly relevant for gpu.
@@ -237,7 +247,8 @@ def _get_resource(container_resources: Dict[str, Any], resource_name: str,
237
247
 
238
248
 
239
249
  def _configure_autoscaler_service_account(
240
- namespace: str, provider_config: Dict[str, Any]) -> None:
250
+ namespace: str, context: Optional[str],
251
+ provider_config: Dict[str, Any]) -> None:
241
252
  account_field = 'autoscaler_service_account'
242
253
  if account_field not in provider_config:
243
254
  logger.info('_configure_autoscaler_service_account: '
@@ -252,9 +263,9 @@ def _configure_autoscaler_service_account(
252
263
 
253
264
  name = account['metadata']['name']
254
265
  field_selector = f'metadata.name={name}'
255
- accounts = (kubernetes.core_api().list_namespaced_service_account(
266
+ accounts = (kubernetes.core_api(context).list_namespaced_service_account(
256
267
  namespace, field_selector=field_selector).items)
257
- if len(accounts) > 0:
268
+ if accounts:
258
269
  assert len(accounts) == 1
259
270
  # Nothing to check for equality and patch here,
260
271
  # since the service_account.metadata.name is the only important
@@ -265,12 +276,14 @@ def _configure_autoscaler_service_account(
265
276
 
266
277
  logger.info('_configure_autoscaler_service_account: '
267
278
  f'{not_found_msg(account_field, name)}')
268
- kubernetes.core_api().create_namespaced_service_account(namespace, account)
279
+ kubernetes.core_api(context).create_namespaced_service_account(
280
+ namespace, account)
269
281
  logger.info('_configure_autoscaler_service_account: '
270
282
  f'{created_msg(account_field, name)}')
271
283
 
272
284
 
273
- def _configure_autoscaler_role(namespace: str, provider_config: Dict[str, Any],
285
+ def _configure_autoscaler_role(namespace: str, context: Optional[str],
286
+ provider_config: Dict[str, Any],
274
287
  role_field: str) -> None:
275
288
  """ Reads the role from the provider config, creates if it does not exist.
276
289
 
@@ -293,9 +306,9 @@ def _configure_autoscaler_role(namespace: str, provider_config: Dict[str, Any],
293
306
 
294
307
  name = role['metadata']['name']
295
308
  field_selector = f'metadata.name={name}'
296
- roles = (kubernetes.auth_api().list_namespaced_role(
309
+ roles = (kubernetes.auth_api(context).list_namespaced_role(
297
310
  namespace, field_selector=field_selector).items)
298
- if len(roles) > 0:
311
+ if roles:
299
312
  assert len(roles) == 1
300
313
  existing_role = roles[0]
301
314
  # Convert to k8s object to compare
@@ -306,17 +319,19 @@ def _configure_autoscaler_role(namespace: str, provider_config: Dict[str, Any],
306
319
  return
307
320
  logger.info('_configure_autoscaler_role: '
308
321
  f'{updating_existing_msg(role_field, name)}')
309
- kubernetes.auth_api().patch_namespaced_role(name, namespace, role)
322
+ kubernetes.auth_api(context).patch_namespaced_role(
323
+ name, namespace, role)
310
324
  return
311
325
 
312
326
  logger.info('_configure_autoscaler_role: '
313
327
  f'{not_found_msg(role_field, name)}')
314
- kubernetes.auth_api().create_namespaced_role(namespace, role)
328
+ kubernetes.auth_api(context).create_namespaced_role(namespace, role)
315
329
  logger.info(f'_configure_autoscaler_role: {created_msg(role_field, name)}')
316
330
 
317
331
 
318
332
  def _configure_autoscaler_role_binding(
319
333
  namespace: str,
334
+ context: Optional[str],
320
335
  provider_config: Dict[str, Any],
321
336
  binding_field: str,
322
337
  override_name: Optional[str] = None,
@@ -357,9 +372,9 @@ def _configure_autoscaler_role_binding(
357
372
  name = binding['metadata']['name']
358
373
 
359
374
  field_selector = f'metadata.name={name}'
360
- role_bindings = (kubernetes.auth_api().list_namespaced_role_binding(
375
+ role_bindings = (kubernetes.auth_api(context).list_namespaced_role_binding(
361
376
  rb_namespace, field_selector=field_selector).items)
362
- if len(role_bindings) > 0:
377
+ if role_bindings:
363
378
  assert len(role_bindings) == 1
364
379
  existing_binding = role_bindings[0]
365
380
  new_rb = kubernetes_utils.dict_to_k8s_object(binding, 'V1RoleBinding')
@@ -370,18 +385,19 @@ def _configure_autoscaler_role_binding(
370
385
  return
371
386
  logger.info('_configure_autoscaler_role_binding: '
372
387
  f'{updating_existing_msg(binding_field, name)}')
373
- kubernetes.auth_api().patch_namespaced_role_binding(
388
+ kubernetes.auth_api(context).patch_namespaced_role_binding(
374
389
  name, rb_namespace, binding)
375
390
  return
376
391
 
377
392
  logger.info('_configure_autoscaler_role_binding: '
378
393
  f'{not_found_msg(binding_field, name)}')
379
- kubernetes.auth_api().create_namespaced_role_binding(rb_namespace, binding)
394
+ kubernetes.auth_api(context).create_namespaced_role_binding(
395
+ rb_namespace, binding)
380
396
  logger.info('_configure_autoscaler_role_binding: '
381
397
  f'{created_msg(binding_field, name)}')
382
398
 
383
399
 
384
- def _configure_autoscaler_cluster_role(namespace,
400
+ def _configure_autoscaler_cluster_role(namespace, context,
385
401
  provider_config: Dict[str, Any]) -> None:
386
402
  role_field = 'autoscaler_cluster_role'
387
403
  if role_field not in provider_config:
@@ -397,9 +413,9 @@ def _configure_autoscaler_cluster_role(namespace,
397
413
 
398
414
  name = role['metadata']['name']
399
415
  field_selector = f'metadata.name={name}'
400
- cluster_roles = (kubernetes.auth_api().list_cluster_role(
416
+ cluster_roles = (kubernetes.auth_api(context).list_cluster_role(
401
417
  field_selector=field_selector).items)
402
- if len(cluster_roles) > 0:
418
+ if cluster_roles:
403
419
  assert len(cluster_roles) == 1
404
420
  existing_cr = cluster_roles[0]
405
421
  new_cr = kubernetes_utils.dict_to_k8s_object(role, 'V1ClusterRole')
@@ -409,18 +425,18 @@ def _configure_autoscaler_cluster_role(namespace,
409
425
  return
410
426
  logger.info('_configure_autoscaler_cluster_role: '
411
427
  f'{updating_existing_msg(role_field, name)}')
412
- kubernetes.auth_api().patch_cluster_role(name, role)
428
+ kubernetes.auth_api(context).patch_cluster_role(name, role)
413
429
  return
414
430
 
415
431
  logger.info('_configure_autoscaler_cluster_role: '
416
432
  f'{not_found_msg(role_field, name)}')
417
- kubernetes.auth_api().create_cluster_role(role)
433
+ kubernetes.auth_api(context).create_cluster_role(role)
418
434
  logger.info(
419
435
  f'_configure_autoscaler_cluster_role: {created_msg(role_field, name)}')
420
436
 
421
437
 
422
438
  def _configure_autoscaler_cluster_role_binding(
423
- namespace, provider_config: Dict[str, Any]) -> None:
439
+ namespace, context, provider_config: Dict[str, Any]) -> None:
424
440
  binding_field = 'autoscaler_cluster_role_binding'
425
441
  if binding_field not in provider_config:
426
442
  logger.info('_configure_autoscaler_cluster_role_binding: '
@@ -442,9 +458,9 @@ def _configure_autoscaler_cluster_role_binding(
442
458
 
443
459
  name = binding['metadata']['name']
444
460
  field_selector = f'metadata.name={name}'
445
- cr_bindings = (kubernetes.auth_api().list_cluster_role_binding(
461
+ cr_bindings = (kubernetes.auth_api(context).list_cluster_role_binding(
446
462
  field_selector=field_selector).items)
447
- if len(cr_bindings) > 0:
463
+ if cr_bindings:
448
464
  assert len(cr_bindings) == 1
449
465
  existing_binding = cr_bindings[0]
450
466
  new_binding = kubernetes_utils.dict_to_k8s_object(
@@ -456,17 +472,17 @@ def _configure_autoscaler_cluster_role_binding(
456
472
  return
457
473
  logger.info('_configure_autoscaler_cluster_role_binding: '
458
474
  f'{updating_existing_msg(binding_field, name)}')
459
- kubernetes.auth_api().patch_cluster_role_binding(name, binding)
475
+ kubernetes.auth_api(context).patch_cluster_role_binding(name, binding)
460
476
  return
461
477
 
462
478
  logger.info('_configure_autoscaler_cluster_role_binding: '
463
479
  f'{not_found_msg(binding_field, name)}')
464
- kubernetes.auth_api().create_cluster_role_binding(binding)
480
+ kubernetes.auth_api(context).create_cluster_role_binding(binding)
465
481
  logger.info('_configure_autoscaler_cluster_role_binding: '
466
482
  f'{created_msg(binding_field, name)}')
467
483
 
468
484
 
469
- def _configure_ssh_jump(namespace, config: common.ProvisionConfig):
485
+ def _configure_ssh_jump(namespace, context, config: common.ProvisionConfig):
470
486
  """Creates a SSH jump pod to connect to the cluster.
471
487
 
472
488
  Also updates config['auth']['ssh_proxy_command'] to use the newly created
@@ -497,13 +513,12 @@ def _configure_ssh_jump(namespace, config: common.ProvisionConfig):
497
513
  # service is missing, we should raise an error.
498
514
 
499
515
  kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image,
500
- ssh_key_secret_name, namespace)
516
+ ssh_key_secret_name, namespace, context)
501
517
  return config
502
518
 
503
519
 
504
520
  def _configure_skypilot_system_namespace(
505
- provider_config: Dict[str,
506
- Any], service_account: Optional[str]) -> None:
521
+ provider_config: Dict[str, Any]) -> None:
507
522
  """Creates the namespace for skypilot-system mounting if it does not exist.
508
523
 
509
524
  Also patches the SkyPilot service account to have the necessary permissions
@@ -511,36 +526,33 @@ def _configure_skypilot_system_namespace(
511
526
  """
512
527
  svc_account_namespace = provider_config['namespace']
513
528
  skypilot_system_namespace = provider_config['skypilot_system_namespace']
514
- kubernetes_utils.create_namespace(skypilot_system_namespace)
515
-
516
- # Setup permissions if using the default service account.
517
- # If the user has requested a different service account (via
518
- # remote_identity in ~/.sky/config.yaml), we assume they have already set
519
- # up the necessary roles and role bindings.
520
- if service_account == kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME:
521
- # Note - this must be run only after the service account has been
522
- # created in the cluster (in bootstrap_instances).
523
- # Create the role in the skypilot-system namespace if it does not exist.
524
- _configure_autoscaler_role(skypilot_system_namespace,
525
- provider_config,
526
- role_field='autoscaler_skypilot_system_role')
527
- # We must create a unique role binding per-namespace that SkyPilot is
528
- # running in, so we override the name with a unique name identifying
529
- # the namespace. This is required for multi-tenant setups where
530
- # different SkyPilot instances may be running in different namespaces.
531
- override_name = provider_config[
532
- 'autoscaler_skypilot_system_role_binding']['metadata'][
533
- 'name'] + '-' + svc_account_namespace
534
-
535
- # Create the role binding in the skypilot-system namespace, and have
536
- # the subject namespace be the namespace that the SkyPilot service
537
- # account is created in.
538
- _configure_autoscaler_role_binding(
539
- skypilot_system_namespace,
540
- provider_config,
541
- binding_field='autoscaler_skypilot_system_role_binding',
542
- override_name=override_name,
543
- override_subject_namespace=svc_account_namespace)
529
+ context = kubernetes_utils.get_context_from_config(provider_config)
530
+ kubernetes_utils.create_namespace(skypilot_system_namespace, context)
531
+
532
+ # Note - this must be run only after the service account has been
533
+ # created in the cluster (in bootstrap_instances).
534
+ # Create the role in the skypilot-system namespace if it does not exist.
535
+ _configure_autoscaler_role(skypilot_system_namespace,
536
+ context,
537
+ provider_config,
538
+ role_field='autoscaler_skypilot_system_role')
539
+ # We must create a unique role binding per-namespace that SkyPilot is
540
+ # running in, so we override the name with a unique name identifying
541
+ # the namespace. This is required for multi-tenant setups where
542
+ # different SkyPilot instances may be running in different namespaces.
543
+ override_name = provider_config['autoscaler_skypilot_system_role_binding'][
544
+ 'metadata']['name'] + '-' + svc_account_namespace
545
+
546
+ # Create the role binding in the skypilot-system namespace, and have
547
+ # the subject namespace be the namespace that the SkyPilot service
548
+ # account is created in.
549
+ _configure_autoscaler_role_binding(
550
+ skypilot_system_namespace,
551
+ context,
552
+ provider_config,
553
+ binding_field='autoscaler_skypilot_system_role_binding',
554
+ override_name=override_name,
555
+ override_subject_namespace=svc_account_namespace)
544
556
 
545
557
 
546
558
  def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
@@ -560,6 +572,7 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
560
572
  logger.info('_configure_fuse_mounting: Setting up FUSE device manager.')
561
573
 
562
574
  fuse_device_manager_namespace = provider_config['skypilot_system_namespace']
575
+ context = kubernetes_utils.get_context_from_config(provider_config)
563
576
 
564
577
  # Read the device manager YAMLs from the manifests directory
565
578
  root_dir = os.path.dirname(os.path.dirname(__file__))
@@ -572,7 +585,7 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
572
585
  config_map = yaml.safe_load(file)
573
586
  kubernetes_utils.merge_custom_metadata(config_map['metadata'])
574
587
  try:
575
- kubernetes.core_api().create_namespaced_config_map(
588
+ kubernetes.core_api(context).create_namespaced_config_map(
576
589
  fuse_device_manager_namespace, config_map)
577
590
  except kubernetes.api_exception() as e:
578
591
  if e.status == 409:
@@ -592,7 +605,7 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
592
605
  daemonset = yaml.safe_load(file)
593
606
  kubernetes_utils.merge_custom_metadata(daemonset['metadata'])
594
607
  try:
595
- kubernetes.apps_api().create_namespaced_daemon_set(
608
+ kubernetes.apps_api(context).create_namespaced_daemon_set(
596
609
  fuse_device_manager_namespace, daemonset)
597
610
  except kubernetes.api_exception() as e:
598
611
  if e.status == 409:
@@ -608,8 +621,8 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
608
621
  f'in namespace {fuse_device_manager_namespace!r}')
609
622
 
610
623
 
611
- def _configure_services(namespace: str, provider_config: Dict[str,
612
- Any]) -> None:
624
+ def _configure_services(namespace: str, context: Optional[str],
625
+ provider_config: Dict[str, Any]) -> None:
613
626
  service_field = 'services'
614
627
  if service_field not in provider_config:
615
628
  logger.info(f'_configure_services: {not_provided_msg(service_field)}')
@@ -624,9 +637,9 @@ def _configure_services(namespace: str, provider_config: Dict[str,
624
637
 
625
638
  name = service['metadata']['name']
626
639
  field_selector = f'metadata.name={name}'
627
- services = (kubernetes.core_api().list_namespaced_service(
640
+ services = (kubernetes.core_api(context).list_namespaced_service(
628
641
  namespace, field_selector=field_selector).items)
629
- if len(services) > 0:
642
+ if services:
630
643
  assert len(services) == 1
631
644
  existing_service = services[0]
632
645
  # Convert to k8s object to compare
@@ -638,12 +651,13 @@ def _configure_services(namespace: str, provider_config: Dict[str,
638
651
  else:
639
652
  logger.info('_configure_services: '
640
653
  f'{updating_existing_msg("service", name)}')
641
- kubernetes.core_api().patch_namespaced_service(
654
+ kubernetes.core_api(context).patch_namespaced_service(
642
655
  name, namespace, service)
643
656
  else:
644
657
  logger.info(
645
658
  f'_configure_services: {not_found_msg("service", name)}')
646
- kubernetes.core_api().create_namespaced_service(namespace, service)
659
+ kubernetes.core_api(context).create_namespaced_service(
660
+ namespace, service)
647
661
  logger.info(f'_configure_services: {created_msg("service", name)}')
648
662
 
649
663
 
@@ -0,0 +1,8 @@
1
+ """Constants for Kubernetes provisioning."""
2
+
3
+ NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
4
+ 'nvidia.com/gpu resource is available on the nodes and '
5
+ 'the node labels for identifying GPUs '
6
+ '(e.g., skypilot.co/accelerator) are setup correctly. ')
7
+
8
+ KUBERNETES_IN_CLUSTER_NAMESPACE_ENV_VAR = 'SKYPILOT_IN_CLUSTER_NAMESPACE'