skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  """Kubernetes adaptors"""
2
-
3
- # pylint: disable=import-outside-toplevel
4
-
2
+ import logging
5
3
  import os
4
+ from typing import Any, Callable, Optional, Set
6
5
 
7
6
  from sky.adaptors import common
8
- from sky.utils import env_options
7
+ from sky.sky_logging import set_logging_level
8
+ from sky.utils import annotations
9
+ from sky.utils import common_utils
9
10
  from sky.utils import ux_utils
10
11
 
11
12
  _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Kubernetes. '
@@ -15,117 +16,140 @@ kubernetes = common.LazyImport('kubernetes',
15
16
  urllib3 = common.LazyImport('urllib3',
16
17
  import_error_message=_IMPORT_ERROR_MESSAGE)
17
18
 
18
- _configured = False
19
- _core_api = None
20
- _auth_api = None
21
- _networking_api = None
22
- _custom_objects_api = None
23
- _node_api = None
24
- _apps_api = None
25
- _api_client = None
26
-
27
19
  # Timeout to use for API calls
28
20
  API_TIMEOUT = 5
29
21
 
22
+ DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
23
+ # The name for the environment variable that stores the in-cluster context name
24
+ # for Kubernetes clusters. This is used to associate a name with the current
25
+ # context when running with in-cluster auth. If not set, the context name is
26
+ # set to DEFAULT_IN_CLUSTER_REGION.
27
+ IN_CLUSTER_CONTEXT_NAME_ENV_VAR = 'SKYPILOT_IN_CLUSTER_CONTEXT_NAME'
28
+
29
+
30
+ def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
31
+ for attr_name in dir(obj):
32
+ attr = getattr(obj, attr_name)
33
+ # Skip methods starting with '__' since they are invoked through one
34
+ # of the main methods, which are already decorated.
35
+ if callable(attr) and not attr_name.startswith('__'):
36
+ decorated_types: Set[str] = getattr(attr, '_sky_decorator_types',
37
+ set())
38
+ if decoration_type not in decorated_types:
39
+ decorated_attr = decorator(attr)
40
+ decorated_attr._sky_decorator_types = ( # pylint: disable=protected-access
41
+ decorated_types | {decoration_type})
42
+ setattr(obj, attr_name, decorated_attr)
43
+ return obj
44
+
45
+
46
+ def _api_logging_decorator(logger: str, level: int):
47
+ """Decorator to set logging level for API calls.
48
+
49
+ This is used to suppress the verbose logging from urllib3 when calls to the
50
+ Kubernetes API timeout.
51
+ """
52
+
53
+ def decorated_api(api):
54
+
55
+ def wrapped(*args, **kwargs):
56
+ obj = api(*args, **kwargs)
57
+ _decorate_methods(obj, set_logging_level(logger, level), 'api_log')
58
+ return obj
59
+
60
+ return wrapped
61
+
62
+ return decorated_api
30
63
 
31
- def _load_config():
32
- global _configured
33
- if _configured:
34
- return
35
- try:
36
- # Load in-cluster config if running in a pod
37
- # Kubernetes set environment variables for service discovery do not
38
- # show up in SkyPilot tasks. For now, we work around by using
39
- # DNS name instead of environment variables.
40
- # See issue: https://github.com/skypilot-org/skypilot/issues/2287
41
- os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
42
- os.environ['KUBERNETES_SERVICE_PORT'] = '443'
43
- kubernetes.config.load_incluster_config()
44
- except kubernetes.config.config_exception.ConfigException:
64
+
65
+ def _load_config(context: Optional[str] = None):
66
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
67
+
68
+ def _load_config_from_kubeconfig(context: Optional[str] = None):
45
69
  try:
46
- kubernetes.config.load_kube_config()
70
+ kubernetes.config.load_kube_config(context=context)
47
71
  except kubernetes.config.config_exception.ConfigException as e:
48
- suffix = ''
49
- if env_options.Options.SHOW_DEBUG_INFO.get():
50
- suffix += f' Error: {str(e)}'
72
+ suffix = common_utils.format_exception(e, use_bracket=True)
51
73
  # Check if exception was due to no current-context
52
74
  if 'Expected key current-context' in str(e):
53
- err_str = ('Failed to load Kubernetes configuration. '
54
- 'Kubeconfig does not contain any valid context(s).'
55
- f'{suffix}\n'
56
- ' If you were running a local Kubernetes '
57
- 'cluster, run `sky local up` to start the cluster.')
75
+ err_str = (
76
+ f'Failed to load Kubernetes configuration for {context!r}. '
77
+ 'Kubeconfig does not contain any valid context(s).'
78
+ f'\n{suffix}\n'
79
+ ' If you were running a local Kubernetes '
80
+ 'cluster, run `sky local up` to start the cluster.')
58
81
  else:
59
- err_str = ('Failed to load Kubernetes configuration. '
60
- 'Please check if your kubeconfig file exists at '
61
- f'~/.kube/config and is valid.{suffix}')
82
+ err_str = (
83
+ f'Failed to load Kubernetes configuration for {context!r}. '
84
+ 'Please check if your kubeconfig file exists at '
85
+ f'~/.kube/config and is valid.\n{suffix}')
62
86
  err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
63
87
  with ux_utils.print_exception_no_traceback():
64
88
  raise ValueError(err_str) from None
65
- _configured = True
66
-
67
-
68
- def core_api():
69
- global _core_api
70
- if _core_api is None:
71
- _load_config()
72
- _core_api = kubernetes.client.CoreV1Api()
73
-
74
- return _core_api
75
-
76
-
77
- def auth_api():
78
- global _auth_api
79
- if _auth_api is None:
80
- _load_config()
81
- _auth_api = kubernetes.client.RbacAuthorizationV1Api()
82
-
83
- return _auth_api
84
89
 
90
+ if context == in_cluster_context_name() or context is None:
91
+ try:
92
+ # Load in-cluster config if running in a pod and context is None.
93
+ # Kubernetes set environment variables for service discovery do not
94
+ # show up in SkyPilot tasks. For now, we work around by using
95
+ # DNS name instead of environment variables.
96
+ # See issue: https://github.com/skypilot-org/skypilot/issues/2287
97
+ os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
98
+ os.environ['KUBERNETES_SERVICE_PORT'] = '443'
99
+ kubernetes.config.load_incluster_config()
100
+ except kubernetes.config.config_exception.ConfigException:
101
+ _load_config_from_kubeconfig()
102
+ else:
103
+ _load_config_from_kubeconfig(context)
85
104
 
86
- def networking_api():
87
- global _networking_api
88
- if _networking_api is None:
89
- _load_config()
90
- _networking_api = kubernetes.client.NetworkingV1Api()
91
105
 
92
- return _networking_api
106
+ @_api_logging_decorator('urllib3', logging.ERROR)
107
+ @annotations.lru_cache(scope='request')
108
+ def core_api(context: Optional[str] = None):
109
+ _load_config(context)
110
+ return kubernetes.client.CoreV1Api()
93
111
 
94
112
 
95
- def custom_objects_api():
96
- global _custom_objects_api
97
- if _custom_objects_api is None:
98
- _load_config()
99
- _custom_objects_api = kubernetes.client.CustomObjectsApi()
113
+ @_api_logging_decorator('urllib3', logging.ERROR)
114
+ @annotations.lru_cache(scope='request')
115
+ def auth_api(context: Optional[str] = None):
116
+ _load_config(context)
117
+ return kubernetes.client.RbacAuthorizationV1Api()
100
118
 
101
- return _custom_objects_api
102
119
 
120
+ @_api_logging_decorator('urllib3', logging.ERROR)
121
+ @annotations.lru_cache(scope='request')
122
+ def networking_api(context: Optional[str] = None):
123
+ _load_config(context)
124
+ return kubernetes.client.NetworkingV1Api()
103
125
 
104
- def node_api():
105
- global _node_api
106
- if _node_api is None:
107
- _load_config()
108
- _node_api = kubernetes.client.NodeV1Api()
109
126
 
110
- return _node_api
127
+ @_api_logging_decorator('urllib3', logging.ERROR)
128
+ @annotations.lru_cache(scope='request')
129
+ def custom_objects_api(context: Optional[str] = None):
130
+ _load_config(context)
131
+ return kubernetes.client.CustomObjectsApi()
111
132
 
112
133
 
113
- def apps_api():
114
- global _apps_api
115
- if _apps_api is None:
116
- _load_config()
117
- _apps_api = kubernetes.client.AppsV1Api()
134
+ @_api_logging_decorator('urllib3', logging.ERROR)
135
+ @annotations.lru_cache(scope='global')
136
+ def node_api(context: Optional[str] = None):
137
+ _load_config(context)
138
+ return kubernetes.client.NodeV1Api()
118
139
 
119
- return _apps_api
120
140
 
141
+ @_api_logging_decorator('urllib3', logging.ERROR)
142
+ @annotations.lru_cache(scope='request')
143
+ def apps_api(context: Optional[str] = None):
144
+ _load_config(context)
145
+ return kubernetes.client.AppsV1Api()
121
146
 
122
- def api_client():
123
- global _api_client
124
- if _api_client is None:
125
- _load_config()
126
- _api_client = kubernetes.client.ApiClient()
127
147
 
128
- return _api_client
148
+ @_api_logging_decorator('urllib3', logging.ERROR)
149
+ @annotations.lru_cache(scope='request')
150
+ def api_client(context: Optional[str] = None):
151
+ _load_config(context)
152
+ return kubernetes.client.ApiClient()
129
153
 
130
154
 
131
155
  def api_exception():
@@ -142,3 +166,13 @@ def max_retry_error():
142
166
 
143
167
  def stream():
144
168
  return kubernetes.stream.stream
169
+
170
+
171
+ def in_cluster_context_name() -> Optional[str]:
172
+ """Returns the name of the in-cluster context from the environment.
173
+
174
+ If the environment variable is not set, returns the default in-cluster
175
+ context name.
176
+ """
177
+ return (os.environ.get(IN_CLUSTER_CONTEXT_NAME_ENV_VAR) or
178
+ DEFAULT_IN_CLUSTER_REGION)
sky/adaptors/nebius.py ADDED
@@ -0,0 +1,100 @@
1
+ """Nebius cloud adaptor."""
2
+ import os
3
+
4
+ from sky.adaptors import common
5
+
6
+ NEBIUS_TENANT_ID_FILENAME = 'NEBIUS_TENANT_ID.txt'
7
+ NEBIUS_IAM_TOKEN_FILENAME = 'NEBIUS_IAM_TOKEN.txt'
8
+ NEBIUS_PROJECT_ID_FILENAME = 'NEBIUS_PROJECT_ID.txt'
9
+ NEBIUS_TENANT_ID_PATH = '~/.nebius/' + NEBIUS_TENANT_ID_FILENAME
10
+ NEBIUS_IAM_TOKEN_PATH = '~/.nebius/' + NEBIUS_IAM_TOKEN_FILENAME
11
+ NEBIUS_PROJECT_ID_PATH = '~/.nebius/' + NEBIUS_PROJECT_ID_FILENAME
12
+
13
+ MAX_RETRIES_TO_DISK_CREATE = 120
14
+ MAX_RETRIES_TO_INSTANCE_STOP = 120
15
+ MAX_RETRIES_TO_INSTANCE_START = 120
16
+ MAX_RETRIES_TO_INSTANCE_READY = 240
17
+
18
+ MAX_RETRIES_TO_DISK_DELETE = 120
19
+ MAX_RETRIES_TO_INSTANCE_WAIT = 120 # Maximum number of retries
20
+
21
+ POLL_INTERVAL = 5
22
+
23
+ _iam_token = None
24
+ _tenant_id = None
25
+ _project_id = None
26
+
27
+ nebius = common.LazyImport(
28
+ 'nebius',
29
+ import_error_message='Failed to import dependencies for Nebius AI Cloud. '
30
+ 'Try running: pip install "skypilot[nebius]"',
31
+ # https://github.com/grpc/grpc/issues/37642 to avoid spam in console
32
+ set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'}))
33
+
34
+
35
+ def request_error():
36
+ return nebius.aio.service_error.RequestError
37
+
38
+
39
+ def compute():
40
+ # pylint: disable=import-outside-toplevel
41
+ from nebius.api.nebius.compute import v1 as compute_v1
42
+ return compute_v1
43
+
44
+
45
+ def iam():
46
+ # pylint: disable=import-outside-toplevel
47
+ from nebius.api.nebius.iam import v1 as iam_v1
48
+ return iam_v1
49
+
50
+
51
+ def nebius_common():
52
+ # pylint: disable=import-outside-toplevel
53
+ from nebius.api.nebius.common import v1 as common_v1
54
+ return common_v1
55
+
56
+
57
+ def vpc():
58
+ # pylint: disable=import-outside-toplevel
59
+ from nebius.api.nebius.vpc import v1 as vpc_v1
60
+ return vpc_v1
61
+
62
+
63
+ def get_iam_token():
64
+ global _iam_token
65
+ if _iam_token is None:
66
+ try:
67
+ with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
68
+ encoding='utf-8') as file:
69
+ _iam_token = file.read().strip()
70
+ except FileNotFoundError:
71
+ return None
72
+ return _iam_token
73
+
74
+
75
+ def get_project_id():
76
+ global _project_id
77
+ if _project_id is None:
78
+ try:
79
+ with open(os.path.expanduser(NEBIUS_PROJECT_ID_PATH),
80
+ encoding='utf-8') as file:
81
+ _project_id = file.read().strip()
82
+ except FileNotFoundError:
83
+ return None
84
+ return _project_id
85
+
86
+
87
+ def get_tenant_id():
88
+ global _tenant_id
89
+ if _tenant_id is None:
90
+ try:
91
+ with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
92
+ encoding='utf-8') as file:
93
+ _tenant_id = file.read().strip()
94
+ except FileNotFoundError:
95
+ return None
96
+ return _tenant_id
97
+
98
+
99
+ def sdk():
100
+ return nebius.sdk.SDK(credentials=get_iam_token())
sky/adaptors/oci.py CHANGED
@@ -1,8 +1,17 @@
1
1
  """Oracle OCI cloud adaptor"""
2
2
 
3
+ import functools
4
+ import logging
3
5
  import os
4
6
 
5
7
  from sky.adaptors import common
8
+ from sky.clouds.utils import oci_utils
9
+
10
+ # Suppress OCI circuit breaker logging before lazy import, because
11
+ # oci modules prints additional message during imports, i.e., the
12
+ # set_logger in the LazyImport called after imports will not take
13
+ # effect.
14
+ logging.getLogger('oci.circuit_breaker').setLevel(logging.WARNING)
6
15
 
7
16
  CONFIG_PATH = '~/.oci/config'
8
17
  ENV_VAR_OCI_CONFIG = 'OCI_CONFIG'
@@ -23,10 +32,16 @@ def get_config_file() -> str:
23
32
 
24
33
  def get_oci_config(region=None, profile='DEFAULT'):
25
34
  conf_file_path = get_config_file()
35
+ if not profile or profile == 'DEFAULT':
36
+ config_profile = oci_utils.oci_config.get_profile()
37
+ else:
38
+ config_profile = profile
39
+
26
40
  oci_config = oci.config.from_file(file_location=conf_file_path,
27
- profile_name=profile)
41
+ profile_name=config_profile)
28
42
  if region is not None:
29
43
  oci_config['region'] = region
44
+
30
45
  return oci_config
31
46
 
32
47
 
@@ -47,6 +62,29 @@ def get_identity_client(region=None, profile='DEFAULT'):
47
62
  return oci.identity.IdentityClient(get_oci_config(region, profile))
48
63
 
49
64
 
65
+ def get_object_storage_client(region=None, profile='DEFAULT'):
66
+ return oci.object_storage.ObjectStorageClient(
67
+ get_oci_config(region, profile))
68
+
69
+
50
70
  def service_exception():
51
71
  """OCI service exception."""
52
72
  return oci.exceptions.ServiceError
73
+
74
+
75
+ def with_oci_env(f):
76
+
77
+ @functools.wraps(f)
78
+ def wrapper(*args, **kwargs):
79
+ # pylint: disable=line-too-long
80
+ enter_env_cmds = [
81
+ 'conda info --envs | grep "sky-oci-cli-env" || conda create -n sky-oci-cli-env python=3.10 -y',
82
+ '. $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true',
83
+ 'conda activate sky-oci-cli-env', 'pip install oci-cli',
84
+ 'export OCI_CLI_SUPPRESS_FILE_PERMISSIONS_WARNING=True'
85
+ ]
86
+ operation_cmd = [f(*args, **kwargs)]
87
+ leave_env_cmds = ['conda deactivate']
88
+ return ' && '.join(enter_env_cmds + operation_cmd + leave_env_cmds)
89
+
90
+ return wrapper
sky/adaptors/vast.py ADDED
@@ -0,0 +1,29 @@
1
+ """Vast cloud adaptor."""
2
+
3
+ import functools
4
+
5
+ _vast_sdk = None
6
+
7
+
8
+ def import_package(func):
9
+
10
+ @functools.wraps(func)
11
+ def wrapper(*args, **kwargs):
12
+ global _vast_sdk
13
+
14
+ if _vast_sdk is None:
15
+ try:
16
+ import vastai_sdk as _vast # pylint: disable=import-outside-toplevel
17
+ _vast_sdk = _vast.VastAI()
18
+ except ImportError as e:
19
+ raise ImportError(f'Fail to import dependencies for vast: {e}\n'
20
+ 'Try pip install "skypilot[vast]"') from None
21
+ return func(*args, **kwargs)
22
+
23
+ return wrapper
24
+
25
+
26
+ @import_package
27
+ def vast():
28
+ """Return the vast package."""
29
+ return _vast_sdk
sky/admin_policy.py ADDED
@@ -0,0 +1,101 @@
1
+ """Interface for admin-defined policy for user requests."""
2
+ import abc
3
+ import dataclasses
4
+ import typing
5
+ from typing import Optional
6
+
7
+ if typing.TYPE_CHECKING:
8
+ import sky
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class RequestOptions:
13
+ """Request options for admin policy.
14
+
15
+ Args:
16
+ cluster_name: Name of the cluster to create/reuse. It is None if not
17
+ specified by the user.
18
+ idle_minutes_to_autostop: Autostop setting requested by a user. The
19
+ cluster will be set to autostop after this many minutes of idleness.
20
+ down: If true, use autodown rather than autostop.
21
+ dryrun: Is the request a dryrun?
22
+ """
23
+ cluster_name: Optional[str]
24
+ idle_minutes_to_autostop: Optional[int]
25
+ down: bool
26
+ dryrun: bool
27
+
28
+
29
+ @dataclasses.dataclass
30
+ class UserRequest:
31
+ """A user request.
32
+
33
+ A "user request" is defined as a `sky launch / exec` command or its API
34
+ equivalent.
35
+
36
+ `sky jobs launch / serve up` involves multiple launch requests, including
37
+ the launch of controller and clusters for a job (which can have multiple
38
+ tasks if it is a pipeline) or service replicas. Each launch is a separate
39
+ request.
40
+
41
+ This class wraps the underlying task, the global skypilot config used to run
42
+ a task, and the request options.
43
+
44
+ Args:
45
+ task: User specified task.
46
+ skypilot_config: Global skypilot config to be used in this request.
47
+ request_options: Request options. It is None for jobs and services.
48
+ """
49
+ task: 'sky.Task'
50
+ skypilot_config: 'sky.Config'
51
+ request_options: Optional['RequestOptions'] = None
52
+
53
+
54
+ @dataclasses.dataclass
55
+ class MutatedUserRequest:
56
+ task: 'sky.Task'
57
+ skypilot_config: 'sky.Config'
58
+
59
+
60
+ # pylint: disable=line-too-long
61
+ class AdminPolicy:
62
+ """Abstract interface of an admin-defined policy for all user requests.
63
+
64
+ Admins can implement a subclass of AdminPolicy with the following signature:
65
+
66
+ import sky
67
+
68
+ class SkyPilotPolicyV1(sky.AdminPolicy):
69
+ def validate_and_mutate(user_request: UserRequest) -> MutatedUserRequest:
70
+ ...
71
+ return MutatedUserRequest(task=..., skypilot_config=...)
72
+
73
+ The policy can mutate both task and skypilot_config. Admins then distribute
74
+ a simple module that contains this implementation, installable in a way
75
+ that it can be imported by users from the same Python environment where
76
+ SkyPilot is running.
77
+
78
+ Users can register a subclass of AdminPolicy in the SkyPilot config file
79
+ under the key 'admin_policy', e.g.
80
+
81
+ admin_policy: my_package.SkyPilotPolicyV1
82
+ """
83
+
84
+ @classmethod
85
+ @abc.abstractmethod
86
+ def validate_and_mutate(cls,
87
+ user_request: UserRequest) -> MutatedUserRequest:
88
+ """Validates and mutates the user request and returns mutated request.
89
+
90
+ Args:
91
+ user_request: The user request to validate and mutate.
92
+ UserRequest contains (sky.Task, sky.Config)
93
+
94
+ Returns:
95
+ MutatedUserRequest: The mutated user request.
96
+
97
+ Raises:
98
+ Exception to throw if the user request failed the validation.
99
+ """
100
+ raise NotImplementedError(
101
+ 'Your policy must implement validate_and_mutate')