skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/adaptors/kubernetes.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
"""Kubernetes adaptors"""
|
2
|
-
|
3
|
-
# pylint: disable=import-outside-toplevel
|
4
|
-
|
2
|
+
import logging
|
5
3
|
import os
|
4
|
+
from typing import Any, Callable, Optional, Set
|
6
5
|
|
7
6
|
from sky.adaptors import common
|
8
|
-
from sky.
|
7
|
+
from sky.sky_logging import set_logging_level
|
8
|
+
from sky.utils import annotations
|
9
|
+
from sky.utils import common_utils
|
9
10
|
from sky.utils import ux_utils
|
10
11
|
|
11
12
|
_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Kubernetes. '
|
@@ -15,117 +16,140 @@ kubernetes = common.LazyImport('kubernetes',
|
|
15
16
|
urllib3 = common.LazyImport('urllib3',
|
16
17
|
import_error_message=_IMPORT_ERROR_MESSAGE)
|
17
18
|
|
18
|
-
_configured = False
|
19
|
-
_core_api = None
|
20
|
-
_auth_api = None
|
21
|
-
_networking_api = None
|
22
|
-
_custom_objects_api = None
|
23
|
-
_node_api = None
|
24
|
-
_apps_api = None
|
25
|
-
_api_client = None
|
26
|
-
|
27
19
|
# Timeout to use for API calls
|
28
20
|
API_TIMEOUT = 5
|
29
21
|
|
22
|
+
DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
|
23
|
+
# The name for the environment variable that stores the in-cluster context name
|
24
|
+
# for Kubernetes clusters. This is used to associate a name with the current
|
25
|
+
# context when running with in-cluster auth. If not set, the context name is
|
26
|
+
# set to DEFAULT_IN_CLUSTER_REGION.
|
27
|
+
IN_CLUSTER_CONTEXT_NAME_ENV_VAR = 'SKYPILOT_IN_CLUSTER_CONTEXT_NAME'
|
28
|
+
|
29
|
+
|
30
|
+
def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
|
31
|
+
for attr_name in dir(obj):
|
32
|
+
attr = getattr(obj, attr_name)
|
33
|
+
# Skip methods starting with '__' since they are invoked through one
|
34
|
+
# of the main methods, which are already decorated.
|
35
|
+
if callable(attr) and not attr_name.startswith('__'):
|
36
|
+
decorated_types: Set[str] = getattr(attr, '_sky_decorator_types',
|
37
|
+
set())
|
38
|
+
if decoration_type not in decorated_types:
|
39
|
+
decorated_attr = decorator(attr)
|
40
|
+
decorated_attr._sky_decorator_types = ( # pylint: disable=protected-access
|
41
|
+
decorated_types | {decoration_type})
|
42
|
+
setattr(obj, attr_name, decorated_attr)
|
43
|
+
return obj
|
44
|
+
|
45
|
+
|
46
|
+
def _api_logging_decorator(logger: str, level: int):
|
47
|
+
"""Decorator to set logging level for API calls.
|
48
|
+
|
49
|
+
This is used to suppress the verbose logging from urllib3 when calls to the
|
50
|
+
Kubernetes API timeout.
|
51
|
+
"""
|
52
|
+
|
53
|
+
def decorated_api(api):
|
54
|
+
|
55
|
+
def wrapped(*args, **kwargs):
|
56
|
+
obj = api(*args, **kwargs)
|
57
|
+
_decorate_methods(obj, set_logging_level(logger, level), 'api_log')
|
58
|
+
return obj
|
59
|
+
|
60
|
+
return wrapped
|
61
|
+
|
62
|
+
return decorated_api
|
30
63
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
# Load in-cluster config if running in a pod
|
37
|
-
# Kubernetes set environment variables for service discovery do not
|
38
|
-
# show up in SkyPilot tasks. For now, we work around by using
|
39
|
-
# DNS name instead of environment variables.
|
40
|
-
# See issue: https://github.com/skypilot-org/skypilot/issues/2287
|
41
|
-
os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
|
42
|
-
os.environ['KUBERNETES_SERVICE_PORT'] = '443'
|
43
|
-
kubernetes.config.load_incluster_config()
|
44
|
-
except kubernetes.config.config_exception.ConfigException:
|
64
|
+
|
65
|
+
def _load_config(context: Optional[str] = None):
|
66
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
67
|
+
|
68
|
+
def _load_config_from_kubeconfig(context: Optional[str] = None):
|
45
69
|
try:
|
46
|
-
kubernetes.config.load_kube_config()
|
70
|
+
kubernetes.config.load_kube_config(context=context)
|
47
71
|
except kubernetes.config.config_exception.ConfigException as e:
|
48
|
-
suffix =
|
49
|
-
if env_options.Options.SHOW_DEBUG_INFO.get():
|
50
|
-
suffix += f' Error: {str(e)}'
|
72
|
+
suffix = common_utils.format_exception(e, use_bracket=True)
|
51
73
|
# Check if exception was due to no current-context
|
52
74
|
if 'Expected key current-context' in str(e):
|
53
|
-
err_str = (
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
75
|
+
err_str = (
|
76
|
+
f'Failed to load Kubernetes configuration for {context!r}. '
|
77
|
+
'Kubeconfig does not contain any valid context(s).'
|
78
|
+
f'\n{suffix}\n'
|
79
|
+
' If you were running a local Kubernetes '
|
80
|
+
'cluster, run `sky local up` to start the cluster.')
|
58
81
|
else:
|
59
|
-
err_str = (
|
60
|
-
|
61
|
-
|
82
|
+
err_str = (
|
83
|
+
f'Failed to load Kubernetes configuration for {context!r}. '
|
84
|
+
'Please check if your kubeconfig file exists at '
|
85
|
+
f'~/.kube/config and is valid.\n{suffix}')
|
62
86
|
err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
|
63
87
|
with ux_utils.print_exception_no_traceback():
|
64
88
|
raise ValueError(err_str) from None
|
65
|
-
_configured = True
|
66
|
-
|
67
|
-
|
68
|
-
def core_api():
|
69
|
-
global _core_api
|
70
|
-
if _core_api is None:
|
71
|
-
_load_config()
|
72
|
-
_core_api = kubernetes.client.CoreV1Api()
|
73
|
-
|
74
|
-
return _core_api
|
75
|
-
|
76
|
-
|
77
|
-
def auth_api():
|
78
|
-
global _auth_api
|
79
|
-
if _auth_api is None:
|
80
|
-
_load_config()
|
81
|
-
_auth_api = kubernetes.client.RbacAuthorizationV1Api()
|
82
|
-
|
83
|
-
return _auth_api
|
84
89
|
|
90
|
+
if context == in_cluster_context_name() or context is None:
|
91
|
+
try:
|
92
|
+
# Load in-cluster config if running in a pod and context is None.
|
93
|
+
# Kubernetes set environment variables for service discovery do not
|
94
|
+
# show up in SkyPilot tasks. For now, we work around by using
|
95
|
+
# DNS name instead of environment variables.
|
96
|
+
# See issue: https://github.com/skypilot-org/skypilot/issues/2287
|
97
|
+
os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
|
98
|
+
os.environ['KUBERNETES_SERVICE_PORT'] = '443'
|
99
|
+
kubernetes.config.load_incluster_config()
|
100
|
+
except kubernetes.config.config_exception.ConfigException:
|
101
|
+
_load_config_from_kubeconfig()
|
102
|
+
else:
|
103
|
+
_load_config_from_kubeconfig(context)
|
85
104
|
|
86
|
-
def networking_api():
|
87
|
-
global _networking_api
|
88
|
-
if _networking_api is None:
|
89
|
-
_load_config()
|
90
|
-
_networking_api = kubernetes.client.NetworkingV1Api()
|
91
105
|
|
92
|
-
|
106
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
107
|
+
@annotations.lru_cache(scope='request')
|
108
|
+
def core_api(context: Optional[str] = None):
|
109
|
+
_load_config(context)
|
110
|
+
return kubernetes.client.CoreV1Api()
|
93
111
|
|
94
112
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
113
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
114
|
+
@annotations.lru_cache(scope='request')
|
115
|
+
def auth_api(context: Optional[str] = None):
|
116
|
+
_load_config(context)
|
117
|
+
return kubernetes.client.RbacAuthorizationV1Api()
|
100
118
|
|
101
|
-
return _custom_objects_api
|
102
119
|
|
120
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
121
|
+
@annotations.lru_cache(scope='request')
|
122
|
+
def networking_api(context: Optional[str] = None):
|
123
|
+
_load_config(context)
|
124
|
+
return kubernetes.client.NetworkingV1Api()
|
103
125
|
|
104
|
-
def node_api():
|
105
|
-
global _node_api
|
106
|
-
if _node_api is None:
|
107
|
-
_load_config()
|
108
|
-
_node_api = kubernetes.client.NodeV1Api()
|
109
126
|
|
110
|
-
|
127
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
128
|
+
@annotations.lru_cache(scope='request')
|
129
|
+
def custom_objects_api(context: Optional[str] = None):
|
130
|
+
_load_config(context)
|
131
|
+
return kubernetes.client.CustomObjectsApi()
|
111
132
|
|
112
133
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
134
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
135
|
+
@annotations.lru_cache(scope='global')
|
136
|
+
def node_api(context: Optional[str] = None):
|
137
|
+
_load_config(context)
|
138
|
+
return kubernetes.client.NodeV1Api()
|
118
139
|
|
119
|
-
return _apps_api
|
120
140
|
|
141
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
142
|
+
@annotations.lru_cache(scope='request')
|
143
|
+
def apps_api(context: Optional[str] = None):
|
144
|
+
_load_config(context)
|
145
|
+
return kubernetes.client.AppsV1Api()
|
121
146
|
|
122
|
-
def api_client():
|
123
|
-
global _api_client
|
124
|
-
if _api_client is None:
|
125
|
-
_load_config()
|
126
|
-
_api_client = kubernetes.client.ApiClient()
|
127
147
|
|
128
|
-
|
148
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
149
|
+
@annotations.lru_cache(scope='request')
|
150
|
+
def api_client(context: Optional[str] = None):
|
151
|
+
_load_config(context)
|
152
|
+
return kubernetes.client.ApiClient()
|
129
153
|
|
130
154
|
|
131
155
|
def api_exception():
|
@@ -142,3 +166,13 @@ def max_retry_error():
|
|
142
166
|
|
143
167
|
def stream():
|
144
168
|
return kubernetes.stream.stream
|
169
|
+
|
170
|
+
|
171
|
+
def in_cluster_context_name() -> Optional[str]:
|
172
|
+
"""Returns the name of the in-cluster context from the environment.
|
173
|
+
|
174
|
+
If the environment variable is not set, returns the default in-cluster
|
175
|
+
context name.
|
176
|
+
"""
|
177
|
+
return (os.environ.get(IN_CLUSTER_CONTEXT_NAME_ENV_VAR) or
|
178
|
+
DEFAULT_IN_CLUSTER_REGION)
|
sky/adaptors/nebius.py
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
"""Nebius cloud adaptor."""
|
2
|
+
import os
|
3
|
+
|
4
|
+
from sky.adaptors import common
|
5
|
+
|
6
|
+
NEBIUS_TENANT_ID_FILENAME = 'NEBIUS_TENANT_ID.txt'
|
7
|
+
NEBIUS_IAM_TOKEN_FILENAME = 'NEBIUS_IAM_TOKEN.txt'
|
8
|
+
NEBIUS_PROJECT_ID_FILENAME = 'NEBIUS_PROJECT_ID.txt'
|
9
|
+
NEBIUS_TENANT_ID_PATH = '~/.nebius/' + NEBIUS_TENANT_ID_FILENAME
|
10
|
+
NEBIUS_IAM_TOKEN_PATH = '~/.nebius/' + NEBIUS_IAM_TOKEN_FILENAME
|
11
|
+
NEBIUS_PROJECT_ID_PATH = '~/.nebius/' + NEBIUS_PROJECT_ID_FILENAME
|
12
|
+
|
13
|
+
MAX_RETRIES_TO_DISK_CREATE = 120
|
14
|
+
MAX_RETRIES_TO_INSTANCE_STOP = 120
|
15
|
+
MAX_RETRIES_TO_INSTANCE_START = 120
|
16
|
+
MAX_RETRIES_TO_INSTANCE_READY = 240
|
17
|
+
|
18
|
+
MAX_RETRIES_TO_DISK_DELETE = 120
|
19
|
+
MAX_RETRIES_TO_INSTANCE_WAIT = 120 # Maximum number of retries
|
20
|
+
|
21
|
+
POLL_INTERVAL = 5
|
22
|
+
|
23
|
+
_iam_token = None
|
24
|
+
_tenant_id = None
|
25
|
+
_project_id = None
|
26
|
+
|
27
|
+
nebius = common.LazyImport(
|
28
|
+
'nebius',
|
29
|
+
import_error_message='Failed to import dependencies for Nebius AI Cloud. '
|
30
|
+
'Try running: pip install "skypilot[nebius]"',
|
31
|
+
# https://github.com/grpc/grpc/issues/37642 to avoid spam in console
|
32
|
+
set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'}))
|
33
|
+
|
34
|
+
|
35
|
+
def request_error():
|
36
|
+
return nebius.aio.service_error.RequestError
|
37
|
+
|
38
|
+
|
39
|
+
def compute():
|
40
|
+
# pylint: disable=import-outside-toplevel
|
41
|
+
from nebius.api.nebius.compute import v1 as compute_v1
|
42
|
+
return compute_v1
|
43
|
+
|
44
|
+
|
45
|
+
def iam():
|
46
|
+
# pylint: disable=import-outside-toplevel
|
47
|
+
from nebius.api.nebius.iam import v1 as iam_v1
|
48
|
+
return iam_v1
|
49
|
+
|
50
|
+
|
51
|
+
def nebius_common():
|
52
|
+
# pylint: disable=import-outside-toplevel
|
53
|
+
from nebius.api.nebius.common import v1 as common_v1
|
54
|
+
return common_v1
|
55
|
+
|
56
|
+
|
57
|
+
def vpc():
|
58
|
+
# pylint: disable=import-outside-toplevel
|
59
|
+
from nebius.api.nebius.vpc import v1 as vpc_v1
|
60
|
+
return vpc_v1
|
61
|
+
|
62
|
+
|
63
|
+
def get_iam_token():
|
64
|
+
global _iam_token
|
65
|
+
if _iam_token is None:
|
66
|
+
try:
|
67
|
+
with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
|
68
|
+
encoding='utf-8') as file:
|
69
|
+
_iam_token = file.read().strip()
|
70
|
+
except FileNotFoundError:
|
71
|
+
return None
|
72
|
+
return _iam_token
|
73
|
+
|
74
|
+
|
75
|
+
def get_project_id():
|
76
|
+
global _project_id
|
77
|
+
if _project_id is None:
|
78
|
+
try:
|
79
|
+
with open(os.path.expanduser(NEBIUS_PROJECT_ID_PATH),
|
80
|
+
encoding='utf-8') as file:
|
81
|
+
_project_id = file.read().strip()
|
82
|
+
except FileNotFoundError:
|
83
|
+
return None
|
84
|
+
return _project_id
|
85
|
+
|
86
|
+
|
87
|
+
def get_tenant_id():
|
88
|
+
global _tenant_id
|
89
|
+
if _tenant_id is None:
|
90
|
+
try:
|
91
|
+
with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
|
92
|
+
encoding='utf-8') as file:
|
93
|
+
_tenant_id = file.read().strip()
|
94
|
+
except FileNotFoundError:
|
95
|
+
return None
|
96
|
+
return _tenant_id
|
97
|
+
|
98
|
+
|
99
|
+
def sdk():
|
100
|
+
return nebius.sdk.SDK(credentials=get_iam_token())
|
sky/adaptors/oci.py
CHANGED
@@ -1,8 +1,17 @@
|
|
1
1
|
"""Oracle OCI cloud adaptor"""
|
2
2
|
|
3
|
+
import functools
|
4
|
+
import logging
|
3
5
|
import os
|
4
6
|
|
5
7
|
from sky.adaptors import common
|
8
|
+
from sky.clouds.utils import oci_utils
|
9
|
+
|
10
|
+
# Suppress OCI circuit breaker logging before lazy import, because
|
11
|
+
# oci modules prints additional message during imports, i.e., the
|
12
|
+
# set_logger in the LazyImport called after imports will not take
|
13
|
+
# effect.
|
14
|
+
logging.getLogger('oci.circuit_breaker').setLevel(logging.WARNING)
|
6
15
|
|
7
16
|
CONFIG_PATH = '~/.oci/config'
|
8
17
|
ENV_VAR_OCI_CONFIG = 'OCI_CONFIG'
|
@@ -23,10 +32,16 @@ def get_config_file() -> str:
|
|
23
32
|
|
24
33
|
def get_oci_config(region=None, profile='DEFAULT'):
|
25
34
|
conf_file_path = get_config_file()
|
35
|
+
if not profile or profile == 'DEFAULT':
|
36
|
+
config_profile = oci_utils.oci_config.get_profile()
|
37
|
+
else:
|
38
|
+
config_profile = profile
|
39
|
+
|
26
40
|
oci_config = oci.config.from_file(file_location=conf_file_path,
|
27
|
-
profile_name=
|
41
|
+
profile_name=config_profile)
|
28
42
|
if region is not None:
|
29
43
|
oci_config['region'] = region
|
44
|
+
|
30
45
|
return oci_config
|
31
46
|
|
32
47
|
|
@@ -47,6 +62,29 @@ def get_identity_client(region=None, profile='DEFAULT'):
|
|
47
62
|
return oci.identity.IdentityClient(get_oci_config(region, profile))
|
48
63
|
|
49
64
|
|
65
|
+
def get_object_storage_client(region=None, profile='DEFAULT'):
|
66
|
+
return oci.object_storage.ObjectStorageClient(
|
67
|
+
get_oci_config(region, profile))
|
68
|
+
|
69
|
+
|
50
70
|
def service_exception():
|
51
71
|
"""OCI service exception."""
|
52
72
|
return oci.exceptions.ServiceError
|
73
|
+
|
74
|
+
|
75
|
+
def with_oci_env(f):
|
76
|
+
|
77
|
+
@functools.wraps(f)
|
78
|
+
def wrapper(*args, **kwargs):
|
79
|
+
# pylint: disable=line-too-long
|
80
|
+
enter_env_cmds = [
|
81
|
+
'conda info --envs | grep "sky-oci-cli-env" || conda create -n sky-oci-cli-env python=3.10 -y',
|
82
|
+
'. $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true',
|
83
|
+
'conda activate sky-oci-cli-env', 'pip install oci-cli',
|
84
|
+
'export OCI_CLI_SUPPRESS_FILE_PERMISSIONS_WARNING=True'
|
85
|
+
]
|
86
|
+
operation_cmd = [f(*args, **kwargs)]
|
87
|
+
leave_env_cmds = ['conda deactivate']
|
88
|
+
return ' && '.join(enter_env_cmds + operation_cmd + leave_env_cmds)
|
89
|
+
|
90
|
+
return wrapper
|
sky/adaptors/vast.py
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
"""Vast cloud adaptor."""
|
2
|
+
|
3
|
+
import functools
|
4
|
+
|
5
|
+
_vast_sdk = None
|
6
|
+
|
7
|
+
|
8
|
+
def import_package(func):
|
9
|
+
|
10
|
+
@functools.wraps(func)
|
11
|
+
def wrapper(*args, **kwargs):
|
12
|
+
global _vast_sdk
|
13
|
+
|
14
|
+
if _vast_sdk is None:
|
15
|
+
try:
|
16
|
+
import vastai_sdk as _vast # pylint: disable=import-outside-toplevel
|
17
|
+
_vast_sdk = _vast.VastAI()
|
18
|
+
except ImportError as e:
|
19
|
+
raise ImportError(f'Fail to import dependencies for vast: {e}\n'
|
20
|
+
'Try pip install "skypilot[vast]"') from None
|
21
|
+
return func(*args, **kwargs)
|
22
|
+
|
23
|
+
return wrapper
|
24
|
+
|
25
|
+
|
26
|
+
@import_package
|
27
|
+
def vast():
|
28
|
+
"""Return the vast package."""
|
29
|
+
return _vast_sdk
|
sky/admin_policy.py
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
"""Interface for admin-defined policy for user requests."""
|
2
|
+
import abc
|
3
|
+
import dataclasses
|
4
|
+
import typing
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
if typing.TYPE_CHECKING:
|
8
|
+
import sky
|
9
|
+
|
10
|
+
|
11
|
+
@dataclasses.dataclass
|
12
|
+
class RequestOptions:
|
13
|
+
"""Request options for admin policy.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
cluster_name: Name of the cluster to create/reuse. It is None if not
|
17
|
+
specified by the user.
|
18
|
+
idle_minutes_to_autostop: Autostop setting requested by a user. The
|
19
|
+
cluster will be set to autostop after this many minutes of idleness.
|
20
|
+
down: If true, use autodown rather than autostop.
|
21
|
+
dryrun: Is the request a dryrun?
|
22
|
+
"""
|
23
|
+
cluster_name: Optional[str]
|
24
|
+
idle_minutes_to_autostop: Optional[int]
|
25
|
+
down: bool
|
26
|
+
dryrun: bool
|
27
|
+
|
28
|
+
|
29
|
+
@dataclasses.dataclass
|
30
|
+
class UserRequest:
|
31
|
+
"""A user request.
|
32
|
+
|
33
|
+
A "user request" is defined as a `sky launch / exec` command or its API
|
34
|
+
equivalent.
|
35
|
+
|
36
|
+
`sky jobs launch / serve up` involves multiple launch requests, including
|
37
|
+
the launch of controller and clusters for a job (which can have multiple
|
38
|
+
tasks if it is a pipeline) or service replicas. Each launch is a separate
|
39
|
+
request.
|
40
|
+
|
41
|
+
This class wraps the underlying task, the global skypilot config used to run
|
42
|
+
a task, and the request options.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
task: User specified task.
|
46
|
+
skypilot_config: Global skypilot config to be used in this request.
|
47
|
+
request_options: Request options. It is None for jobs and services.
|
48
|
+
"""
|
49
|
+
task: 'sky.Task'
|
50
|
+
skypilot_config: 'sky.Config'
|
51
|
+
request_options: Optional['RequestOptions'] = None
|
52
|
+
|
53
|
+
|
54
|
+
@dataclasses.dataclass
|
55
|
+
class MutatedUserRequest:
|
56
|
+
task: 'sky.Task'
|
57
|
+
skypilot_config: 'sky.Config'
|
58
|
+
|
59
|
+
|
60
|
+
# pylint: disable=line-too-long
|
61
|
+
class AdminPolicy:
|
62
|
+
"""Abstract interface of an admin-defined policy for all user requests.
|
63
|
+
|
64
|
+
Admins can implement a subclass of AdminPolicy with the following signature:
|
65
|
+
|
66
|
+
import sky
|
67
|
+
|
68
|
+
class SkyPilotPolicyV1(sky.AdminPolicy):
|
69
|
+
def validate_and_mutate(user_request: UserRequest) -> MutatedUserRequest:
|
70
|
+
...
|
71
|
+
return MutatedUserRequest(task=..., skypilot_config=...)
|
72
|
+
|
73
|
+
The policy can mutate both task and skypilot_config. Admins then distribute
|
74
|
+
a simple module that contains this implementation, installable in a way
|
75
|
+
that it can be imported by users from the same Python environment where
|
76
|
+
SkyPilot is running.
|
77
|
+
|
78
|
+
Users can register a subclass of AdminPolicy in the SkyPilot config file
|
79
|
+
under the key 'admin_policy', e.g.
|
80
|
+
|
81
|
+
admin_policy: my_package.SkyPilotPolicyV1
|
82
|
+
"""
|
83
|
+
|
84
|
+
@classmethod
|
85
|
+
@abc.abstractmethod
|
86
|
+
def validate_and_mutate(cls,
|
87
|
+
user_request: UserRequest) -> MutatedUserRequest:
|
88
|
+
"""Validates and mutates the user request and returns mutated request.
|
89
|
+
|
90
|
+
Args:
|
91
|
+
user_request: The user request to validate and mutate.
|
92
|
+
UserRequest contains (sky.Task, sky.Config)
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
MutatedUserRequest: The mutated user request.
|
96
|
+
|
97
|
+
Raises:
|
98
|
+
Exception to throw if the user request failed the validation.
|
99
|
+
"""
|
100
|
+
raise NotImplementedError(
|
101
|
+
'Your policy must implement validate_and_mutate')
|