skypilot-nightly 1.0.0.dev20241111__py3-none-any.whl → 1.0.0.dev20241112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +7 -3
- sky/clouds/service_catalog/kubernetes_catalog.py +34 -11
- sky/provision/kubernetes/utils.py +26 -14
- sky/utils/kubernetes/generate_kubeconfig.sh +3 -0
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241112.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241112.dist-info}/RECORD +11 -11
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241112.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241112.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241112.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '140125eaad5fb64da37934c8f6650d68aa135f77'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241112'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/cli.py
CHANGED
@@ -3102,6 +3102,7 @@ def show_gpus(
|
|
3102
3102
|
kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
|
3103
3103
|
kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
|
3104
3104
|
sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
|
3105
|
+
no_permissions_str = '<no permissions>'
|
3105
3106
|
|
3106
3107
|
def _list_to_str(lst):
|
3107
3108
|
return ', '.join([str(e) for e in lst])
|
@@ -3146,9 +3147,11 @@ def show_gpus(
|
|
3146
3147
|
debug_msg)
|
3147
3148
|
raise ValueError(full_err_msg)
|
3148
3149
|
for gpu, _ in sorted(counts.items()):
|
3150
|
+
available_qty = available[gpu] if available[gpu] != -1 else (
|
3151
|
+
no_permissions_str)
|
3149
3152
|
realtime_gpu_table.add_row([
|
3150
3153
|
gpu,
|
3151
|
-
_list_to_str(counts.pop(gpu)), capacity[gpu],
|
3154
|
+
_list_to_str(counts.pop(gpu)), capacity[gpu], available_qty
|
3152
3155
|
])
|
3153
3156
|
return realtime_gpu_table
|
3154
3157
|
|
@@ -3158,10 +3161,11 @@ def show_gpus(
|
|
3158
3161
|
|
3159
3162
|
node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
|
3160
3163
|
for node_name, node_info in node_info_dict.items():
|
3164
|
+
available = node_info.free['nvidia.com/gpu'] if node_info.free[
|
3165
|
+
'nvidia.com/gpu'] != -1 else no_permissions_str
|
3161
3166
|
node_table.add_row([
|
3162
3167
|
node_name, node_info.gpu_type,
|
3163
|
-
node_info.total['nvidia.com/gpu'],
|
3164
|
-
node_info.free['nvidia.com/gpu']
|
3168
|
+
node_info.total['nvidia.com/gpu'], available
|
3165
3169
|
])
|
3166
3170
|
return node_table
|
3167
3171
|
|
@@ -10,6 +10,7 @@ from typing import Dict, List, Optional, Set, Tuple
|
|
10
10
|
from sky import check as sky_check
|
11
11
|
from sky import sky_logging
|
12
12
|
from sky.adaptors import common as adaptors_common
|
13
|
+
from sky.adaptors import kubernetes
|
13
14
|
from sky.clouds import Kubernetes
|
14
15
|
from sky.clouds.service_catalog import CloudFilter
|
15
16
|
from sky.clouds.service_catalog import common
|
@@ -22,6 +23,8 @@ if typing.TYPE_CHECKING:
|
|
22
23
|
else:
|
23
24
|
pd = adaptors_common.LazyImport('pandas')
|
24
25
|
|
26
|
+
logger = sky_logging.init_logger(__name__)
|
27
|
+
|
25
28
|
_PULL_FREQUENCY_HOURS = 7
|
26
29
|
|
27
30
|
# We keep pull_frequency_hours so we can remotely update the default image paths
|
@@ -77,6 +80,11 @@ def list_accelerators_realtime(
|
|
77
80
|
require_price: bool = True
|
78
81
|
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
|
79
82
|
int]]:
|
83
|
+
"""List accelerators in the Kubernetes cluster.
|
84
|
+
|
85
|
+
If the user does not have sufficient permissions to list pods in all
|
86
|
+
namespaces, the function will return free GPUs as -1.
|
87
|
+
"""
|
80
88
|
# TODO(romilb): This should be refactored to use get_kubernetes_node_info()
|
81
89
|
# function from kubernetes_utils.
|
82
90
|
del all_regions, require_price # Unused.
|
@@ -108,7 +116,17 @@ def list_accelerators_realtime(
|
|
108
116
|
key = label_formatter.get_label_key()
|
109
117
|
nodes = kubernetes_utils.get_kubernetes_nodes(context)
|
110
118
|
# Get the pods to get the real-time GPU usage
|
111
|
-
|
119
|
+
try:
|
120
|
+
pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
|
121
|
+
except kubernetes.api_exception() as e:
|
122
|
+
if e.status == 403:
|
123
|
+
logger.warning('Failed to get pods in the Kubernetes cluster '
|
124
|
+
'(forbidden). Please check if your account has '
|
125
|
+
'necessary permissions to list pods. Realtime GPU '
|
126
|
+
'availability information may be incorrect.')
|
127
|
+
pods = None
|
128
|
+
else:
|
129
|
+
raise
|
112
130
|
# Total number of GPUs in the cluster
|
113
131
|
total_accelerators_capacity: Dict[str, int] = {}
|
114
132
|
# Total number of GPUs currently available in the cluster
|
@@ -141,6 +159,21 @@ def list_accelerators_realtime(
|
|
141
159
|
if accelerator_count not in accelerators_qtys:
|
142
160
|
accelerators_qtys.add((accelerator_name, accelerator_count))
|
143
161
|
|
162
|
+
if accelerator_count >= min_quantity_filter:
|
163
|
+
quantized_count = (min_quantity_filter *
|
164
|
+
(accelerator_count // min_quantity_filter))
|
165
|
+
if accelerator_name not in total_accelerators_capacity:
|
166
|
+
total_accelerators_capacity[
|
167
|
+
accelerator_name] = quantized_count
|
168
|
+
else:
|
169
|
+
total_accelerators_capacity[
|
170
|
+
accelerator_name] += quantized_count
|
171
|
+
|
172
|
+
if pods is None:
|
173
|
+
# If we can't get the pods, we can't get the GPU usage
|
174
|
+
total_accelerators_available[accelerator_name] = -1
|
175
|
+
continue
|
176
|
+
|
144
177
|
for pod in pods:
|
145
178
|
# Get all the pods running on the node
|
146
179
|
if (pod.spec.node_name == node.metadata.name and
|
@@ -155,16 +188,6 @@ def list_accelerators_realtime(
|
|
155
188
|
|
156
189
|
accelerators_available = accelerator_count - allocated_qty
|
157
190
|
|
158
|
-
if accelerator_count >= min_quantity_filter:
|
159
|
-
quantized_count = (min_quantity_filter *
|
160
|
-
(accelerator_count // min_quantity_filter))
|
161
|
-
if accelerator_name not in total_accelerators_capacity:
|
162
|
-
total_accelerators_capacity[
|
163
|
-
accelerator_name] = quantized_count
|
164
|
-
else:
|
165
|
-
total_accelerators_capacity[
|
166
|
-
accelerator_name] += quantized_count
|
167
|
-
|
168
191
|
if accelerator_name not in total_accelerators_available:
|
169
192
|
total_accelerators_available[accelerator_name] = 0
|
170
193
|
if accelerators_available >= min_quantity_filter:
|
@@ -1801,13 +1801,22 @@ def get_kubernetes_node_info(
|
|
1801
1801
|
number of GPUs available on the node and the number of free GPUs on the
|
1802
1802
|
node.
|
1803
1803
|
|
1804
|
+
If the user does not have sufficient permissions to list pods in all
|
1805
|
+
namespaces, the function will return free GPUs as -1.
|
1806
|
+
|
1804
1807
|
Returns:
|
1805
1808
|
Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
|
1806
1809
|
key and the KubernetesNodeInfo object as value
|
1807
1810
|
"""
|
1808
1811
|
nodes = get_kubernetes_nodes(context)
|
1809
1812
|
# Get the pods to get the real-time resource usage
|
1810
|
-
|
1813
|
+
try:
|
1814
|
+
pods = get_all_pods_in_kubernetes_cluster(context)
|
1815
|
+
except kubernetes.api_exception() as e:
|
1816
|
+
if e.status == 403:
|
1817
|
+
pods = None
|
1818
|
+
else:
|
1819
|
+
raise
|
1811
1820
|
|
1812
1821
|
label_formatter, _ = detect_gpu_label_formatter(context)
|
1813
1822
|
if not label_formatter:
|
@@ -1828,19 +1837,22 @@ def get_kubernetes_node_info(
|
|
1828
1837
|
accelerator_count = int(node.status.allocatable.get(
|
1829
1838
|
'nvidia.com/gpu', 0))
|
1830
1839
|
|
1831
|
-
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1835
|
-
|
1836
|
-
#
|
1837
|
-
|
1838
|
-
|
1839
|
-
|
1840
|
-
|
1841
|
-
|
1842
|
-
|
1843
|
-
|
1840
|
+
if pods is None:
|
1841
|
+
accelerators_available = -1
|
1842
|
+
|
1843
|
+
else:
|
1844
|
+
for pod in pods:
|
1845
|
+
# Get all the pods running on the node
|
1846
|
+
if (pod.spec.node_name == node.metadata.name and
|
1847
|
+
pod.status.phase in ['Running', 'Pending']):
|
1848
|
+
# Iterate over all the containers in the pod and sum the
|
1849
|
+
# GPU requests
|
1850
|
+
for container in pod.spec.containers:
|
1851
|
+
if container.resources.requests:
|
1852
|
+
allocated_qty += int(
|
1853
|
+
container.resources.requests.get(
|
1854
|
+
'nvidia.com/gpu', 0))
|
1855
|
+
accelerators_available = accelerator_count - allocated_qty
|
1844
1856
|
|
1845
1857
|
node_info_dict[node.metadata.name] = KubernetesNodeInfo(
|
1846
1858
|
name=node.metadata.name,
|
@@ -112,6 +112,9 @@ rules:
|
|
112
112
|
- apiGroups: ["networking.k8s.io"] # Required for exposing services through ingresses
|
113
113
|
resources: ["ingressclasses"]
|
114
114
|
verbs: ["get", "list", "watch"]
|
115
|
+
- apiGroups: [""] # Required for sky show-gpus command
|
116
|
+
resources: ["pods"]
|
117
|
+
verbs: ["get", "list"]
|
115
118
|
---
|
116
119
|
# ClusterRoleBinding for the service account
|
117
120
|
apiVersion: rbac.authorization.k8s.io/v1
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20241112
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -309,7 +309,7 @@ Runnable examples:
|
|
309
309
|
- [LocalGPT](./llm/localgpt)
|
310
310
|
- [Falcon](./llm/falcon)
|
311
311
|
- Add yours here & see more in [`llm/`](./llm)!
|
312
|
-
- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/
|
312
|
+
- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
|
313
313
|
|
314
314
|
Case Studies and Integrations: [Community Spotlights](https://blog.skypilot.co/community/)
|
315
315
|
|
{skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241112.dist-info}/RECORD
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=8VuuTyDTVZB1BaeWD7OwBFZwpwweQkb0DNyPpsitRQs,5882
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=oGBQrCYVWqRTcWR-yCKZY7dmUOUnP5Xuvz_zcFXzqlw,212342
|
6
6
|
sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
|
7
7
|
sky/core.py,sha256=0-4W_DKJZgbwXuzNZKQ2R_qJxqxbqqNfyi0U0PQBKvQ,38230
|
8
8
|
sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
|
@@ -65,7 +65,7 @@ sky/clouds/service_catalog/cudo_catalog.py,sha256=V_takvL6dWTGQaTLCEvjKIotCDPnMu
|
|
65
65
|
sky/clouds/service_catalog/fluidstack_catalog.py,sha256=21-cvrYEYTIi7n3ZNF2e7_0QX-PF4BkhlVJUWQOvKrY,5059
|
66
66
|
sky/clouds/service_catalog/gcp_catalog.py,sha256=v_5fsB3dB9oD8U7lBKnCe5ii6AUWEOiQjNarMnU_qLA,24379
|
67
67
|
sky/clouds/service_catalog/ibm_catalog.py,sha256=1iK0KvbI82U7sySb7chr-qm_16x3tTnZ6nIo7o76ouc,4493
|
68
|
-
sky/clouds/service_catalog/kubernetes_catalog.py,sha256=
|
68
|
+
sky/clouds/service_catalog/kubernetes_catalog.py,sha256=c6Oot8RC1ujcFmfJbkeJKUWsw3aX0iNvKL1fJg-FoOc,10020
|
69
69
|
sky/clouds/service_catalog/lambda_catalog.py,sha256=2R-ccu63BbdvO6X80MtxiniA-jLewXb6I0Ye1rYD9fY,5302
|
70
70
|
sky/clouds/service_catalog/oci_catalog.py,sha256=cyA6ZqwHGOKuPxUl_dKmFGdeWdQGMrvl_-o2MtyF998,8580
|
71
71
|
sky/clouds/service_catalog/paperspace_catalog.py,sha256=MOlfoGRChjEwMzu4nRAho8DrIwwUJ3QlRzrMA1RLqvE,3789
|
@@ -140,7 +140,7 @@ sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2v
|
|
140
140
|
sky/provision/kubernetes/instance.py,sha256=MFtTh-dNIuTZcHD20PQG_QuULFRFaPxwlUczR6sRnsk,43601
|
141
141
|
sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
|
142
142
|
sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
|
143
|
-
sky/provision/kubernetes/utils.py,sha256=
|
143
|
+
sky/provision/kubernetes/utils.py,sha256=PEDyZnf-dSmQ4dXyS_0x9OYHt9SbY7A6urd436f-WyQ,89923
|
144
144
|
sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
|
145
145
|
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
|
146
146
|
sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
|
@@ -269,15 +269,15 @@ sky/utils/kubernetes/create_cluster.sh,sha256=VLXfazav9XCMQmeKVqhuOQzt2vM6G1jgnv
|
|
269
269
|
sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
|
270
270
|
sky/utils/kubernetes/deploy_remote_cluster.sh,sha256=vGj0mD0tejHDRy8ulwKOvOF2mfLyT5J8fp7GVqEe_EY,8478
|
271
271
|
sky/utils/kubernetes/generate_kind_config.py,sha256=_TNLnifA_r7-CRq083IP1xjelYqiLjzQX9ohuqYpDH8,3187
|
272
|
-
sky/utils/kubernetes/generate_kubeconfig.sh,sha256=
|
272
|
+
sky/utils/kubernetes/generate_kubeconfig.sh,sha256=livvxDKV-_xx8-dYWNyo4wlg3sOldeHefI37JXKLXu0,9398
|
273
273
|
sky/utils/kubernetes/gpu_labeler.py,sha256=MEUv0U4ACDcNwtFVltlv017XJMjxx1Bndf6fL0i6eqg,6960
|
274
274
|
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7ZWF5gdVIZPupCCo9A,1224
|
275
275
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
276
276
|
sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
|
277
277
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
282
|
-
skypilot_nightly-1.0.0.
|
283
|
-
skypilot_nightly-1.0.0.
|
278
|
+
skypilot_nightly-1.0.0.dev20241112.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
279
|
+
skypilot_nightly-1.0.0.dev20241112.dist-info/METADATA,sha256=Ui6L9CmuvZsIg2D0paU-NiqfVLtywzq5GLpCrJes-eY,19699
|
280
|
+
skypilot_nightly-1.0.0.dev20241112.dist-info/WHEEL,sha256=a7TGlA-5DaHMRrarXjVbQagU3Man_dCnGIWMJr5kRWo,91
|
281
|
+
skypilot_nightly-1.0.0.dev20241112.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
282
|
+
skypilot_nightly-1.0.0.dev20241112.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
283
|
+
skypilot_nightly-1.0.0.dev20241112.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|