skypilot-nightly 1.0.0.dev20241111__py3-none-any.whl → 1.0.0.dev20241112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '91323d86baaeb1341c6953e15bbf19f2896b67ad'
8
+ _SKYPILOT_COMMIT_SHA = '140125eaad5fb64da37934c8f6650d68aa135f77'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241111'
38
+ __version__ = '1.0.0.dev20241112'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/cli.py CHANGED
@@ -3102,6 +3102,7 @@ def show_gpus(
3102
3102
  kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
3103
3103
  kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
3104
3104
  sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
3105
+ no_permissions_str = '<no permissions>'
3105
3106
 
3106
3107
  def _list_to_str(lst):
3107
3108
  return ', '.join([str(e) for e in lst])
@@ -3146,9 +3147,11 @@ def show_gpus(
3146
3147
  debug_msg)
3147
3148
  raise ValueError(full_err_msg)
3148
3149
  for gpu, _ in sorted(counts.items()):
3150
+ available_qty = available[gpu] if available[gpu] != -1 else (
3151
+ no_permissions_str)
3149
3152
  realtime_gpu_table.add_row([
3150
3153
  gpu,
3151
- _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu]
3154
+ _list_to_str(counts.pop(gpu)), capacity[gpu], available_qty
3152
3155
  ])
3153
3156
  return realtime_gpu_table
3154
3157
 
@@ -3158,10 +3161,11 @@ def show_gpus(
3158
3161
 
3159
3162
  node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
3160
3163
  for node_name, node_info in node_info_dict.items():
3164
+ available = node_info.free['nvidia.com/gpu'] if node_info.free[
3165
+ 'nvidia.com/gpu'] != -1 else no_permissions_str
3161
3166
  node_table.add_row([
3162
3167
  node_name, node_info.gpu_type,
3163
- node_info.total['nvidia.com/gpu'],
3164
- node_info.free['nvidia.com/gpu']
3168
+ node_info.total['nvidia.com/gpu'], available
3165
3169
  ])
3166
3170
  return node_table
3167
3171
 
@@ -10,6 +10,7 @@ from typing import Dict, List, Optional, Set, Tuple
10
10
  from sky import check as sky_check
11
11
  from sky import sky_logging
12
12
  from sky.adaptors import common as adaptors_common
13
+ from sky.adaptors import kubernetes
13
14
  from sky.clouds import Kubernetes
14
15
  from sky.clouds.service_catalog import CloudFilter
15
16
  from sky.clouds.service_catalog import common
@@ -22,6 +23,8 @@ if typing.TYPE_CHECKING:
22
23
  else:
23
24
  pd = adaptors_common.LazyImport('pandas')
24
25
 
26
+ logger = sky_logging.init_logger(__name__)
27
+
25
28
  _PULL_FREQUENCY_HOURS = 7
26
29
 
27
30
  # We keep pull_frequency_hours so we can remotely update the default image paths
@@ -77,6 +80,11 @@ def list_accelerators_realtime(
77
80
  require_price: bool = True
78
81
  ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
79
82
  int]]:
83
+ """List accelerators in the Kubernetes cluster.
84
+
85
+ If the user does not have sufficient permissions to list pods in all
86
+ namespaces, the function will return free GPUs as -1.
87
+ """
80
88
  # TODO(romilb): This should be refactored to use get_kubernetes_node_info()
81
89
  # function from kubernetes_utils.
82
90
  del all_regions, require_price # Unused.
@@ -108,7 +116,17 @@ def list_accelerators_realtime(
108
116
  key = label_formatter.get_label_key()
109
117
  nodes = kubernetes_utils.get_kubernetes_nodes(context)
110
118
  # Get the pods to get the real-time GPU usage
111
- pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
119
+ try:
120
+ pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
121
+ except kubernetes.api_exception() as e:
122
+ if e.status == 403:
123
+ logger.warning('Failed to get pods in the Kubernetes cluster '
124
+ '(forbidden). Please check if your account has '
125
+ 'necessary permissions to list pods. Realtime GPU '
126
+ 'availability information may be incorrect.')
127
+ pods = None
128
+ else:
129
+ raise
112
130
  # Total number of GPUs in the cluster
113
131
  total_accelerators_capacity: Dict[str, int] = {}
114
132
  # Total number of GPUs currently available in the cluster
@@ -141,6 +159,21 @@ def list_accelerators_realtime(
141
159
  if accelerator_count not in accelerators_qtys:
142
160
  accelerators_qtys.add((accelerator_name, accelerator_count))
143
161
 
162
+ if accelerator_count >= min_quantity_filter:
163
+ quantized_count = (min_quantity_filter *
164
+ (accelerator_count // min_quantity_filter))
165
+ if accelerator_name not in total_accelerators_capacity:
166
+ total_accelerators_capacity[
167
+ accelerator_name] = quantized_count
168
+ else:
169
+ total_accelerators_capacity[
170
+ accelerator_name] += quantized_count
171
+
172
+ if pods is None:
173
+ # If we can't get the pods, we can't get the GPU usage
174
+ total_accelerators_available[accelerator_name] = -1
175
+ continue
176
+
144
177
  for pod in pods:
145
178
  # Get all the pods running on the node
146
179
  if (pod.spec.node_name == node.metadata.name and
@@ -155,16 +188,6 @@ def list_accelerators_realtime(
155
188
 
156
189
  accelerators_available = accelerator_count - allocated_qty
157
190
 
158
- if accelerator_count >= min_quantity_filter:
159
- quantized_count = (min_quantity_filter *
160
- (accelerator_count // min_quantity_filter))
161
- if accelerator_name not in total_accelerators_capacity:
162
- total_accelerators_capacity[
163
- accelerator_name] = quantized_count
164
- else:
165
- total_accelerators_capacity[
166
- accelerator_name] += quantized_count
167
-
168
191
  if accelerator_name not in total_accelerators_available:
169
192
  total_accelerators_available[accelerator_name] = 0
170
193
  if accelerators_available >= min_quantity_filter:
@@ -1801,13 +1801,22 @@ def get_kubernetes_node_info(
1801
1801
  number of GPUs available on the node and the number of free GPUs on the
1802
1802
  node.
1803
1803
 
1804
+ If the user does not have sufficient permissions to list pods in all
1805
+ namespaces, the function will return free GPUs as -1.
1806
+
1804
1807
  Returns:
1805
1808
  Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
1806
1809
  key and the KubernetesNodeInfo object as value
1807
1810
  """
1808
1811
  nodes = get_kubernetes_nodes(context)
1809
1812
  # Get the pods to get the real-time resource usage
1810
- pods = get_all_pods_in_kubernetes_cluster(context)
1813
+ try:
1814
+ pods = get_all_pods_in_kubernetes_cluster(context)
1815
+ except kubernetes.api_exception() as e:
1816
+ if e.status == 403:
1817
+ pods = None
1818
+ else:
1819
+ raise
1811
1820
 
1812
1821
  label_formatter, _ = detect_gpu_label_formatter(context)
1813
1822
  if not label_formatter:
@@ -1828,19 +1837,22 @@ def get_kubernetes_node_info(
1828
1837
  accelerator_count = int(node.status.allocatable.get(
1829
1838
  'nvidia.com/gpu', 0))
1830
1839
 
1831
- for pod in pods:
1832
- # Get all the pods running on the node
1833
- if (pod.spec.node_name == node.metadata.name and
1834
- pod.status.phase in ['Running', 'Pending']):
1835
- # Iterate over all the containers in the pod and sum the
1836
- # GPU requests
1837
- for container in pod.spec.containers:
1838
- if container.resources.requests:
1839
- allocated_qty += int(
1840
- container.resources.requests.get(
1841
- 'nvidia.com/gpu', 0))
1842
-
1843
- accelerators_available = accelerator_count - allocated_qty
1840
+ if pods is None:
1841
+ accelerators_available = -1
1842
+
1843
+ else:
1844
+ for pod in pods:
1845
+ # Get all the pods running on the node
1846
+ if (pod.spec.node_name == node.metadata.name and
1847
+ pod.status.phase in ['Running', 'Pending']):
1848
+ # Iterate over all the containers in the pod and sum the
1849
+ # GPU requests
1850
+ for container in pod.spec.containers:
1851
+ if container.resources.requests:
1852
+ allocated_qty += int(
1853
+ container.resources.requests.get(
1854
+ 'nvidia.com/gpu', 0))
1855
+ accelerators_available = accelerator_count - allocated_qty
1844
1856
 
1845
1857
  node_info_dict[node.metadata.name] = KubernetesNodeInfo(
1846
1858
  name=node.metadata.name,
@@ -112,6 +112,9 @@ rules:
112
112
  - apiGroups: ["networking.k8s.io"] # Required for exposing services through ingresses
113
113
  resources: ["ingressclasses"]
114
114
  verbs: ["get", "list", "watch"]
115
+ - apiGroups: [""] # Required for sky show-gpus command
116
+ resources: ["pods"]
117
+ verbs: ["get", "list"]
115
118
  ---
116
119
  # ClusterRoleBinding for the service account
117
120
  apiVersion: rbac.authorization.k8s.io/v1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241111
3
+ Version: 1.0.0.dev20241112
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -309,7 +309,7 @@ Runnable examples:
309
309
  - [LocalGPT](./llm/localgpt)
310
310
  - [Falcon](./llm/falcon)
311
311
  - Add yours here & see more in [`llm/`](./llm)!
312
- - Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
312
+ - Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
313
313
 
314
314
  Case Studies and Integrations: [Community Spotlights](https://blog.skypilot.co/community/)
315
315
 
@@ -1,8 +1,8 @@
1
- sky/__init__.py,sha256=JxZi3opPkeceUxnwl2tlNNr19fC_0QQ_mQ9N6cSQb-Q,5882
1
+ sky/__init__.py,sha256=8VuuTyDTVZB1BaeWD7OwBFZwpwweQkb0DNyPpsitRQs,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
5
- sky/cli.py,sha256=jEjXs5Z0u263eJIsTHoKyG9oOY6giqw19s2di9kEv1s,212088
5
+ sky/cli.py,sha256=oGBQrCYVWqRTcWR-yCKZY7dmUOUnP5Xuvz_zcFXzqlw,212342
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=0-4W_DKJZgbwXuzNZKQ2R_qJxqxbqqNfyi0U0PQBKvQ,38230
8
8
  sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
@@ -65,7 +65,7 @@ sky/clouds/service_catalog/cudo_catalog.py,sha256=V_takvL6dWTGQaTLCEvjKIotCDPnMu
65
65
  sky/clouds/service_catalog/fluidstack_catalog.py,sha256=21-cvrYEYTIi7n3ZNF2e7_0QX-PF4BkhlVJUWQOvKrY,5059
66
66
  sky/clouds/service_catalog/gcp_catalog.py,sha256=v_5fsB3dB9oD8U7lBKnCe5ii6AUWEOiQjNarMnU_qLA,24379
67
67
  sky/clouds/service_catalog/ibm_catalog.py,sha256=1iK0KvbI82U7sySb7chr-qm_16x3tTnZ6nIo7o76ouc,4493
68
- sky/clouds/service_catalog/kubernetes_catalog.py,sha256=5ilQ-JK1ZS2EZp8GpCKok0H3S1fdI_aAznzIDWCY1NY,9110
68
+ sky/clouds/service_catalog/kubernetes_catalog.py,sha256=c6Oot8RC1ujcFmfJbkeJKUWsw3aX0iNvKL1fJg-FoOc,10020
69
69
  sky/clouds/service_catalog/lambda_catalog.py,sha256=2R-ccu63BbdvO6X80MtxiniA-jLewXb6I0Ye1rYD9fY,5302
70
70
  sky/clouds/service_catalog/oci_catalog.py,sha256=cyA6ZqwHGOKuPxUl_dKmFGdeWdQGMrvl_-o2MtyF998,8580
71
71
  sky/clouds/service_catalog/paperspace_catalog.py,sha256=MOlfoGRChjEwMzu4nRAho8DrIwwUJ3QlRzrMA1RLqvE,3789
@@ -140,7 +140,7 @@ sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2v
140
140
  sky/provision/kubernetes/instance.py,sha256=MFtTh-dNIuTZcHD20PQG_QuULFRFaPxwlUczR6sRnsk,43601
141
141
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
142
142
  sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
143
- sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
143
+ sky/provision/kubernetes/utils.py,sha256=PEDyZnf-dSmQ4dXyS_0x9OYHt9SbY7A6urd436f-WyQ,89923
144
144
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
145
145
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
146
146
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
@@ -269,15 +269,15 @@ sky/utils/kubernetes/create_cluster.sh,sha256=VLXfazav9XCMQmeKVqhuOQzt2vM6G1jgnv
269
269
  sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
270
270
  sky/utils/kubernetes/deploy_remote_cluster.sh,sha256=vGj0mD0tejHDRy8ulwKOvOF2mfLyT5J8fp7GVqEe_EY,8478
271
271
  sky/utils/kubernetes/generate_kind_config.py,sha256=_TNLnifA_r7-CRq083IP1xjelYqiLjzQX9ohuqYpDH8,3187
272
- sky/utils/kubernetes/generate_kubeconfig.sh,sha256=AcYhuuG5jXWGHUmyRuH-oKy5qcn92gXhu6bXOt6eD6g,9274
272
+ sky/utils/kubernetes/generate_kubeconfig.sh,sha256=livvxDKV-_xx8-dYWNyo4wlg3sOldeHefI37JXKLXu0,9398
273
273
  sky/utils/kubernetes/gpu_labeler.py,sha256=MEUv0U4ACDcNwtFVltlv017XJMjxx1Bndf6fL0i6eqg,6960
274
274
  sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7ZWF5gdVIZPupCCo9A,1224
275
275
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
276
276
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
277
277
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
278
- skypilot_nightly-1.0.0.dev20241111.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
- skypilot_nightly-1.0.0.dev20241111.dist-info/METADATA,sha256=ILiS9hM4X6WG3syvXek7BxYF7SvXnZ9o8h5bmcFL2sI,19708
280
- skypilot_nightly-1.0.0.dev20241111.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
281
- skypilot_nightly-1.0.0.dev20241111.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
- skypilot_nightly-1.0.0.dev20241111.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
- skypilot_nightly-1.0.0.dev20241111.dist-info/RECORD,,
278
+ skypilot_nightly-1.0.0.dev20241112.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
+ skypilot_nightly-1.0.0.dev20241112.dist-info/METADATA,sha256=Ui6L9CmuvZsIg2D0paU-NiqfVLtywzq5GLpCrJes-eY,19699
280
+ skypilot_nightly-1.0.0.dev20241112.dist-info/WHEEL,sha256=a7TGlA-5DaHMRrarXjVbQagU3Man_dCnGIWMJr5kRWo,91
281
+ skypilot_nightly-1.0.0.dev20241112.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
+ skypilot_nightly-1.0.0.dev20241112.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
+ skypilot_nightly-1.0.0.dev20241112.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.0)
2
+ Generator: setuptools (75.4.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5