skypilot-nightly 1.0.0.dev20241025__py3-none-any.whl → 1.0.0.dev20241026__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +1 -1
- sky/clouds/azure.py +3 -3
- sky/clouds/oci.py +5 -1
- sky/clouds/service_catalog/azure_catalog.py +13 -3
- sky/jobs/controller.py +1 -1
- sky/provision/kubernetes/instance.py +66 -2
- {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241026.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241026.dist-info}/RECORD +13 -13
- {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241026.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241026.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241026.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241026.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '0e915d3430d8027aa40b766605bb13c889ffc62f'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241026'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/cli.py
CHANGED
@@ -3519,7 +3519,7 @@ def jobs():
|
|
3519
3519
|
default=None,
|
3520
3520
|
type=str,
|
3521
3521
|
hidden=True,
|
3522
|
-
help=('Alias for --name, the name of the
|
3522
|
+
help=('Alias for --name, the name of the managed job.'))
|
3523
3523
|
@click.option('--job-recovery',
|
3524
3524
|
default=None,
|
3525
3525
|
type=str,
|
sky/clouds/azure.py
CHANGED
@@ -39,9 +39,9 @@ _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB = 30
|
|
39
39
|
_DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB = 150
|
40
40
|
_DEFAULT_SKYPILOT_IMAGE_GB = 30
|
41
41
|
|
42
|
-
_DEFAULT_CPU_IMAGE_ID = 'skypilot:
|
43
|
-
_DEFAULT_GPU_IMAGE_ID = 'skypilot:gpu-ubuntu-
|
44
|
-
_DEFAULT_V1_IMAGE_ID = 'skypilot:
|
42
|
+
_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-v2'
|
43
|
+
_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2'
|
44
|
+
_DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
|
45
45
|
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
|
46
46
|
_FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
|
47
47
|
|
sky/clouds/oci.py
CHANGED
@@ -468,7 +468,11 @@ class OCI(clouds.Cloud):
|
|
468
468
|
api_key_file = oci_cfg[
|
469
469
|
'key_file'] if 'key_file' in oci_cfg else 'BadConf'
|
470
470
|
sky_cfg_file = oci_utils.oci_config.get_sky_user_config_file()
|
471
|
-
|
471
|
+
# Must catch ImportError before any oci_adaptor.oci.exceptions
|
472
|
+
# because oci_adaptor.oci.exceptions can throw ImportError.
|
473
|
+
except ImportError:
|
474
|
+
return {}
|
475
|
+
except oci_adaptor.oci.exceptions.ConfigFileNotFound:
|
472
476
|
return {}
|
473
477
|
|
474
478
|
# OCI config and API key file are mandatory
|
@@ -7,11 +7,14 @@ import re
|
|
7
7
|
from typing import Dict, List, Optional, Tuple
|
8
8
|
|
9
9
|
from sky import clouds as cloud_lib
|
10
|
+
from sky import sky_logging
|
10
11
|
from sky.clouds import Azure
|
11
12
|
from sky.clouds.service_catalog import common
|
12
13
|
from sky.utils import resources_utils
|
13
14
|
from sky.utils import ux_utils
|
14
15
|
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
17
|
+
|
15
18
|
# This list should match the list of regions in
|
16
19
|
# skypilot image generation Packer script's replication_regions
|
17
20
|
# sky/clouds/service_catalog/images/skypilot-azure-cpu-ubuntu.pkr.hcl
|
@@ -191,9 +194,16 @@ def list_accelerators(
|
|
191
194
|
|
192
195
|
def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
|
193
196
|
"""Returns the image id from the tag."""
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
+
global _image_df
|
198
|
+
image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
|
199
|
+
if image_id is None:
|
200
|
+
# Refresh the image catalog and try again, if the image tag is not
|
201
|
+
# found.
|
202
|
+
logger.debug('Refreshing the image catalog and trying again.')
|
203
|
+
_image_df = common.read_catalog('azure/images.csv',
|
204
|
+
pull_frequency_hours=0)
|
205
|
+
image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
|
206
|
+
return image_id
|
197
207
|
|
198
208
|
|
199
209
|
def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
|
sky/jobs/controller.py
CHANGED
@@ -215,7 +215,7 @@ class JobsController:
|
|
215
215
|
end_time=end_time,
|
216
216
|
callback_func=callback_func)
|
217
217
|
logger.info(
|
218
|
-
f'
|
218
|
+
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
219
219
|
f'Cleaning up the cluster {cluster_name}.')
|
220
220
|
# Only clean up the cluster, not the storages, because tasks may
|
221
221
|
# share storages.
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Kubernetes instance provisioning."""
|
2
2
|
import copy
|
3
|
+
import json
|
3
4
|
import time
|
4
5
|
from typing import Any, Dict, List, Optional
|
5
6
|
import uuid
|
@@ -425,6 +426,70 @@ def _label_pod(namespace: str, context: Optional[str], pod_name: str,
|
|
425
426
|
_request_timeout=kubernetes.API_TIMEOUT)
|
426
427
|
|
427
428
|
|
429
|
+
def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
|
430
|
+
context: Optional[str]) -> Any:
|
431
|
+
"""Attempts to create a Kubernetes Pod and handle any errors.
|
432
|
+
|
433
|
+
Currently, we handle errors due to the AppArmor annotation and retry if
|
434
|
+
it fails due to the `FieldValueForbidden` error.
|
435
|
+
See https://github.com/skypilot-org/skypilot/issues/4174 for details.
|
436
|
+
|
437
|
+
Returns: The created Pod object.
|
438
|
+
"""
|
439
|
+
try:
|
440
|
+
# Attempt to create the Pod with the AppArmor annotation
|
441
|
+
pod = kubernetes.core_api(context).create_namespaced_pod(
|
442
|
+
namespace, pod_spec)
|
443
|
+
return pod
|
444
|
+
except kubernetes.api_exception() as e:
|
445
|
+
try:
|
446
|
+
error_body = json.loads(e.body)
|
447
|
+
error_message = error_body.get('message', '')
|
448
|
+
except json.JSONDecodeError:
|
449
|
+
error_message = str(e.body)
|
450
|
+
# Check if the error is due to the AppArmor annotation and retry.
|
451
|
+
# We add an AppArmor annotation to set it as unconfined in our
|
452
|
+
# base template in kubernetes-ray.yml.j2. This is required for
|
453
|
+
# FUSE to work in the pod on most Kubernetes distributions.
|
454
|
+
# However, some distributions do not support the AppArmor annotation
|
455
|
+
# and will fail to create the pod. In this case, we retry without
|
456
|
+
# the annotation.
|
457
|
+
if (e.status == 422 and 'FieldValueForbidden' in error_message and
|
458
|
+
'AppArmorProfile: nil' in error_message):
|
459
|
+
logger.warning('AppArmor annotation caused pod creation to fail. '
|
460
|
+
'Retrying without the annotation. '
|
461
|
+
'Note: this may cause bucket mounting to fail.')
|
462
|
+
|
463
|
+
# Remove the AppArmor annotation
|
464
|
+
annotations = pod_spec.get('metadata', {}).get('annotations', {})
|
465
|
+
if ('container.apparmor.security.beta.kubernetes.io/ray-node'
|
466
|
+
in annotations):
|
467
|
+
del annotations[
|
468
|
+
'container.apparmor.security.beta.kubernetes.io/ray-node']
|
469
|
+
pod_spec['metadata']['annotations'] = annotations
|
470
|
+
logger.info('AppArmor annotation removed from Pod spec.')
|
471
|
+
else:
|
472
|
+
logger.warning('AppArmor annotation not found in pod spec, '
|
473
|
+
'retrying will not help. '
|
474
|
+
f'Current annotations: {annotations}')
|
475
|
+
raise e
|
476
|
+
|
477
|
+
# Retry Pod creation without the AppArmor annotation
|
478
|
+
try:
|
479
|
+
pod = kubernetes.core_api(context).create_namespaced_pod(
|
480
|
+
namespace, pod_spec)
|
481
|
+
logger.info(f'Pod {pod.metadata.name} created successfully '
|
482
|
+
'without AppArmor annotation.')
|
483
|
+
return pod
|
484
|
+
except kubernetes.api_exception() as retry_exception:
|
485
|
+
logger.info('Failed to create Pod without AppArmor annotation: '
|
486
|
+
f'{retry_exception}')
|
487
|
+
raise retry_exception
|
488
|
+
else:
|
489
|
+
# Re-raise the exception if it's a different error
|
490
|
+
raise e
|
491
|
+
|
492
|
+
|
428
493
|
def _create_pods(region: str, cluster_name_on_cloud: str,
|
429
494
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
430
495
|
"""Create pods based on the config."""
|
@@ -546,8 +611,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
546
611
|
}
|
547
612
|
}
|
548
613
|
|
549
|
-
pod =
|
550
|
-
namespace, pod_spec)
|
614
|
+
pod = _create_namespaced_pod_with_retries(namespace, pod_spec, context)
|
551
615
|
created_pods[pod.metadata.name] = pod
|
552
616
|
if head_pod_name is None:
|
553
617
|
head_pod_name = pod.metadata.name
|
{skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241026.dist-info}/RECORD
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=N9HA1yPUbTF3VIZz0NOVgz7dHAcWh0f_GR82a0uJYe8,5882
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=VoPwWKGeNZZcFNLvw3VPR_F0WpKnM5EvfffNS8kcKc0,210360
|
6
6
|
sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
|
7
7
|
sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
|
8
8
|
sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
|
@@ -41,7 +41,7 @@ sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG
|
|
41
41
|
sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
|
42
42
|
sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
|
43
43
|
sky/clouds/aws.py,sha256=XJVbOSkVVUHp9HbHDp0rFdHX113JHbY-3sgokGdNJVE,49527
|
44
|
-
sky/clouds/azure.py,sha256=
|
44
|
+
sky/clouds/azure.py,sha256=SOaJQ-E6ENJJviveNjY6OU4CmV5VMIqJEV5A1Rltqpg,30178
|
45
45
|
sky/clouds/cloud.py,sha256=BBu1G-gkmylffldL50cvJ2DkDJ8vjVPziOPUAsvgJ2o,34948
|
46
46
|
sky/clouds/cloud_registry.py,sha256=oLoYFjm_SDTgdHokY7b6A5Utq80HXRQNxV0fLjDdVsQ,2361
|
47
47
|
sky/clouds/cudo.py,sha256=H4VyMo5wWGAv2MXZ3xsbWjlZA_cZYnt4ecNlTOOao8Y,13147
|
@@ -50,14 +50,14 @@ sky/clouds/gcp.py,sha256=m_dH04HqgU-DdW4R9wrSr66IpPt9JMKHEvHEGGFpeRo,54655
|
|
50
50
|
sky/clouds/ibm.py,sha256=M8QdjeSFlwssfoY2aOodxG4q5R3eT9K-4lTPDHYvEYI,21476
|
51
51
|
sky/clouds/kubernetes.py,sha256=j3imm_sbtyyZXvJ6qbqZmXok2C9OQIcGpyulljbTSJ4,28696
|
52
52
|
sky/clouds/lambda_cloud.py,sha256=11dKUSunHUgaPZ1t8O85X29_NJ-o26sCt5DjwAPFgl4,12697
|
53
|
-
sky/clouds/oci.py,sha256=
|
53
|
+
sky/clouds/oci.py,sha256=gefhbQlFLW1K4pyPOp9fHnTawe_Ozy8m3nPUYlVszSc,27137
|
54
54
|
sky/clouds/paperspace.py,sha256=lmUZPYAblaqiBmGQwCunccMiTF_dVA1o3vqY9Q_Nc28,10921
|
55
55
|
sky/clouds/runpod.py,sha256=lstUC6f4JDhtcH9NfwkbpCJMmfmvMigoanhPXPbTYds,11540
|
56
56
|
sky/clouds/scp.py,sha256=2KLTuNSMdBzK8CLwSesv7efOuiLidIMoyNG4AOt5Sqw,15870
|
57
57
|
sky/clouds/vsphere.py,sha256=7eZFYIDtY5sX_ATr8h7kwwkY9t8Z-EYMJ9HCjoRBoxI,12309
|
58
58
|
sky/clouds/service_catalog/__init__.py,sha256=e0K-c64jQV9d6zly5OnIXMsYaZXs_Ko9osAbDaRlOOw,14743
|
59
59
|
sky/clouds/service_catalog/aws_catalog.py,sha256=1wX1-wOMw2LZ7RkV_Ah7c42RLRYm-m5_GAXzn32M5a8,13038
|
60
|
-
sky/clouds/service_catalog/azure_catalog.py,sha256=
|
60
|
+
sky/clouds/service_catalog/azure_catalog.py,sha256=JpULm-1WYkpkFwePjtYsxYxo7h_DFkMMsdrQxpgiKH0,8127
|
61
61
|
sky/clouds/service_catalog/common.py,sha256=PA3llB0zZh4v0DO_gDDCKGhRIBx16CAp2WJZNxhjNOA,27266
|
62
62
|
sky/clouds/service_catalog/config.py,sha256=ylzqewdEBjDg4awvFek6ldYmFrnvD2bVGLZuLPvEVYA,1793
|
63
63
|
sky/clouds/service_catalog/constants.py,sha256=ai2yOlsVqBnEpbxaEHXt61COsHBLwOfw6GZXntEPj7k,411
|
@@ -95,7 +95,7 @@ sky/data/storage.py,sha256=x8YYY4zVBdit_5oAR_MXV-TM9qDefV_ZV4z0irv6ZaU,163102
|
|
95
95
|
sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
|
96
96
|
sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
|
97
97
|
sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
|
98
|
-
sky/jobs/controller.py,sha256=
|
98
|
+
sky/jobs/controller.py,sha256=JcgHsghFGweTlvj4_-tdHSpeT015EQTrqWrXkjGsJBA,26704
|
99
99
|
sky/jobs/core.py,sha256=RkBFaKDlovmdzqlOAgQ0xAimZFgo4pXq3qaQkAvGsGk,16908
|
100
100
|
sky/jobs/recovery_strategy.py,sha256=UOEaVGSpRbCnCzlD8cgyjhCPIBIeBeCXCutoSic5aiA,25545
|
101
101
|
sky/jobs/state.py,sha256=C6R5Yq7ftBqGPa_71tUjflBMKAaJ1FTTdbgjAwmbJsI,23231
|
@@ -137,7 +137,7 @@ sky/provision/gcp/instance_utils.py,sha256=veRBr6Oziv0KaUdC4acuWeaOremNV0gMYCCHa
|
|
137
137
|
sky/provision/gcp/mig_utils.py,sha256=oFpcFZoapHMILSE4iIm8V5bxP1RhbMHRF7cciqq8qAk,7883
|
138
138
|
sky/provision/kubernetes/__init__.py,sha256=y6yVfii81WYG3ROxv4hiIj-ydinS5-xGxLvXnARVQoI,719
|
139
139
|
sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2vmzldc,29072
|
140
|
-
sky/provision/kubernetes/instance.py,sha256=
|
140
|
+
sky/provision/kubernetes/instance.py,sha256=1dN2vdh-ZdeIe39ZxH5DAnnc8kXHWpzD6q-f14-8cDE,41576
|
141
141
|
sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
|
142
142
|
sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
|
143
143
|
sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
|
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
274
274
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
275
275
|
sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
|
276
276
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
282
|
-
skypilot_nightly-1.0.0.
|
277
|
+
skypilot_nightly-1.0.0.dev20241026.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
278
|
+
skypilot_nightly-1.0.0.dev20241026.dist-info/METADATA,sha256=Gt6EEjuIDVBchJXkKxLBtgvYeJab6tYp-FhMeIPw9hc,19540
|
279
|
+
skypilot_nightly-1.0.0.dev20241026.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
280
|
+
skypilot_nightly-1.0.0.dev20241026.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
281
|
+
skypilot_nightly-1.0.0.dev20241026.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
282
|
+
skypilot_nightly-1.0.0.dev20241026.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241026.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|