skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +28 -40
- sky/backends/backend_utils.py +19 -2
- sky/backends/cloud_vm_ray_backend.py +33 -8
- sky/backends/local_docker_backend.py +1 -2
- sky/cli.py +91 -38
- sky/client/cli.py +91 -38
- sky/client/sdk.py +3 -2
- sky/clouds/aws.py +12 -6
- sky/clouds/azure.py +3 -0
- sky/clouds/cloud.py +8 -2
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +3 -0
- sky/clouds/fluidstack.py +3 -0
- sky/clouds/gcp.py +7 -0
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +42 -19
- sky/clouds/lambda_cloud.py +1 -0
- sky/clouds/nebius.py +18 -10
- sky/clouds/oci.py +6 -3
- sky/clouds/paperspace.py +2 -0
- sky/clouds/runpod.py +2 -0
- sky/clouds/scp.py +2 -0
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
- sky/clouds/vast.py +2 -0
- sky/clouds/vsphere.py +2 -0
- sky/core.py +58 -29
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/favicon.ico +0 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/exceptions.py +6 -0
- sky/execution.py +19 -4
- sky/global_user_state.py +1 -0
- sky/optimizer.py +35 -11
- sky/provision/common.py +2 -5
- sky/provision/docker_utils.py +22 -16
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +276 -93
- sky/provision/kubernetes/network.py +1 -1
- sky/provision/kubernetes/utils.py +36 -24
- sky/provision/provisioner.py +6 -0
- sky/serve/replica_managers.py +51 -5
- sky/serve/serve_state.py +41 -0
- sky/serve/service.py +108 -63
- sky/server/common.py +6 -3
- sky/server/config.py +184 -0
- sky/server/requests/executor.py +17 -156
- sky/server/server.py +4 -4
- sky/setup_files/dependencies.py +0 -1
- sky/skylet/constants.py +7 -0
- sky/skypilot_config.py +27 -6
- sky/task.py +1 -1
- sky/templates/kubernetes-ray.yml.j2 +145 -15
- sky/templates/nebius-ray.yml.j2 +63 -0
- sky/utils/command_runner.py +17 -3
- sky/utils/command_runner.pyi +2 -0
- sky/utils/controller_utils.py +24 -0
- sky/utils/kubernetes/rsync_helper.sh +20 -4
- sky/utils/schemas.py +13 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +73 -72
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +1 -1
- /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '7b804dafe2f6b775f8a357ac6e147b83e792af93'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250429'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/nebius.py
CHANGED
@@ -29,11 +29,6 @@ MAX_RETRIES_TO_INSTANCE_WAIT = 120 # Maximum number of retries
|
|
29
29
|
|
30
30
|
POLL_INTERVAL = 5
|
31
31
|
|
32
|
-
_iam_token = None
|
33
|
-
_sdk = None
|
34
|
-
_tenant_id = None
|
35
|
-
_project_id = None
|
36
|
-
|
37
32
|
_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Nebius AI Cloud.'
|
38
33
|
'Try pip install "skypilot[nebius]"')
|
39
34
|
|
@@ -81,56 +76,49 @@ def vpc():
|
|
81
76
|
return vpc_v1
|
82
77
|
|
83
78
|
|
79
|
+
@annotations.lru_cache(scope='request')
|
84
80
|
def get_iam_token():
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
except FileNotFoundError:
|
92
|
-
return None
|
93
|
-
return _iam_token
|
81
|
+
try:
|
82
|
+
with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
|
83
|
+
encoding='utf-8') as file:
|
84
|
+
return file.read().strip()
|
85
|
+
except FileNotFoundError:
|
86
|
+
return None
|
94
87
|
|
95
88
|
|
89
|
+
@annotations.lru_cache(scope='request')
|
96
90
|
def is_token_or_cred_file_exist():
|
97
91
|
return (os.path.exists(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH)) or
|
98
92
|
os.path.exists(os.path.expanduser(NEBIUS_CREDENTIALS_PATH)))
|
99
93
|
|
100
94
|
|
95
|
+
@annotations.lru_cache(scope='request')
|
101
96
|
def get_project_id():
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
except FileNotFoundError:
|
109
|
-
return None
|
110
|
-
return _project_id
|
97
|
+
try:
|
98
|
+
with open(os.path.expanduser(NEBIUS_PROJECT_ID_PATH),
|
99
|
+
encoding='utf-8') as file:
|
100
|
+
return file.read().strip()
|
101
|
+
except FileNotFoundError:
|
102
|
+
return None
|
111
103
|
|
112
104
|
|
105
|
+
@annotations.lru_cache(scope='request')
|
113
106
|
def get_tenant_id():
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
except FileNotFoundError:
|
121
|
-
return None
|
122
|
-
return _tenant_id
|
107
|
+
try:
|
108
|
+
with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
|
109
|
+
encoding='utf-8') as file:
|
110
|
+
return file.read().strip()
|
111
|
+
except FileNotFoundError:
|
112
|
+
return None
|
123
113
|
|
124
114
|
|
115
|
+
@annotations.lru_cache(scope='request')
|
125
116
|
def sdk():
|
126
|
-
|
127
|
-
if
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
_sdk = nebius.sdk.SDK(
|
132
|
-
credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
|
133
|
-
return _sdk
|
117
|
+
token = get_iam_token()
|
118
|
+
if token is not None:
|
119
|
+
return nebius.sdk.SDK(credentials=token)
|
120
|
+
return nebius.sdk.SDK(
|
121
|
+
credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
|
134
122
|
|
135
123
|
|
136
124
|
def get_nebius_credentials(boto3_session):
|
sky/backends/backend_utils.py
CHANGED
@@ -179,6 +179,9 @@ _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
|
|
179
179
|
('available_node_types', 'ray.head.default', 'node_config', 'UserData'),
|
180
180
|
('available_node_types', 'ray.head.default', 'node_config',
|
181
181
|
'azure_arm_parameters', 'cloudInitSetupCommands'),
|
182
|
+
('available_node_types', 'ray_head_default', 'node_config', 'pvc_spec'),
|
183
|
+
('available_node_types', 'ray_head_default', 'node_config',
|
184
|
+
'deployment_spec'),
|
182
185
|
]
|
183
186
|
# These keys are expected to change when provisioning on an existing cluster,
|
184
187
|
# but they don't actually represent a change that requires re-provisioning the
|
@@ -705,6 +708,13 @@ def write_cluster_config(
|
|
705
708
|
is_custom_docker = ('true' if to_provision.extract_docker_image()
|
706
709
|
is not None else 'false')
|
707
710
|
|
711
|
+
# Here, if users specify the controller to be high availability, we will
|
712
|
+
# provision a high availability controller. Whether the cloud supports
|
713
|
+
# this feature has been checked by
|
714
|
+
# CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS
|
715
|
+
high_availability_specified = controller_utils.high_availability_specified(
|
716
|
+
cluster_name_on_cloud)
|
717
|
+
|
708
718
|
# Use a tmp file path to avoid incomplete YAML file being re-used in the
|
709
719
|
# future.
|
710
720
|
tmp_yaml_path = yaml_path + '.tmp'
|
@@ -790,6 +800,9 @@ def write_cluster_config(
|
|
790
800
|
'sky_wheel_hash': wheel_hash,
|
791
801
|
# Authentication (optional).
|
792
802
|
**auth_config,
|
803
|
+
|
804
|
+
# High availability
|
805
|
+
'high_availability': high_availability_specified,
|
793
806
|
}),
|
794
807
|
output_path=tmp_yaml_path)
|
795
808
|
config_dict['cluster_name'] = cluster_name
|
@@ -802,8 +815,12 @@ def write_cluster_config(
|
|
802
815
|
cluster_config_overrides=to_provision.cluster_config_overrides)
|
803
816
|
kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
|
804
817
|
yaml_obj = common_utils.read_yaml(tmp_yaml_path)
|
805
|
-
pod_config = yaml_obj['available_node_types'][
|
806
|
-
'node_config']
|
818
|
+
pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
|
819
|
+
'ray_head_default']['node_config']
|
820
|
+
|
821
|
+
# Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
|
822
|
+
pod_config.pop('deployment_spec', None)
|
823
|
+
pod_config.pop('pvc_spec', None)
|
807
824
|
valid, message = kubernetes_utils.check_pod_config(pod_config)
|
808
825
|
if not valid:
|
809
826
|
raise exceptions.InvalidCloudConfigs(
|
@@ -1583,6 +1583,10 @@ class RetryingVmProvisioner(object):
|
|
1583
1583
|
except provision_common.StopFailoverError:
|
1584
1584
|
with ux_utils.print_exception_no_traceback():
|
1585
1585
|
raise
|
1586
|
+
except exceptions.InconsistentHighAvailabilityError:
|
1587
|
+
# No teardown happens for this error.
|
1588
|
+
with ux_utils.print_exception_no_traceback():
|
1589
|
+
raise
|
1586
1590
|
except Exception as e: # pylint: disable=broad-except
|
1587
1591
|
# NOTE: We try to cleanup the cluster even if the previous
|
1588
1592
|
# cluster does not exist. Also we are fast at
|
@@ -2032,6 +2036,7 @@ class RetryingVmProvisioner(object):
|
|
2032
2036
|
# Recheck cluster name as the 'except:' block below may
|
2033
2037
|
# change the cloud assignment.
|
2034
2038
|
common_utils.check_cluster_name_is_valid(cluster_name)
|
2039
|
+
|
2035
2040
|
if dryrun:
|
2036
2041
|
cloud_user = None
|
2037
2042
|
else:
|
@@ -2459,6 +2464,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2459
2464
|
'Tried to use cached cluster info, but it\'s missing for '
|
2460
2465
|
f'cluster "{self.cluster_name}"')
|
2461
2466
|
self._update_cluster_info()
|
2467
|
+
|
2462
2468
|
assert self.cached_cluster_info is not None, self
|
2463
2469
|
runners = provision_lib.get_command_runners(
|
2464
2470
|
self.cached_cluster_info.provider_name, self.cached_cluster_info,
|
@@ -2689,6 +2695,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2689
2695
|
self._optimize_target) or common.OptimizeTarget.COST
|
2690
2696
|
self._requested_features = kwargs.pop('requested_features',
|
2691
2697
|
self._requested_features)
|
2698
|
+
self._dump_final_script = kwargs.pop('dump_final_script', False)
|
2692
2699
|
assert not kwargs, f'Unexpected kwargs: {kwargs}'
|
2693
2700
|
|
2694
2701
|
def check_resources_fit_cluster(
|
@@ -3272,18 +3279,30 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3272
3279
|
env_vars=setup_envs)
|
3273
3280
|
encoded_script = shlex.quote(setup_script)
|
3274
3281
|
|
3275
|
-
def
|
3282
|
+
def _dump_final_script(
|
3283
|
+
setup_script: str,
|
3284
|
+
target_dir: str = remote_setup_file_name) -> None:
|
3276
3285
|
with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f:
|
3277
3286
|
f.write(setup_script)
|
3278
3287
|
f.flush()
|
3279
3288
|
setup_sh_path = f.name
|
3280
3289
|
runner.rsync(source=setup_sh_path,
|
3281
|
-
target=
|
3290
|
+
target=target_dir,
|
3282
3291
|
up=True,
|
3283
3292
|
stream_logs=False)
|
3284
3293
|
|
3294
|
+
# Always dump the full setup script to the persistent path first
|
3295
|
+
# In high availability mode, we need to dump the full setup script
|
3296
|
+
# to a persistent path BEFORE any other operations. This ensures
|
3297
|
+
# that if the pod restarts, it can find and execute the complete
|
3298
|
+
# setup script, rather than a reference to a temporary file that
|
3299
|
+
# would no longer exist after restart.
|
3300
|
+
if self._dump_final_script:
|
3301
|
+
_dump_final_script(setup_script,
|
3302
|
+
constants.PERSISTENT_SETUP_SCRIPT_PATH)
|
3303
|
+
|
3285
3304
|
if detach_setup or _is_command_length_over_limit(encoded_script):
|
3286
|
-
|
3305
|
+
_dump_final_script(setup_script)
|
3287
3306
|
create_script_code = 'true'
|
3288
3307
|
else:
|
3289
3308
|
create_script_code = (f'{{ echo {encoded_script} > '
|
@@ -3335,7 +3354,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3335
3354
|
'Failed to run setup command inline due to '
|
3336
3355
|
'command length limit. Dumping setup script to '
|
3337
3356
|
'file and running it with SSH.')
|
3338
|
-
|
3357
|
+
_dump_final_script(setup_script)
|
3339
3358
|
returncode = _run_setup(setup_cmd)
|
3340
3359
|
|
3341
3360
|
def error_message() -> str:
|
@@ -3426,14 +3445,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3426
3445
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
3427
3446
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
3428
3447
|
|
3429
|
-
def _dump_code_to_file(codegen: str
|
3448
|
+
def _dump_code_to_file(codegen: str,
|
3449
|
+
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
3430
3450
|
runners = handle.get_command_runners()
|
3431
3451
|
head_runner = runners[0]
|
3432
3452
|
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
3433
3453
|
fp.write(codegen)
|
3434
3454
|
fp.flush()
|
3435
|
-
script_path = os.path.join(
|
3436
|
-
f'sky_job_{job_id}')
|
3455
|
+
script_path = os.path.join(target_dir, f'sky_job_{job_id}')
|
3437
3456
|
# We choose to sync code + exec, because the alternative of 'ray
|
3438
3457
|
# submit' may not work as it may use system python (python2) to
|
3439
3458
|
# execute the script. Happens for AWS.
|
@@ -3442,6 +3461,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3442
3461
|
up=True,
|
3443
3462
|
stream_logs=False)
|
3444
3463
|
|
3464
|
+
# Should also be ealier than _is_command_length_over_limit
|
3465
|
+
# Same reason as in _setup
|
3466
|
+
if self._dump_final_script:
|
3467
|
+
_dump_code_to_file(job_submit_cmd,
|
3468
|
+
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
3469
|
+
|
3445
3470
|
if _is_command_length_over_limit(job_submit_cmd):
|
3446
3471
|
_dump_code_to_file(codegen)
|
3447
3472
|
job_submit_cmd = f'{mkdir_code} && {code}'
|
@@ -3457,7 +3482,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3457
3482
|
# We cannot set the managed job to PENDING state in the job template
|
3458
3483
|
# (jobs-controller.yaml.j2), as it may need to wait for the run
|
3459
3484
|
# commands to be scheduled on the job controller in high-load cases.
|
3460
|
-
job_submit_cmd
|
3485
|
+
job_submit_cmd += ' && ' + managed_job_code
|
3461
3486
|
|
3462
3487
|
returncode, stdout, stderr = self.run_on_head(handle,
|
3463
3488
|
job_submit_cmd,
|
@@ -276,7 +276,6 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
276
276
|
detach_run: bool,
|
277
277
|
dryrun: bool = False) -> None:
|
278
278
|
""" Launches the container."""
|
279
|
-
|
280
279
|
if detach_run:
|
281
280
|
raise NotImplementedError('detach_run=True is not supported in '
|
282
281
|
'LocalDockerBackend.')
|
@@ -364,7 +363,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
364
363
|
if k.startswith(_DOCKER_LABEL_PREFIX):
|
365
364
|
# Remove 'skymeta_' from key
|
366
365
|
metadata[k[len(_DOCKER_LABEL_PREFIX):]] = v
|
367
|
-
self.images[c.name] =
|
366
|
+
self.images[c.name] = (c.image, metadata)
|
368
367
|
self.containers[c.name] = c
|
369
368
|
|
370
369
|
def _execute_task_one_node(self, handle: LocalDockerResourceHandle,
|
sky/cli.py
CHANGED
@@ -23,6 +23,7 @@ NOTE: the order of command definitions in this file corresponds to how they are
|
|
23
23
|
listed in "sky --help". Take care to put logically connected commands close to
|
24
24
|
each other.
|
25
25
|
"""
|
26
|
+
import collections
|
26
27
|
import copy
|
27
28
|
import datetime
|
28
29
|
import functools
|
@@ -162,7 +163,7 @@ def _get_cluster_records_and_set_ssh_config(
|
|
162
163
|
'-o StrictHostKeyChecking=no '
|
163
164
|
'-o UserKnownHostsFile=/dev/null '
|
164
165
|
'-o IdentitiesOnly=yes '
|
165
|
-
'-W %h:%p '
|
166
|
+
'-W \'[%h]:%p\' '
|
166
167
|
f'{handle.ssh_user}@127.0.0.1 '
|
167
168
|
'-o ProxyCommand='
|
168
169
|
# TODO(zhwu): write the template to a temp file, don't use
|
@@ -3413,7 +3414,7 @@ def show_gpus(
|
|
3413
3414
|
|
3414
3415
|
# TODO(zhwu,romilb): We should move most of these kubernetes related
|
3415
3416
|
# queries into the backend, especially behind the server.
|
3416
|
-
def
|
3417
|
+
def _get_kubernetes_realtime_gpu_tables(
|
3417
3418
|
context: Optional[str] = None,
|
3418
3419
|
name_filter: Optional[str] = None,
|
3419
3420
|
quantity_filter: Optional[int] = None):
|
@@ -3423,15 +3424,14 @@ def show_gpus(
|
|
3423
3424
|
else:
|
3424
3425
|
qty_header = 'REQUESTABLE_QTY_PER_NODE'
|
3425
3426
|
free_header = 'TOTAL_FREE_GPUS'
|
3426
|
-
|
3427
|
-
|
3428
|
-
realtime_gpu_availability_list = sdk.stream_and_get(
|
3427
|
+
|
3428
|
+
realtime_gpu_availability_lists = sdk.stream_and_get(
|
3429
3429
|
sdk.realtime_kubernetes_gpu_availability(
|
3430
3430
|
context=context,
|
3431
3431
|
name_filter=name_filter,
|
3432
3432
|
quantity_filter=quantity_filter))
|
3433
|
-
if not
|
3434
|
-
err_msg = 'No GPUs found in Kubernetes cluster. '
|
3433
|
+
if not realtime_gpu_availability_lists:
|
3434
|
+
err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
|
3435
3435
|
debug_msg = 'To further debug, run: sky check '
|
3436
3436
|
if name_filter is not None:
|
3437
3437
|
gpu_info_msg = f' {name_filter!r}'
|
@@ -3439,26 +3439,52 @@ def show_gpus(
|
|
3439
3439
|
gpu_info_msg += (' with requested quantity'
|
3440
3440
|
f' {quantity_filter}')
|
3441
3441
|
err_msg = (f'Resources{gpu_info_msg} not found '
|
3442
|
-
'in Kubernetes cluster. ')
|
3442
|
+
'in any allowed Kubernetes cluster. ')
|
3443
3443
|
debug_msg = ('To show available accelerators on kubernetes,'
|
3444
3444
|
' run: sky show-gpus --cloud kubernetes ')
|
3445
3445
|
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
3446
3446
|
debug_msg)
|
3447
3447
|
raise ValueError(full_err_msg)
|
3448
3448
|
no_permissions_str = '<no permissions>'
|
3449
|
-
|
3450
|
-
|
3451
|
-
|
3452
|
-
|
3453
|
-
|
3454
|
-
|
3455
|
-
|
3456
|
-
|
3457
|
-
|
3458
|
-
|
3459
|
-
available_qty
|
3460
|
-
|
3461
|
-
|
3449
|
+
realtime_gpu_infos = []
|
3450
|
+
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
3451
|
+
lambda: [0, 0])
|
3452
|
+
|
3453
|
+
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
3454
|
+
realtime_gpu_table = log_utils.create_table(
|
3455
|
+
['GPU', qty_header, 'TOTAL_GPUS', free_header])
|
3456
|
+
for realtime_gpu_availability in sorted(availability_list):
|
3457
|
+
gpu_availability = models.RealtimeGpuAvailability(
|
3458
|
+
*realtime_gpu_availability)
|
3459
|
+
available_qty = (gpu_availability.available
|
3460
|
+
if gpu_availability.available != -1 else
|
3461
|
+
no_permissions_str)
|
3462
|
+
realtime_gpu_table.add_row([
|
3463
|
+
gpu_availability.gpu,
|
3464
|
+
_list_to_str(gpu_availability.counts),
|
3465
|
+
gpu_availability.capacity,
|
3466
|
+
available_qty,
|
3467
|
+
])
|
3468
|
+
gpu = gpu_availability.gpu
|
3469
|
+
capacity = gpu_availability.capacity
|
3470
|
+
# we want total, so skip permission denied.
|
3471
|
+
available = max(gpu_availability.available, 0)
|
3472
|
+
if capacity > 0:
|
3473
|
+
total_gpu_info[gpu][0] += capacity
|
3474
|
+
total_gpu_info[gpu][1] += available
|
3475
|
+
realtime_gpu_infos.append((ctx, realtime_gpu_table))
|
3476
|
+
|
3477
|
+
# display an aggregated table for all contexts
|
3478
|
+
# if there are more than one contexts with GPUs
|
3479
|
+
if len(realtime_gpu_infos) > 1:
|
3480
|
+
total_realtime_gpu_table = log_utils.create_table(
|
3481
|
+
['GPU', 'TOTAL_GPUS', free_header])
|
3482
|
+
for gpu, stats in total_gpu_info.items():
|
3483
|
+
total_realtime_gpu_table.add_row([gpu, stats[0], stats[1]])
|
3484
|
+
else:
|
3485
|
+
total_realtime_gpu_table = None
|
3486
|
+
|
3487
|
+
return realtime_gpu_infos, total_realtime_gpu_table
|
3462
3488
|
|
3463
3489
|
def _format_kubernetes_node_info(context: Optional[str]):
|
3464
3490
|
node_table = log_utils.create_table(
|
@@ -3479,7 +3505,7 @@ def show_gpus(
|
|
3479
3505
|
'Kubernetes per node accelerator availability ')
|
3480
3506
|
if nodes_info.hint:
|
3481
3507
|
k8s_per_node_acc_message += nodes_info.hint
|
3482
|
-
return (f'{colorama.Fore.
|
3508
|
+
return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
|
3483
3509
|
f'{k8s_per_node_acc_message}'
|
3484
3510
|
f'{colorama.Style.RESET_ALL}\n'
|
3485
3511
|
f'{node_table.get_string()}')
|
@@ -3516,8 +3542,7 @@ def show_gpus(
|
|
3516
3542
|
# If --cloud kubernetes is not specified, we want to catch
|
3517
3543
|
# the case where no GPUs are available on the cluster and
|
3518
3544
|
# print the warning at the end.
|
3519
|
-
|
3520
|
-
context)
|
3545
|
+
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
|
3521
3546
|
except ValueError as e:
|
3522
3547
|
if not cloud_is_kubernetes:
|
3523
3548
|
# Make it a note if cloud is not kubernetes
|
@@ -3525,13 +3550,24 @@ def show_gpus(
|
|
3525
3550
|
k8s_messages += str(e)
|
3526
3551
|
else:
|
3527
3552
|
print_section_titles = True
|
3528
|
-
|
3529
|
-
|
3530
|
-
|
3531
|
-
|
3532
|
-
|
3533
|
-
|
3534
|
-
|
3553
|
+
|
3554
|
+
# print total table
|
3555
|
+
if total_table is not None:
|
3556
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3557
|
+
'Total Kubernetes GPUs'
|
3558
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3559
|
+
yield from total_table.get_string()
|
3560
|
+
yield '\n-----\n\n'
|
3561
|
+
|
3562
|
+
# print individual infos.
|
3563
|
+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3564
|
+
context_str = f'(Context: {ctx})' if ctx else ''
|
3565
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3566
|
+
f'Kubernetes GPUs {context_str}'
|
3567
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3568
|
+
yield from k8s_realtime_table.get_string()
|
3569
|
+
yield '\n\n'
|
3570
|
+
yield _format_kubernetes_node_info(ctx) + '\n-----\n\n'
|
3535
3571
|
if kubernetes_autoscaling:
|
3536
3572
|
k8s_messages += (
|
3537
3573
|
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
@@ -3620,13 +3656,29 @@ def show_gpus(
|
|
3620
3656
|
# Print section title if not showing all and instead a specific
|
3621
3657
|
# accelerator is requested
|
3622
3658
|
print_section_titles = True
|
3623
|
-
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3624
|
-
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
|
3625
3659
|
# TODO(romilb): Show filtered per node GPU availability here as well
|
3626
3660
|
try:
|
3627
|
-
|
3628
|
-
|
3629
|
-
|
3661
|
+
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables( # pylint: disable=line-too-long
|
3662
|
+
context=region,
|
3663
|
+
name_filter=name,
|
3664
|
+
quantity_filter=quantity)
|
3665
|
+
|
3666
|
+
# print total table
|
3667
|
+
if total_table is not None:
|
3668
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3669
|
+
'Total Kubernetes GPUs'
|
3670
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3671
|
+
yield from total_table.get_string()
|
3672
|
+
yield '\n-----\n\n'
|
3673
|
+
|
3674
|
+
# print individual tables
|
3675
|
+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3676
|
+
context_str = f'(Context: {ctx})' if ctx else ''
|
3677
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3678
|
+
f'Kubernetes GPUs {context_str}'
|
3679
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3680
|
+
yield from k8s_realtime_table.get_string()
|
3681
|
+
yield '\n\n'
|
3630
3682
|
except ValueError as e:
|
3631
3683
|
# In the case of a specific accelerator, show the error message
|
3632
3684
|
# immediately (e.g., "Resources H100 not found ...")
|
@@ -5911,11 +5963,12 @@ def api_info():
|
|
5911
5963
|
user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
|
5912
5964
|
user_hash = common_utils.get_user_hash()
|
5913
5965
|
dashboard_url = server_common.get_dashboard_url(url)
|
5914
|
-
click.echo(f'Using SkyPilot API server: {url}
|
5966
|
+
click.echo(f'Using SkyPilot API server: {url}\n'
|
5915
5967
|
f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
|
5916
5968
|
f'commit: {api_server_info["commit"]}, '
|
5917
5969
|
f'version: {api_server_info["version"]}\n'
|
5918
|
-
f'{ux_utils.
|
5970
|
+
f'{ux_utils.INDENT_SYMBOL}User: {user_name} ({user_hash})\n'
|
5971
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
|
5919
5972
|
|
5920
5973
|
|
5921
5974
|
def main():
|