skypilot-nightly 1.0.0.dev20241222__py3-none-any.whl → 1.0.0.dev20241224__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +0 -4
- sky/backends/cloud_vm_ray_backend.py +2 -2
- sky/benchmark/benchmark_utils.py +1 -1
- sky/check.py +1 -1
- sky/cli.py +28 -30
- sky/cloud_stores.py +1 -1
- sky/clouds/gcp.py +1 -1
- sky/clouds/kubernetes.py +1 -1
- sky/clouds/service_catalog/common.py +11 -10
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/utils/scp_utils.py +3 -3
- sky/core.py +3 -3
- sky/data/data_utils.py +25 -32
- sky/data/storage.py +58 -15
- sky/jobs/core.py +2 -2
- sky/jobs/state.py +2 -2
- sky/jobs/utils.py +6 -6
- sky/optimizer.py +3 -3
- sky/provision/aws/config.py +2 -2
- sky/provision/gcp/config.py +3 -3
- sky/provision/kubernetes/config.py +7 -7
- sky/provision/kubernetes/network_utils.py +1 -1
- sky/provision/kubernetes/utils.py +2 -2
- sky/provision/lambda_cloud/lambda_utils.py +3 -3
- sky/provision/oci/query_utils.py +3 -3
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +6 -7
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +4 -4
- sky/serve/autoscalers.py +2 -2
- sky/serve/core.py +4 -4
- sky/serve/replica_managers.py +1 -1
- sky/serve/serve_state.py +1 -1
- sky/serve/serve_utils.py +11 -10
- sky/serve/service_spec.py +8 -5
- sky/sky_logging.py +17 -1
- sky/skylet/job_lib.py +1 -1
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +1 -1
- sky/skylet/providers/scp/node_provider.py +7 -7
- sky/task.py +1 -1
- sky/utils/accelerator_registry.py +1 -1
- sky/utils/common_utils.py +1 -1
- sky/utils/dag_utils.py +1 -1
- sky/utils/kubernetes/gpu_labeler.py +1 -1
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- {skypilot_nightly-1.0.0.dev20241222.dist-info → skypilot_nightly-1.0.0.dev20241224.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241222.dist-info → skypilot_nightly-1.0.0.dev20241224.dist-info}/RECORD +54 -54
- {skypilot_nightly-1.0.0.dev20241222.dist-info → skypilot_nightly-1.0.0.dev20241224.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241222.dist-info → skypilot_nightly-1.0.0.dev20241224.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241222.dist-info → skypilot_nightly-1.0.0.dev20241224.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241222.dist-info → skypilot_nightly-1.0.0.dev20241224.dist-info}/top_level.txt +0 -0
sky/data/storage.py
CHANGED
@@ -72,6 +72,8 @@ _BUCKET_EXTERNALLY_DELETED_DEBUG_MESSAGE = (
|
|
72
72
|
'Bucket {bucket_name!r} does not exist. '
|
73
73
|
'It may have been deleted externally.')
|
74
74
|
|
75
|
+
_STORAGE_LOG_FILE_NAME = 'storage_sync.log'
|
76
|
+
|
75
77
|
|
76
78
|
def get_cached_enabled_storage_clouds_or_refresh(
|
77
79
|
raise_if_no_cloud_access: bool = False) -> List[str]:
|
@@ -1067,7 +1069,7 @@ class Storage(object):
|
|
1067
1069
|
add_if_not_none('source', self.source)
|
1068
1070
|
|
1069
1071
|
stores = None
|
1070
|
-
if
|
1072
|
+
if self.stores:
|
1071
1073
|
stores = ','.join([store.value for store in self.stores])
|
1072
1074
|
add_if_not_none('store', stores)
|
1073
1075
|
add_if_not_none('persistent', self.persistent)
|
@@ -1331,17 +1333,24 @@ class S3Store(AbstractStore):
|
|
1331
1333
|
else:
|
1332
1334
|
source_message = source_path_list[0]
|
1333
1335
|
|
1336
|
+
log_path = sky_logging.generate_tmp_logging_file_path(
|
1337
|
+
_STORAGE_LOG_FILE_NAME)
|
1338
|
+
sync_path = f'{source_message} -> s3://{self.name}/'
|
1334
1339
|
with rich_utils.safe_status(
|
1335
|
-
ux_utils.spinner_message(f'Syncing {
|
1336
|
-
|
1340
|
+
ux_utils.spinner_message(f'Syncing {sync_path}',
|
1341
|
+
log_path=log_path)):
|
1337
1342
|
data_utils.parallel_upload(
|
1338
1343
|
source_path_list,
|
1339
1344
|
get_file_sync_command,
|
1340
1345
|
get_dir_sync_command,
|
1346
|
+
log_path,
|
1341
1347
|
self.name,
|
1342
1348
|
self._ACCESS_DENIED_MESSAGE,
|
1343
1349
|
create_dirs=create_dirs,
|
1344
1350
|
max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS)
|
1351
|
+
logger.info(
|
1352
|
+
ux_utils.finishing_message(f'Storage synced: {sync_path}',
|
1353
|
+
log_path))
|
1345
1354
|
|
1346
1355
|
def _transfer_to_s3(self) -> None:
|
1347
1356
|
assert isinstance(self.source, str), self.source
|
@@ -1741,13 +1750,19 @@ class GcsStore(AbstractStore):
|
|
1741
1750
|
gsutil_alias, alias_gen = data_utils.get_gsutil_command()
|
1742
1751
|
sync_command = (f'{alias_gen}; echo "{copy_list}" | {gsutil_alias} '
|
1743
1752
|
f'cp -e -n -r -I gs://{self.name}')
|
1744
|
-
|
1753
|
+
log_path = sky_logging.generate_tmp_logging_file_path(
|
1754
|
+
_STORAGE_LOG_FILE_NAME)
|
1755
|
+
sync_path = f'{source_message} -> gs://{self.name}/'
|
1745
1756
|
with rich_utils.safe_status(
|
1746
|
-
ux_utils.spinner_message(f'Syncing {
|
1747
|
-
|
1757
|
+
ux_utils.spinner_message(f'Syncing {sync_path}',
|
1758
|
+
log_path=log_path)):
|
1748
1759
|
data_utils.run_upload_cli(sync_command,
|
1749
1760
|
self._ACCESS_DENIED_MESSAGE,
|
1750
|
-
bucket_name=self.name
|
1761
|
+
bucket_name=self.name,
|
1762
|
+
log_path=log_path)
|
1763
|
+
logger.info(
|
1764
|
+
ux_utils.finishing_message(f'Storage synced: {sync_path}',
|
1765
|
+
log_path))
|
1751
1766
|
|
1752
1767
|
def batch_gsutil_rsync(self,
|
1753
1768
|
source_path_list: List[Path],
|
@@ -1797,17 +1812,24 @@ class GcsStore(AbstractStore):
|
|
1797
1812
|
else:
|
1798
1813
|
source_message = source_path_list[0]
|
1799
1814
|
|
1815
|
+
log_path = sky_logging.generate_tmp_logging_file_path(
|
1816
|
+
_STORAGE_LOG_FILE_NAME)
|
1817
|
+
sync_path = f'{source_message} -> gs://{self.name}/'
|
1800
1818
|
with rich_utils.safe_status(
|
1801
|
-
ux_utils.spinner_message(f'Syncing {
|
1802
|
-
|
1819
|
+
ux_utils.spinner_message(f'Syncing {sync_path}',
|
1820
|
+
log_path=log_path)):
|
1803
1821
|
data_utils.parallel_upload(
|
1804
1822
|
source_path_list,
|
1805
1823
|
get_file_sync_command,
|
1806
1824
|
get_dir_sync_command,
|
1825
|
+
log_path,
|
1807
1826
|
self.name,
|
1808
1827
|
self._ACCESS_DENIED_MESSAGE,
|
1809
1828
|
create_dirs=create_dirs,
|
1810
1829
|
max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS)
|
1830
|
+
logger.info(
|
1831
|
+
ux_utils.finishing_message(f'Storage synced: {sync_path}',
|
1832
|
+
log_path))
|
1811
1833
|
|
1812
1834
|
def _transfer_to_gcs(self) -> None:
|
1813
1835
|
if isinstance(self.source, str) and self.source.startswith('s3://'):
|
@@ -2535,17 +2557,24 @@ class AzureBlobStore(AbstractStore):
|
|
2535
2557
|
container_endpoint = data_utils.AZURE_CONTAINER_URL.format(
|
2536
2558
|
storage_account_name=self.storage_account_name,
|
2537
2559
|
container_name=self.name)
|
2560
|
+
log_path = sky_logging.generate_tmp_logging_file_path(
|
2561
|
+
_STORAGE_LOG_FILE_NAME)
|
2562
|
+
sync_path = f'{source_message} -> {container_endpoint}/'
|
2538
2563
|
with rich_utils.safe_status(
|
2539
|
-
ux_utils.spinner_message(
|
2540
|
-
|
2564
|
+
ux_utils.spinner_message(f'Syncing {sync_path}',
|
2565
|
+
log_path=log_path)):
|
2541
2566
|
data_utils.parallel_upload(
|
2542
2567
|
source_path_list,
|
2543
2568
|
get_file_sync_command,
|
2544
2569
|
get_dir_sync_command,
|
2570
|
+
log_path,
|
2545
2571
|
self.name,
|
2546
2572
|
self._ACCESS_DENIED_MESSAGE,
|
2547
2573
|
create_dirs=create_dirs,
|
2548
2574
|
max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS)
|
2575
|
+
logger.info(
|
2576
|
+
ux_utils.finishing_message(f'Storage synced: {sync_path}',
|
2577
|
+
log_path))
|
2549
2578
|
|
2550
2579
|
def _get_bucket(self) -> Tuple[str, bool]:
|
2551
2580
|
"""Obtains the AZ Container.
|
@@ -2938,17 +2967,24 @@ class R2Store(AbstractStore):
|
|
2938
2967
|
else:
|
2939
2968
|
source_message = source_path_list[0]
|
2940
2969
|
|
2970
|
+
log_path = sky_logging.generate_tmp_logging_file_path(
|
2971
|
+
_STORAGE_LOG_FILE_NAME)
|
2972
|
+
sync_path = f'{source_message} -> r2://{self.name}/'
|
2941
2973
|
with rich_utils.safe_status(
|
2942
|
-
ux_utils.spinner_message(
|
2943
|
-
|
2974
|
+
ux_utils.spinner_message(f'Syncing {sync_path}',
|
2975
|
+
log_path=log_path)):
|
2944
2976
|
data_utils.parallel_upload(
|
2945
2977
|
source_path_list,
|
2946
2978
|
get_file_sync_command,
|
2947
2979
|
get_dir_sync_command,
|
2980
|
+
log_path,
|
2948
2981
|
self.name,
|
2949
2982
|
self._ACCESS_DENIED_MESSAGE,
|
2950
2983
|
create_dirs=create_dirs,
|
2951
2984
|
max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS)
|
2985
|
+
logger.info(
|
2986
|
+
ux_utils.finishing_message(f'Storage synced: {sync_path}',
|
2987
|
+
log_path))
|
2952
2988
|
|
2953
2989
|
def _transfer_to_r2(self) -> None:
|
2954
2990
|
assert isinstance(self.source, str), self.source
|
@@ -3379,17 +3415,24 @@ class IBMCosStore(AbstractStore):
|
|
3379
3415
|
else:
|
3380
3416
|
source_message = source_path_list[0]
|
3381
3417
|
|
3418
|
+
log_path = sky_logging.generate_tmp_logging_file_path(
|
3419
|
+
_STORAGE_LOG_FILE_NAME)
|
3420
|
+
sync_path = f'{source_message} -> cos://{self.region}/{self.name}/'
|
3382
3421
|
with rich_utils.safe_status(
|
3383
|
-
ux_utils.spinner_message(f'Syncing {
|
3384
|
-
|
3422
|
+
ux_utils.spinner_message(f'Syncing {sync_path}',
|
3423
|
+
log_path=log_path)):
|
3385
3424
|
data_utils.parallel_upload(
|
3386
3425
|
source_path_list,
|
3387
3426
|
get_file_sync_command,
|
3388
3427
|
get_dir_sync_command,
|
3428
|
+
log_path,
|
3389
3429
|
self.name,
|
3390
3430
|
self._ACCESS_DENIED_MESSAGE,
|
3391
3431
|
create_dirs=create_dirs,
|
3392
3432
|
max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS)
|
3433
|
+
logger.info(
|
3434
|
+
ux_utils.finishing_message(f'Storage synced: {sync_path}',
|
3435
|
+
log_path))
|
3393
3436
|
|
3394
3437
|
def _get_bucket(self) -> Tuple[StorageHandle, bool]:
|
3395
3438
|
"""returns IBM COS bucket object if exists, otherwise creates it.
|
sky/jobs/core.py
CHANGED
@@ -347,8 +347,8 @@ def cancel(name: Optional[str] = None,
|
|
347
347
|
stopped_message='All managed jobs should have finished.')
|
348
348
|
|
349
349
|
job_id_str = ','.join(map(str, job_ids))
|
350
|
-
if sum([
|
351
|
-
argument_str = f'job_ids={job_id_str}' if
|
350
|
+
if sum([bool(job_ids), name is not None, all]) != 1:
|
351
|
+
argument_str = f'job_ids={job_id_str}' if job_ids else ''
|
352
352
|
argument_str += f' name={name}' if name is not None else ''
|
353
353
|
argument_str += ' all' if all else ''
|
354
354
|
with ux_utils.print_exception_no_traceback():
|
sky/jobs/state.py
CHANGED
@@ -591,7 +591,7 @@ def get_latest_task_id_status(
|
|
591
591
|
If the job_id does not exist, (None, None) will be returned.
|
592
592
|
"""
|
593
593
|
id_statuses = _get_all_task_ids_statuses(job_id)
|
594
|
-
if
|
594
|
+
if not id_statuses:
|
595
595
|
return None, None
|
596
596
|
task_id, status = id_statuses[-1]
|
597
597
|
for task_id, status in id_statuses:
|
@@ -617,7 +617,7 @@ def get_failure_reason(job_id: int) -> Optional[str]:
|
|
617
617
|
WHERE spot_job_id=(?)
|
618
618
|
ORDER BY task_id ASC""", (job_id,)).fetchall()
|
619
619
|
reason = [r[0] for r in reason if r[0] is not None]
|
620
|
-
if
|
620
|
+
if not reason:
|
621
621
|
return None
|
622
622
|
return reason[0]
|
623
623
|
|
sky/jobs/utils.py
CHANGED
@@ -234,11 +234,11 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
|
|
234
234
|
if job_ids is None:
|
235
235
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
|
236
236
|
job_ids = list(set(job_ids))
|
237
|
-
if
|
237
|
+
if not job_ids:
|
238
238
|
return 'No job to cancel.'
|
239
239
|
job_id_str = ', '.join(map(str, job_ids))
|
240
240
|
logger.info(f'Cancelling jobs {job_id_str}.')
|
241
|
-
cancelled_job_ids = []
|
241
|
+
cancelled_job_ids: List[int] = []
|
242
242
|
for job_id in job_ids:
|
243
243
|
# Check the status of the managed job status. If it is in
|
244
244
|
# terminal state, we can safely skip it.
|
@@ -268,7 +268,7 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
|
|
268
268
|
shutil.copy(str(signal_file), str(legacy_signal_file))
|
269
269
|
cancelled_job_ids.append(job_id)
|
270
270
|
|
271
|
-
if
|
271
|
+
if not cancelled_job_ids:
|
272
272
|
return 'No job to cancel.'
|
273
273
|
identity_str = f'Job with ID {cancelled_job_ids[0]} is'
|
274
274
|
if len(cancelled_job_ids) > 1:
|
@@ -281,7 +281,7 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
|
|
281
281
|
def cancel_job_by_name(job_name: str) -> str:
|
282
282
|
"""Cancel a job by name."""
|
283
283
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
|
284
|
-
if
|
284
|
+
if not job_ids:
|
285
285
|
return f'No running job found with name {job_name!r}.'
|
286
286
|
if len(job_ids) > 1:
|
287
287
|
return (f'{colorama.Fore.RED}Multiple running jobs found '
|
@@ -515,7 +515,7 @@ def stream_logs(job_id: Optional[int],
|
|
515
515
|
for job in managed_jobs
|
516
516
|
if job['job_name'] == job_name
|
517
517
|
}
|
518
|
-
if
|
518
|
+
if not managed_job_ids:
|
519
519
|
return f'No managed job found with name {job_name!r}.'
|
520
520
|
if len(managed_job_ids) > 1:
|
521
521
|
job_ids_str = ', '.join(
|
@@ -541,7 +541,7 @@ def stream_logs(job_id: Optional[int],
|
|
541
541
|
if job_id is None:
|
542
542
|
assert job_name is not None
|
543
543
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
|
544
|
-
if
|
544
|
+
if not job_ids:
|
545
545
|
return f'No running managed job found with name {job_name!r}.'
|
546
546
|
if len(job_ids) > 1:
|
547
547
|
raise ValueError(
|
sky/optimizer.py
CHANGED
@@ -188,7 +188,7 @@ class Optimizer:
|
|
188
188
|
"""Removes special Source and Sink nodes."""
|
189
189
|
source = [t for t in dag.tasks if t.name == _DUMMY_SOURCE_NAME]
|
190
190
|
sink = [t for t in dag.tasks if t.name == _DUMMY_SINK_NAME]
|
191
|
-
if
|
191
|
+
if not source and not sink:
|
192
192
|
return
|
193
193
|
assert len(source) == len(sink) == 1, dag.tasks
|
194
194
|
dag.remove(source[0])
|
@@ -1298,7 +1298,7 @@ def _fill_in_launchable_resources(
|
|
1298
1298
|
resources, num_nodes=task.num_nodes)
|
1299
1299
|
if feasible_resources.hint is not None:
|
1300
1300
|
hints[cloud] = feasible_resources.hint
|
1301
|
-
if
|
1301
|
+
if feasible_resources.resources_list:
|
1302
1302
|
# Assume feasible_resources is sorted by prices. Guaranteed by
|
1303
1303
|
# the implementation of get_feasible_launchable_resources and
|
1304
1304
|
# the underlying service_catalog filtering
|
@@ -1310,7 +1310,7 @@ def _fill_in_launchable_resources(
|
|
1310
1310
|
else:
|
1311
1311
|
all_fuzzy_candidates.update(
|
1312
1312
|
feasible_resources.fuzzy_candidate_list)
|
1313
|
-
if
|
1313
|
+
if not launchable[resources]:
|
1314
1314
|
clouds_str = str(clouds_list) if len(clouds_list) > 1 else str(
|
1315
1315
|
clouds_list[0])
|
1316
1316
|
num_node_str = ''
|
sky/provision/aws/config.py
CHANGED
@@ -279,7 +279,7 @@ def _is_subnet_public(ec2, subnet_id, vpc_id: Optional[str]) -> bool:
|
|
279
279
|
logger.debug(f'subnet {subnet_id} route tables: {route_tables}')
|
280
280
|
if _has_igw_route(route_tables):
|
281
281
|
return True
|
282
|
-
if
|
282
|
+
if route_tables:
|
283
283
|
return False
|
284
284
|
|
285
285
|
# Handle the case that a "main" route table is implicitly associated with
|
@@ -454,7 +454,7 @@ def _vpc_id_from_security_group_ids(ec2, sg_ids: List[str]) -> Any:
|
|
454
454
|
|
455
455
|
no_sg_msg = ('Failed to detect a security group with id equal to any of '
|
456
456
|
'the configured SecurityGroupIds.')
|
457
|
-
assert
|
457
|
+
assert vpc_ids, no_sg_msg
|
458
458
|
|
459
459
|
return vpc_ids[0]
|
460
460
|
|
sky/provision/gcp/config.py
CHANGED
@@ -397,7 +397,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
397
397
|
operation = compute.networks().getEffectiveFirewalls(project=project_id,
|
398
398
|
network=vpc_name)
|
399
399
|
response = operation.execute()
|
400
|
-
if
|
400
|
+
if not response:
|
401
401
|
return False
|
402
402
|
effective_rules = response['firewalls']
|
403
403
|
|
@@ -515,7 +515,7 @@ def _create_rules(project_id: str, compute, rules, vpc_name):
|
|
515
515
|
rule_list = _list_firewall_rules(project_id,
|
516
516
|
compute,
|
517
517
|
filter=f'(name={rule_name})')
|
518
|
-
if
|
518
|
+
if rule_list:
|
519
519
|
_delete_firewall_rule(project_id, compute, rule_name)
|
520
520
|
|
521
521
|
body = rule.copy()
|
@@ -624,7 +624,7 @@ def get_usable_vpc_and_subnet(
|
|
624
624
|
vpc_list = _list_vpcnets(project_id,
|
625
625
|
compute,
|
626
626
|
filter=f'name={constants.SKYPILOT_VPC_NAME}')
|
627
|
-
if
|
627
|
+
if not vpc_list:
|
628
628
|
body = constants.VPC_TEMPLATE.copy()
|
629
629
|
body['name'] = body['name'].format(VPC_NAME=constants.SKYPILOT_VPC_NAME)
|
630
630
|
body['selfLink'] = body['selfLink'].format(
|
@@ -232,7 +232,7 @@ def _get_resource(container_resources: Dict[str, Any], resource_name: str,
|
|
232
232
|
# Look for keys containing the resource_name. For example,
|
233
233
|
# the key 'nvidia.com/gpu' contains the key 'gpu'.
|
234
234
|
matching_keys = [key for key in resources if resource_name in key.lower()]
|
235
|
-
if
|
235
|
+
if not matching_keys:
|
236
236
|
return float('inf')
|
237
237
|
if len(matching_keys) > 1:
|
238
238
|
# Should have only one match -- mostly relevant for gpu.
|
@@ -265,7 +265,7 @@ def _configure_autoscaler_service_account(
|
|
265
265
|
field_selector = f'metadata.name={name}'
|
266
266
|
accounts = (kubernetes.core_api(context).list_namespaced_service_account(
|
267
267
|
namespace, field_selector=field_selector).items)
|
268
|
-
if
|
268
|
+
if accounts:
|
269
269
|
assert len(accounts) == 1
|
270
270
|
# Nothing to check for equality and patch here,
|
271
271
|
# since the service_account.metadata.name is the only important
|
@@ -308,7 +308,7 @@ def _configure_autoscaler_role(namespace: str, context: Optional[str],
|
|
308
308
|
field_selector = f'metadata.name={name}'
|
309
309
|
roles = (kubernetes.auth_api(context).list_namespaced_role(
|
310
310
|
namespace, field_selector=field_selector).items)
|
311
|
-
if
|
311
|
+
if roles:
|
312
312
|
assert len(roles) == 1
|
313
313
|
existing_role = roles[0]
|
314
314
|
# Convert to k8s object to compare
|
@@ -374,7 +374,7 @@ def _configure_autoscaler_role_binding(
|
|
374
374
|
field_selector = f'metadata.name={name}'
|
375
375
|
role_bindings = (kubernetes.auth_api(context).list_namespaced_role_binding(
|
376
376
|
rb_namespace, field_selector=field_selector).items)
|
377
|
-
if
|
377
|
+
if role_bindings:
|
378
378
|
assert len(role_bindings) == 1
|
379
379
|
existing_binding = role_bindings[0]
|
380
380
|
new_rb = kubernetes_utils.dict_to_k8s_object(binding, 'V1RoleBinding')
|
@@ -415,7 +415,7 @@ def _configure_autoscaler_cluster_role(namespace, context,
|
|
415
415
|
field_selector = f'metadata.name={name}'
|
416
416
|
cluster_roles = (kubernetes.auth_api(context).list_cluster_role(
|
417
417
|
field_selector=field_selector).items)
|
418
|
-
if
|
418
|
+
if cluster_roles:
|
419
419
|
assert len(cluster_roles) == 1
|
420
420
|
existing_cr = cluster_roles[0]
|
421
421
|
new_cr = kubernetes_utils.dict_to_k8s_object(role, 'V1ClusterRole')
|
@@ -460,7 +460,7 @@ def _configure_autoscaler_cluster_role_binding(
|
|
460
460
|
field_selector = f'metadata.name={name}'
|
461
461
|
cr_bindings = (kubernetes.auth_api(context).list_cluster_role_binding(
|
462
462
|
field_selector=field_selector).items)
|
463
|
-
if
|
463
|
+
if cr_bindings:
|
464
464
|
assert len(cr_bindings) == 1
|
465
465
|
existing_binding = cr_bindings[0]
|
466
466
|
new_binding = kubernetes_utils.dict_to_k8s_object(
|
@@ -639,7 +639,7 @@ def _configure_services(namespace: str, context: Optional[str],
|
|
639
639
|
field_selector = f'metadata.name={name}'
|
640
640
|
services = (kubernetes.core_api(context).list_namespaced_service(
|
641
641
|
namespace, field_selector=field_selector).items)
|
642
|
-
if
|
642
|
+
if services:
|
643
643
|
assert len(services) == 1
|
644
644
|
existing_service = services[0]
|
645
645
|
# Convert to k8s object to compare
|
@@ -230,7 +230,7 @@ def get_ingress_external_ip_and_ports(
|
|
230
230
|
namespace, _request_timeout=kubernetes.API_TIMEOUT).items
|
231
231
|
if item.metadata.name == 'ingress-nginx-controller'
|
232
232
|
]
|
233
|
-
if
|
233
|
+
if not ingress_services:
|
234
234
|
return (None, None)
|
235
235
|
|
236
236
|
ingress_service = ingress_services[0]
|
@@ -583,7 +583,7 @@ def check_instance_fits(context: Optional[str],
|
|
583
583
|
node for node in nodes if gpu_label_key in node.metadata.labels and
|
584
584
|
node.metadata.labels[gpu_label_key] == gpu_label_val
|
585
585
|
]
|
586
|
-
assert
|
586
|
+
assert gpu_nodes, 'GPU nodes not found'
|
587
587
|
if is_tpu_on_gke(acc_type):
|
588
588
|
# If requested accelerator is a TPU type, check if the cluster
|
589
589
|
# has sufficient TPU resource to meet the requirement.
|
@@ -1526,7 +1526,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
|
|
1526
1526
|
def find(l, predicate):
|
1527
1527
|
"""Utility function to find element in given list"""
|
1528
1528
|
results = [x for x in l if predicate(x)]
|
1529
|
-
return results[0] if
|
1529
|
+
return results[0] if results else None
|
1530
1530
|
|
1531
1531
|
# Get the SSH jump pod name from the head pod
|
1532
1532
|
try:
|
@@ -50,7 +50,7 @@ class Metadata:
|
|
50
50
|
if value is None:
|
51
51
|
if instance_id in metadata:
|
52
52
|
metadata.pop(instance_id) # del entry
|
53
|
-
if
|
53
|
+
if not metadata:
|
54
54
|
if os.path.exists(self.path):
|
55
55
|
os.remove(self.path)
|
56
56
|
return
|
@@ -69,7 +69,7 @@ class Metadata:
|
|
69
69
|
for instance_id in list(metadata.keys()):
|
70
70
|
if instance_id not in instance_ids:
|
71
71
|
del metadata[instance_id]
|
72
|
-
if
|
72
|
+
if not metadata:
|
73
73
|
os.remove(self.path)
|
74
74
|
return
|
75
75
|
with open(self.path, 'w', encoding='utf-8') as f:
|
@@ -150,7 +150,7 @@ class LambdaCloudClient:
|
|
150
150
|
['regions_with_capacity_available'])
|
151
151
|
available_regions = [reg['name'] for reg in available_regions]
|
152
152
|
if region not in available_regions:
|
153
|
-
if
|
153
|
+
if available_regions:
|
154
154
|
aval_reg = ' '.join(available_regions)
|
155
155
|
else:
|
156
156
|
aval_reg = 'None'
|
sky/provision/oci/query_utils.py
CHANGED
@@ -248,7 +248,7 @@ class QueryHelper:
|
|
248
248
|
limit=1)
|
249
249
|
|
250
250
|
compartments = list_compartments_response.data
|
251
|
-
if
|
251
|
+
if compartments:
|
252
252
|
skypilot_compartment = compartments[0].id
|
253
253
|
return skypilot_compartment
|
254
254
|
|
@@ -274,7 +274,7 @@ class QueryHelper:
|
|
274
274
|
display_name=oci_utils.oci_config.VCN_NAME,
|
275
275
|
lifecycle_state='AVAILABLE')
|
276
276
|
vcns = list_vcns_response.data
|
277
|
-
if
|
277
|
+
if vcns:
|
278
278
|
# Found the VCN.
|
279
279
|
skypilot_vcn = vcns[0].id
|
280
280
|
list_subnets_response = net_client.list_subnets(
|
@@ -359,7 +359,7 @@ class QueryHelper:
|
|
359
359
|
if str(s.cidr_block).startswith('all-') and str(s.cidr_block).
|
360
360
|
endswith('-services-in-oracle-services-network')
|
361
361
|
]
|
362
|
-
if
|
362
|
+
if services:
|
363
363
|
# Create service gateway for regional services.
|
364
364
|
create_sg_response = net_client.create_service_gateway(
|
365
365
|
create_service_gateway_details=oci_adaptor.oci.core.models.
|
@@ -56,7 +56,7 @@ def get_hosts_by_cluster_names(content, vcenter_name, cluster_name_dicts=None):
|
|
56
56
|
'name': cluster.name
|
57
57
|
} for cluster in cluster_view.view]
|
58
58
|
cluster_view.Destroy()
|
59
|
-
if
|
59
|
+
if not cluster_name_dicts:
|
60
60
|
logger.warning(f'vCenter \'{vcenter_name}\' has no clusters')
|
61
61
|
|
62
62
|
# Retrieve all cluster names from the cluster_name_dicts
|
@@ -162,7 +162,7 @@ def _create_instances(
|
|
162
162
|
if not gpu_instance:
|
163
163
|
# Find an image for CPU
|
164
164
|
images_df = images_df[images_df['GpuTags'] == '\'[]\'']
|
165
|
-
if
|
165
|
+
if not images_df:
|
166
166
|
logger.error(
|
167
167
|
f'Can not find an image for instance type: {instance_type}.')
|
168
168
|
raise Exception(
|
@@ -185,7 +185,7 @@ def _create_instances(
|
|
185
185
|
image_instance_mapping_df = image_instance_mapping_df[
|
186
186
|
image_instance_mapping_df['InstanceType'] == instance_type]
|
187
187
|
|
188
|
-
if
|
188
|
+
if not image_instance_mapping_df:
|
189
189
|
raise Exception(f"""There is no image can match instance type named
|
190
190
|
{instance_type}
|
191
191
|
If you are using CPU-only instance, assign an image with tag
|
@@ -218,10 +218,9 @@ def _create_instances(
|
|
218
218
|
hosts_df = hosts_df[(hosts_df['AvailableCPUs'] /
|
219
219
|
hosts_df['cpuMhz']) >= cpus_needed]
|
220
220
|
hosts_df = hosts_df[hosts_df['AvailableMemory(MB)'] >= memory_needed]
|
221
|
-
assert
|
222
|
-
|
223
|
-
|
224
|
-
f'cpus and {memory_needed}MB memory are required.')
|
221
|
+
assert hosts_df, (f'There is no host available to create the instance '
|
222
|
+
f'{vms_item["InstanceType"]}, at least {cpus_needed} '
|
223
|
+
f'cpus and {memory_needed}MB memory are required.')
|
225
224
|
|
226
225
|
# Sort the hosts df by AvailableCPUs to get the compatible host with the
|
227
226
|
# least resource
|
@@ -365,7 +364,7 @@ def _choose_vsphere_cluster_name(config: common.ProvisionConfig, region: str,
|
|
365
364
|
skypilot framework-optimized availability_zones"""
|
366
365
|
vsphere_cluster_name = None
|
367
366
|
vsphere_cluster_name_str = config.provider_config['availability_zone']
|
368
|
-
if
|
367
|
+
if vc_object.clusters:
|
369
368
|
for optimized_cluster_name in vsphere_cluster_name_str.split(','):
|
370
369
|
if optimized_cluster_name in [
|
371
370
|
item['name'] for item in vc_object.clusters
|
@@ -257,7 +257,7 @@ class VsphereClient:
|
|
257
257
|
# hard code here. should support configure later.
|
258
258
|
profile_name = 'skypilot_policy'
|
259
259
|
storage_profile_id = None
|
260
|
-
if
|
260
|
+
if profile_ids:
|
261
261
|
profiles = pm.PbmRetrieveContent(profileIds=profile_ids)
|
262
262
|
for profile in profiles:
|
263
263
|
if profile_name in profile.name:
|
sky/resources.py
CHANGED
@@ -661,7 +661,7 @@ class Resources:
|
|
661
661
|
continue
|
662
662
|
valid_clouds.append(cloud)
|
663
663
|
|
664
|
-
if
|
664
|
+
if not valid_clouds:
|
665
665
|
if len(enabled_clouds) == 1:
|
666
666
|
cloud_str = f'for cloud {enabled_clouds[0]}'
|
667
667
|
else:
|
@@ -773,7 +773,7 @@ class Resources:
|
|
773
773
|
for cloud in enabled_clouds:
|
774
774
|
if cloud.instance_type_exists(self._instance_type):
|
775
775
|
valid_clouds.append(cloud)
|
776
|
-
if
|
776
|
+
if not valid_clouds:
|
777
777
|
if len(enabled_clouds) == 1:
|
778
778
|
cloud_str = f'for cloud {enabled_clouds[0]}'
|
779
779
|
else:
|
@@ -1008,7 +1008,7 @@ class Resources:
|
|
1008
1008
|
f'Label rejected due to {cloud}: {err_msg}'
|
1009
1009
|
])
|
1010
1010
|
break
|
1011
|
-
if
|
1011
|
+
if invalid_table.rows:
|
1012
1012
|
with ux_utils.print_exception_no_traceback():
|
1013
1013
|
raise ValueError(
|
1014
1014
|
'The following labels are invalid:'
|
@@ -1283,7 +1283,7 @@ class Resources:
|
|
1283
1283
|
_cluster_config_overrides=override.pop(
|
1284
1284
|
'_cluster_config_overrides', self._cluster_config_overrides),
|
1285
1285
|
)
|
1286
|
-
assert
|
1286
|
+
assert not override
|
1287
1287
|
return resources
|
1288
1288
|
|
1289
1289
|
def valid_on_region_zones(self, region: str, zones: List[str]) -> bool:
|
sky/serve/autoscalers.py
CHANGED
@@ -320,8 +320,8 @@ class RequestRateAutoscaler(Autoscaler):
|
|
320
320
|
"""Select outdated replicas to scale down."""
|
321
321
|
|
322
322
|
if self.update_mode == serve_utils.UpdateMode.ROLLING:
|
323
|
-
latest_ready_replicas = []
|
324
|
-
old_nonterminal_replicas = []
|
323
|
+
latest_ready_replicas: List['replica_managers.ReplicaInfo'] = []
|
324
|
+
old_nonterminal_replicas: List['replica_managers.ReplicaInfo'] = []
|
325
325
|
for info in replica_infos:
|
326
326
|
if info.version == self.latest_version:
|
327
327
|
if info.is_ready:
|
sky/serve/core.py
CHANGED
@@ -360,7 +360,7 @@ def update(
|
|
360
360
|
raise RuntimeError(e.error_msg) from e
|
361
361
|
|
362
362
|
service_statuses = serve_utils.load_service_status(serve_status_payload)
|
363
|
-
if
|
363
|
+
if not service_statuses:
|
364
364
|
with ux_utils.print_exception_no_traceback():
|
365
365
|
raise RuntimeError(f'Cannot find service {service_name!r}.'
|
366
366
|
f'To spin up a service, use {ux_utils.BOLD}'
|
@@ -491,9 +491,9 @@ def down(
|
|
491
491
|
stopped_message='All services should have terminated.')
|
492
492
|
|
493
493
|
service_names_str = ','.join(service_names)
|
494
|
-
if sum([
|
495
|
-
argument_str = f'service_names={service_names_str}'
|
496
|
-
|
494
|
+
if sum([bool(service_names), all]) != 1:
|
495
|
+
argument_str = (f'service_names={service_names_str}'
|
496
|
+
if service_names else '')
|
497
497
|
argument_str += ' all' if all else ''
|
498
498
|
raise ValueError('Can only specify one of service_names or all. '
|
499
499
|
f'Provided {argument_str!r}.')
|
sky/serve/replica_managers.py
CHANGED
@@ -172,7 +172,7 @@ def _get_resources_ports(task_yaml: str) -> str:
|
|
172
172
|
"""Get the resources ports used by the task."""
|
173
173
|
task = sky.Task.from_yaml(task_yaml)
|
174
174
|
# Already checked all ports are the same in sky.serve.core.up
|
175
|
-
assert
|
175
|
+
assert task.resources, task
|
176
176
|
task_resources: 'resources.Resources' = list(task.resources)[0]
|
177
177
|
# Already checked the resources have and only have one port
|
178
178
|
# before upload the task yaml.
|
sky/serve/serve_state.py
CHANGED
@@ -226,7 +226,7 @@ class ServiceStatus(enum.Enum):
|
|
226
226
|
for status in ReplicaStatus.failed_statuses()) > 0:
|
227
227
|
return cls.FAILED
|
228
228
|
# When min_replicas = 0, there is no (provisioning) replica.
|
229
|
-
if
|
229
|
+
if not replica_statuses:
|
230
230
|
return cls.NO_REPLICA
|
231
231
|
return cls.REPLICA_INIT
|
232
232
|
|