skypilot-nightly 1.0.0.dev20250318__py3-none-any.whl → 1.0.0.dev20250320__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/cloudflare.py +4 -0
- sky/check.py +156 -53
- sky/clouds/aws.py +5 -0
- sky/clouds/azure.py +5 -0
- sky/clouds/cloud.py +12 -0
- sky/clouds/gcp.py +55 -33
- sky/clouds/ibm.py +5 -0
- sky/clouds/oci.py +5 -0
- sky/clouds/utils/gcp_utils.py +11 -1
- sky/core.py +3 -1
- sky/data/storage.py +7 -9
- sky/execution.py +6 -1
- sky/global_user_state.py +30 -0
- sky/optimizer.py +10 -5
- sky/provision/gcp/config.py +3 -3
- sky/provision/gcp/constants.py +16 -2
- sky/provision/gcp/instance.py +4 -1
- sky/provision/kubernetes/utils.py +37 -24
- sky/serve/replica_managers.py +10 -1
- sky/server/requests/executor.py +33 -19
- sky/server/server.py +4 -1
- sky/utils/controller_utils.py +7 -1
- sky/utils/kubernetes/kubernetes_deploy_utils.py +3 -1
- sky/utils/subprocess_utils.py +47 -25
- {skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/METADATA +3 -2
- {skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/RECORD +31 -31
- {skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info/licenses}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/top_level.txt +0 -0
sky/optimizer.py
CHANGED
@@ -1225,7 +1225,8 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
1225
1225
|
# Explicitly check again to update the enabled cloud list.
|
1226
1226
|
sky_check.check(quiet=True,
|
1227
1227
|
clouds=list(clouds_need_recheck -
|
1228
|
-
global_disabled_clouds)
|
1228
|
+
global_disabled_clouds),
|
1229
|
+
capability=sky_check.CloudCapability.COMPUTE)
|
1229
1230
|
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
1230
1231
|
raise_if_no_cloud_access=True)
|
1231
1232
|
disabled_clouds = (clouds_need_recheck -
|
@@ -1328,13 +1329,17 @@ def _fill_in_launchable_resources(
|
|
1328
1329
|
f'{colorama.Style.RESET_ALL}')
|
1329
1330
|
else:
|
1330
1331
|
if resources.cpus is not None:
|
1331
|
-
logger.info('
|
1332
|
+
logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
|
1333
|
+
'- Try specifying a different CPU count, '
|
1332
1334
|
'or add "+" to the end of the CPU count '
|
1333
|
-
'to allow for larger instances.'
|
1335
|
+
'to allow for larger instances.'
|
1336
|
+
f'{colorama.Style.RESET_ALL}')
|
1334
1337
|
if resources.memory is not None:
|
1335
|
-
logger.info('
|
1338
|
+
logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
|
1339
|
+
'- Try specifying a different memory size, '
|
1336
1340
|
'or add "+" to the end of the memory size '
|
1337
|
-
'to allow for larger instances.'
|
1341
|
+
'to allow for larger instances.'
|
1342
|
+
f'{colorama.Style.RESET_ALL}')
|
1338
1343
|
for cloud, hint in hints.items():
|
1339
1344
|
logger.info(f'{repr(cloud)}: {hint}')
|
1340
1345
|
|
sky/provision/gcp/config.py
CHANGED
@@ -297,8 +297,8 @@ def _is_permission_satisfied(service_account, crm, iam, required_permissions,
|
|
297
297
|
def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
|
298
298
|
"""Setup a gcp service account with IAM roles.
|
299
299
|
|
300
|
-
Creates a gcp service
|
301
|
-
|
300
|
+
Creates a gcp service account and binds IAM roles which allow it to control
|
301
|
+
storage/compute services. Specifically, the head node needs to have
|
302
302
|
an IAM role that allows it to create further gce instances and store items
|
303
303
|
in google cloud storage.
|
304
304
|
|
@@ -311,7 +311,7 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
|
|
311
311
|
)
|
312
312
|
service_account = _get_service_account(email, project_id, iam)
|
313
313
|
|
314
|
-
permissions = gcp_utils.
|
314
|
+
permissions = gcp_utils.get_minimal_compute_permissions()
|
315
315
|
roles = constants.DEFAULT_SERVICE_ACCOUNT_ROLES
|
316
316
|
if config.provider_config.get(constants.HAS_TPU_PROVIDER_FIELD, False):
|
317
317
|
roles = (constants.DEFAULT_SERVICE_ACCOUNT_ROLES +
|
sky/provision/gcp/constants.py
CHANGED
@@ -141,6 +141,11 @@ FIREWALL_RULES_TEMPLATE = [
|
|
141
141
|
},
|
142
142
|
]
|
143
143
|
|
144
|
+
GCP_MINIMAL_PERMISSIONS = [
|
145
|
+
'serviceusage.services.enable',
|
146
|
+
'serviceusage.services.list',
|
147
|
+
]
|
148
|
+
|
144
149
|
# A list of permissions required to run SkyPilot on GCP.
|
145
150
|
# Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long
|
146
151
|
VM_MINIMAL_PERMISSIONS = [
|
@@ -170,13 +175,22 @@ VM_MINIMAL_PERMISSIONS = [
|
|
170
175
|
# Check: sky.provision.gcp.config::_is_permission_satisfied
|
171
176
|
# 'iam.serviceAccounts.actAs',
|
172
177
|
'iam.serviceAccounts.get',
|
173
|
-
'serviceusage.services.enable',
|
174
|
-
'serviceusage.services.list',
|
175
178
|
'serviceusage.services.use',
|
176
179
|
'resourcemanager.projects.get',
|
177
180
|
'resourcemanager.projects.getIamPolicy',
|
178
181
|
]
|
179
182
|
|
183
|
+
STORAGE_MINIMAL_PERMISSIONS = [
|
184
|
+
'storage.buckets.create',
|
185
|
+
'storage.buckets.get',
|
186
|
+
'storage.buckets.delete',
|
187
|
+
'storage.objects.create',
|
188
|
+
'storage.objects.update',
|
189
|
+
'storage.objects.delete',
|
190
|
+
'storage.objects.get',
|
191
|
+
'storage.objects.list',
|
192
|
+
]
|
193
|
+
|
180
194
|
# Permissions implied by GCP built-in roles. We hardcode these here, as we
|
181
195
|
# cannot get the permissions of built-in role from the GCP Python API.
|
182
196
|
# The lists are not exhaustive, but should cover the permissions listed in
|
sky/provision/gcp/instance.py
CHANGED
@@ -586,8 +586,11 @@ def open_ports(
|
|
586
586
|
}
|
587
587
|
handlers: List[Type[instance_utils.GCPInstance]] = [
|
588
588
|
instance_utils.GCPComputeInstance,
|
589
|
-
instance_utils.GCPTPUVMInstance,
|
590
589
|
]
|
590
|
+
use_tpu_vms = provider_config.get('_has_tpus', False)
|
591
|
+
if use_tpu_vms:
|
592
|
+
handlers.append(instance_utils.GCPTPUVMInstance)
|
593
|
+
|
591
594
|
handler_to_instances = _filter_instances(handlers, project_id, zone,
|
592
595
|
label_filters, lambda _: None)
|
593
596
|
operations = collections.defaultdict(list)
|
@@ -97,6 +97,7 @@ GKE_TPU_ACCELERATOR_TO_GENERATION = {
|
|
97
97
|
# Multi-host compatible v5e TPU configurations allowed.
|
98
98
|
'tpu-v5-lite-podslice': 'v5e',
|
99
99
|
'tpu-v5p-slice': 'v5p',
|
100
|
+
'tpu-v6e-slice': 'v6e',
|
100
101
|
}
|
101
102
|
|
102
103
|
POD_STATUSES = {
|
@@ -359,7 +360,8 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
359
360
|
# label to use in an autoscaling environment. For list of topologies, see:
|
360
361
|
# tpu v5e: https://cloud.google.com/tpu/docs/tpus-in-gke
|
361
362
|
# tpu v5p: https://cloud.google.com/tpu/docs/v5p
|
362
|
-
#
|
363
|
+
# tpu v6e: https://cloud.google.com/tpu/docs/v6e
|
364
|
+
# TODO(romilb): Add support for TPU v4.
|
363
365
|
GKE_TPU_TOPOLOGIES = {
|
364
366
|
'tpu-v5-lite-podslice': {
|
365
367
|
1: '1x1',
|
@@ -374,6 +376,11 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
374
376
|
'tpu-v5p-slice': {
|
375
377
|
4: '2x2x1'
|
376
378
|
},
|
379
|
+
'tpu-v6e-slice': {
|
380
|
+
1: '1x1',
|
381
|
+
4: '2x2',
|
382
|
+
8: '2x4'
|
383
|
+
}
|
377
384
|
}
|
378
385
|
|
379
386
|
@classmethod
|
@@ -602,6 +609,7 @@ class GKEAutoscaler(Autoscaler):
|
|
602
609
|
_pip_install_gcp_hint_last_sent = 0.0
|
603
610
|
|
604
611
|
@classmethod
|
612
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
605
613
|
def can_create_new_instance_of_type(cls, context: str,
|
606
614
|
instance_type: str) -> bool:
|
607
615
|
"""Looks at each node pool in the cluster and checks if
|
@@ -655,18 +663,25 @@ class GKEAutoscaler(Autoscaler):
|
|
655
663
|
|
656
664
|
# Check if any node pool with autoscaling enabled can
|
657
665
|
# fit the instance type.
|
658
|
-
|
659
|
-
|
666
|
+
node_pools = cluster.get('nodePools', [])
|
667
|
+
for node_pool in node_pools:
|
668
|
+
name = node_pool.get('name', '')
|
669
|
+
logger.debug(f'checking if node pool {name} '
|
660
670
|
'has autoscaling enabled.')
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
logger.debug(
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
671
|
+
autoscaling_enabled = (node_pool.get('autoscaling',
|
672
|
+
{}).get('enabled', False))
|
673
|
+
if autoscaling_enabled:
|
674
|
+
logger.debug(f'node pool {name} has autoscaling enabled. '
|
675
|
+
'Checking if it can create a node '
|
676
|
+
f'satisfying {instance_type}')
|
677
|
+
try:
|
678
|
+
if cls._check_instance_fits_gke_autoscaler_node_pool(
|
679
|
+
instance_type, node_pool):
|
680
|
+
return True
|
681
|
+
except KeyError:
|
682
|
+
logger.debug('encountered KeyError while checking if '
|
683
|
+
f'node pool {name} can create a node '
|
684
|
+
f'satisfying {instance_type}.')
|
670
685
|
return True
|
671
686
|
return False
|
672
687
|
|
@@ -768,9 +783,9 @@ class GKEAutoscaler(Autoscaler):
|
|
768
783
|
to fit the instance type.
|
769
784
|
"""
|
770
785
|
for accelerator in node_pool_accelerators:
|
771
|
-
node_accelerator_type =
|
772
|
-
get_accelerator_from_label_value(
|
773
|
-
accelerator['acceleratorType'])
|
786
|
+
node_accelerator_type = (
|
787
|
+
GKELabelFormatter.get_accelerator_from_label_value(
|
788
|
+
accelerator['acceleratorType']))
|
774
789
|
node_accelerator_count = accelerator['acceleratorCount']
|
775
790
|
if node_accelerator_type == requested_gpu_type and int(
|
776
791
|
node_accelerator_count) >= requested_gpu_count:
|
@@ -784,6 +799,7 @@ class GKEAutoscaler(Autoscaler):
|
|
784
799
|
"""Check if the node pool has enough TPU capacity
|
785
800
|
to fit the instance type.
|
786
801
|
"""
|
802
|
+
|
787
803
|
if 'goog-gke-tpu-node-pool-type' not in node_pool_resource_labels:
|
788
804
|
# This node does not have TPUs.
|
789
805
|
return False
|
@@ -803,25 +819,22 @@ class GKEAutoscaler(Autoscaler):
|
|
803
819
|
@classmethod
|
804
820
|
def _tpu_chip_count_from_instance_type(cls, machine_type: str) -> int:
|
805
821
|
"""Infer the number of TPU chips from the instance type."""
|
806
|
-
machine_type_parts = machine_type.split('-')
|
807
822
|
# according to
|
808
823
|
# https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#machine_type
|
809
824
|
# GKE TPU machine types have the format of
|
810
|
-
# ct<version
|
825
|
+
# ct<version>-<type>-<node-chip-count>t
|
811
826
|
logger.debug(
|
812
827
|
f'inferring TPU chip count from machine type: {machine_type}')
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
not machine_type_parts[2].endswith('t') or
|
817
|
-
not machine_type_parts[2].strip('t').isdigit()):
|
828
|
+
pattern = r'ct[a-z0-9]+-[a-z]+-([0-9]+)t'
|
829
|
+
search = re.search(pattern, machine_type)
|
830
|
+
if search is None:
|
818
831
|
logger.debug(f'machine type {machine_type} is not a '
|
819
832
|
'valid TPU machine type format.')
|
820
833
|
return 0
|
821
|
-
num_tpu_chips =
|
834
|
+
num_tpu_chips = search.group(1)
|
822
835
|
logger.debug(
|
823
836
|
f'machine type {machine_type} has {num_tpu_chips} TPU chips.')
|
824
|
-
return num_tpu_chips
|
837
|
+
return int(num_tpu_chips)
|
825
838
|
|
826
839
|
@classmethod
|
827
840
|
def _is_node_multi_host_tpu(cls, resource_labels: dict) -> bool:
|
sky/serve/replica_managers.py
CHANGED
@@ -1205,7 +1205,16 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
1205
1205
|
for key in ['service']:
|
1206
1206
|
old_config.pop(key)
|
1207
1207
|
# Bump replica version if all fields except for service are
|
1208
|
-
# the same.
|
1208
|
+
# the same.
|
1209
|
+
# Here, we manually convert the any_of field to a set to avoid
|
1210
|
+
# only the difference in the random order of the any_of fields.
|
1211
|
+
old_config_any_of = old_config.get('resources',
|
1212
|
+
{}).pop('any_of', [])
|
1213
|
+
new_config_any_of = new_config.get('resources',
|
1214
|
+
{}).pop('any_of', [])
|
1215
|
+
if set(old_config_any_of) != set(new_config_any_of):
|
1216
|
+
continue
|
1217
|
+
# File mounts should both be empty, as update always
|
1209
1218
|
# create new buckets if they are not empty.
|
1210
1219
|
if (old_config == new_config and
|
1211
1220
|
old_config.get('file_mounts', None) == {}):
|
sky/server/requests/executor.py
CHANGED
@@ -49,7 +49,6 @@ from sky.utils import annotations
|
|
49
49
|
from sky.utils import common_utils
|
50
50
|
from sky.utils import subprocess_utils
|
51
51
|
from sky.utils import timeline
|
52
|
-
from sky.utils import ux_utils
|
53
52
|
|
54
53
|
if typing.TYPE_CHECKING:
|
55
54
|
import types
|
@@ -221,6 +220,10 @@ def _restore_output(original_stdout: int, original_stderr: int) -> None:
|
|
221
220
|
os.close(original_stderr)
|
222
221
|
|
223
222
|
|
223
|
+
def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
|
224
|
+
raise KeyboardInterrupt
|
225
|
+
|
226
|
+
|
224
227
|
def _request_execution_wrapper(request_id: str,
|
225
228
|
ignore_return_value: bool) -> None:
|
226
229
|
"""Wrapper for a request execution.
|
@@ -232,12 +235,8 @@ def _request_execution_wrapper(request_id: str,
|
|
232
235
|
3. Redirect the stdout and stderr of the execution to log file;
|
233
236
|
4. Handle the SIGTERM signal to abort the request gracefully.
|
234
237
|
"""
|
235
|
-
|
236
|
-
|
237
|
-
frame: Optional['types.FrameType']) -> None:
|
238
|
-
raise KeyboardInterrupt
|
239
|
-
|
240
|
-
signal.signal(signal.SIGTERM, sigterm_handler)
|
238
|
+
# Handle the SIGTERM signal to abort the request processing gracefully.
|
239
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
241
240
|
|
242
241
|
pid = multiprocessing.current_process().pid
|
243
242
|
logger.info(f'Running request {request_id} with pid {pid}')
|
@@ -355,6 +354,8 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
|
355
354
|
Args:
|
356
355
|
max_parallel_size: Maximum number of parallel jobs this worker can run.
|
357
356
|
"""
|
357
|
+
# Handle the SIGTERM signal to abort the executor process gracefully.
|
358
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
358
359
|
proc_group = f'{worker.schedule_type.value}-{worker.id}'
|
359
360
|
setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
|
360
361
|
queue = _get_queue(worker.schedule_type)
|
@@ -388,19 +389,11 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
|
388
389
|
logger.info(f'[{worker}] Finished request: {request_id}')
|
389
390
|
else:
|
390
391
|
logger.info(f'[{worker}] Submitted request: {request_id}')
|
391
|
-
except KeyboardInterrupt:
|
392
|
-
# Interrupt the worker process will stop request execution, but
|
393
|
-
# the SIGTERM request should be respected anyway since it might
|
394
|
-
# be explicitly sent by user.
|
395
|
-
# TODO(aylei): crash the API server or recreate the worker process
|
396
|
-
# to avoid broken state.
|
397
|
-
logger.error(f'[{worker}] Worker process interrupted')
|
398
|
-
with ux_utils.print_exception_no_traceback():
|
399
|
-
raise
|
400
392
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
401
393
|
# Catch any other exceptions to avoid crashing the worker process.
|
402
394
|
logger.error(
|
403
|
-
f'[{worker}] Error processing request
|
395
|
+
f'[{worker}] Error processing request: '
|
396
|
+
f'{request_id if "request_id" in locals() else ""} '
|
404
397
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
405
398
|
|
406
399
|
# Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
|
@@ -409,12 +402,33 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
|
409
402
|
# We use executor instead of individual multiprocessing.Process to avoid
|
410
403
|
# the overhead of forking a new process for each request, which can be about
|
411
404
|
# 1s delay.
|
412
|
-
|
405
|
+
try:
|
406
|
+
executor = concurrent.futures.ProcessPoolExecutor(
|
413
407
|
max_workers=max_parallel_size,
|
414
408
|
initializer=executor_initializer,
|
415
|
-
initargs=(proc_group,))
|
409
|
+
initargs=(proc_group,))
|
416
410
|
while True:
|
417
411
|
process_request(executor)
|
412
|
+
# TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
|
413
|
+
except KeyboardInterrupt:
|
414
|
+
pass
|
415
|
+
finally:
|
416
|
+
# In most cases, here we receive either ctrl-c in foreground execution
|
417
|
+
# or SIGTERM on server exiting. Gracefully exit the worker process and
|
418
|
+
# the executor.
|
419
|
+
# TODO(aylei): worker may also be killed by system daemons like OOM
|
420
|
+
# killer, crash the API server or recreate the worker process to avoid
|
421
|
+
# broken state in such cases.
|
422
|
+
logger.info(f'[{worker}] Worker process interrupted')
|
423
|
+
executor_processes = list(executor._processes.values()) # pylint: disable=protected-access,line-too-long
|
424
|
+
# Shutdown the executor so that executor process can exit once the
|
425
|
+
# running task is finished or interrupted.
|
426
|
+
executor.shutdown(wait=False)
|
427
|
+
# Proactively interrupt the running task to avoid indefinite waiting.
|
428
|
+
subprocess_utils.run_in_parallel(
|
429
|
+
subprocess_utils.kill_process_with_grace_period,
|
430
|
+
executor_processes,
|
431
|
+
num_threads=len(executor_processes))
|
418
432
|
|
419
433
|
|
420
434
|
def start(deploy: bool) -> List[multiprocessing.Process]:
|
sky/server/server.py
CHANGED
@@ -1140,6 +1140,9 @@ if __name__ == '__main__':
|
|
1140
1140
|
# The process may not be started yet, close it anyway.
|
1141
1141
|
proc.close()
|
1142
1142
|
|
1143
|
+
# Terminate processes in reverse order in case dependency, especially
|
1144
|
+
# queue server. Terminate queue server first does not affect the
|
1145
|
+
# correctness of cleanup but introduce redundant error messages.
|
1143
1146
|
subprocess_utils.run_in_parallel(cleanup,
|
1144
|
-
sub_procs,
|
1147
|
+
list(reversed(sub_procs)),
|
1145
1148
|
num_threads=len(sub_procs))
|
sky/utils/controller_utils.py
CHANGED
@@ -215,7 +215,13 @@ def _get_cloud_dependencies_installation_commands(
|
|
215
215
|
commands.append(f'echo -en "\\r{step_prefix}uv{empty_str}" &&'
|
216
216
|
f'{constants.SKY_UV_INSTALL_CMD} >/dev/null 2>&1')
|
217
217
|
|
218
|
-
|
218
|
+
enabled_compute_clouds = set(
|
219
|
+
sky_check.get_cached_enabled_clouds_or_refresh())
|
220
|
+
enabled_storage_clouds = set(
|
221
|
+
sky_check.get_cached_enabled_storage_clouds_or_refresh())
|
222
|
+
enabled_clouds = enabled_compute_clouds.union(enabled_storage_clouds)
|
223
|
+
|
224
|
+
for cloud in enabled_clouds:
|
219
225
|
cloud_python_dependencies: List[str] = copy.deepcopy(
|
220
226
|
dependencies.extras_require[cloud.canonical_name()])
|
221
227
|
|
@@ -167,7 +167,9 @@ def deploy_local_cluster(gpus: bool):
|
|
167
167
|
f'\nError: {stderr}')
|
168
168
|
# Run sky check
|
169
169
|
with rich_utils.safe_status('[bold cyan]Running sky check...'):
|
170
|
-
sky_check.check(clouds=['kubernetes'],
|
170
|
+
sky_check.check(clouds=['kubernetes'],
|
171
|
+
quiet=True,
|
172
|
+
capability=sky_check.CloudCapability.COMPUTE)
|
171
173
|
if cluster_created:
|
172
174
|
# Prepare completion message which shows CPU and GPU count
|
173
175
|
# Get number of CPUs
|
sky/utils/subprocess_utils.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Utility functions for subprocesses."""
|
2
|
+
import multiprocessing
|
2
3
|
from multiprocessing import pool
|
3
4
|
import os
|
4
5
|
import random
|
@@ -181,29 +182,6 @@ def kill_children_processes(parent_pids: Optional[Union[
|
|
181
182
|
if isinstance(parent_pids, int):
|
182
183
|
parent_pids = [parent_pids]
|
183
184
|
|
184
|
-
def kill(proc: psutil.Process):
|
185
|
-
if not proc.is_running():
|
186
|
-
# Skip if the process is not running.
|
187
|
-
return
|
188
|
-
logger.debug(f'Killing process {proc.pid}')
|
189
|
-
try:
|
190
|
-
if force:
|
191
|
-
proc.kill()
|
192
|
-
else:
|
193
|
-
proc.terminate()
|
194
|
-
proc.wait(timeout=10)
|
195
|
-
except psutil.NoSuchProcess:
|
196
|
-
# The child process may have already been terminated.
|
197
|
-
pass
|
198
|
-
except psutil.TimeoutExpired:
|
199
|
-
logger.debug(
|
200
|
-
f'Process {proc.pid} did not terminate after 10 seconds')
|
201
|
-
# Attempt to force kill if the normal termination fails
|
202
|
-
if not force:
|
203
|
-
logger.debug(f'Force killing process {proc.pid}')
|
204
|
-
proc.kill()
|
205
|
-
proc.wait(timeout=5) # Shorter timeout after force kill
|
206
|
-
|
207
185
|
parent_processes = []
|
208
186
|
if parent_pids is None:
|
209
187
|
parent_processes = [psutil.Process()]
|
@@ -218,10 +196,54 @@ def kill_children_processes(parent_pids: Optional[Union[
|
|
218
196
|
for parent_process in parent_processes:
|
219
197
|
child_processes = parent_process.children(recursive=True)
|
220
198
|
if parent_pids is not None:
|
221
|
-
|
199
|
+
kill_process_with_grace_period(parent_process, force=force)
|
222
200
|
logger.debug(f'Killing child processes: {child_processes}')
|
223
201
|
for child in child_processes:
|
224
|
-
|
202
|
+
kill_process_with_grace_period(child, force=force)
|
203
|
+
|
204
|
+
|
205
|
+
def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
|
206
|
+
psutil.Process],
|
207
|
+
force: bool = False,
|
208
|
+
grace_period: int = 10) -> None:
|
209
|
+
"""Kill a process with SIGTERM and wait for it to exit.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
proc: The process to kill, either a multiprocessing.Process or a
|
213
|
+
psutil.Process.
|
214
|
+
force: Whether to force kill the process.
|
215
|
+
grace_period: The grace period seconds to wait for the process to exit.
|
216
|
+
"""
|
217
|
+
if isinstance(proc, psutil.Process):
|
218
|
+
alive = proc.is_running
|
219
|
+
wait = proc.wait
|
220
|
+
else:
|
221
|
+
alive = proc.is_alive
|
222
|
+
wait = proc.join
|
223
|
+
if not alive():
|
224
|
+
# Skip if the process is not running.
|
225
|
+
return
|
226
|
+
logger.debug(f'Killing process {proc.pid}')
|
227
|
+
try:
|
228
|
+
if force:
|
229
|
+
proc.kill()
|
230
|
+
else:
|
231
|
+
proc.terminate()
|
232
|
+
wait(timeout=grace_period)
|
233
|
+
except (psutil.NoSuchProcess, ValueError):
|
234
|
+
# The child process may have already been terminated.
|
235
|
+
return
|
236
|
+
except psutil.TimeoutExpired:
|
237
|
+
# Pass to finally to force kill the process.
|
238
|
+
pass
|
239
|
+
finally:
|
240
|
+
logger.debug(f'Process {proc.pid} did not terminate after '
|
241
|
+
f'{grace_period} seconds')
|
242
|
+
# Attempt to force kill if the normal termination fails
|
243
|
+
if not force:
|
244
|
+
logger.debug(f'Force killing process {proc.pid}')
|
245
|
+
# Shorter timeout after force kill
|
246
|
+
kill_process_with_grace_period(proc, force=True, grace_period=5)
|
225
247
|
|
226
248
|
|
227
249
|
def run_with_retries(
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20250320
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -156,6 +156,7 @@ Dynamic: classifier
|
|
156
156
|
Dynamic: description
|
157
157
|
Dynamic: description-content-type
|
158
158
|
Dynamic: license
|
159
|
+
Dynamic: license-file
|
159
160
|
Dynamic: project-url
|
160
161
|
Dynamic: provides-extra
|
161
162
|
Dynamic: requires-dist
|