skypilot-nightly 1.0.0.dev20250318__py3-none-any.whl → 1.0.0.dev20250320__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/optimizer.py CHANGED
@@ -1225,7 +1225,8 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1225
1225
  # Explicitly check again to update the enabled cloud list.
1226
1226
  sky_check.check(quiet=True,
1227
1227
  clouds=list(clouds_need_recheck -
1228
- global_disabled_clouds))
1228
+ global_disabled_clouds),
1229
+ capability=sky_check.CloudCapability.COMPUTE)
1229
1230
  enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
1230
1231
  raise_if_no_cloud_access=True)
1231
1232
  disabled_clouds = (clouds_need_recheck -
@@ -1328,13 +1329,17 @@ def _fill_in_launchable_resources(
1328
1329
  f'{colorama.Style.RESET_ALL}')
1329
1330
  else:
1330
1331
  if resources.cpus is not None:
1331
- logger.info('Try specifying a different CPU count, '
1332
+ logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
1333
+ '- Try specifying a different CPU count, '
1332
1334
  'or add "+" to the end of the CPU count '
1333
- 'to allow for larger instances.')
1335
+ 'to allow for larger instances.'
1336
+ f'{colorama.Style.RESET_ALL}')
1334
1337
  if resources.memory is not None:
1335
- logger.info('Try specifying a different memory size, '
1338
+ logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
1339
+ '- Try specifying a different memory size, '
1336
1340
  'or add "+" to the end of the memory size '
1337
- 'to allow for larger instances.')
1341
+ 'to allow for larger instances.'
1342
+ f'{colorama.Style.RESET_ALL}')
1338
1343
  for cloud, hint in hints.items():
1339
1344
  logger.info(f'{repr(cloud)}: {hint}')
1340
1345
 
@@ -297,8 +297,8 @@ def _is_permission_satisfied(service_account, crm, iam, required_permissions,
297
297
  def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
298
298
  """Setup a gcp service account with IAM roles.
299
299
 
300
- Creates a gcp service acconut and binds IAM roles which allow it to control
301
- control storage/compute services. Specifically, the head node needs to have
300
+ Creates a gcp service account and binds IAM roles which allow it to control
301
+ storage/compute services. Specifically, the head node needs to have
302
302
  an IAM role that allows it to create further gce instances and store items
303
303
  in google cloud storage.
304
304
 
@@ -311,7 +311,7 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
311
311
  )
312
312
  service_account = _get_service_account(email, project_id, iam)
313
313
 
314
- permissions = gcp_utils.get_minimal_permissions()
314
+ permissions = gcp_utils.get_minimal_compute_permissions()
315
315
  roles = constants.DEFAULT_SERVICE_ACCOUNT_ROLES
316
316
  if config.provider_config.get(constants.HAS_TPU_PROVIDER_FIELD, False):
317
317
  roles = (constants.DEFAULT_SERVICE_ACCOUNT_ROLES +
@@ -141,6 +141,11 @@ FIREWALL_RULES_TEMPLATE = [
141
141
  },
142
142
  ]
143
143
 
144
+ GCP_MINIMAL_PERMISSIONS = [
145
+ 'serviceusage.services.enable',
146
+ 'serviceusage.services.list',
147
+ ]
148
+
144
149
  # A list of permissions required to run SkyPilot on GCP.
145
150
  # Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long
146
151
  VM_MINIMAL_PERMISSIONS = [
@@ -170,13 +175,22 @@ VM_MINIMAL_PERMISSIONS = [
170
175
  # Check: sky.provision.gcp.config::_is_permission_satisfied
171
176
  # 'iam.serviceAccounts.actAs',
172
177
  'iam.serviceAccounts.get',
173
- 'serviceusage.services.enable',
174
- 'serviceusage.services.list',
175
178
  'serviceusage.services.use',
176
179
  'resourcemanager.projects.get',
177
180
  'resourcemanager.projects.getIamPolicy',
178
181
  ]
179
182
 
183
+ STORAGE_MINIMAL_PERMISSIONS = [
184
+ 'storage.buckets.create',
185
+ 'storage.buckets.get',
186
+ 'storage.buckets.delete',
187
+ 'storage.objects.create',
188
+ 'storage.objects.update',
189
+ 'storage.objects.delete',
190
+ 'storage.objects.get',
191
+ 'storage.objects.list',
192
+ ]
193
+
180
194
  # Permissions implied by GCP built-in roles. We hardcode these here, as we
181
195
  # cannot get the permissions of built-in role from the GCP Python API.
182
196
  # The lists are not exhaustive, but should cover the permissions listed in
@@ -586,8 +586,11 @@ def open_ports(
586
586
  }
587
587
  handlers: List[Type[instance_utils.GCPInstance]] = [
588
588
  instance_utils.GCPComputeInstance,
589
- instance_utils.GCPTPUVMInstance,
590
589
  ]
590
+ use_tpu_vms = provider_config.get('_has_tpus', False)
591
+ if use_tpu_vms:
592
+ handlers.append(instance_utils.GCPTPUVMInstance)
593
+
591
594
  handler_to_instances = _filter_instances(handlers, project_id, zone,
592
595
  label_filters, lambda _: None)
593
596
  operations = collections.defaultdict(list)
@@ -97,6 +97,7 @@ GKE_TPU_ACCELERATOR_TO_GENERATION = {
97
97
  # Multi-host compatible v5e TPU configurations allowed.
98
98
  'tpu-v5-lite-podslice': 'v5e',
99
99
  'tpu-v5p-slice': 'v5p',
100
+ 'tpu-v6e-slice': 'v6e',
100
101
  }
101
102
 
102
103
  POD_STATUSES = {
@@ -359,7 +360,8 @@ class GKELabelFormatter(GPULabelFormatter):
359
360
  # label to use in an autoscaling environment. For list of topologies, see:
360
361
  # tpu v5e: https://cloud.google.com/tpu/docs/tpus-in-gke
361
362
  # tpu v5p: https://cloud.google.com/tpu/docs/v5p
362
- # TODO(romilb): Add support for TPU v4 and v6.
363
+ # tpu v6e: https://cloud.google.com/tpu/docs/v6e
364
+ # TODO(romilb): Add support for TPU v4.
363
365
  GKE_TPU_TOPOLOGIES = {
364
366
  'tpu-v5-lite-podslice': {
365
367
  1: '1x1',
@@ -374,6 +376,11 @@ class GKELabelFormatter(GPULabelFormatter):
374
376
  'tpu-v5p-slice': {
375
377
  4: '2x2x1'
376
378
  },
379
+ 'tpu-v6e-slice': {
380
+ 1: '1x1',
381
+ 4: '2x2',
382
+ 8: '2x4'
383
+ }
377
384
  }
378
385
 
379
386
  @classmethod
@@ -602,6 +609,7 @@ class GKEAutoscaler(Autoscaler):
602
609
  _pip_install_gcp_hint_last_sent = 0.0
603
610
 
604
611
  @classmethod
612
+ @annotations.lru_cache(scope='request', maxsize=10)
605
613
  def can_create_new_instance_of_type(cls, context: str,
606
614
  instance_type: str) -> bool:
607
615
  """Looks at each node pool in the cluster and checks if
@@ -655,18 +663,25 @@ class GKEAutoscaler(Autoscaler):
655
663
 
656
664
  # Check if any node pool with autoscaling enabled can
657
665
  # fit the instance type.
658
- for node_pool in cluster['nodePools']:
659
- logger.debug(f'checking if node pool {node_pool["name"]} '
666
+ node_pools = cluster.get('nodePools', [])
667
+ for node_pool in node_pools:
668
+ name = node_pool.get('name', '')
669
+ logger.debug(f'checking if node pool {name} '
660
670
  'has autoscaling enabled.')
661
- if (node_pool['autoscaling'] is not None and
662
- 'enabled' in node_pool['autoscaling'] and
663
- node_pool['autoscaling']['enabled']):
664
- logger.debug(
665
- f'node pool {node_pool["name"]} has autoscaling enabled. '
666
- 'Checking if it can create a node '
667
- f'satisfying {instance_type}')
668
- if cls._check_instance_fits_gke_autoscaler_node_pool(
669
- instance_type, node_pool):
671
+ autoscaling_enabled = (node_pool.get('autoscaling',
672
+ {}).get('enabled', False))
673
+ if autoscaling_enabled:
674
+ logger.debug(f'node pool {name} has autoscaling enabled. '
675
+ 'Checking if it can create a node '
676
+ f'satisfying {instance_type}')
677
+ try:
678
+ if cls._check_instance_fits_gke_autoscaler_node_pool(
679
+ instance_type, node_pool):
680
+ return True
681
+ except KeyError:
682
+ logger.debug('encountered KeyError while checking if '
683
+ f'node pool {name} can create a node '
684
+ f'satisfying {instance_type}.')
670
685
  return True
671
686
  return False
672
687
 
@@ -768,9 +783,9 @@ class GKEAutoscaler(Autoscaler):
768
783
  to fit the instance type.
769
784
  """
770
785
  for accelerator in node_pool_accelerators:
771
- node_accelerator_type = GKELabelFormatter. \
772
- get_accelerator_from_label_value(
773
- accelerator['acceleratorType'])
786
+ node_accelerator_type = (
787
+ GKELabelFormatter.get_accelerator_from_label_value(
788
+ accelerator['acceleratorType']))
774
789
  node_accelerator_count = accelerator['acceleratorCount']
775
790
  if node_accelerator_type == requested_gpu_type and int(
776
791
  node_accelerator_count) >= requested_gpu_count:
@@ -784,6 +799,7 @@ class GKEAutoscaler(Autoscaler):
784
799
  """Check if the node pool has enough TPU capacity
785
800
  to fit the instance type.
786
801
  """
802
+
787
803
  if 'goog-gke-tpu-node-pool-type' not in node_pool_resource_labels:
788
804
  # This node does not have TPUs.
789
805
  return False
@@ -803,25 +819,22 @@ class GKEAutoscaler(Autoscaler):
803
819
  @classmethod
804
820
  def _tpu_chip_count_from_instance_type(cls, machine_type: str) -> int:
805
821
  """Infer the number of TPU chips from the instance type."""
806
- machine_type_parts = machine_type.split('-')
807
822
  # according to
808
823
  # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#machine_type
809
824
  # GKE TPU machine types have the format of
810
- # ct<version>-hightpu-<node-chip-count>t
825
+ # ct<version>-<type>-<node-chip-count>t
811
826
  logger.debug(
812
827
  f'inferring TPU chip count from machine type: {machine_type}')
813
- if (len(machine_type_parts) != 3 or
814
- not machine_type_parts[0].startswith('ct') or
815
- machine_type_parts[1] != 'hightpu' or
816
- not machine_type_parts[2].endswith('t') or
817
- not machine_type_parts[2].strip('t').isdigit()):
828
+ pattern = r'ct[a-z0-9]+-[a-z]+-([0-9]+)t'
829
+ search = re.search(pattern, machine_type)
830
+ if search is None:
818
831
  logger.debug(f'machine type {machine_type} is not a '
819
832
  'valid TPU machine type format.')
820
833
  return 0
821
- num_tpu_chips = int(machine_type_parts[2].strip('t'))
834
+ num_tpu_chips = search.group(1)
822
835
  logger.debug(
823
836
  f'machine type {machine_type} has {num_tpu_chips} TPU chips.')
824
- return num_tpu_chips
837
+ return int(num_tpu_chips)
825
838
 
826
839
  @classmethod
827
840
  def _is_node_multi_host_tpu(cls, resource_labels: dict) -> bool:
@@ -1205,7 +1205,16 @@ class SkyPilotReplicaManager(ReplicaManager):
1205
1205
  for key in ['service']:
1206
1206
  old_config.pop(key)
1207
1207
  # Bump replica version if all fields except for service are
1208
- # the same. File mounts should both be empty, as update always
1208
+ # the same.
1209
+ # Here, we manually convert the any_of field to a set to avoid
1210
+ # only the difference in the random order of the any_of fields.
1211
+ old_config_any_of = old_config.get('resources',
1212
+ {}).pop('any_of', [])
1213
+ new_config_any_of = new_config.get('resources',
1214
+ {}).pop('any_of', [])
1215
+ if set(old_config_any_of) != set(new_config_any_of):
1216
+ continue
1217
+ # File mounts should both be empty, as update always
1209
1218
  # create new buckets if they are not empty.
1210
1219
  if (old_config == new_config and
1211
1220
  old_config.get('file_mounts', None) == {}):
@@ -49,7 +49,6 @@ from sky.utils import annotations
49
49
  from sky.utils import common_utils
50
50
  from sky.utils import subprocess_utils
51
51
  from sky.utils import timeline
52
- from sky.utils import ux_utils
53
52
 
54
53
  if typing.TYPE_CHECKING:
55
54
  import types
@@ -221,6 +220,10 @@ def _restore_output(original_stdout: int, original_stderr: int) -> None:
221
220
  os.close(original_stderr)
222
221
 
223
222
 
223
+ def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
224
+ raise KeyboardInterrupt
225
+
226
+
224
227
  def _request_execution_wrapper(request_id: str,
225
228
  ignore_return_value: bool) -> None:
226
229
  """Wrapper for a request execution.
@@ -232,12 +235,8 @@ def _request_execution_wrapper(request_id: str,
232
235
  3. Redirect the stdout and stderr of the execution to log file;
233
236
  4. Handle the SIGTERM signal to abort the request gracefully.
234
237
  """
235
-
236
- def sigterm_handler(signum: int,
237
- frame: Optional['types.FrameType']) -> None:
238
- raise KeyboardInterrupt
239
-
240
- signal.signal(signal.SIGTERM, sigterm_handler)
238
+ # Handle the SIGTERM signal to abort the request processing gracefully.
239
+ signal.signal(signal.SIGTERM, _sigterm_handler)
241
240
 
242
241
  pid = multiprocessing.current_process().pid
243
242
  logger.info(f'Running request {request_id} with pid {pid}')
@@ -355,6 +354,8 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
355
354
  Args:
356
355
  max_parallel_size: Maximum number of parallel jobs this worker can run.
357
356
  """
357
+ # Handle the SIGTERM signal to abort the executor process gracefully.
358
+ signal.signal(signal.SIGTERM, _sigterm_handler)
358
359
  proc_group = f'{worker.schedule_type.value}-{worker.id}'
359
360
  setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
360
361
  queue = _get_queue(worker.schedule_type)
@@ -388,19 +389,11 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
388
389
  logger.info(f'[{worker}] Finished request: {request_id}')
389
390
  else:
390
391
  logger.info(f'[{worker}] Submitted request: {request_id}')
391
- except KeyboardInterrupt:
392
- # Interrupt the worker process will stop request execution, but
393
- # the SIGTERM request should be respected anyway since it might
394
- # be explicitly sent by user.
395
- # TODO(aylei): crash the API server or recreate the worker process
396
- # to avoid broken state.
397
- logger.error(f'[{worker}] Worker process interrupted')
398
- with ux_utils.print_exception_no_traceback():
399
- raise
400
392
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
401
393
  # Catch any other exceptions to avoid crashing the worker process.
402
394
  logger.error(
403
- f'[{worker}] Error processing request {request_id}: '
395
+ f'[{worker}] Error processing request: '
396
+ f'{request_id if "request_id" in locals() else ""} '
404
397
  f'{common_utils.format_exception(e, use_bracket=True)}')
405
398
 
406
399
  # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
@@ -409,12 +402,33 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
409
402
  # We use executor instead of individual multiprocessing.Process to avoid
410
403
  # the overhead of forking a new process for each request, which can be about
411
404
  # 1s delay.
412
- with concurrent.futures.ProcessPoolExecutor(
405
+ try:
406
+ executor = concurrent.futures.ProcessPoolExecutor(
413
407
  max_workers=max_parallel_size,
414
408
  initializer=executor_initializer,
415
- initargs=(proc_group,)) as executor:
409
+ initargs=(proc_group,))
416
410
  while True:
417
411
  process_request(executor)
412
+ # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
413
+ except KeyboardInterrupt:
414
+ pass
415
+ finally:
416
+ # In most cases, here we receive either ctrl-c in foreground execution
417
+ # or SIGTERM on server exiting. Gracefully exit the worker process and
418
+ # the executor.
419
+ # TODO(aylei): worker may also be killed by system daemons like OOM
420
+ # killer, crash the API server or recreate the worker process to avoid
421
+ # broken state in such cases.
422
+ logger.info(f'[{worker}] Worker process interrupted')
423
+ executor_processes = list(executor._processes.values()) # pylint: disable=protected-access,line-too-long
424
+ # Shutdown the executor so that executor process can exit once the
425
+ # running task is finished or interrupted.
426
+ executor.shutdown(wait=False)
427
+ # Proactively interrupt the running task to avoid indefinite waiting.
428
+ subprocess_utils.run_in_parallel(
429
+ subprocess_utils.kill_process_with_grace_period,
430
+ executor_processes,
431
+ num_threads=len(executor_processes))
418
432
 
419
433
 
420
434
  def start(deploy: bool) -> List[multiprocessing.Process]:
sky/server/server.py CHANGED
@@ -1140,6 +1140,9 @@ if __name__ == '__main__':
1140
1140
  # The process may not be started yet, close it anyway.
1141
1141
  proc.close()
1142
1142
 
1143
+ # Terminate processes in reverse order in case dependency, especially
1144
+ # queue server. Terminate queue server first does not affect the
1145
+ # correctness of cleanup but introduce redundant error messages.
1143
1146
  subprocess_utils.run_in_parallel(cleanup,
1144
- sub_procs,
1147
+ list(reversed(sub_procs)),
1145
1148
  num_threads=len(sub_procs))
@@ -215,7 +215,13 @@ def _get_cloud_dependencies_installation_commands(
215
215
  commands.append(f'echo -en "\\r{step_prefix}uv{empty_str}" &&'
216
216
  f'{constants.SKY_UV_INSTALL_CMD} >/dev/null 2>&1')
217
217
 
218
- for cloud in sky_check.get_cached_enabled_clouds_or_refresh():
218
+ enabled_compute_clouds = set(
219
+ sky_check.get_cached_enabled_clouds_or_refresh())
220
+ enabled_storage_clouds = set(
221
+ sky_check.get_cached_enabled_storage_clouds_or_refresh())
222
+ enabled_clouds = enabled_compute_clouds.union(enabled_storage_clouds)
223
+
224
+ for cloud in enabled_clouds:
219
225
  cloud_python_dependencies: List[str] = copy.deepcopy(
220
226
  dependencies.extras_require[cloud.canonical_name()])
221
227
 
@@ -167,7 +167,9 @@ def deploy_local_cluster(gpus: bool):
167
167
  f'\nError: {stderr}')
168
168
  # Run sky check
169
169
  with rich_utils.safe_status('[bold cyan]Running sky check...'):
170
- sky_check.check(clouds=['kubernetes'], quiet=True)
170
+ sky_check.check(clouds=['kubernetes'],
171
+ quiet=True,
172
+ capability=sky_check.CloudCapability.COMPUTE)
171
173
  if cluster_created:
172
174
  # Prepare completion message which shows CPU and GPU count
173
175
  # Get number of CPUs
@@ -1,4 +1,5 @@
1
1
  """Utility functions for subprocesses."""
2
+ import multiprocessing
2
3
  from multiprocessing import pool
3
4
  import os
4
5
  import random
@@ -181,29 +182,6 @@ def kill_children_processes(parent_pids: Optional[Union[
181
182
  if isinstance(parent_pids, int):
182
183
  parent_pids = [parent_pids]
183
184
 
184
- def kill(proc: psutil.Process):
185
- if not proc.is_running():
186
- # Skip if the process is not running.
187
- return
188
- logger.debug(f'Killing process {proc.pid}')
189
- try:
190
- if force:
191
- proc.kill()
192
- else:
193
- proc.terminate()
194
- proc.wait(timeout=10)
195
- except psutil.NoSuchProcess:
196
- # The child process may have already been terminated.
197
- pass
198
- except psutil.TimeoutExpired:
199
- logger.debug(
200
- f'Process {proc.pid} did not terminate after 10 seconds')
201
- # Attempt to force kill if the normal termination fails
202
- if not force:
203
- logger.debug(f'Force killing process {proc.pid}')
204
- proc.kill()
205
- proc.wait(timeout=5) # Shorter timeout after force kill
206
-
207
185
  parent_processes = []
208
186
  if parent_pids is None:
209
187
  parent_processes = [psutil.Process()]
@@ -218,10 +196,54 @@ def kill_children_processes(parent_pids: Optional[Union[
218
196
  for parent_process in parent_processes:
219
197
  child_processes = parent_process.children(recursive=True)
220
198
  if parent_pids is not None:
221
- kill(parent_process)
199
+ kill_process_with_grace_period(parent_process, force=force)
222
200
  logger.debug(f'Killing child processes: {child_processes}')
223
201
  for child in child_processes:
224
- kill(child)
202
+ kill_process_with_grace_period(child, force=force)
203
+
204
+
205
+ def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
206
+ psutil.Process],
207
+ force: bool = False,
208
+ grace_period: int = 10) -> None:
209
+ """Kill a process with SIGTERM and wait for it to exit.
210
+
211
+ Args:
212
+ proc: The process to kill, either a multiprocessing.Process or a
213
+ psutil.Process.
214
+ force: Whether to force kill the process.
215
+ grace_period: The grace period seconds to wait for the process to exit.
216
+ """
217
+ if isinstance(proc, psutil.Process):
218
+ alive = proc.is_running
219
+ wait = proc.wait
220
+ else:
221
+ alive = proc.is_alive
222
+ wait = proc.join
223
+ if not alive():
224
+ # Skip if the process is not running.
225
+ return
226
+ logger.debug(f'Killing process {proc.pid}')
227
+ try:
228
+ if force:
229
+ proc.kill()
230
+ else:
231
+ proc.terminate()
232
+ wait(timeout=grace_period)
233
+ except (psutil.NoSuchProcess, ValueError):
234
+ # The child process may have already been terminated.
235
+ return
236
+ except psutil.TimeoutExpired:
237
+ # Pass to finally to force kill the process.
238
+ pass
239
+ finally:
240
+ logger.debug(f'Process {proc.pid} did not terminate after '
241
+ f'{grace_period} seconds')
242
+ # Attempt to force kill if the normal termination fails
243
+ if not force:
244
+ logger.debug(f'Force killing process {proc.pid}')
245
+ # Shorter timeout after force kill
246
+ kill_process_with_grace_period(proc, force=True, grace_period=5)
225
247
 
226
248
 
227
249
  def run_with_retries(
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250318
3
+ Version: 1.0.0.dev20250320
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -156,6 +156,7 @@ Dynamic: classifier
156
156
  Dynamic: description
157
157
  Dynamic: description-content-type
158
158
  Dynamic: license
159
+ Dynamic: license-file
159
160
  Dynamic: project-url
160
161
  Dynamic: provides-extra
161
162
  Dynamic: requires-dist