skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (68) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +125 -22
  3. sky/backends/cloud_vm_ray_backend.py +224 -72
  4. sky/catalog/__init__.py +7 -0
  5. sky/catalog/aws_catalog.py +4 -0
  6. sky/catalog/common.py +18 -0
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +2 -71
  9. sky/client/sdk_async.py +5 -2
  10. sky/clouds/aws.py +23 -5
  11. sky/clouds/cloud.py +8 -0
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
  14. sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
  15. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  16. sky/dashboard/out/clusters/[cluster].html +1 -1
  17. sky/dashboard/out/clusters.html +1 -1
  18. sky/dashboard/out/config.html +1 -1
  19. sky/dashboard/out/index.html +1 -1
  20. sky/dashboard/out/infra/[context].html +1 -1
  21. sky/dashboard/out/infra.html +1 -1
  22. sky/dashboard/out/jobs/[job].html +1 -1
  23. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  24. sky/dashboard/out/jobs.html +1 -1
  25. sky/dashboard/out/users.html +1 -1
  26. sky/dashboard/out/volumes.html +1 -1
  27. sky/dashboard/out/workspace/new.html +1 -1
  28. sky/dashboard/out/workspaces/[name].html +1 -1
  29. sky/dashboard/out/workspaces.html +1 -1
  30. sky/global_user_state.py +34 -0
  31. sky/jobs/client/sdk_async.py +4 -2
  32. sky/jobs/controller.py +4 -2
  33. sky/jobs/recovery_strategy.py +1 -1
  34. sky/jobs/state.py +26 -16
  35. sky/jobs/utils.py +6 -11
  36. sky/logs/agent.py +10 -2
  37. sky/provision/kubernetes/config.py +7 -2
  38. sky/provision/kubernetes/instance.py +84 -41
  39. sky/provision/vast/instance.py +1 -1
  40. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  41. sky/server/config.py +14 -5
  42. sky/server/metrics.py +41 -8
  43. sky/server/requests/executor.py +41 -4
  44. sky/server/server.py +1 -0
  45. sky/server/uvicorn.py +11 -5
  46. sky/skylet/constants.py +12 -7
  47. sky/skylet/log_lib.py +11 -0
  48. sky/skylet/log_lib.pyi +9 -0
  49. sky/task.py +62 -0
  50. sky/templates/kubernetes-ray.yml.j2 +120 -3
  51. sky/utils/accelerator_registry.py +3 -1
  52. sky/utils/command_runner.py +35 -11
  53. sky/utils/command_runner.pyi +22 -0
  54. sky/utils/context_utils.py +15 -2
  55. sky/utils/db/migration_utils.py +1 -1
  56. sky/utils/git.py +559 -1
  57. sky/utils/resource_checker.py +8 -7
  58. sky/workspaces/core.py +57 -21
  59. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +33 -33
  60. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +66 -66
  61. sky/client/cli/git.py +0 -549
  62. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  63. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
  64. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
  65. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
  66. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
  67. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
  68. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
@@ -7,9 +7,11 @@ import json
7
7
  import math
8
8
  import os
9
9
  import pathlib
10
+ import random
10
11
  import re
11
12
  import shlex
12
13
  import signal
14
+ import socket
13
15
  import subprocess
14
16
  import sys
15
17
  import tempfile
@@ -48,6 +50,7 @@ from sky.provision import common as provision_common
48
50
  from sky.provision import instance_setup
49
51
  from sky.provision import metadata_utils
50
52
  from sky.provision import provisioner
53
+ from sky.provision.kubernetes import config as config_lib
51
54
  from sky.provision.kubernetes import utils as kubernetes_utils
52
55
  from sky.server.requests import requests as requests_lib
53
56
  from sky.skylet import autostop_lib
@@ -87,7 +90,11 @@ if typing.TYPE_CHECKING:
87
90
  from sky.schemas.generated import autostopv1_pb2_grpc
88
91
  else:
89
92
  # To avoid requiring grpcio to be installed on the client side.
90
- grpc = adaptors_common.LazyImport('grpc')
93
+ grpc = adaptors_common.LazyImport(
94
+ 'grpc',
95
+ # https://github.com/grpc/grpc/issues/37642 to avoid spam in console
96
+ set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
97
+ if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
91
98
  autostopv1_pb2 = adaptors_common.LazyImport(
92
99
  'sky.schemas.generated.autostopv1_pb2')
93
100
  autostopv1_pb2_grpc = adaptors_common.LazyImport(
@@ -330,6 +337,8 @@ class RayCodeGen:
330
337
 
331
338
  SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
332
339
 
340
+ CANCELLED_RETURN_CODE = 137
341
+
333
342
  kwargs = dict()
334
343
  # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
335
344
  # the directory exists for backward compatibility for the VM
@@ -345,8 +354,10 @@ class RayCodeGen:
345
354
  def get_or_fail(futures, pg) -> List[int]:
346
355
  \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
347
356
  if not futures:
348
- return []
357
+ return [], []
349
358
  returncodes = [1] * len(futures)
359
+ pids = [None] * len(futures)
360
+ failed = False
350
361
  # Wait for 1 task to be ready.
351
362
  ready = []
352
363
  # Keep invoking ray.wait if ready is empty. This is because
@@ -355,12 +366,22 @@ class RayCodeGen:
355
366
  # before becoming ready.
356
367
  # (Such tasks are common in serving jobs.)
357
368
  # Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
369
+
370
+ def handle_ready_tasks(tasks: List[ray.ObjectRef]) -> None:
371
+ nonlocal returncodes, pids, failed
372
+ for task in tasks:
373
+ idx = futures.index(task)
374
+ res = ray.get(task)
375
+ returncodes[idx] = res['return_code']
376
+ pids[idx] = res['pid']
377
+ if res['return_code'] != 0:
378
+ failed = True
379
+
358
380
  while not ready:
359
381
  ready, unready = ray.wait(futures)
360
- idx = futures.index(ready[0])
361
- returncodes[idx] = ray.get(ready[0])
382
+ handle_ready_tasks(ready)
362
383
  while unready:
363
- if returncodes[idx] != 0:
384
+ if failed:
364
385
  for task in unready:
365
386
  # ray.cancel without force fails to kill tasks.
366
387
  # We use force=True to kill unready tasks.
@@ -368,17 +389,16 @@ class RayCodeGen:
368
389
  # Use SIGKILL=128+9 to indicate the task is forcely
369
390
  # killed.
370
391
  idx = futures.index(task)
371
- returncodes[idx] = 137
392
+ returncodes[idx] = CANCELLED_RETURN_CODE
372
393
  break
373
394
  ready, unready = ray.wait(unready)
374
- idx = futures.index(ready[0])
375
- returncodes[idx] = ray.get(ready[0])
395
+ handle_ready_tasks(ready)
376
396
  # Remove the placement group after all tasks are done, so that
377
397
  # the next job can be scheduled on the released resources
378
398
  # immediately.
379
399
  ray_util.remove_placement_group(pg)
380
400
  sys.stdout.flush()
381
- return returncodes
401
+ return returncodes, pids
382
402
 
383
403
  run_fn = None
384
404
  futures = []
@@ -394,7 +414,10 @@ class RayCodeGen:
394
414
  inspect.getsource(log_lib.make_task_bash_script),
395
415
  inspect.getsource(log_lib.add_ray_env_vars),
396
416
  inspect.getsource(log_lib.run_bash_command_with_log),
397
- 'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
417
+ inspect.getsource(log_lib.run_bash_command_with_log_and_return_pid),
418
+ 'run_bash_command_with_log = run_bash_command_with_log',
419
+ 'run_bash_command_with_log_and_return_pid = \
420
+ ray.remote(run_bash_command_with_log_and_return_pid)',
398
421
  ]
399
422
  # Currently, the codegen program is/can only be submitted to the head
400
423
  # node, due to using job_lib for updating job statuses, and using
@@ -499,7 +522,7 @@ class RayCodeGen:
499
522
  total_num_nodes = len(ray.nodes())
500
523
  setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
501
524
  setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
502
- setup_workers = [run_bash_command_with_log \\
525
+ setup_workers = [run_bash_command_with_log_and_return_pid \\
503
526
  .options(
504
527
  name='setup',
505
528
  num_cpus=_SETUP_CPUS,
@@ -514,15 +537,25 @@ class RayCodeGen:
514
537
  stream_logs=True,
515
538
  with_ray=True,
516
539
  ) for i in range(total_num_nodes)]
517
- setup_returncodes = get_or_fail(setup_workers, setup_pg)
518
- if sum(setup_returncodes) != 0:
540
+ setup_returncodes, setup_pids = get_or_fail(setup_workers, setup_pg)
541
+ success = True
542
+ failed_workers_and_returncodes = []
543
+ for i in range(len(setup_returncodes)):
544
+ returncode = setup_returncodes[i]
545
+ pid = setup_pids[i]
546
+ if pid == None:
547
+ pid = os.getpid()
548
+ if returncode != 0 and returncode != CANCELLED_RETURN_CODE:
549
+ success = False
550
+ failed_workers_and_returncodes.append((pid, returncode))
551
+ if not success:
552
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed. '
553
+ msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
554
+ msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
555
+ print(msg, flush=True)
519
556
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
520
557
  # This waits for all streaming logs to finish.
521
558
  time.sleep(1)
522
- print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
523
- 'return code list:{colorama.Style.RESET_ALL}',
524
- setup_returncodes,
525
- flush=True)
526
559
  # Need this to set the job status in ray job to be FAILED.
527
560
  sys.exit(1)
528
561
  """)
@@ -695,7 +728,7 @@ class RayCodeGen:
695
728
 
696
729
  sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
697
730
 
698
- futures.append(run_bash_command_with_log \\
731
+ futures.append(run_bash_command_with_log_and_return_pid \\
699
732
  .options(name=name_str, {options_str}) \\
700
733
  .remote(
701
734
  script,
@@ -714,7 +747,7 @@ class RayCodeGen:
714
747
 
715
748
  self._code += [
716
749
  textwrap.dedent(f"""\
717
- returncodes = get_or_fail(futures, pg)
750
+ returncodes, _ = get_or_fail(futures, pg)
718
751
  if sum(returncodes) != 0:
719
752
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
720
753
  # Schedule the next pending job immediately to make the job
@@ -1340,6 +1373,34 @@ class RetryingVmProvisioner(object):
1340
1373
  zones = [clouds.Zone(name=to_provision.zone)]
1341
1374
  yield zones
1342
1375
 
1376
+ def _insufficient_resources_msg(
1377
+ self,
1378
+ to_provision: resources_lib.Resources,
1379
+ requested_resources: Set[resources_lib.Resources],
1380
+ insufficient_resources: Optional[List[str]],
1381
+ ) -> str:
1382
+ insufficent_resource_msg = ('' if insufficient_resources is None else
1383
+ f' ({", ".join(insufficient_resources)})')
1384
+ message = f'Failed to acquire resources{insufficent_resource_msg} '
1385
+ if to_provision.zone is not None:
1386
+ message += (f'in {to_provision.zone} for {requested_resources}. ')
1387
+ elif to_provision.region is not None and to_provision.cloud is not None:
1388
+ # For public clouds, provision.region is always set.
1389
+ if clouds.SSH().is_same_cloud(to_provision.cloud):
1390
+ message += (
1391
+ f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
1392
+ f'for {requested_resources}. The SSH Node Pool may not '
1393
+ 'have enough resources.')
1394
+ elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
1395
+ message += (f'in context {to_provision.region} for '
1396
+ f'{requested_resources}. ')
1397
+ else:
1398
+ message += (f'in all zones in {to_provision.region} for '
1399
+ f'{requested_resources}. ')
1400
+ else:
1401
+ message += (f'{to_provision.cloud} for {requested_resources}. ')
1402
+ return message
1403
+
1343
1404
  def _retry_zones(
1344
1405
  self,
1345
1406
  to_provision: resources_lib.Resources,
@@ -1418,6 +1479,7 @@ class RetryingVmProvisioner(object):
1418
1479
  f'To request quotas, check the instruction: '
1419
1480
  f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
1420
1481
 
1482
+ insufficient_resources = None
1421
1483
  for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
1422
1484
  prev_cluster_status,
1423
1485
  prev_cluster_ever_up):
@@ -1630,6 +1692,24 @@ class RetryingVmProvisioner(object):
1630
1692
  # No teardown happens for this error.
1631
1693
  with ux_utils.print_exception_no_traceback():
1632
1694
  raise
1695
+ except config_lib.KubernetesError as e:
1696
+ if e.insufficent_resources:
1697
+ insufficient_resources = e.insufficent_resources
1698
+ # NOTE: We try to cleanup the cluster even if the previous
1699
+ # cluster does not exist. Also we are fast at
1700
+ # cleaning up clusters now if there is no existing node.
1701
+ CloudVmRayBackend().post_teardown_cleanup(
1702
+ handle,
1703
+ terminate=not prev_cluster_ever_up,
1704
+ remove_from_db=False,
1705
+ failover=True,
1706
+ )
1707
+ # TODO(suquark): other clouds may have different zone
1708
+ # blocking strategy. See '_update_blocklist_on_error'
1709
+ # for details.
1710
+ FailoverCloudErrorHandlerV2.update_blocklist_on_error(
1711
+ self._blocked_resources, to_provision, region, zones, e)
1712
+ continue
1633
1713
  except Exception as e: # pylint: disable=broad-except
1634
1714
  # NOTE: We try to cleanup the cluster even if the previous
1635
1715
  # cluster does not exist. Also we are fast at
@@ -1760,26 +1840,9 @@ class RetryingVmProvisioner(object):
1760
1840
  terminate=terminate_or_stop,
1761
1841
  remove_from_db=False)
1762
1842
 
1763
- if to_provision.zone is not None:
1764
- message = (
1765
- f'Failed to acquire resources in {to_provision.zone} for '
1766
- f'{requested_resources}. ')
1767
- elif to_provision.region is not None:
1768
- # For public clouds, provision.region is always set.
1769
- if clouds.SSH().is_same_cloud(to_provision.cloud):
1770
- message = ('Failed to acquire resources in SSH Node Pool '
1771
- f'({to_provision.region.lstrip("ssh-")}) for '
1772
- f'{requested_resources}. The SSH Node Pool may not '
1773
- 'have enough resources.')
1774
- elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
1775
- message = ('Failed to acquire resources in context '
1776
- f'{to_provision.region} for {requested_resources}. ')
1777
- else:
1778
- message = ('Failed to acquire resources in all zones in '
1779
- f'{to_provision.region} for {requested_resources}. ')
1780
- else:
1781
- message = (f'Failed to acquire resources in {to_provision.cloud} '
1782
- f'for {requested_resources}. ')
1843
+ message = self._insufficient_resources_msg(to_provision,
1844
+ requested_resources,
1845
+ insufficient_resources)
1783
1846
  # Do not failover to other locations if the cluster was ever up, since
1784
1847
  # the user can have some data on the cluster.
1785
1848
  raise exceptions.ResourcesUnavailableError(
@@ -2261,8 +2324,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2261
2324
  - (optional) Skylet SSH tunnel info.
2262
2325
  """
2263
2326
  # Bump if any fields get added/removed/changed, and add backward
2264
- # compaitibility logic in __setstate__.
2265
- _VERSION = 11
2327
+ # compatibility logic in __setstate__ and/or __getstate__.
2328
+ _VERSION = 12
2266
2329
 
2267
2330
  def __init__(
2268
2331
  self,
@@ -2296,7 +2359,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2296
2359
  self.launched_resources = launched_resources
2297
2360
  self.docker_user: Optional[str] = None
2298
2361
  self.is_grpc_enabled = True
2299
- self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
2300
2362
 
2301
2363
  def __repr__(self):
2302
2364
  return (f'ResourceHandle('
@@ -2313,8 +2375,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2313
2375
  f'{self.launched_resources}, '
2314
2376
  f'\n\tdocker_user={self.docker_user},'
2315
2377
  f'\n\tssh_user={self.ssh_user},'
2316
- f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
2317
- f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
2378
+ f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
2318
2379
 
2319
2380
  def get_cluster_name(self):
2320
2381
  return self.cluster_name
@@ -2643,11 +2704,74 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2643
2704
  cluster_config_file)
2644
2705
  self.docker_user = docker_user
2645
2706
 
2707
+ def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
2708
+ metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
2709
+ self.cluster_name)
2710
+ if metadata is None:
2711
+ return None
2712
+ return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
2713
+
2714
+ def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
2715
+ global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
2716
+ self.cluster_name,
2717
+ (tunnel.port, tunnel.pid) if tunnel is not None else None)
2718
+
2646
2719
  def get_grpc_channel(self) -> 'grpc.Channel':
2647
- if self.skylet_ssh_tunnel is None:
2648
- self.open_and_update_skylet_tunnel()
2649
- assert self.skylet_ssh_tunnel is not None
2650
- return grpc.insecure_channel(f'localhost:{self.skylet_ssh_tunnel.port}')
2720
+ # It's fine to not grab the lock here, as we're only reading,
2721
+ # and writes are very rare.
2722
+ # It's acceptable to read while another process is opening a tunnel,
2723
+ # because it will only happen on:
2724
+ # 1. A new cluster who has no tunnel yet, or
2725
+ # 2. A cluster with an unhealthy tunnel
2726
+ # For (2), for processes that read the "stale" tunnel, it will fail
2727
+ # and on the next retry, it will call get_grpc_channel again
2728
+ # and get the new tunnel.
2729
+ tunnel = self._get_skylet_ssh_tunnel()
2730
+ if tunnel is not None:
2731
+ try:
2732
+ # Check if the tunnel is open.
2733
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
2734
+ s.settimeout(0.5)
2735
+ s.connect(('localhost', tunnel.port))
2736
+ return grpc.insecure_channel(f'localhost:{tunnel.port}')
2737
+ except socket.error as e:
2738
+ logger.warning(
2739
+ 'Failed to connect to SSH tunnel for cluster '
2740
+ f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
2741
+ 'acquiring lock')
2742
+ pass
2743
+ lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
2744
+ lock_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
2745
+ lock = locks.get_lock(lock_id, lock_timeout)
2746
+ try:
2747
+ with lock.acquire(blocking=True):
2748
+ # Re-read the tunnel from the DB.
2749
+ tunnel = self._get_skylet_ssh_tunnel()
2750
+ if tunnel is None:
2751
+ logger.debug('No SSH tunnel found for cluster '
2752
+ f'{self.cluster_name!r}, '
2753
+ 'opening the tunnel')
2754
+ tunnel = self._open_and_update_skylet_tunnel()
2755
+ return grpc.insecure_channel(f'localhost:{tunnel.port}')
2756
+ try:
2757
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
2758
+ s.settimeout(0.5)
2759
+ s.connect(('localhost', tunnel.port))
2760
+ return grpc.insecure_channel(f'localhost:{tunnel.port}')
2761
+ except socket.error as e:
2762
+ logger.warning(
2763
+ 'Failed to connect to SSH tunnel for cluster '
2764
+ f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
2765
+ 'opening new tunnel')
2766
+ tunnel = self._open_and_update_skylet_tunnel()
2767
+ return grpc.insecure_channel(f'localhost:{tunnel.port}')
2768
+ except locks.LockTimeout as e:
2769
+ raise RuntimeError(
2770
+ 'Failed to get gRPC channel for cluster '
2771
+ f'{self.cluster_name!r} due to a timeout when waiting for the '
2772
+ 'SSH tunnel to be opened. Please try again or manually remove '
2773
+ f'the lock at {lock_id}. '
2774
+ f'{common_utils.format_exception(e)}') from e
2651
2775
 
2652
2776
  def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
2653
2777
  """Clean up an SSH tunnel by terminating the process."""
@@ -2668,31 +2792,48 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2668
2792
  logger.warning(
2669
2793
  f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
2670
2794
 
2671
- def open_and_update_skylet_tunnel(self) -> None:
2795
+ def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
2672
2796
  """Opens an SSH tunnel to the Skylet on the head node,
2673
2797
  updates the cluster handle, and persists it to the database."""
2674
- local_port = common_utils.find_free_port(10000)
2675
- runners = self.get_command_runners()
2676
- head_runner = runners[0]
2677
- if isinstance(head_runner, command_runner.SSHCommandRunner):
2678
- # Disabling ControlMaster makes things easier to reason about
2679
- # with respect to resource management/ownership,
2680
- # as killing the process will close the tunnel too.
2681
- head_runner.disable_control_master = True
2682
-
2683
- cmd = head_runner.port_forward_command([(local_port,
2684
- constants.SKYLET_GRPC_PORT)])
2685
- ssh_tunnel_proc = subprocess.Popen(cmd)
2686
- tunnel_info = SSHTunnelInfo(port=local_port, pid=ssh_tunnel_proc.pid)
2798
+ max_attempts = 3
2799
+ # There could be a race condition here, as multiple processes may
2800
+ # attempt to open the same port at the same time.
2801
+ for attempt in range(max_attempts):
2802
+ runners = self.get_command_runners()
2803
+ head_runner = runners[0]
2804
+ local_port = random.randint(10000, 65535)
2805
+ try:
2806
+ ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
2807
+ head_runner, (local_port, constants.SKYLET_GRPC_PORT))
2808
+ except exceptions.CommandError as e:
2809
+ # Don't retry if the error is due to timeout,
2810
+ # connection refused, Kubernetes pods not found,
2811
+ # or an in-progress termination.
2812
+ if (e.detailed_reason is not None and
2813
+ (backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
2814
+ e.detailed_reason) or
2815
+ backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
2816
+ e.detailed_reason) or attempt == max_attempts - 1)):
2817
+ raise e
2818
+ logger.warning(
2819
+ f'Failed to open SSH tunnel on port {local_port} '
2820
+ f'({attempt + 1}/{max_attempts}). '
2821
+ f'{e.error_msg}\n{e.detailed_reason}')
2822
+ continue
2823
+ tunnel_info = SSHTunnelInfo(port=local_port,
2824
+ pid=ssh_tunnel_proc.pid)
2825
+ break
2826
+
2687
2827
  try:
2688
2828
  grpc.channel_ready_future(
2689
2829
  grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
2690
2830
  timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
2691
2831
  # Clean up existing tunnel before setting up the new one.
2692
- if self.skylet_ssh_tunnel is not None:
2693
- self._cleanup_ssh_tunnel(self.skylet_ssh_tunnel)
2694
- self.skylet_ssh_tunnel = tunnel_info
2695
- global_user_state.update_cluster_handle(self.cluster_name, self)
2832
+ old_tunnel = self._get_skylet_ssh_tunnel()
2833
+ if old_tunnel is not None:
2834
+ self._cleanup_ssh_tunnel(old_tunnel)
2835
+ self._set_skylet_ssh_tunnel(tunnel_info)
2836
+ return tunnel_info
2696
2837
  except grpc.FutureTimeoutError as e:
2697
2838
  self._cleanup_ssh_tunnel(tunnel_info)
2698
2839
  logger.warning(
@@ -2752,6 +2893,13 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2752
2893
  """Returns whether this handle has gRPC enabled and gRPC flag is set."""
2753
2894
  return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
2754
2895
 
2896
+ def __getstate__(self):
2897
+ state = self.__dict__.copy()
2898
+ # For backwards compatibility. Refer to
2899
+ # https://github.com/skypilot-org/skypilot/pull/7133
2900
+ state.setdefault('skylet_ssh_tunnel', None)
2901
+ return state
2902
+
2755
2903
  def __setstate__(self, state):
2756
2904
  self._version = self._VERSION
2757
2905
 
@@ -2809,6 +2957,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2809
2957
  state['is_grpc_enabled'] = False
2810
2958
  state['skylet_ssh_tunnel'] = None
2811
2959
 
2960
+ if version >= 12:
2961
+ # DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
2962
+ state.pop('skylet_ssh_tunnel', None)
2963
+
2812
2964
  self.__dict__.update(state)
2813
2965
 
2814
2966
  # Because the update_cluster_ips and update_ssh_ports
@@ -3115,7 +3267,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3115
3267
  colorama.Style.RESET_ALL +
3116
3268
  colorama.Style.DIM +
3117
3269
  'Check concurrent requests: ' +
3118
- 'sky api status '))
3270
+ 'sky api status -v | grep '
3271
+ f'{cluster_name}'))
3119
3272
 
3120
3273
  def _locked_provision(
3121
3274
  self,
@@ -4974,9 +5127,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4974
5127
  autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
4975
5128
  down=down,
4976
5129
  )
4977
- backend_utils.invoke_skylet_with_retries(
4978
- handle, lambda: SkyletClient(handle.get_grpc_channel()).
4979
- set_autostop(request))
5130
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
5131
+ handle.get_grpc_channel()).set_autostop(request))
4980
5132
  else:
4981
5133
  code = autostop_lib.AutostopCodeGen.set_autostop(
4982
5134
  idle_minutes_to_autostop, self.NAME, wait_for, down)
@@ -5015,8 +5167,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5015
5167
  try:
5016
5168
  request = autostopv1_pb2.IsAutostoppingRequest()
5017
5169
  response = backend_utils.invoke_skylet_with_retries(
5018
- handle, lambda: SkyletClient(handle.get_grpc_channel()).
5019
- is_autostopping(request))
5170
+ lambda: SkyletClient(handle.get_grpc_channel()
5171
+ ).is_autostopping(request))
5020
5172
  return response.is_autostopping
5021
5173
  except Exception as e: # pylint: disable=broad-except
5022
5174
  # The cluster may have been terminated, causing the gRPC call
sky/catalog/__init__.py CHANGED
@@ -247,6 +247,13 @@ def get_accelerators_from_instance_type(
247
247
  instance_type)
248
248
 
249
249
 
250
+ def get_arch_from_instance_type(instance_type: str,
251
+ clouds: CloudFilter = None) -> Optional[str]:
252
+ """Returns the arch from a instance type."""
253
+ return _map_clouds_catalog(clouds, 'get_arch_from_instance_type',
254
+ instance_type)
255
+
256
+
250
257
  def get_instance_type_for_accelerator(
251
258
  acc_name: str,
252
259
  acc_count: Union[int, float],
@@ -271,6 +271,10 @@ def get_accelerators_from_instance_type(
271
271
  _get_df(), instance_type)
272
272
 
273
273
 
274
+ def get_arch_from_instance_type(instance_type: str) -> Optional[str]:
275
+ return common.get_arch_from_instance_type_impl(_get_df(), instance_type)
276
+
277
+
274
278
  def get_instance_type_for_accelerator(
275
279
  acc_name: str,
276
280
  acc_count: int,
sky/catalog/common.py CHANGED
@@ -527,6 +527,24 @@ def get_accelerators_from_instance_type_impl(
527
527
  return {acc_name: _convert(acc_count)}
528
528
 
529
529
 
530
+ def get_arch_from_instance_type_impl(
531
+ df: 'pd.DataFrame',
532
+ instance_type: str,
533
+ ) -> Optional[str]:
534
+ df = _get_instance_type(df, instance_type, None)
535
+ if df.empty:
536
+ with ux_utils.print_exception_no_traceback():
537
+ raise ValueError(f'No instance type {instance_type} found.')
538
+ row = df.iloc[0]
539
+ if 'Arch' not in row:
540
+ return None
541
+ arch = row['Arch']
542
+ if pd.isnull(arch):
543
+ return None
544
+
545
+ return arch
546
+
547
+
530
548
  def get_instance_type_for_accelerator_impl(
531
549
  df: 'pd.DataFrame',
532
550
  acc_name: str,
@@ -67,7 +67,7 @@ US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
67
67
  # The following columns will be included in the final catalog.
68
68
  USEFUL_COLUMNS = [
69
69
  'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
70
- 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone'
70
+ 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Arch'
71
71
  ]
72
72
 
73
73
  # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
@@ -275,6 +275,17 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
275
275
  return None, np.nan
276
276
  return accelerator['Name'], accelerator['Count']
277
277
 
278
+ def get_arch(row) -> Optional[str]:
279
+ if 'ProcessorInfo' in row:
280
+ processor = row['ProcessorInfo']
281
+ if 'SupportedArchitectures' in processor:
282
+ archs = processor['SupportedArchitectures']
283
+ if isinstance(archs, list):
284
+ return archs[0]
285
+ elif isinstance(archs, str):
286
+ return archs
287
+ return None
288
+
278
289
  def get_vcpus(row) -> float:
279
290
  if not np.isnan(row['vCPU']):
280
291
  return float(row['vCPU'])
@@ -332,6 +343,7 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
332
343
  'AcceleratorCount': acc_count,
333
344
  'vCPUs': get_vcpus(row),
334
345
  'MemoryGiB': get_memory_gib(row),
346
+ 'Arch': get_arch(row),
335
347
  })
336
348
 
337
349
  # The AWS API may not have all the instance types in the pricing table,