skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (97) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -4
  3. sky/backends/backend_utils.py +160 -23
  4. sky/backends/cloud_vm_ray_backend.py +226 -74
  5. sky/catalog/__init__.py +7 -0
  6. sky/catalog/aws_catalog.py +4 -0
  7. sky/catalog/common.py +18 -0
  8. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  9. sky/client/cli/command.py +2 -71
  10. sky/client/sdk.py +20 -0
  11. sky/client/sdk_async.py +23 -18
  12. sky/clouds/aws.py +26 -6
  13. sky/clouds/cloud.py +8 -0
  14. sky/dashboard/out/404.html +1 -1
  15. sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
  17. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  18. sky/dashboard/out/clusters/[cluster].html +1 -1
  19. sky/dashboard/out/clusters.html +1 -1
  20. sky/dashboard/out/config.html +1 -1
  21. sky/dashboard/out/index.html +1 -1
  22. sky/dashboard/out/infra/[context].html +1 -1
  23. sky/dashboard/out/infra.html +1 -1
  24. sky/dashboard/out/jobs/[job].html +1 -1
  25. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  26. sky/dashboard/out/jobs.html +1 -1
  27. sky/dashboard/out/users.html +1 -1
  28. sky/dashboard/out/volumes.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/data/storage.py +5 -1
  33. sky/execution.py +21 -14
  34. sky/global_user_state.py +34 -0
  35. sky/jobs/client/sdk_async.py +4 -2
  36. sky/jobs/constants.py +3 -0
  37. sky/jobs/controller.py +734 -310
  38. sky/jobs/recovery_strategy.py +251 -129
  39. sky/jobs/scheduler.py +247 -174
  40. sky/jobs/server/core.py +20 -4
  41. sky/jobs/server/utils.py +2 -2
  42. sky/jobs/state.py +709 -508
  43. sky/jobs/utils.py +90 -40
  44. sky/logs/agent.py +10 -2
  45. sky/provision/aws/config.py +4 -1
  46. sky/provision/gcp/config.py +6 -1
  47. sky/provision/kubernetes/config.py +7 -2
  48. sky/provision/kubernetes/instance.py +84 -41
  49. sky/provision/kubernetes/utils.py +17 -8
  50. sky/provision/provisioner.py +1 -0
  51. sky/provision/vast/instance.py +1 -1
  52. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  53. sky/serve/replica_managers.py +0 -7
  54. sky/serve/serve_utils.py +5 -0
  55. sky/serve/server/impl.py +1 -2
  56. sky/serve/service.py +0 -2
  57. sky/server/common.py +8 -3
  58. sky/server/config.py +55 -27
  59. sky/server/constants.py +1 -0
  60. sky/server/daemons.py +7 -11
  61. sky/server/metrics.py +41 -8
  62. sky/server/requests/executor.py +41 -4
  63. sky/server/requests/serializers/encoders.py +1 -1
  64. sky/server/server.py +9 -1
  65. sky/server/uvicorn.py +11 -5
  66. sky/setup_files/dependencies.py +4 -2
  67. sky/skylet/attempt_skylet.py +1 -0
  68. sky/skylet/constants.py +14 -7
  69. sky/skylet/events.py +2 -10
  70. sky/skylet/log_lib.py +11 -0
  71. sky/skylet/log_lib.pyi +9 -0
  72. sky/task.py +62 -0
  73. sky/templates/kubernetes-ray.yml.j2 +120 -3
  74. sky/utils/accelerator_registry.py +3 -1
  75. sky/utils/command_runner.py +35 -11
  76. sky/utils/command_runner.pyi +25 -3
  77. sky/utils/common_utils.py +11 -1
  78. sky/utils/context_utils.py +15 -2
  79. sky/utils/controller_utils.py +5 -0
  80. sky/utils/db/db_utils.py +31 -2
  81. sky/utils/db/migration_utils.py +1 -1
  82. sky/utils/git.py +559 -1
  83. sky/utils/resource_checker.py +8 -7
  84. sky/utils/rich_utils.py +3 -1
  85. sky/utils/subprocess_utils.py +9 -0
  86. sky/volumes/volume.py +2 -0
  87. sky/workspaces/core.py +57 -21
  88. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +38 -36
  89. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +95 -95
  90. sky/client/cli/git.py +0 -549
  91. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  92. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
  93. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
  94. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
  95. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
  96. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
  97. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -7,7 +7,7 @@ import urllib.request
7
7
  from sky.utils import directory_utils
8
8
 
9
9
  # Replaced with the current commit when building the wheels.
10
- _SKYPILOT_COMMIT_SHA = 'a5750884dc37f2134ae8d1ddacc247d12d8fa74e'
10
+ _SKYPILOT_COMMIT_SHA = '16e9ad33e647ab3859a2b5624be7386721c9ef8b'
11
11
 
12
12
 
13
13
  def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
37
37
 
38
38
 
39
39
  __commit__ = _get_git_commit()
40
- __version__ = '1.0.0.dev20250909'
40
+ __version__ = '1.0.0.dev20250912'
41
41
  __root_dir__ = directory_utils.get_sky_dir()
42
42
 
43
43
 
sky/authentication.py CHANGED
@@ -207,6 +207,24 @@ def configure_ssh_info(config: Dict[str, Any]) -> Dict[str, Any]:
207
207
  return config
208
208
 
209
209
 
210
+ def parse_gcp_project_oslogin(project):
211
+ """Helper function to parse GCP project metadata."""
212
+ common_metadata = project.get('commonInstanceMetadata', {})
213
+ if not isinstance(common_metadata, dict):
214
+ common_metadata = {}
215
+
216
+ metadata_items = common_metadata.get('items', [])
217
+ if not isinstance(metadata_items, list):
218
+ metadata_items = []
219
+
220
+ project_oslogin = next(
221
+ (item for item in metadata_items
222
+ if isinstance(item, dict) and item.get('key') == 'enable-oslogin'),
223
+ {}).get('value', 'False')
224
+
225
+ return project_oslogin
226
+
227
+
210
228
  # Snippets of code inspired from
211
229
  # https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/gcp/config.py
212
230
  # Takes in config, a yaml dict and outputs a postprocessed dict
@@ -264,10 +282,7 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
264
282
  'Please check your network connection.')
265
283
  raise
266
284
 
267
- project_oslogin: str = next( # type: ignore
268
- (item for item in project['commonInstanceMetadata'].get('items', [])
269
- if item['key'] == 'enable-oslogin'), {}).get('value', 'False')
270
-
285
+ project_oslogin = parse_gcp_project_oslogin(project)
271
286
  if project_oslogin.lower() == 'true':
272
287
  logger.info(
273
288
  f'OS Login is enabled for GCP project {project_id}. Running '
@@ -1,4 +1,5 @@
1
1
  """Util constants/functions for the backends."""
2
+ import asyncio
2
3
  from datetime import datetime
3
4
  import enum
4
5
  import fnmatch
@@ -6,20 +7,24 @@ import hashlib
6
7
  import os
7
8
  import pathlib
8
9
  import pprint
10
+ import queue as queue_lib
9
11
  import re
10
12
  import shlex
11
13
  import subprocess
12
14
  import sys
13
15
  import tempfile
16
+ import threading
14
17
  import time
15
18
  import typing
16
19
  from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
17
20
  TypeVar, Union)
18
21
  import uuid
19
22
 
23
+ import aiohttp
24
+ from aiohttp import ClientTimeout
25
+ from aiohttp import TCPConnector
20
26
  import colorama
21
27
  from packaging import version
22
- import psutil
23
28
  from typing_extensions import Literal
24
29
 
25
30
  import sky
@@ -107,8 +112,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
107
112
  # 10.133.0.5: ray.worker.default,
108
113
  _LAUNCHING_IP_PATTERN = re.compile(
109
114
  r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
115
+ SSH_CONNECTION_ERROR_PATTERN = re.compile(
116
+ r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
110
117
  _SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
111
118
  re.IGNORECASE)
119
+ K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
120
+ re.IGNORECASE)
112
121
  _RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
113
122
  WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
114
123
 
@@ -131,6 +140,7 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
131
140
 
132
141
  CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
133
142
  WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
143
+ CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10
134
144
 
135
145
  # Remote dir that holds our runtime files.
136
146
  _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
@@ -209,6 +219,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
209
219
  ('provider', 'availability_zone'),
210
220
  ]
211
221
 
222
+ _ACK_MESSAGE = 'ack'
223
+ _FORWARDING_FROM_MESSAGE = 'Forwarding from'
224
+
212
225
 
213
226
  def is_ip(s: str) -> bool:
214
227
  """Returns whether this string matches IP_ADDR_REGEX."""
@@ -535,7 +548,7 @@ def get_expirable_clouds(
535
548
  # get all custom contexts
536
549
  contexts = kubernetes_utils.get_custom_config_k8s_contexts()
537
550
  # add remote_identity of each context if it exists
538
- remote_identities = None
551
+ remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
539
552
  for context in contexts:
540
553
  context_remote_identity = skypilot_config.get_effective_region_config(
541
554
  cloud='kubernetes',
@@ -546,9 +559,11 @@ def get_expirable_clouds(
546
559
  if remote_identities is None:
547
560
  remote_identities = []
548
561
  if isinstance(context_remote_identity, str):
562
+ assert isinstance(remote_identities, list)
549
563
  remote_identities.append(
550
564
  {context: context_remote_identity})
551
565
  elif isinstance(context_remote_identity, list):
566
+ assert isinstance(remote_identities, list)
552
567
  remote_identities.extend(context_remote_identity)
553
568
  # add global kubernetes remote identity if it exists, if not, add default
554
569
  global_remote_identity = skypilot_config.get_effective_region_config(
@@ -560,8 +575,10 @@ def get_expirable_clouds(
560
575
  if remote_identities is None:
561
576
  remote_identities = []
562
577
  if isinstance(global_remote_identity, str):
578
+ assert isinstance(remote_identities, list)
563
579
  remote_identities.append({'*': global_remote_identity})
564
580
  elif isinstance(global_remote_identity, list):
581
+ assert isinstance(remote_identities, list)
565
582
  remote_identities.extend(global_remote_identity)
566
583
  if remote_identities is None:
567
584
  remote_identities = schemas.get_default_remote_identity(
@@ -1784,6 +1801,32 @@ def check_network_connection():
1784
1801
  'Network seems down.')
1785
1802
 
1786
1803
 
1804
+ async def async_check_network_connection():
1805
+ """Check if the network connection is available.
1806
+
1807
+ Tolerates 3 retries as it is observed that connections can fail.
1808
+ Uses aiohttp for async HTTP requests.
1809
+ """
1810
+ # Create a session with retry logic
1811
+ timeout = ClientTimeout(total=15)
1812
+ connector = TCPConnector(limit=1) # Limit to 1 connection at a time
1813
+
1814
+ async with aiohttp.ClientSession(timeout=timeout,
1815
+ connector=connector) as session:
1816
+ for i, ip in enumerate(_TEST_IP_LIST):
1817
+ try:
1818
+ async with session.head(ip) as response:
1819
+ if response.status < 400: # Any 2xx or 3xx status is good
1820
+ return
1821
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
1822
+ if i == len(_TEST_IP_LIST) - 1:
1823
+ raise exceptions.NetworkError(
1824
+ 'Could not refresh the cluster. '
1825
+ 'Network seems down.') from e
1826
+ # If not the last IP, continue to try the next one
1827
+ continue
1828
+
1829
+
1787
1830
  @timeline.event
1788
1831
  def check_owner_identity(cluster_name: str) -> None:
1789
1832
  """Check if current user is the same as the user who created the cluster.
@@ -2638,7 +2681,7 @@ def refresh_cluster_record(
2638
2681
  'Refreshing status: Failed get the lock for cluster '
2639
2682
  f'{cluster_name!r}. Using the cached status.')
2640
2683
  return record
2641
- time.sleep(0.05)
2684
+ time.sleep(lock.poll_interval)
2642
2685
 
2643
2686
  # Refresh for next loop iteration.
2644
2687
  record = global_user_state.get_cluster_from_name(cluster_name)
@@ -3548,19 +3591,126 @@ def workspace_lock_id(workspace_name: str) -> str:
3548
3591
  return f'{workspace_name}_workspace'
3549
3592
 
3550
3593
 
3594
+ def cluster_tunnel_lock_id(cluster_name: str) -> str:
3595
+ """Get the lock ID for cluster tunnel operations."""
3596
+ return f'{cluster_name}_ssh_tunnel'
3597
+
3598
+
3599
+ def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
3600
+ command_runner.KubernetesCommandRunner],
3601
+ port_forward: Tuple[int, int]) -> subprocess.Popen:
3602
+ local_port, remote_port = port_forward
3603
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3604
+ # Disabling ControlMaster makes things easier to reason about
3605
+ # with respect to resource management/ownership,
3606
+ # as killing the process will close the tunnel too.
3607
+ head_runner.disable_control_master = True
3608
+ head_runner.port_forward_execute_remote_command = True
3609
+
3610
+ # The default connect_timeout of 1s is too short for
3611
+ # connecting to clusters using a jump server.
3612
+ # We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
3613
+ # which is counted towards non-idleness.
3614
+ cmd: List[str] = head_runner.port_forward_command(
3615
+ [(local_port, remote_port)],
3616
+ connect_timeout=5,
3617
+ ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
3618
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3619
+ # cat so the command doesn't exit until we kill it
3620
+ cmd += [f'"echo {_ACK_MESSAGE} && cat"']
3621
+ cmd_str = ' '.join(cmd)
3622
+ logger.debug(f'Running port forward command: {cmd_str}')
3623
+ ssh_tunnel_proc = subprocess.Popen(cmd_str,
3624
+ shell=True,
3625
+ stdin=subprocess.PIPE,
3626
+ stdout=subprocess.PIPE,
3627
+ stderr=subprocess.PIPE,
3628
+ start_new_session=True,
3629
+ text=True)
3630
+ # Wait until we receive an ack from the remote cluster or
3631
+ # the SSH connection times out.
3632
+ queue: queue_lib.Queue = queue_lib.Queue()
3633
+ stdout_thread = threading.Thread(
3634
+ target=lambda queue, stdout: queue.put(stdout.readline()),
3635
+ args=(queue, ssh_tunnel_proc.stdout),
3636
+ daemon=True)
3637
+ stdout_thread.start()
3638
+ while ssh_tunnel_proc.poll() is None:
3639
+ try:
3640
+ ack = queue.get_nowait()
3641
+ except queue_lib.Empty:
3642
+ ack = None
3643
+ time.sleep(0.1)
3644
+ continue
3645
+ assert ack is not None
3646
+ if isinstance(
3647
+ head_runner,
3648
+ command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
3649
+ break
3650
+ elif isinstance(head_runner, command_runner.KubernetesCommandRunner
3651
+ ) and _FORWARDING_FROM_MESSAGE in ack:
3652
+ # On kind clusters, this error occurs if we make a request
3653
+ # immediately after the port-forward is established on a new pod:
3654
+ # "Unhandled Error" err="an error occurred forwarding ... -> 46590:
3655
+ # failed to execute portforward in network namespace
3656
+ # "/var/run/netns/cni-...": failed to connect to localhost:46590
3657
+ # inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
3658
+ # connect: connection refused
3659
+ # So we need to poll the port on the pod to check if it is open.
3660
+ # We did not observe this with real Kubernetes clusters.
3661
+ timeout = 5
3662
+ port_check_cmd = (
3663
+ # We install netcat in our ray-node container,
3664
+ # so we can use it here.
3665
+ # (See kubernetes-ray.yml.j2)
3666
+ f'end=$((SECONDS+{timeout})); '
3667
+ f'while ! nc -z -w 1 localhost {remote_port}; do '
3668
+ 'if (( SECONDS >= end )); then exit 1; fi; '
3669
+ 'sleep 0.1; '
3670
+ 'done')
3671
+ returncode, stdout, stderr = head_runner.run(port_check_cmd,
3672
+ require_outputs=True,
3673
+ stream_logs=False)
3674
+ if returncode != 0:
3675
+ try:
3676
+ ssh_tunnel_proc.terminate()
3677
+ ssh_tunnel_proc.wait(timeout=5)
3678
+ except subprocess.TimeoutExpired:
3679
+ ssh_tunnel_proc.kill()
3680
+ ssh_tunnel_proc.wait()
3681
+ finally:
3682
+ error_msg = (f'Failed to check remote port {remote_port}')
3683
+ if stdout:
3684
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3685
+ raise exceptions.CommandError(returncode=returncode,
3686
+ command=cmd_str,
3687
+ error_msg=error_msg,
3688
+ detailed_reason=stderr)
3689
+ break
3690
+
3691
+ if ssh_tunnel_proc.poll() is not None:
3692
+ stdout, stderr = ssh_tunnel_proc.communicate()
3693
+ error_msg = 'Port forward failed'
3694
+ if stdout:
3695
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3696
+ raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
3697
+ command=cmd_str,
3698
+ error_msg=error_msg,
3699
+ detailed_reason=stderr)
3700
+ return ssh_tunnel_proc
3701
+
3702
+
3551
3703
  T = TypeVar('T')
3552
3704
 
3553
3705
 
3554
- def invoke_skylet_with_retries(
3555
- handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
3556
- func: Callable[..., T]) -> T:
3706
+ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
3557
3707
  """Generic helper for making Skylet gRPC requests.
3558
3708
 
3559
3709
  This method handles the common pattern of:
3560
3710
  1. Try the gRPC request
3561
3711
  2. If SSH tunnel is closed, recreate it and retry
3562
3712
  """
3563
- max_attempts = 3
3713
+ max_attempts = 5
3564
3714
  backoff = common_utils.Backoff(initial_backoff=0.5)
3565
3715
  last_exception: Optional[Exception] = None
3566
3716
 
@@ -3573,22 +3723,9 @@ def invoke_skylet_with_retries(
3573
3723
  with ux_utils.print_exception_no_traceback():
3574
3724
  raise exceptions.SkyletInternalError(e.details())
3575
3725
  elif e.code() == grpc.StatusCode.UNAVAILABLE:
3576
- recreate_tunnel = True
3577
- try:
3578
- if handle.skylet_ssh_tunnel is not None:
3579
- proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
3580
- if proc.is_running(
3581
- ) and proc.status() != psutil.STATUS_ZOMBIE:
3582
- recreate_tunnel = False
3583
- except psutil.NoSuchProcess:
3584
- pass
3585
-
3586
- if recreate_tunnel:
3587
- handle.open_and_update_skylet_tunnel()
3588
-
3589
3726
  time.sleep(backoff.current_backoff())
3590
3727
  else:
3591
3728
  raise e
3592
-
3593
- raise RuntimeError(f'Failed to invoke Skylet after {max_attempts} attempts'
3594
- ) from last_exception
3729
+ raise RuntimeError(
3730
+ f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
3731
+ ) from last_exception