skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (68) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +125 -22
  3. sky/backends/cloud_vm_ray_backend.py +224 -72
  4. sky/catalog/__init__.py +7 -0
  5. sky/catalog/aws_catalog.py +4 -0
  6. sky/catalog/common.py +18 -0
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +2 -71
  9. sky/client/sdk_async.py +5 -2
  10. sky/clouds/aws.py +23 -5
  11. sky/clouds/cloud.py +8 -0
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
  14. sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
  15. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  16. sky/dashboard/out/clusters/[cluster].html +1 -1
  17. sky/dashboard/out/clusters.html +1 -1
  18. sky/dashboard/out/config.html +1 -1
  19. sky/dashboard/out/index.html +1 -1
  20. sky/dashboard/out/infra/[context].html +1 -1
  21. sky/dashboard/out/infra.html +1 -1
  22. sky/dashboard/out/jobs/[job].html +1 -1
  23. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  24. sky/dashboard/out/jobs.html +1 -1
  25. sky/dashboard/out/users.html +1 -1
  26. sky/dashboard/out/volumes.html +1 -1
  27. sky/dashboard/out/workspace/new.html +1 -1
  28. sky/dashboard/out/workspaces/[name].html +1 -1
  29. sky/dashboard/out/workspaces.html +1 -1
  30. sky/global_user_state.py +34 -0
  31. sky/jobs/client/sdk_async.py +4 -2
  32. sky/jobs/controller.py +4 -2
  33. sky/jobs/recovery_strategy.py +1 -1
  34. sky/jobs/state.py +26 -16
  35. sky/jobs/utils.py +6 -11
  36. sky/logs/agent.py +10 -2
  37. sky/provision/kubernetes/config.py +7 -2
  38. sky/provision/kubernetes/instance.py +84 -41
  39. sky/provision/vast/instance.py +1 -1
  40. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  41. sky/server/config.py +14 -5
  42. sky/server/metrics.py +41 -8
  43. sky/server/requests/executor.py +41 -4
  44. sky/server/server.py +1 -0
  45. sky/server/uvicorn.py +11 -5
  46. sky/skylet/constants.py +12 -7
  47. sky/skylet/log_lib.py +11 -0
  48. sky/skylet/log_lib.pyi +9 -0
  49. sky/task.py +62 -0
  50. sky/templates/kubernetes-ray.yml.j2 +120 -3
  51. sky/utils/accelerator_registry.py +3 -1
  52. sky/utils/command_runner.py +35 -11
  53. sky/utils/command_runner.pyi +22 -0
  54. sky/utils/context_utils.py +15 -2
  55. sky/utils/db/migration_utils.py +1 -1
  56. sky/utils/git.py +559 -1
  57. sky/utils/resource_checker.py +8 -7
  58. sky/workspaces/core.py +57 -21
  59. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +33 -33
  60. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +66 -66
  61. sky/client/cli/git.py +0 -549
  62. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  63. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
  64. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
  65. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
  66. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
  67. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
  68. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -7,7 +7,7 @@ import urllib.request
7
7
  from sky.utils import directory_utils
8
8
 
9
9
  # Replaced with the current commit when building the wheels.
10
- _SKYPILOT_COMMIT_SHA = 'acc7b0392fd7c54b450ac11a29f7e114c4651d66'
10
+ _SKYPILOT_COMMIT_SHA = '16e9ad33e647ab3859a2b5624be7386721c9ef8b'
11
11
 
12
12
 
13
13
  def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
37
37
 
38
38
 
39
39
  __commit__ = _get_git_commit()
40
- __version__ = '1.0.0.dev20250910'
40
+ __version__ = '1.0.0.dev20250912'
41
41
  __root_dir__ = directory_utils.get_sky_dir()
42
42
 
43
43
 
@@ -7,11 +7,13 @@ import hashlib
7
7
  import os
8
8
  import pathlib
9
9
  import pprint
10
+ import queue as queue_lib
10
11
  import re
11
12
  import shlex
12
13
  import subprocess
13
14
  import sys
14
15
  import tempfile
16
+ import threading
15
17
  import time
16
18
  import typing
17
19
  from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
@@ -23,7 +25,6 @@ from aiohttp import ClientTimeout
23
25
  from aiohttp import TCPConnector
24
26
  import colorama
25
27
  from packaging import version
26
- import psutil
27
28
  from typing_extensions import Literal
28
29
 
29
30
  import sky
@@ -111,8 +112,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
111
112
  # 10.133.0.5: ray.worker.default,
112
113
  _LAUNCHING_IP_PATTERN = re.compile(
113
114
  r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
115
+ SSH_CONNECTION_ERROR_PATTERN = re.compile(
116
+ r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
114
117
  _SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
115
118
  re.IGNORECASE)
119
+ K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
120
+ re.IGNORECASE)
116
121
  _RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
117
122
  WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
118
123
 
@@ -135,6 +140,7 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
135
140
 
136
141
  CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
137
142
  WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
143
+ CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10
138
144
 
139
145
  # Remote dir that holds our runtime files.
140
146
  _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
@@ -213,6 +219,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
213
219
  ('provider', 'availability_zone'),
214
220
  ]
215
221
 
222
+ _ACK_MESSAGE = 'ack'
223
+ _FORWARDING_FROM_MESSAGE = 'Forwarding from'
224
+
216
225
 
217
226
  def is_ip(s: str) -> bool:
218
227
  """Returns whether this string matches IP_ADDR_REGEX."""
@@ -2672,7 +2681,7 @@ def refresh_cluster_record(
2672
2681
  'Refreshing status: Failed get the lock for cluster '
2673
2682
  f'{cluster_name!r}. Using the cached status.')
2674
2683
  return record
2675
- time.sleep(0.05)
2684
+ time.sleep(lock.poll_interval)
2676
2685
 
2677
2686
  # Refresh for next loop iteration.
2678
2687
  record = global_user_state.get_cluster_from_name(cluster_name)
@@ -3582,19 +3591,126 @@ def workspace_lock_id(workspace_name: str) -> str:
3582
3591
  return f'{workspace_name}_workspace'
3583
3592
 
3584
3593
 
3594
+ def cluster_tunnel_lock_id(cluster_name: str) -> str:
3595
+ """Get the lock ID for cluster tunnel operations."""
3596
+ return f'{cluster_name}_ssh_tunnel'
3597
+
3598
+
3599
+ def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
3600
+ command_runner.KubernetesCommandRunner],
3601
+ port_forward: Tuple[int, int]) -> subprocess.Popen:
3602
+ local_port, remote_port = port_forward
3603
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3604
+ # Disabling ControlMaster makes things easier to reason about
3605
+ # with respect to resource management/ownership,
3606
+ # as killing the process will close the tunnel too.
3607
+ head_runner.disable_control_master = True
3608
+ head_runner.port_forward_execute_remote_command = True
3609
+
3610
+ # The default connect_timeout of 1s is too short for
3611
+ # connecting to clusters using a jump server.
3612
+ # We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
3613
+ # which is counted towards non-idleness.
3614
+ cmd: List[str] = head_runner.port_forward_command(
3615
+ [(local_port, remote_port)],
3616
+ connect_timeout=5,
3617
+ ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
3618
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3619
+ # cat so the command doesn't exit until we kill it
3620
+ cmd += [f'"echo {_ACK_MESSAGE} && cat"']
3621
+ cmd_str = ' '.join(cmd)
3622
+ logger.debug(f'Running port forward command: {cmd_str}')
3623
+ ssh_tunnel_proc = subprocess.Popen(cmd_str,
3624
+ shell=True,
3625
+ stdin=subprocess.PIPE,
3626
+ stdout=subprocess.PIPE,
3627
+ stderr=subprocess.PIPE,
3628
+ start_new_session=True,
3629
+ text=True)
3630
+ # Wait until we receive an ack from the remote cluster or
3631
+ # the SSH connection times out.
3632
+ queue: queue_lib.Queue = queue_lib.Queue()
3633
+ stdout_thread = threading.Thread(
3634
+ target=lambda queue, stdout: queue.put(stdout.readline()),
3635
+ args=(queue, ssh_tunnel_proc.stdout),
3636
+ daemon=True)
3637
+ stdout_thread.start()
3638
+ while ssh_tunnel_proc.poll() is None:
3639
+ try:
3640
+ ack = queue.get_nowait()
3641
+ except queue_lib.Empty:
3642
+ ack = None
3643
+ time.sleep(0.1)
3644
+ continue
3645
+ assert ack is not None
3646
+ if isinstance(
3647
+ head_runner,
3648
+ command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
3649
+ break
3650
+ elif isinstance(head_runner, command_runner.KubernetesCommandRunner
3651
+ ) and _FORWARDING_FROM_MESSAGE in ack:
3652
+ # On kind clusters, this error occurs if we make a request
3653
+ # immediately after the port-forward is established on a new pod:
3654
+ # "Unhandled Error" err="an error occurred forwarding ... -> 46590:
3655
+ # failed to execute portforward in network namespace
3656
+ # "/var/run/netns/cni-...": failed to connect to localhost:46590
3657
+ # inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
3658
+ # connect: connection refused
3659
+ # So we need to poll the port on the pod to check if it is open.
3660
+ # We did not observe this with real Kubernetes clusters.
3661
+ timeout = 5
3662
+ port_check_cmd = (
3663
+ # We install netcat in our ray-node container,
3664
+ # so we can use it here.
3665
+ # (See kubernetes-ray.yml.j2)
3666
+ f'end=$((SECONDS+{timeout})); '
3667
+ f'while ! nc -z -w 1 localhost {remote_port}; do '
3668
+ 'if (( SECONDS >= end )); then exit 1; fi; '
3669
+ 'sleep 0.1; '
3670
+ 'done')
3671
+ returncode, stdout, stderr = head_runner.run(port_check_cmd,
3672
+ require_outputs=True,
3673
+ stream_logs=False)
3674
+ if returncode != 0:
3675
+ try:
3676
+ ssh_tunnel_proc.terminate()
3677
+ ssh_tunnel_proc.wait(timeout=5)
3678
+ except subprocess.TimeoutExpired:
3679
+ ssh_tunnel_proc.kill()
3680
+ ssh_tunnel_proc.wait()
3681
+ finally:
3682
+ error_msg = (f'Failed to check remote port {remote_port}')
3683
+ if stdout:
3684
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3685
+ raise exceptions.CommandError(returncode=returncode,
3686
+ command=cmd_str,
3687
+ error_msg=error_msg,
3688
+ detailed_reason=stderr)
3689
+ break
3690
+
3691
+ if ssh_tunnel_proc.poll() is not None:
3692
+ stdout, stderr = ssh_tunnel_proc.communicate()
3693
+ error_msg = 'Port forward failed'
3694
+ if stdout:
3695
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3696
+ raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
3697
+ command=cmd_str,
3698
+ error_msg=error_msg,
3699
+ detailed_reason=stderr)
3700
+ return ssh_tunnel_proc
3701
+
3702
+
3585
3703
  T = TypeVar('T')
3586
3704
 
3587
3705
 
3588
- def invoke_skylet_with_retries(
3589
- handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
3590
- func: Callable[..., T]) -> T:
3706
+ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
3591
3707
  """Generic helper for making Skylet gRPC requests.
3592
3708
 
3593
3709
  This method handles the common pattern of:
3594
3710
  1. Try the gRPC request
3595
3711
  2. If SSH tunnel is closed, recreate it and retry
3596
3712
  """
3597
- max_attempts = 3
3713
+ max_attempts = 5
3598
3714
  backoff = common_utils.Backoff(initial_backoff=0.5)
3599
3715
  last_exception: Optional[Exception] = None
3600
3716
 
@@ -3607,22 +3723,9 @@ def invoke_skylet_with_retries(
3607
3723
  with ux_utils.print_exception_no_traceback():
3608
3724
  raise exceptions.SkyletInternalError(e.details())
3609
3725
  elif e.code() == grpc.StatusCode.UNAVAILABLE:
3610
- recreate_tunnel = True
3611
- try:
3612
- if handle.skylet_ssh_tunnel is not None:
3613
- proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
3614
- if proc.is_running(
3615
- ) and proc.status() != psutil.STATUS_ZOMBIE:
3616
- recreate_tunnel = False
3617
- except psutil.NoSuchProcess:
3618
- pass
3619
-
3620
- if recreate_tunnel:
3621
- handle.open_and_update_skylet_tunnel()
3622
-
3623
3726
  time.sleep(backoff.current_backoff())
3624
3727
  else:
3625
3728
  raise e
3626
-
3627
- raise RuntimeError(f'Failed to invoke Skylet after {max_attempts} attempts'
3628
- ) from last_exception
3729
+ raise RuntimeError(
3730
+ f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
3731
+ ) from last_exception