skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +125 -22
- sky/backends/cloud_vm_ray_backend.py +224 -72
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +34 -0
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +6 -11
- sky/logs/agent.py +10 -2
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/skylet/constants.py +12 -7
- sky/skylet/log_lib.py +11 -0
- sky/skylet/log_lib.pyi +9 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +33 -33
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +66 -66
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = '16e9ad33e647ab3859a2b5624be7386721c9ef8b'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20250912'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
sky/backends/backend_utils.py
CHANGED
|
@@ -7,11 +7,13 @@ import hashlib
|
|
|
7
7
|
import os
|
|
8
8
|
import pathlib
|
|
9
9
|
import pprint
|
|
10
|
+
import queue as queue_lib
|
|
10
11
|
import re
|
|
11
12
|
import shlex
|
|
12
13
|
import subprocess
|
|
13
14
|
import sys
|
|
14
15
|
import tempfile
|
|
16
|
+
import threading
|
|
15
17
|
import time
|
|
16
18
|
import typing
|
|
17
19
|
from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
|
|
@@ -23,7 +25,6 @@ from aiohttp import ClientTimeout
|
|
|
23
25
|
from aiohttp import TCPConnector
|
|
24
26
|
import colorama
|
|
25
27
|
from packaging import version
|
|
26
|
-
import psutil
|
|
27
28
|
from typing_extensions import Literal
|
|
28
29
|
|
|
29
30
|
import sky
|
|
@@ -111,8 +112,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
|
|
|
111
112
|
# 10.133.0.5: ray.worker.default,
|
|
112
113
|
_LAUNCHING_IP_PATTERN = re.compile(
|
|
113
114
|
r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
|
|
115
|
+
SSH_CONNECTION_ERROR_PATTERN = re.compile(
|
|
116
|
+
r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
|
|
114
117
|
_SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
|
|
115
118
|
re.IGNORECASE)
|
|
119
|
+
K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
|
|
120
|
+
re.IGNORECASE)
|
|
116
121
|
_RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
|
|
117
122
|
WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
|
|
118
123
|
|
|
@@ -135,6 +140,7 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
|
|
135
140
|
|
|
136
141
|
CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
|
|
137
142
|
WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
|
|
143
|
+
CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10
|
|
138
144
|
|
|
139
145
|
# Remote dir that holds our runtime files.
|
|
140
146
|
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
@@ -213,6 +219,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
|
|
|
213
219
|
('provider', 'availability_zone'),
|
|
214
220
|
]
|
|
215
221
|
|
|
222
|
+
_ACK_MESSAGE = 'ack'
|
|
223
|
+
_FORWARDING_FROM_MESSAGE = 'Forwarding from'
|
|
224
|
+
|
|
216
225
|
|
|
217
226
|
def is_ip(s: str) -> bool:
|
|
218
227
|
"""Returns whether this string matches IP_ADDR_REGEX."""
|
|
@@ -2672,7 +2681,7 @@ def refresh_cluster_record(
|
|
|
2672
2681
|
'Refreshing status: Failed get the lock for cluster '
|
|
2673
2682
|
f'{cluster_name!r}. Using the cached status.')
|
|
2674
2683
|
return record
|
|
2675
|
-
time.sleep(
|
|
2684
|
+
time.sleep(lock.poll_interval)
|
|
2676
2685
|
|
|
2677
2686
|
# Refresh for next loop iteration.
|
|
2678
2687
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
|
@@ -3582,19 +3591,126 @@ def workspace_lock_id(workspace_name: str) -> str:
|
|
|
3582
3591
|
return f'{workspace_name}_workspace'
|
|
3583
3592
|
|
|
3584
3593
|
|
|
3594
|
+
def cluster_tunnel_lock_id(cluster_name: str) -> str:
|
|
3595
|
+
"""Get the lock ID for cluster tunnel operations."""
|
|
3596
|
+
return f'{cluster_name}_ssh_tunnel'
|
|
3597
|
+
|
|
3598
|
+
|
|
3599
|
+
def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
|
|
3600
|
+
command_runner.KubernetesCommandRunner],
|
|
3601
|
+
port_forward: Tuple[int, int]) -> subprocess.Popen:
|
|
3602
|
+
local_port, remote_port = port_forward
|
|
3603
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3604
|
+
# Disabling ControlMaster makes things easier to reason about
|
|
3605
|
+
# with respect to resource management/ownership,
|
|
3606
|
+
# as killing the process will close the tunnel too.
|
|
3607
|
+
head_runner.disable_control_master = True
|
|
3608
|
+
head_runner.port_forward_execute_remote_command = True
|
|
3609
|
+
|
|
3610
|
+
# The default connect_timeout of 1s is too short for
|
|
3611
|
+
# connecting to clusters using a jump server.
|
|
3612
|
+
# We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
|
|
3613
|
+
# which is counted towards non-idleness.
|
|
3614
|
+
cmd: List[str] = head_runner.port_forward_command(
|
|
3615
|
+
[(local_port, remote_port)],
|
|
3616
|
+
connect_timeout=5,
|
|
3617
|
+
ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
|
|
3618
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3619
|
+
# cat so the command doesn't exit until we kill it
|
|
3620
|
+
cmd += [f'"echo {_ACK_MESSAGE} && cat"']
|
|
3621
|
+
cmd_str = ' '.join(cmd)
|
|
3622
|
+
logger.debug(f'Running port forward command: {cmd_str}')
|
|
3623
|
+
ssh_tunnel_proc = subprocess.Popen(cmd_str,
|
|
3624
|
+
shell=True,
|
|
3625
|
+
stdin=subprocess.PIPE,
|
|
3626
|
+
stdout=subprocess.PIPE,
|
|
3627
|
+
stderr=subprocess.PIPE,
|
|
3628
|
+
start_new_session=True,
|
|
3629
|
+
text=True)
|
|
3630
|
+
# Wait until we receive an ack from the remote cluster or
|
|
3631
|
+
# the SSH connection times out.
|
|
3632
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
3633
|
+
stdout_thread = threading.Thread(
|
|
3634
|
+
target=lambda queue, stdout: queue.put(stdout.readline()),
|
|
3635
|
+
args=(queue, ssh_tunnel_proc.stdout),
|
|
3636
|
+
daemon=True)
|
|
3637
|
+
stdout_thread.start()
|
|
3638
|
+
while ssh_tunnel_proc.poll() is None:
|
|
3639
|
+
try:
|
|
3640
|
+
ack = queue.get_nowait()
|
|
3641
|
+
except queue_lib.Empty:
|
|
3642
|
+
ack = None
|
|
3643
|
+
time.sleep(0.1)
|
|
3644
|
+
continue
|
|
3645
|
+
assert ack is not None
|
|
3646
|
+
if isinstance(
|
|
3647
|
+
head_runner,
|
|
3648
|
+
command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
|
|
3649
|
+
break
|
|
3650
|
+
elif isinstance(head_runner, command_runner.KubernetesCommandRunner
|
|
3651
|
+
) and _FORWARDING_FROM_MESSAGE in ack:
|
|
3652
|
+
# On kind clusters, this error occurs if we make a request
|
|
3653
|
+
# immediately after the port-forward is established on a new pod:
|
|
3654
|
+
# "Unhandled Error" err="an error occurred forwarding ... -> 46590:
|
|
3655
|
+
# failed to execute portforward in network namespace
|
|
3656
|
+
# "/var/run/netns/cni-...": failed to connect to localhost:46590
|
|
3657
|
+
# inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
|
|
3658
|
+
# connect: connection refused
|
|
3659
|
+
# So we need to poll the port on the pod to check if it is open.
|
|
3660
|
+
# We did not observe this with real Kubernetes clusters.
|
|
3661
|
+
timeout = 5
|
|
3662
|
+
port_check_cmd = (
|
|
3663
|
+
# We install netcat in our ray-node container,
|
|
3664
|
+
# so we can use it here.
|
|
3665
|
+
# (See kubernetes-ray.yml.j2)
|
|
3666
|
+
f'end=$((SECONDS+{timeout})); '
|
|
3667
|
+
f'while ! nc -z -w 1 localhost {remote_port}; do '
|
|
3668
|
+
'if (( SECONDS >= end )); then exit 1; fi; '
|
|
3669
|
+
'sleep 0.1; '
|
|
3670
|
+
'done')
|
|
3671
|
+
returncode, stdout, stderr = head_runner.run(port_check_cmd,
|
|
3672
|
+
require_outputs=True,
|
|
3673
|
+
stream_logs=False)
|
|
3674
|
+
if returncode != 0:
|
|
3675
|
+
try:
|
|
3676
|
+
ssh_tunnel_proc.terminate()
|
|
3677
|
+
ssh_tunnel_proc.wait(timeout=5)
|
|
3678
|
+
except subprocess.TimeoutExpired:
|
|
3679
|
+
ssh_tunnel_proc.kill()
|
|
3680
|
+
ssh_tunnel_proc.wait()
|
|
3681
|
+
finally:
|
|
3682
|
+
error_msg = (f'Failed to check remote port {remote_port}')
|
|
3683
|
+
if stdout:
|
|
3684
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3685
|
+
raise exceptions.CommandError(returncode=returncode,
|
|
3686
|
+
command=cmd_str,
|
|
3687
|
+
error_msg=error_msg,
|
|
3688
|
+
detailed_reason=stderr)
|
|
3689
|
+
break
|
|
3690
|
+
|
|
3691
|
+
if ssh_tunnel_proc.poll() is not None:
|
|
3692
|
+
stdout, stderr = ssh_tunnel_proc.communicate()
|
|
3693
|
+
error_msg = 'Port forward failed'
|
|
3694
|
+
if stdout:
|
|
3695
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3696
|
+
raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
|
|
3697
|
+
command=cmd_str,
|
|
3698
|
+
error_msg=error_msg,
|
|
3699
|
+
detailed_reason=stderr)
|
|
3700
|
+
return ssh_tunnel_proc
|
|
3701
|
+
|
|
3702
|
+
|
|
3585
3703
|
T = TypeVar('T')
|
|
3586
3704
|
|
|
3587
3705
|
|
|
3588
|
-
def invoke_skylet_with_retries(
|
|
3589
|
-
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
3590
|
-
func: Callable[..., T]) -> T:
|
|
3706
|
+
def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
|
|
3591
3707
|
"""Generic helper for making Skylet gRPC requests.
|
|
3592
3708
|
|
|
3593
3709
|
This method handles the common pattern of:
|
|
3594
3710
|
1. Try the gRPC request
|
|
3595
3711
|
2. If SSH tunnel is closed, recreate it and retry
|
|
3596
3712
|
"""
|
|
3597
|
-
max_attempts =
|
|
3713
|
+
max_attempts = 5
|
|
3598
3714
|
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3599
3715
|
last_exception: Optional[Exception] = None
|
|
3600
3716
|
|
|
@@ -3607,22 +3723,9 @@ def invoke_skylet_with_retries(
|
|
|
3607
3723
|
with ux_utils.print_exception_no_traceback():
|
|
3608
3724
|
raise exceptions.SkyletInternalError(e.details())
|
|
3609
3725
|
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3610
|
-
recreate_tunnel = True
|
|
3611
|
-
try:
|
|
3612
|
-
if handle.skylet_ssh_tunnel is not None:
|
|
3613
|
-
proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
|
|
3614
|
-
if proc.is_running(
|
|
3615
|
-
) and proc.status() != psutil.STATUS_ZOMBIE:
|
|
3616
|
-
recreate_tunnel = False
|
|
3617
|
-
except psutil.NoSuchProcess:
|
|
3618
|
-
pass
|
|
3619
|
-
|
|
3620
|
-
if recreate_tunnel:
|
|
3621
|
-
handle.open_and_update_skylet_tunnel()
|
|
3622
|
-
|
|
3623
3726
|
time.sleep(backoff.current_backoff())
|
|
3624
3727
|
else:
|
|
3625
3728
|
raise e
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3729
|
+
raise RuntimeError(
|
|
3730
|
+
f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
|
|
3731
|
+
) from last_exception
|