skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -4
- sky/backends/backend_utils.py +160 -23
- sky/backends/cloud_vm_ray_backend.py +226 -74
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +2 -71
- sky/client/sdk.py +20 -0
- sky/client/sdk_async.py +23 -18
- sky/clouds/aws.py +26 -6
- sky/clouds/cloud.py +8 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +5 -1
- sky/execution.py +21 -14
- sky/global_user_state.py +34 -0
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/constants.py +3 -0
- sky/jobs/controller.py +734 -310
- sky/jobs/recovery_strategy.py +251 -129
- sky/jobs/scheduler.py +247 -174
- sky/jobs/server/core.py +20 -4
- sky/jobs/server/utils.py +2 -2
- sky/jobs/state.py +709 -508
- sky/jobs/utils.py +90 -40
- sky/logs/agent.py +10 -2
- sky/provision/aws/config.py +4 -1
- sky/provision/gcp/config.py +6 -1
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +17 -8
- sky/provision/provisioner.py +1 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/serve/replica_managers.py +0 -7
- sky/serve/serve_utils.py +5 -0
- sky/serve/server/impl.py +1 -2
- sky/serve/service.py +0 -2
- sky/server/common.py +8 -3
- sky/server/config.py +55 -27
- sky/server/constants.py +1 -0
- sky/server/daemons.py +7 -11
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/requests/serializers/encoders.py +1 -1
- sky/server/server.py +9 -1
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +4 -2
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/constants.py +14 -7
- sky/skylet/events.py +2 -10
- sky/skylet/log_lib.py +11 -0
- sky/skylet/log_lib.pyi +9 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +25 -3
- sky/utils/common_utils.py +11 -1
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +5 -0
- sky/utils/db/db_utils.py +31 -2
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/utils/rich_utils.py +3 -1
- sky/utils/subprocess_utils.py +9 -0
- sky/volumes/volume.py +2 -0
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +38 -36
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +95 -95
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = '16e9ad33e647ab3859a2b5624be7386721c9ef8b'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20250912'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
sky/authentication.py
CHANGED
|
@@ -207,6 +207,24 @@ def configure_ssh_info(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
207
207
|
return config
|
|
208
208
|
|
|
209
209
|
|
|
210
|
+
def parse_gcp_project_oslogin(project):
|
|
211
|
+
"""Helper function to parse GCP project metadata."""
|
|
212
|
+
common_metadata = project.get('commonInstanceMetadata', {})
|
|
213
|
+
if not isinstance(common_metadata, dict):
|
|
214
|
+
common_metadata = {}
|
|
215
|
+
|
|
216
|
+
metadata_items = common_metadata.get('items', [])
|
|
217
|
+
if not isinstance(metadata_items, list):
|
|
218
|
+
metadata_items = []
|
|
219
|
+
|
|
220
|
+
project_oslogin = next(
|
|
221
|
+
(item for item in metadata_items
|
|
222
|
+
if isinstance(item, dict) and item.get('key') == 'enable-oslogin'),
|
|
223
|
+
{}).get('value', 'False')
|
|
224
|
+
|
|
225
|
+
return project_oslogin
|
|
226
|
+
|
|
227
|
+
|
|
210
228
|
# Snippets of code inspired from
|
|
211
229
|
# https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/gcp/config.py
|
|
212
230
|
# Takes in config, a yaml dict and outputs a postprocessed dict
|
|
@@ -264,10 +282,7 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
264
282
|
'Please check your network connection.')
|
|
265
283
|
raise
|
|
266
284
|
|
|
267
|
-
project_oslogin
|
|
268
|
-
(item for item in project['commonInstanceMetadata'].get('items', [])
|
|
269
|
-
if item['key'] == 'enable-oslogin'), {}).get('value', 'False')
|
|
270
|
-
|
|
285
|
+
project_oslogin = parse_gcp_project_oslogin(project)
|
|
271
286
|
if project_oslogin.lower() == 'true':
|
|
272
287
|
logger.info(
|
|
273
288
|
f'OS Login is enabled for GCP project {project_id}. Running '
|
sky/backends/backend_utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Util constants/functions for the backends."""
|
|
2
|
+
import asyncio
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
import enum
|
|
4
5
|
import fnmatch
|
|
@@ -6,20 +7,24 @@ import hashlib
|
|
|
6
7
|
import os
|
|
7
8
|
import pathlib
|
|
8
9
|
import pprint
|
|
10
|
+
import queue as queue_lib
|
|
9
11
|
import re
|
|
10
12
|
import shlex
|
|
11
13
|
import subprocess
|
|
12
14
|
import sys
|
|
13
15
|
import tempfile
|
|
16
|
+
import threading
|
|
14
17
|
import time
|
|
15
18
|
import typing
|
|
16
19
|
from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
|
|
17
20
|
TypeVar, Union)
|
|
18
21
|
import uuid
|
|
19
22
|
|
|
23
|
+
import aiohttp
|
|
24
|
+
from aiohttp import ClientTimeout
|
|
25
|
+
from aiohttp import TCPConnector
|
|
20
26
|
import colorama
|
|
21
27
|
from packaging import version
|
|
22
|
-
import psutil
|
|
23
28
|
from typing_extensions import Literal
|
|
24
29
|
|
|
25
30
|
import sky
|
|
@@ -107,8 +112,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
|
|
|
107
112
|
# 10.133.0.5: ray.worker.default,
|
|
108
113
|
_LAUNCHING_IP_PATTERN = re.compile(
|
|
109
114
|
r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
|
|
115
|
+
SSH_CONNECTION_ERROR_PATTERN = re.compile(
|
|
116
|
+
r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
|
|
110
117
|
_SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
|
|
111
118
|
re.IGNORECASE)
|
|
119
|
+
K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
|
|
120
|
+
re.IGNORECASE)
|
|
112
121
|
_RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
|
|
113
122
|
WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
|
|
114
123
|
|
|
@@ -131,6 +140,7 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
|
|
131
140
|
|
|
132
141
|
CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
|
|
133
142
|
WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
|
|
143
|
+
CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10
|
|
134
144
|
|
|
135
145
|
# Remote dir that holds our runtime files.
|
|
136
146
|
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
@@ -209,6 +219,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
|
|
|
209
219
|
('provider', 'availability_zone'),
|
|
210
220
|
]
|
|
211
221
|
|
|
222
|
+
_ACK_MESSAGE = 'ack'
|
|
223
|
+
_FORWARDING_FROM_MESSAGE = 'Forwarding from'
|
|
224
|
+
|
|
212
225
|
|
|
213
226
|
def is_ip(s: str) -> bool:
|
|
214
227
|
"""Returns whether this string matches IP_ADDR_REGEX."""
|
|
@@ -535,7 +548,7 @@ def get_expirable_clouds(
|
|
|
535
548
|
# get all custom contexts
|
|
536
549
|
contexts = kubernetes_utils.get_custom_config_k8s_contexts()
|
|
537
550
|
# add remote_identity of each context if it exists
|
|
538
|
-
remote_identities = None
|
|
551
|
+
remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
|
|
539
552
|
for context in contexts:
|
|
540
553
|
context_remote_identity = skypilot_config.get_effective_region_config(
|
|
541
554
|
cloud='kubernetes',
|
|
@@ -546,9 +559,11 @@ def get_expirable_clouds(
|
|
|
546
559
|
if remote_identities is None:
|
|
547
560
|
remote_identities = []
|
|
548
561
|
if isinstance(context_remote_identity, str):
|
|
562
|
+
assert isinstance(remote_identities, list)
|
|
549
563
|
remote_identities.append(
|
|
550
564
|
{context: context_remote_identity})
|
|
551
565
|
elif isinstance(context_remote_identity, list):
|
|
566
|
+
assert isinstance(remote_identities, list)
|
|
552
567
|
remote_identities.extend(context_remote_identity)
|
|
553
568
|
# add global kubernetes remote identity if it exists, if not, add default
|
|
554
569
|
global_remote_identity = skypilot_config.get_effective_region_config(
|
|
@@ -560,8 +575,10 @@ def get_expirable_clouds(
|
|
|
560
575
|
if remote_identities is None:
|
|
561
576
|
remote_identities = []
|
|
562
577
|
if isinstance(global_remote_identity, str):
|
|
578
|
+
assert isinstance(remote_identities, list)
|
|
563
579
|
remote_identities.append({'*': global_remote_identity})
|
|
564
580
|
elif isinstance(global_remote_identity, list):
|
|
581
|
+
assert isinstance(remote_identities, list)
|
|
565
582
|
remote_identities.extend(global_remote_identity)
|
|
566
583
|
if remote_identities is None:
|
|
567
584
|
remote_identities = schemas.get_default_remote_identity(
|
|
@@ -1784,6 +1801,32 @@ def check_network_connection():
|
|
|
1784
1801
|
'Network seems down.')
|
|
1785
1802
|
|
|
1786
1803
|
|
|
1804
|
+
async def async_check_network_connection():
|
|
1805
|
+
"""Check if the network connection is available.
|
|
1806
|
+
|
|
1807
|
+
Tolerates 3 retries as it is observed that connections can fail.
|
|
1808
|
+
Uses aiohttp for async HTTP requests.
|
|
1809
|
+
"""
|
|
1810
|
+
# Create a session with retry logic
|
|
1811
|
+
timeout = ClientTimeout(total=15)
|
|
1812
|
+
connector = TCPConnector(limit=1) # Limit to 1 connection at a time
|
|
1813
|
+
|
|
1814
|
+
async with aiohttp.ClientSession(timeout=timeout,
|
|
1815
|
+
connector=connector) as session:
|
|
1816
|
+
for i, ip in enumerate(_TEST_IP_LIST):
|
|
1817
|
+
try:
|
|
1818
|
+
async with session.head(ip) as response:
|
|
1819
|
+
if response.status < 400: # Any 2xx or 3xx status is good
|
|
1820
|
+
return
|
|
1821
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
1822
|
+
if i == len(_TEST_IP_LIST) - 1:
|
|
1823
|
+
raise exceptions.NetworkError(
|
|
1824
|
+
'Could not refresh the cluster. '
|
|
1825
|
+
'Network seems down.') from e
|
|
1826
|
+
# If not the last IP, continue to try the next one
|
|
1827
|
+
continue
|
|
1828
|
+
|
|
1829
|
+
|
|
1787
1830
|
@timeline.event
|
|
1788
1831
|
def check_owner_identity(cluster_name: str) -> None:
|
|
1789
1832
|
"""Check if current user is the same as the user who created the cluster.
|
|
@@ -2638,7 +2681,7 @@ def refresh_cluster_record(
|
|
|
2638
2681
|
'Refreshing status: Failed get the lock for cluster '
|
|
2639
2682
|
f'{cluster_name!r}. Using the cached status.')
|
|
2640
2683
|
return record
|
|
2641
|
-
time.sleep(
|
|
2684
|
+
time.sleep(lock.poll_interval)
|
|
2642
2685
|
|
|
2643
2686
|
# Refresh for next loop iteration.
|
|
2644
2687
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
|
@@ -3548,19 +3591,126 @@ def workspace_lock_id(workspace_name: str) -> str:
|
|
|
3548
3591
|
return f'{workspace_name}_workspace'
|
|
3549
3592
|
|
|
3550
3593
|
|
|
3594
|
+
def cluster_tunnel_lock_id(cluster_name: str) -> str:
|
|
3595
|
+
"""Get the lock ID for cluster tunnel operations."""
|
|
3596
|
+
return f'{cluster_name}_ssh_tunnel'
|
|
3597
|
+
|
|
3598
|
+
|
|
3599
|
+
def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
|
|
3600
|
+
command_runner.KubernetesCommandRunner],
|
|
3601
|
+
port_forward: Tuple[int, int]) -> subprocess.Popen:
|
|
3602
|
+
local_port, remote_port = port_forward
|
|
3603
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3604
|
+
# Disabling ControlMaster makes things easier to reason about
|
|
3605
|
+
# with respect to resource management/ownership,
|
|
3606
|
+
# as killing the process will close the tunnel too.
|
|
3607
|
+
head_runner.disable_control_master = True
|
|
3608
|
+
head_runner.port_forward_execute_remote_command = True
|
|
3609
|
+
|
|
3610
|
+
# The default connect_timeout of 1s is too short for
|
|
3611
|
+
# connecting to clusters using a jump server.
|
|
3612
|
+
# We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
|
|
3613
|
+
# which is counted towards non-idleness.
|
|
3614
|
+
cmd: List[str] = head_runner.port_forward_command(
|
|
3615
|
+
[(local_port, remote_port)],
|
|
3616
|
+
connect_timeout=5,
|
|
3617
|
+
ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
|
|
3618
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3619
|
+
# cat so the command doesn't exit until we kill it
|
|
3620
|
+
cmd += [f'"echo {_ACK_MESSAGE} && cat"']
|
|
3621
|
+
cmd_str = ' '.join(cmd)
|
|
3622
|
+
logger.debug(f'Running port forward command: {cmd_str}')
|
|
3623
|
+
ssh_tunnel_proc = subprocess.Popen(cmd_str,
|
|
3624
|
+
shell=True,
|
|
3625
|
+
stdin=subprocess.PIPE,
|
|
3626
|
+
stdout=subprocess.PIPE,
|
|
3627
|
+
stderr=subprocess.PIPE,
|
|
3628
|
+
start_new_session=True,
|
|
3629
|
+
text=True)
|
|
3630
|
+
# Wait until we receive an ack from the remote cluster or
|
|
3631
|
+
# the SSH connection times out.
|
|
3632
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
3633
|
+
stdout_thread = threading.Thread(
|
|
3634
|
+
target=lambda queue, stdout: queue.put(stdout.readline()),
|
|
3635
|
+
args=(queue, ssh_tunnel_proc.stdout),
|
|
3636
|
+
daemon=True)
|
|
3637
|
+
stdout_thread.start()
|
|
3638
|
+
while ssh_tunnel_proc.poll() is None:
|
|
3639
|
+
try:
|
|
3640
|
+
ack = queue.get_nowait()
|
|
3641
|
+
except queue_lib.Empty:
|
|
3642
|
+
ack = None
|
|
3643
|
+
time.sleep(0.1)
|
|
3644
|
+
continue
|
|
3645
|
+
assert ack is not None
|
|
3646
|
+
if isinstance(
|
|
3647
|
+
head_runner,
|
|
3648
|
+
command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
|
|
3649
|
+
break
|
|
3650
|
+
elif isinstance(head_runner, command_runner.KubernetesCommandRunner
|
|
3651
|
+
) and _FORWARDING_FROM_MESSAGE in ack:
|
|
3652
|
+
# On kind clusters, this error occurs if we make a request
|
|
3653
|
+
# immediately after the port-forward is established on a new pod:
|
|
3654
|
+
# "Unhandled Error" err="an error occurred forwarding ... -> 46590:
|
|
3655
|
+
# failed to execute portforward in network namespace
|
|
3656
|
+
# "/var/run/netns/cni-...": failed to connect to localhost:46590
|
|
3657
|
+
# inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
|
|
3658
|
+
# connect: connection refused
|
|
3659
|
+
# So we need to poll the port on the pod to check if it is open.
|
|
3660
|
+
# We did not observe this with real Kubernetes clusters.
|
|
3661
|
+
timeout = 5
|
|
3662
|
+
port_check_cmd = (
|
|
3663
|
+
# We install netcat in our ray-node container,
|
|
3664
|
+
# so we can use it here.
|
|
3665
|
+
# (See kubernetes-ray.yml.j2)
|
|
3666
|
+
f'end=$((SECONDS+{timeout})); '
|
|
3667
|
+
f'while ! nc -z -w 1 localhost {remote_port}; do '
|
|
3668
|
+
'if (( SECONDS >= end )); then exit 1; fi; '
|
|
3669
|
+
'sleep 0.1; '
|
|
3670
|
+
'done')
|
|
3671
|
+
returncode, stdout, stderr = head_runner.run(port_check_cmd,
|
|
3672
|
+
require_outputs=True,
|
|
3673
|
+
stream_logs=False)
|
|
3674
|
+
if returncode != 0:
|
|
3675
|
+
try:
|
|
3676
|
+
ssh_tunnel_proc.terminate()
|
|
3677
|
+
ssh_tunnel_proc.wait(timeout=5)
|
|
3678
|
+
except subprocess.TimeoutExpired:
|
|
3679
|
+
ssh_tunnel_proc.kill()
|
|
3680
|
+
ssh_tunnel_proc.wait()
|
|
3681
|
+
finally:
|
|
3682
|
+
error_msg = (f'Failed to check remote port {remote_port}')
|
|
3683
|
+
if stdout:
|
|
3684
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3685
|
+
raise exceptions.CommandError(returncode=returncode,
|
|
3686
|
+
command=cmd_str,
|
|
3687
|
+
error_msg=error_msg,
|
|
3688
|
+
detailed_reason=stderr)
|
|
3689
|
+
break
|
|
3690
|
+
|
|
3691
|
+
if ssh_tunnel_proc.poll() is not None:
|
|
3692
|
+
stdout, stderr = ssh_tunnel_proc.communicate()
|
|
3693
|
+
error_msg = 'Port forward failed'
|
|
3694
|
+
if stdout:
|
|
3695
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3696
|
+
raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
|
|
3697
|
+
command=cmd_str,
|
|
3698
|
+
error_msg=error_msg,
|
|
3699
|
+
detailed_reason=stderr)
|
|
3700
|
+
return ssh_tunnel_proc
|
|
3701
|
+
|
|
3702
|
+
|
|
3551
3703
|
T = TypeVar('T')
|
|
3552
3704
|
|
|
3553
3705
|
|
|
3554
|
-
def invoke_skylet_with_retries(
|
|
3555
|
-
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
3556
|
-
func: Callable[..., T]) -> T:
|
|
3706
|
+
def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
|
|
3557
3707
|
"""Generic helper for making Skylet gRPC requests.
|
|
3558
3708
|
|
|
3559
3709
|
This method handles the common pattern of:
|
|
3560
3710
|
1. Try the gRPC request
|
|
3561
3711
|
2. If SSH tunnel is closed, recreate it and retry
|
|
3562
3712
|
"""
|
|
3563
|
-
max_attempts =
|
|
3713
|
+
max_attempts = 5
|
|
3564
3714
|
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3565
3715
|
last_exception: Optional[Exception] = None
|
|
3566
3716
|
|
|
@@ -3573,22 +3723,9 @@ def invoke_skylet_with_retries(
|
|
|
3573
3723
|
with ux_utils.print_exception_no_traceback():
|
|
3574
3724
|
raise exceptions.SkyletInternalError(e.details())
|
|
3575
3725
|
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3576
|
-
recreate_tunnel = True
|
|
3577
|
-
try:
|
|
3578
|
-
if handle.skylet_ssh_tunnel is not None:
|
|
3579
|
-
proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
|
|
3580
|
-
if proc.is_running(
|
|
3581
|
-
) and proc.status() != psutil.STATUS_ZOMBIE:
|
|
3582
|
-
recreate_tunnel = False
|
|
3583
|
-
except psutil.NoSuchProcess:
|
|
3584
|
-
pass
|
|
3585
|
-
|
|
3586
|
-
if recreate_tunnel:
|
|
3587
|
-
handle.open_and_update_skylet_tunnel()
|
|
3588
|
-
|
|
3589
3726
|
time.sleep(backoff.current_backoff())
|
|
3590
3727
|
else:
|
|
3591
3728
|
raise e
|
|
3592
|
-
|
|
3593
|
-
|
|
3594
|
-
|
|
3729
|
+
raise RuntimeError(
|
|
3730
|
+
f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
|
|
3731
|
+
) from last_exception
|