skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/seeweb.py +103 -0
- sky/authentication.py +38 -0
- sky/backends/backend_utils.py +148 -30
- sky/backends/cloud_vm_ray_backend.py +606 -223
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -37
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/seeweb_catalog.py +184 -0
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/seeweb.py +463 -0
- sky/core.py +46 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
- sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +75 -26
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +67 -24
- sky/logs/agent.py +10 -2
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +14 -3
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +806 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +252 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +14 -8
- sky/skylet/job_lib.py +128 -10
- sky/skylet/log_lib.py +14 -3
- sky/skylet/log_lib.pyi +9 -0
- sky/skylet/services.py +203 -0
- sky/skylet/skylet.py +4 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +11 -5
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = 'bf9b3c4e09e97cf2dafed9f351d0b36438adf4ec'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20250913'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
|
@@ -149,6 +149,7 @@ Vsphere = clouds.Vsphere
|
|
|
149
149
|
Fluidstack = clouds.Fluidstack
|
|
150
150
|
Nebius = clouds.Nebius
|
|
151
151
|
Hyperbolic = clouds.Hyperbolic
|
|
152
|
+
Seeweb = clouds.Seeweb
|
|
152
153
|
|
|
153
154
|
__all__ = [
|
|
154
155
|
'__version__',
|
|
@@ -169,6 +170,7 @@ __all__ = [
|
|
|
169
170
|
'Fluidstack',
|
|
170
171
|
'Nebius',
|
|
171
172
|
'Hyperbolic',
|
|
173
|
+
'Seeweb',
|
|
172
174
|
'Optimizer',
|
|
173
175
|
'OptimizeTarget',
|
|
174
176
|
'backends',
|
sky/adaptors/seeweb.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
""" Seeweb Adaptor """
|
|
2
|
+
import configparser
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from sky.adaptors import common
|
|
6
|
+
from sky.utils import annotations
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SeewebError(Exception):
|
|
10
|
+
"""Base exception for Seeweb adaptor errors."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SeewebCredentialsFileNotFound(SeewebError):
|
|
14
|
+
"""Raised when the Seeweb credentials file is missing."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SeewebApiKeyMissing(SeewebError):
|
|
18
|
+
"""Raised when the Seeweb API key is missing or empty."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SeewebAuthenticationError(SeewebError):
|
|
22
|
+
"""Raised when authenticating with Seeweb API fails."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Seeweb.'
|
|
26
|
+
'Try pip install "skypilot[seeweb]"')
|
|
27
|
+
|
|
28
|
+
ecsapi = common.LazyImport(
|
|
29
|
+
'ecsapi',
|
|
30
|
+
import_error_message=_IMPORT_ERROR_MESSAGE,
|
|
31
|
+
)
|
|
32
|
+
boto3 = common.LazyImport('boto3', import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
33
|
+
botocore = common.LazyImport('botocore',
|
|
34
|
+
import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
35
|
+
|
|
36
|
+
_LAZY_MODULES = (ecsapi, boto3, botocore)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
40
|
+
def check_compute_credentials() -> bool:
|
|
41
|
+
"""Checks if the user has access credentials to Seeweb's compute service.
|
|
42
|
+
|
|
43
|
+
Returns True if credentials are valid; otherwise raises a SeewebError.
|
|
44
|
+
"""
|
|
45
|
+
# Read API key from standard Seeweb configuration file
|
|
46
|
+
key_path = Path('~/.seeweb_cloud/seeweb_keys').expanduser()
|
|
47
|
+
if not key_path.exists():
|
|
48
|
+
raise SeewebCredentialsFileNotFound(
|
|
49
|
+
'Missing Seeweb API key file ~/.seeweb_cloud/seeweb_keys')
|
|
50
|
+
|
|
51
|
+
parser = configparser.ConfigParser()
|
|
52
|
+
parser.read(key_path)
|
|
53
|
+
try:
|
|
54
|
+
api_key = parser['DEFAULT']['api_key'].strip()
|
|
55
|
+
except KeyError as e:
|
|
56
|
+
raise SeewebApiKeyMissing(
|
|
57
|
+
'Missing api_key in ~/.seeweb_cloud/seeweb_keys') from e
|
|
58
|
+
if not api_key:
|
|
59
|
+
raise SeewebApiKeyMissing(
|
|
60
|
+
'Empty api_key in ~/.seeweb_cloud/seeweb_keys')
|
|
61
|
+
|
|
62
|
+
# Test connection by fetching servers list to validate the key
|
|
63
|
+
try:
|
|
64
|
+
seeweb_client = ecsapi.Api(token=api_key)
|
|
65
|
+
seeweb_client.fetch_servers()
|
|
66
|
+
except Exception as e: # pylint: disable=broad-except
|
|
67
|
+
raise SeewebAuthenticationError(
|
|
68
|
+
f'Unable to authenticate with Seeweb API: {e}') from e
|
|
69
|
+
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
74
|
+
def check_storage_credentials() -> bool:
|
|
75
|
+
"""Checks if the user has access credentials to Seeweb's storage service.
|
|
76
|
+
|
|
77
|
+
Mirrors compute credentials validation.
|
|
78
|
+
"""
|
|
79
|
+
return check_compute_credentials()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
83
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
|
84
|
+
def client():
|
|
85
|
+
"""Returns an authenticated ecsapi.Api object."""
|
|
86
|
+
# Create authenticated client using the same credential pattern
|
|
87
|
+
key_path = Path('~/.seeweb_cloud/seeweb_keys').expanduser()
|
|
88
|
+
if not key_path.exists():
|
|
89
|
+
raise SeewebCredentialsFileNotFound(
|
|
90
|
+
'Missing Seeweb API key file ~/.seeweb_cloud/seeweb_keys')
|
|
91
|
+
|
|
92
|
+
parser = configparser.ConfigParser()
|
|
93
|
+
parser.read(key_path)
|
|
94
|
+
try:
|
|
95
|
+
api_key = parser['DEFAULT']['api_key'].strip()
|
|
96
|
+
except KeyError as e:
|
|
97
|
+
raise SeewebApiKeyMissing(
|
|
98
|
+
'Missing api_key in ~/.seeweb_cloud/seeweb_keys') from e
|
|
99
|
+
if not api_key:
|
|
100
|
+
raise SeewebApiKeyMissing(
|
|
101
|
+
'Empty api_key in ~/.seeweb_cloud/seeweb_keys')
|
|
102
|
+
|
|
103
|
+
return ecsapi.Api(token=api_key)
|
sky/authentication.py
CHANGED
|
@@ -40,6 +40,7 @@ from sky.adaptors import gcp
|
|
|
40
40
|
from sky.adaptors import ibm
|
|
41
41
|
from sky.adaptors import kubernetes
|
|
42
42
|
from sky.adaptors import runpod
|
|
43
|
+
from sky.adaptors import seeweb as seeweb_adaptor
|
|
43
44
|
from sky.adaptors import vast
|
|
44
45
|
from sky.provision.fluidstack import fluidstack_utils
|
|
45
46
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
@@ -601,3 +602,40 @@ def setup_hyperbolic_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
601
602
|
config['auth']['ssh_public_key'] = public_key_path
|
|
602
603
|
|
|
603
604
|
return configure_ssh_info(config)
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def setup_seeweb_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
608
|
+
"""Registers the public key with Seeweb and notes the remote name."""
|
|
609
|
+
# 1. local key pair
|
|
610
|
+
get_or_generate_keys()
|
|
611
|
+
|
|
612
|
+
# 2. public key
|
|
613
|
+
_, public_key_path = get_or_generate_keys()
|
|
614
|
+
with open(public_key_path, 'r', encoding='utf-8') as f:
|
|
615
|
+
public_key = f.read().strip()
|
|
616
|
+
|
|
617
|
+
# 3. Seeweb API client
|
|
618
|
+
client = seeweb_adaptor.client()
|
|
619
|
+
|
|
620
|
+
# 4. Check if key is already registered
|
|
621
|
+
prefix = f'sky-key-{common_utils.get_user_hash()}'
|
|
622
|
+
remote_name = None
|
|
623
|
+
for k in client.fetch_ssh_keys():
|
|
624
|
+
if k.key.strip() == public_key:
|
|
625
|
+
remote_name = k.label # already present
|
|
626
|
+
break
|
|
627
|
+
|
|
628
|
+
# 5. doesn't exist, choose a unique name and create it
|
|
629
|
+
if remote_name is None:
|
|
630
|
+
suffix = 1
|
|
631
|
+
remote_name = prefix
|
|
632
|
+
existing_names = {k.label for k in client.fetch_ssh_keys()}
|
|
633
|
+
while remote_name in existing_names:
|
|
634
|
+
suffix += 1
|
|
635
|
+
remote_name = f'{prefix}-{suffix}'
|
|
636
|
+
client.create_ssh_key(label=remote_name, key=public_key)
|
|
637
|
+
|
|
638
|
+
# 6. Put the remote name in cluster-config (like for Lambda)
|
|
639
|
+
config['auth']['remote_key_name'] = remote_name
|
|
640
|
+
|
|
641
|
+
return config
|
sky/backends/backend_utils.py
CHANGED
|
@@ -7,11 +7,13 @@ import hashlib
|
|
|
7
7
|
import os
|
|
8
8
|
import pathlib
|
|
9
9
|
import pprint
|
|
10
|
+
import queue as queue_lib
|
|
10
11
|
import re
|
|
11
12
|
import shlex
|
|
12
13
|
import subprocess
|
|
13
14
|
import sys
|
|
14
15
|
import tempfile
|
|
16
|
+
import threading
|
|
15
17
|
import time
|
|
16
18
|
import typing
|
|
17
19
|
from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
|
|
@@ -23,7 +25,6 @@ from aiohttp import ClientTimeout
|
|
|
23
25
|
from aiohttp import TCPConnector
|
|
24
26
|
import colorama
|
|
25
27
|
from packaging import version
|
|
26
|
-
import psutil
|
|
27
28
|
from typing_extensions import Literal
|
|
28
29
|
|
|
29
30
|
import sky
|
|
@@ -111,8 +112,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
|
|
|
111
112
|
# 10.133.0.5: ray.worker.default,
|
|
112
113
|
_LAUNCHING_IP_PATTERN = re.compile(
|
|
113
114
|
r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
|
|
115
|
+
SSH_CONNECTION_ERROR_PATTERN = re.compile(
|
|
116
|
+
r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
|
|
114
117
|
_SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
|
|
115
118
|
re.IGNORECASE)
|
|
119
|
+
K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
|
|
120
|
+
re.IGNORECASE)
|
|
116
121
|
_RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
|
|
117
122
|
WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
|
|
118
123
|
|
|
@@ -135,6 +140,7 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
|
|
135
140
|
|
|
136
141
|
CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
|
|
137
142
|
WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
|
|
143
|
+
CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10
|
|
138
144
|
|
|
139
145
|
# Remote dir that holds our runtime files.
|
|
140
146
|
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
@@ -213,6 +219,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
|
|
|
213
219
|
('provider', 'availability_zone'),
|
|
214
220
|
]
|
|
215
221
|
|
|
222
|
+
_ACK_MESSAGE = 'ack'
|
|
223
|
+
_FORWARDING_FROM_MESSAGE = 'Forwarding from'
|
|
224
|
+
|
|
216
225
|
|
|
217
226
|
def is_ip(s: str) -> bool:
|
|
218
227
|
"""Returns whether this string matches IP_ADDR_REGEX."""
|
|
@@ -1107,6 +1116,8 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1107
1116
|
config = auth.setup_fluidstack_authentication(config)
|
|
1108
1117
|
elif isinstance(cloud, clouds.Hyperbolic):
|
|
1109
1118
|
config = auth.setup_hyperbolic_authentication(config)
|
|
1119
|
+
elif isinstance(cloud, clouds.Seeweb):
|
|
1120
|
+
config = auth.setup_seeweb_authentication(config)
|
|
1110
1121
|
else:
|
|
1111
1122
|
assert False, cloud
|
|
1112
1123
|
yaml_utils.dump_yaml(tmp_yaml_path, config)
|
|
@@ -2324,7 +2335,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2324
2335
|
handle,
|
|
2325
2336
|
requested_resources=None,
|
|
2326
2337
|
ready=True,
|
|
2327
|
-
is_launch=False
|
|
2338
|
+
is_launch=False,
|
|
2339
|
+
update_only=True)
|
|
2328
2340
|
return global_user_state.get_cluster_from_name(cluster_name)
|
|
2329
2341
|
|
|
2330
2342
|
# All cases below are transitioning the cluster to non-UP states.
|
|
@@ -2534,7 +2546,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2534
2546
|
handle,
|
|
2535
2547
|
requested_resources=None,
|
|
2536
2548
|
ready=False,
|
|
2537
|
-
is_launch=False
|
|
2549
|
+
is_launch=False,
|
|
2550
|
+
update_only=True)
|
|
2538
2551
|
return global_user_state.get_cluster_from_name(cluster_name)
|
|
2539
2552
|
# Now is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2540
2553
|
# STOPPED.
|
|
@@ -2672,7 +2685,7 @@ def refresh_cluster_record(
|
|
|
2672
2685
|
'Refreshing status: Failed get the lock for cluster '
|
|
2673
2686
|
f'{cluster_name!r}. Using the cached status.')
|
|
2674
2687
|
return record
|
|
2675
|
-
time.sleep(
|
|
2688
|
+
time.sleep(lock.poll_interval)
|
|
2676
2689
|
|
|
2677
2690
|
# Refresh for next loop iteration.
|
|
2678
2691
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
|
@@ -3582,19 +3595,126 @@ def workspace_lock_id(workspace_name: str) -> str:
|
|
|
3582
3595
|
return f'{workspace_name}_workspace'
|
|
3583
3596
|
|
|
3584
3597
|
|
|
3598
|
+
def cluster_tunnel_lock_id(cluster_name: str) -> str:
|
|
3599
|
+
"""Get the lock ID for cluster tunnel operations."""
|
|
3600
|
+
return f'{cluster_name}_ssh_tunnel'
|
|
3601
|
+
|
|
3602
|
+
|
|
3603
|
+
def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
|
|
3604
|
+
command_runner.KubernetesCommandRunner],
|
|
3605
|
+
port_forward: Tuple[int, int]) -> subprocess.Popen:
|
|
3606
|
+
local_port, remote_port = port_forward
|
|
3607
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3608
|
+
# Disabling ControlMaster makes things easier to reason about
|
|
3609
|
+
# with respect to resource management/ownership,
|
|
3610
|
+
# as killing the process will close the tunnel too.
|
|
3611
|
+
head_runner.disable_control_master = True
|
|
3612
|
+
head_runner.port_forward_execute_remote_command = True
|
|
3613
|
+
|
|
3614
|
+
# The default connect_timeout of 1s is too short for
|
|
3615
|
+
# connecting to clusters using a jump server.
|
|
3616
|
+
# We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
|
|
3617
|
+
# which is counted towards non-idleness.
|
|
3618
|
+
cmd: List[str] = head_runner.port_forward_command(
|
|
3619
|
+
[(local_port, remote_port)],
|
|
3620
|
+
connect_timeout=5,
|
|
3621
|
+
ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
|
|
3622
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3623
|
+
# cat so the command doesn't exit until we kill it
|
|
3624
|
+
cmd += [f'"echo {_ACK_MESSAGE} && cat"']
|
|
3625
|
+
cmd_str = ' '.join(cmd)
|
|
3626
|
+
logger.debug(f'Running port forward command: {cmd_str}')
|
|
3627
|
+
ssh_tunnel_proc = subprocess.Popen(cmd_str,
|
|
3628
|
+
shell=True,
|
|
3629
|
+
stdin=subprocess.PIPE,
|
|
3630
|
+
stdout=subprocess.PIPE,
|
|
3631
|
+
stderr=subprocess.PIPE,
|
|
3632
|
+
start_new_session=True,
|
|
3633
|
+
text=True)
|
|
3634
|
+
# Wait until we receive an ack from the remote cluster or
|
|
3635
|
+
# the SSH connection times out.
|
|
3636
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
3637
|
+
stdout_thread = threading.Thread(
|
|
3638
|
+
target=lambda queue, stdout: queue.put(stdout.readline()),
|
|
3639
|
+
args=(queue, ssh_tunnel_proc.stdout),
|
|
3640
|
+
daemon=True)
|
|
3641
|
+
stdout_thread.start()
|
|
3642
|
+
while ssh_tunnel_proc.poll() is None:
|
|
3643
|
+
try:
|
|
3644
|
+
ack = queue.get_nowait()
|
|
3645
|
+
except queue_lib.Empty:
|
|
3646
|
+
ack = None
|
|
3647
|
+
time.sleep(0.1)
|
|
3648
|
+
continue
|
|
3649
|
+
assert ack is not None
|
|
3650
|
+
if isinstance(
|
|
3651
|
+
head_runner,
|
|
3652
|
+
command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
|
|
3653
|
+
break
|
|
3654
|
+
elif isinstance(head_runner, command_runner.KubernetesCommandRunner
|
|
3655
|
+
) and _FORWARDING_FROM_MESSAGE in ack:
|
|
3656
|
+
# On kind clusters, this error occurs if we make a request
|
|
3657
|
+
# immediately after the port-forward is established on a new pod:
|
|
3658
|
+
# "Unhandled Error" err="an error occurred forwarding ... -> 46590:
|
|
3659
|
+
# failed to execute portforward in network namespace
|
|
3660
|
+
# "/var/run/netns/cni-...": failed to connect to localhost:46590
|
|
3661
|
+
# inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
|
|
3662
|
+
# connect: connection refused
|
|
3663
|
+
# So we need to poll the port on the pod to check if it is open.
|
|
3664
|
+
# We did not observe this with real Kubernetes clusters.
|
|
3665
|
+
timeout = 5
|
|
3666
|
+
port_check_cmd = (
|
|
3667
|
+
# We install netcat in our ray-node container,
|
|
3668
|
+
# so we can use it here.
|
|
3669
|
+
# (See kubernetes-ray.yml.j2)
|
|
3670
|
+
f'end=$((SECONDS+{timeout})); '
|
|
3671
|
+
f'while ! nc -z -w 1 localhost {remote_port}; do '
|
|
3672
|
+
'if (( SECONDS >= end )); then exit 1; fi; '
|
|
3673
|
+
'sleep 0.1; '
|
|
3674
|
+
'done')
|
|
3675
|
+
returncode, stdout, stderr = head_runner.run(port_check_cmd,
|
|
3676
|
+
require_outputs=True,
|
|
3677
|
+
stream_logs=False)
|
|
3678
|
+
if returncode != 0:
|
|
3679
|
+
try:
|
|
3680
|
+
ssh_tunnel_proc.terminate()
|
|
3681
|
+
ssh_tunnel_proc.wait(timeout=5)
|
|
3682
|
+
except subprocess.TimeoutExpired:
|
|
3683
|
+
ssh_tunnel_proc.kill()
|
|
3684
|
+
ssh_tunnel_proc.wait()
|
|
3685
|
+
finally:
|
|
3686
|
+
error_msg = (f'Failed to check remote port {remote_port}')
|
|
3687
|
+
if stdout:
|
|
3688
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3689
|
+
raise exceptions.CommandError(returncode=returncode,
|
|
3690
|
+
command=cmd_str,
|
|
3691
|
+
error_msg=error_msg,
|
|
3692
|
+
detailed_reason=stderr)
|
|
3693
|
+
break
|
|
3694
|
+
|
|
3695
|
+
if ssh_tunnel_proc.poll() is not None:
|
|
3696
|
+
stdout, stderr = ssh_tunnel_proc.communicate()
|
|
3697
|
+
error_msg = 'Port forward failed'
|
|
3698
|
+
if stdout:
|
|
3699
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3700
|
+
raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
|
|
3701
|
+
command=cmd_str,
|
|
3702
|
+
error_msg=error_msg,
|
|
3703
|
+
detailed_reason=stderr)
|
|
3704
|
+
return ssh_tunnel_proc
|
|
3705
|
+
|
|
3706
|
+
|
|
3585
3707
|
T = TypeVar('T')
|
|
3586
3708
|
|
|
3587
3709
|
|
|
3588
|
-
def invoke_skylet_with_retries(
|
|
3589
|
-
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
3590
|
-
func: Callable[..., T]) -> T:
|
|
3710
|
+
def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
|
|
3591
3711
|
"""Generic helper for making Skylet gRPC requests.
|
|
3592
3712
|
|
|
3593
3713
|
This method handles the common pattern of:
|
|
3594
3714
|
1. Try the gRPC request
|
|
3595
3715
|
2. If SSH tunnel is closed, recreate it and retry
|
|
3596
3716
|
"""
|
|
3597
|
-
max_attempts =
|
|
3717
|
+
max_attempts = 5
|
|
3598
3718
|
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3599
3719
|
last_exception: Optional[Exception] = None
|
|
3600
3720
|
|
|
@@ -3603,26 +3723,24 @@ def invoke_skylet_with_retries(
|
|
|
3603
3723
|
return func()
|
|
3604
3724
|
except grpc.RpcError as e:
|
|
3605
3725
|
last_exception = e
|
|
3606
|
-
|
|
3607
|
-
|
|
3608
|
-
|
|
3609
|
-
|
|
3610
|
-
|
|
3611
|
-
try:
|
|
3612
|
-
if handle.skylet_ssh_tunnel is not None:
|
|
3613
|
-
proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
|
|
3614
|
-
if proc.is_running(
|
|
3615
|
-
) and proc.status() != psutil.STATUS_ZOMBIE:
|
|
3616
|
-
recreate_tunnel = False
|
|
3617
|
-
except psutil.NoSuchProcess:
|
|
3618
|
-
pass
|
|
3619
|
-
|
|
3620
|
-
if recreate_tunnel:
|
|
3621
|
-
handle.open_and_update_skylet_tunnel()
|
|
3622
|
-
|
|
3623
|
-
time.sleep(backoff.current_backoff())
|
|
3624
|
-
else:
|
|
3625
|
-
raise e
|
|
3726
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3727
|
+
|
|
3728
|
+
raise RuntimeError(
|
|
3729
|
+
f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
|
|
3730
|
+
) from last_exception
|
|
3626
3731
|
|
|
3627
|
-
|
|
3628
|
-
|
|
3732
|
+
|
|
3733
|
+
def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
|
|
3734
|
+
if e.code() == grpc.StatusCode.INTERNAL:
|
|
3735
|
+
with ux_utils.print_exception_no_traceback():
|
|
3736
|
+
raise exceptions.SkyletInternalError(e.details())
|
|
3737
|
+
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3738
|
+
time.sleep(current_backoff)
|
|
3739
|
+
elif e.code() == grpc.StatusCode.UNIMPLEMENTED:
|
|
3740
|
+
# Handle backwards compatibility: old server doesn't implement this RPC.
|
|
3741
|
+
# Let the caller fall back to legacy execution.
|
|
3742
|
+
raise exceptions.SkyletMethodNotImplementedError(
|
|
3743
|
+
f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
|
|
3744
|
+
)
|
|
3745
|
+
else:
|
|
3746
|
+
raise e
|