skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (105) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/seeweb.py +103 -0
  3. sky/authentication.py +38 -0
  4. sky/backends/backend_utils.py +148 -30
  5. sky/backends/cloud_vm_ray_backend.py +606 -223
  6. sky/catalog/__init__.py +7 -0
  7. sky/catalog/aws_catalog.py +4 -0
  8. sky/catalog/common.py +18 -0
  9. sky/catalog/data_fetchers/fetch_aws.py +13 -37
  10. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  11. sky/catalog/seeweb_catalog.py +184 -0
  12. sky/client/cli/command.py +2 -71
  13. sky/client/sdk_async.py +5 -2
  14. sky/clouds/__init__.py +2 -0
  15. sky/clouds/aws.py +23 -5
  16. sky/clouds/cloud.py +8 -0
  17. sky/clouds/kubernetes.py +2 -0
  18. sky/clouds/seeweb.py +463 -0
  19. sky/core.py +46 -12
  20. sky/dashboard/out/404.html +1 -1
  21. sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
  22. sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
  25. sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
  27. sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
  30. sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
  31. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  32. sky/dashboard/out/clusters/[cluster].html +1 -1
  33. sky/dashboard/out/clusters.html +1 -1
  34. sky/dashboard/out/config.html +1 -1
  35. sky/dashboard/out/index.html +1 -1
  36. sky/dashboard/out/infra/[context].html +1 -1
  37. sky/dashboard/out/infra.html +1 -1
  38. sky/dashboard/out/jobs/[job].html +1 -1
  39. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/volumes.html +1 -1
  43. sky/dashboard/out/workspace/new.html +1 -1
  44. sky/dashboard/out/workspaces/[name].html +1 -1
  45. sky/dashboard/out/workspaces.html +1 -1
  46. sky/exceptions.py +5 -0
  47. sky/global_user_state.py +75 -26
  48. sky/jobs/client/sdk_async.py +4 -2
  49. sky/jobs/controller.py +4 -2
  50. sky/jobs/recovery_strategy.py +1 -1
  51. sky/jobs/state.py +26 -16
  52. sky/jobs/utils.py +67 -24
  53. sky/logs/agent.py +10 -2
  54. sky/provision/__init__.py +1 -0
  55. sky/provision/kubernetes/config.py +7 -2
  56. sky/provision/kubernetes/instance.py +84 -41
  57. sky/provision/kubernetes/utils.py +14 -3
  58. sky/provision/seeweb/__init__.py +11 -0
  59. sky/provision/seeweb/config.py +13 -0
  60. sky/provision/seeweb/instance.py +806 -0
  61. sky/provision/vast/instance.py +1 -1
  62. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  63. sky/schemas/generated/jobsv1_pb2.py +86 -0
  64. sky/schemas/generated/jobsv1_pb2.pyi +252 -0
  65. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  66. sky/server/config.py +14 -5
  67. sky/server/metrics.py +41 -8
  68. sky/server/requests/executor.py +41 -4
  69. sky/server/server.py +1 -0
  70. sky/server/uvicorn.py +11 -5
  71. sky/setup_files/dependencies.py +8 -1
  72. sky/skylet/constants.py +14 -8
  73. sky/skylet/job_lib.py +128 -10
  74. sky/skylet/log_lib.py +14 -3
  75. sky/skylet/log_lib.pyi +9 -0
  76. sky/skylet/services.py +203 -0
  77. sky/skylet/skylet.py +4 -0
  78. sky/task.py +62 -0
  79. sky/templates/kubernetes-ray.yml.j2 +120 -3
  80. sky/templates/seeweb-ray.yml.j2 +108 -0
  81. sky/utils/accelerator_registry.py +3 -1
  82. sky/utils/command_runner.py +35 -11
  83. sky/utils/command_runner.pyi +22 -0
  84. sky/utils/context_utils.py +15 -2
  85. sky/utils/controller_utils.py +11 -5
  86. sky/utils/db/migration_utils.py +1 -1
  87. sky/utils/git.py +559 -1
  88. sky/utils/resource_checker.py +8 -7
  89. sky/workspaces/core.py +57 -21
  90. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
  91. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
  92. sky/client/cli/git.py +0 -549
  93. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  98. sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  101. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
  102. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
  103. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
  104. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
  105. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -7,7 +7,7 @@ import urllib.request
7
7
  from sky.utils import directory_utils
8
8
 
9
9
  # Replaced with the current commit when building the wheels.
10
- _SKYPILOT_COMMIT_SHA = 'acc7b0392fd7c54b450ac11a29f7e114c4651d66'
10
+ _SKYPILOT_COMMIT_SHA = 'bf9b3c4e09e97cf2dafed9f351d0b36438adf4ec'
11
11
 
12
12
 
13
13
  def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
37
37
 
38
38
 
39
39
  __commit__ = _get_git_commit()
40
- __version__ = '1.0.0.dev20250910'
40
+ __version__ = '1.0.0.dev20250913'
41
41
  __root_dir__ = directory_utils.get_sky_dir()
42
42
 
43
43
 
@@ -149,6 +149,7 @@ Vsphere = clouds.Vsphere
149
149
  Fluidstack = clouds.Fluidstack
150
150
  Nebius = clouds.Nebius
151
151
  Hyperbolic = clouds.Hyperbolic
152
+ Seeweb = clouds.Seeweb
152
153
 
153
154
  __all__ = [
154
155
  '__version__',
@@ -169,6 +170,7 @@ __all__ = [
169
170
  'Fluidstack',
170
171
  'Nebius',
171
172
  'Hyperbolic',
173
+ 'Seeweb',
172
174
  'Optimizer',
173
175
  'OptimizeTarget',
174
176
  'backends',
sky/adaptors/seeweb.py ADDED
@@ -0,0 +1,103 @@
1
+ """ Seeweb Adaptor """
2
+ import configparser
3
+ from pathlib import Path
4
+
5
+ from sky.adaptors import common
6
+ from sky.utils import annotations
7
+
8
+
9
+ class SeewebError(Exception):
10
+ """Base exception for Seeweb adaptor errors."""
11
+
12
+
13
+ class SeewebCredentialsFileNotFound(SeewebError):
14
+ """Raised when the Seeweb credentials file is missing."""
15
+
16
+
17
+ class SeewebApiKeyMissing(SeewebError):
18
+ """Raised when the Seeweb API key is missing or empty."""
19
+
20
+
21
+ class SeewebAuthenticationError(SeewebError):
22
+ """Raised when authenticating with Seeweb API fails."""
23
+
24
+
25
+ _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Seeweb.'
26
+ 'Try pip install "skypilot[seeweb]"')
27
+
28
+ ecsapi = common.LazyImport(
29
+ 'ecsapi',
30
+ import_error_message=_IMPORT_ERROR_MESSAGE,
31
+ )
32
+ boto3 = common.LazyImport('boto3', import_error_message=_IMPORT_ERROR_MESSAGE)
33
+ botocore = common.LazyImport('botocore',
34
+ import_error_message=_IMPORT_ERROR_MESSAGE)
35
+
36
+ _LAZY_MODULES = (ecsapi, boto3, botocore)
37
+
38
+
39
+ @common.load_lazy_modules(_LAZY_MODULES)
40
+ def check_compute_credentials() -> bool:
41
+ """Checks if the user has access credentials to Seeweb's compute service.
42
+
43
+ Returns True if credentials are valid; otherwise raises a SeewebError.
44
+ """
45
+ # Read API key from standard Seeweb configuration file
46
+ key_path = Path('~/.seeweb_cloud/seeweb_keys').expanduser()
47
+ if not key_path.exists():
48
+ raise SeewebCredentialsFileNotFound(
49
+ 'Missing Seeweb API key file ~/.seeweb_cloud/seeweb_keys')
50
+
51
+ parser = configparser.ConfigParser()
52
+ parser.read(key_path)
53
+ try:
54
+ api_key = parser['DEFAULT']['api_key'].strip()
55
+ except KeyError as e:
56
+ raise SeewebApiKeyMissing(
57
+ 'Missing api_key in ~/.seeweb_cloud/seeweb_keys') from e
58
+ if not api_key:
59
+ raise SeewebApiKeyMissing(
60
+ 'Empty api_key in ~/.seeweb_cloud/seeweb_keys')
61
+
62
+ # Test connection by fetching servers list to validate the key
63
+ try:
64
+ seeweb_client = ecsapi.Api(token=api_key)
65
+ seeweb_client.fetch_servers()
66
+ except Exception as e: # pylint: disable=broad-except
67
+ raise SeewebAuthenticationError(
68
+ f'Unable to authenticate with Seeweb API: {e}') from e
69
+
70
+ return True
71
+
72
+
73
+ @common.load_lazy_modules(_LAZY_MODULES)
74
+ def check_storage_credentials() -> bool:
75
+ """Checks if the user has access credentials to Seeweb's storage service.
76
+
77
+ Mirrors compute credentials validation.
78
+ """
79
+ return check_compute_credentials()
80
+
81
+
82
+ @common.load_lazy_modules(_LAZY_MODULES)
83
+ @annotations.lru_cache(scope='global', maxsize=1)
84
+ def client():
85
+ """Returns an authenticated ecsapi.Api object."""
86
+ # Create authenticated client using the same credential pattern
87
+ key_path = Path('~/.seeweb_cloud/seeweb_keys').expanduser()
88
+ if not key_path.exists():
89
+ raise SeewebCredentialsFileNotFound(
90
+ 'Missing Seeweb API key file ~/.seeweb_cloud/seeweb_keys')
91
+
92
+ parser = configparser.ConfigParser()
93
+ parser.read(key_path)
94
+ try:
95
+ api_key = parser['DEFAULT']['api_key'].strip()
96
+ except KeyError as e:
97
+ raise SeewebApiKeyMissing(
98
+ 'Missing api_key in ~/.seeweb_cloud/seeweb_keys') from e
99
+ if not api_key:
100
+ raise SeewebApiKeyMissing(
101
+ 'Empty api_key in ~/.seeweb_cloud/seeweb_keys')
102
+
103
+ return ecsapi.Api(token=api_key)
sky/authentication.py CHANGED
@@ -40,6 +40,7 @@ from sky.adaptors import gcp
40
40
  from sky.adaptors import ibm
41
41
  from sky.adaptors import kubernetes
42
42
  from sky.adaptors import runpod
43
+ from sky.adaptors import seeweb as seeweb_adaptor
43
44
  from sky.adaptors import vast
44
45
  from sky.provision.fluidstack import fluidstack_utils
45
46
  from sky.provision.kubernetes import utils as kubernetes_utils
@@ -601,3 +602,40 @@ def setup_hyperbolic_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
601
602
  config['auth']['ssh_public_key'] = public_key_path
602
603
 
603
604
  return configure_ssh_info(config)
605
+
606
+
607
+ def setup_seeweb_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
608
+ """Registers the public key with Seeweb and notes the remote name."""
609
+ # 1. local key pair
610
+ get_or_generate_keys()
611
+
612
+ # 2. public key
613
+ _, public_key_path = get_or_generate_keys()
614
+ with open(public_key_path, 'r', encoding='utf-8') as f:
615
+ public_key = f.read().strip()
616
+
617
+ # 3. Seeweb API client
618
+ client = seeweb_adaptor.client()
619
+
620
+ # 4. Check if key is already registered
621
+ prefix = f'sky-key-{common_utils.get_user_hash()}'
622
+ remote_name = None
623
+ for k in client.fetch_ssh_keys():
624
+ if k.key.strip() == public_key:
625
+ remote_name = k.label # already present
626
+ break
627
+
628
+ # 5. doesn't exist, choose a unique name and create it
629
+ if remote_name is None:
630
+ suffix = 1
631
+ remote_name = prefix
632
+ existing_names = {k.label for k in client.fetch_ssh_keys()}
633
+ while remote_name in existing_names:
634
+ suffix += 1
635
+ remote_name = f'{prefix}-{suffix}'
636
+ client.create_ssh_key(label=remote_name, key=public_key)
637
+
638
+ # 6. Put the remote name in cluster-config (like for Lambda)
639
+ config['auth']['remote_key_name'] = remote_name
640
+
641
+ return config
@@ -7,11 +7,13 @@ import hashlib
7
7
  import os
8
8
  import pathlib
9
9
  import pprint
10
+ import queue as queue_lib
10
11
  import re
11
12
  import shlex
12
13
  import subprocess
13
14
  import sys
14
15
  import tempfile
16
+ import threading
15
17
  import time
16
18
  import typing
17
19
  from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
@@ -23,7 +25,6 @@ from aiohttp import ClientTimeout
23
25
  from aiohttp import TCPConnector
24
26
  import colorama
25
27
  from packaging import version
26
- import psutil
27
28
  from typing_extensions import Literal
28
29
 
29
30
  import sky
@@ -111,8 +112,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
111
112
  # 10.133.0.5: ray.worker.default,
112
113
  _LAUNCHING_IP_PATTERN = re.compile(
113
114
  r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
115
+ SSH_CONNECTION_ERROR_PATTERN = re.compile(
116
+ r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
114
117
  _SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
115
118
  re.IGNORECASE)
119
+ K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
120
+ re.IGNORECASE)
116
121
  _RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
117
122
  WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
118
123
 
@@ -135,6 +140,7 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
135
140
 
136
141
  CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
137
142
  WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
143
+ CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10
138
144
 
139
145
  # Remote dir that holds our runtime files.
140
146
  _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
@@ -213,6 +219,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
213
219
  ('provider', 'availability_zone'),
214
220
  ]
215
221
 
222
+ _ACK_MESSAGE = 'ack'
223
+ _FORWARDING_FROM_MESSAGE = 'Forwarding from'
224
+
216
225
 
217
226
  def is_ip(s: str) -> bool:
218
227
  """Returns whether this string matches IP_ADDR_REGEX."""
@@ -1107,6 +1116,8 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
1107
1116
  config = auth.setup_fluidstack_authentication(config)
1108
1117
  elif isinstance(cloud, clouds.Hyperbolic):
1109
1118
  config = auth.setup_hyperbolic_authentication(config)
1119
+ elif isinstance(cloud, clouds.Seeweb):
1120
+ config = auth.setup_seeweb_authentication(config)
1110
1121
  else:
1111
1122
  assert False, cloud
1112
1123
  yaml_utils.dump_yaml(tmp_yaml_path, config)
@@ -2324,7 +2335,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2324
2335
  handle,
2325
2336
  requested_resources=None,
2326
2337
  ready=True,
2327
- is_launch=False)
2338
+ is_launch=False,
2339
+ update_only=True)
2328
2340
  return global_user_state.get_cluster_from_name(cluster_name)
2329
2341
 
2330
2342
  # All cases below are transitioning the cluster to non-UP states.
@@ -2534,7 +2546,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2534
2546
  handle,
2535
2547
  requested_resources=None,
2536
2548
  ready=False,
2537
- is_launch=False)
2549
+ is_launch=False,
2550
+ update_only=True)
2538
2551
  return global_user_state.get_cluster_from_name(cluster_name)
2539
2552
  # Now is_abnormal is False: either node_statuses is empty or all nodes are
2540
2553
  # STOPPED.
@@ -2672,7 +2685,7 @@ def refresh_cluster_record(
2672
2685
  'Refreshing status: Failed get the lock for cluster '
2673
2686
  f'{cluster_name!r}. Using the cached status.')
2674
2687
  return record
2675
- time.sleep(0.05)
2688
+ time.sleep(lock.poll_interval)
2676
2689
 
2677
2690
  # Refresh for next loop iteration.
2678
2691
  record = global_user_state.get_cluster_from_name(cluster_name)
@@ -3582,19 +3595,126 @@ def workspace_lock_id(workspace_name: str) -> str:
3582
3595
  return f'{workspace_name}_workspace'
3583
3596
 
3584
3597
 
3598
+ def cluster_tunnel_lock_id(cluster_name: str) -> str:
3599
+ """Get the lock ID for cluster tunnel operations."""
3600
+ return f'{cluster_name}_ssh_tunnel'
3601
+
3602
+
3603
+ def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
3604
+ command_runner.KubernetesCommandRunner],
3605
+ port_forward: Tuple[int, int]) -> subprocess.Popen:
3606
+ local_port, remote_port = port_forward
3607
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3608
+ # Disabling ControlMaster makes things easier to reason about
3609
+ # with respect to resource management/ownership,
3610
+ # as killing the process will close the tunnel too.
3611
+ head_runner.disable_control_master = True
3612
+ head_runner.port_forward_execute_remote_command = True
3613
+
3614
+ # The default connect_timeout of 1s is too short for
3615
+ # connecting to clusters using a jump server.
3616
+ # We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
3617
+ # which is counted towards non-idleness.
3618
+ cmd: List[str] = head_runner.port_forward_command(
3619
+ [(local_port, remote_port)],
3620
+ connect_timeout=5,
3621
+ ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
3622
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3623
+ # cat so the command doesn't exit until we kill it
3624
+ cmd += [f'"echo {_ACK_MESSAGE} && cat"']
3625
+ cmd_str = ' '.join(cmd)
3626
+ logger.debug(f'Running port forward command: {cmd_str}')
3627
+ ssh_tunnel_proc = subprocess.Popen(cmd_str,
3628
+ shell=True,
3629
+ stdin=subprocess.PIPE,
3630
+ stdout=subprocess.PIPE,
3631
+ stderr=subprocess.PIPE,
3632
+ start_new_session=True,
3633
+ text=True)
3634
+ # Wait until we receive an ack from the remote cluster or
3635
+ # the SSH connection times out.
3636
+ queue: queue_lib.Queue = queue_lib.Queue()
3637
+ stdout_thread = threading.Thread(
3638
+ target=lambda queue, stdout: queue.put(stdout.readline()),
3639
+ args=(queue, ssh_tunnel_proc.stdout),
3640
+ daemon=True)
3641
+ stdout_thread.start()
3642
+ while ssh_tunnel_proc.poll() is None:
3643
+ try:
3644
+ ack = queue.get_nowait()
3645
+ except queue_lib.Empty:
3646
+ ack = None
3647
+ time.sleep(0.1)
3648
+ continue
3649
+ assert ack is not None
3650
+ if isinstance(
3651
+ head_runner,
3652
+ command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
3653
+ break
3654
+ elif isinstance(head_runner, command_runner.KubernetesCommandRunner
3655
+ ) and _FORWARDING_FROM_MESSAGE in ack:
3656
+ # On kind clusters, this error occurs if we make a request
3657
+ # immediately after the port-forward is established on a new pod:
3658
+ # "Unhandled Error" err="an error occurred forwarding ... -> 46590:
3659
+ # failed to execute portforward in network namespace
3660
+ # "/var/run/netns/cni-...": failed to connect to localhost:46590
3661
+ # inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
3662
+ # connect: connection refused
3663
+ # So we need to poll the port on the pod to check if it is open.
3664
+ # We did not observe this with real Kubernetes clusters.
3665
+ timeout = 5
3666
+ port_check_cmd = (
3667
+ # We install netcat in our ray-node container,
3668
+ # so we can use it here.
3669
+ # (See kubernetes-ray.yml.j2)
3670
+ f'end=$((SECONDS+{timeout})); '
3671
+ f'while ! nc -z -w 1 localhost {remote_port}; do '
3672
+ 'if (( SECONDS >= end )); then exit 1; fi; '
3673
+ 'sleep 0.1; '
3674
+ 'done')
3675
+ returncode, stdout, stderr = head_runner.run(port_check_cmd,
3676
+ require_outputs=True,
3677
+ stream_logs=False)
3678
+ if returncode != 0:
3679
+ try:
3680
+ ssh_tunnel_proc.terminate()
3681
+ ssh_tunnel_proc.wait(timeout=5)
3682
+ except subprocess.TimeoutExpired:
3683
+ ssh_tunnel_proc.kill()
3684
+ ssh_tunnel_proc.wait()
3685
+ finally:
3686
+ error_msg = (f'Failed to check remote port {remote_port}')
3687
+ if stdout:
3688
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3689
+ raise exceptions.CommandError(returncode=returncode,
3690
+ command=cmd_str,
3691
+ error_msg=error_msg,
3692
+ detailed_reason=stderr)
3693
+ break
3694
+
3695
+ if ssh_tunnel_proc.poll() is not None:
3696
+ stdout, stderr = ssh_tunnel_proc.communicate()
3697
+ error_msg = 'Port forward failed'
3698
+ if stdout:
3699
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3700
+ raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
3701
+ command=cmd_str,
3702
+ error_msg=error_msg,
3703
+ detailed_reason=stderr)
3704
+ return ssh_tunnel_proc
3705
+
3706
+
3585
3707
  T = TypeVar('T')
3586
3708
 
3587
3709
 
3588
- def invoke_skylet_with_retries(
3589
- handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
3590
- func: Callable[..., T]) -> T:
3710
+ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
3591
3711
  """Generic helper for making Skylet gRPC requests.
3592
3712
 
3593
3713
  This method handles the common pattern of:
3594
3714
  1. Try the gRPC request
3595
3715
  2. If SSH tunnel is closed, recreate it and retry
3596
3716
  """
3597
- max_attempts = 3
3717
+ max_attempts = 5
3598
3718
  backoff = common_utils.Backoff(initial_backoff=0.5)
3599
3719
  last_exception: Optional[Exception] = None
3600
3720
 
@@ -3603,26 +3723,24 @@ def invoke_skylet_with_retries(
3603
3723
  return func()
3604
3724
  except grpc.RpcError as e:
3605
3725
  last_exception = e
3606
- if e.code() == grpc.StatusCode.INTERNAL:
3607
- with ux_utils.print_exception_no_traceback():
3608
- raise exceptions.SkyletInternalError(e.details())
3609
- elif e.code() == grpc.StatusCode.UNAVAILABLE:
3610
- recreate_tunnel = True
3611
- try:
3612
- if handle.skylet_ssh_tunnel is not None:
3613
- proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
3614
- if proc.is_running(
3615
- ) and proc.status() != psutil.STATUS_ZOMBIE:
3616
- recreate_tunnel = False
3617
- except psutil.NoSuchProcess:
3618
- pass
3619
-
3620
- if recreate_tunnel:
3621
- handle.open_and_update_skylet_tunnel()
3622
-
3623
- time.sleep(backoff.current_backoff())
3624
- else:
3625
- raise e
3726
+ _handle_grpc_error(e, backoff.current_backoff())
3727
+
3728
+ raise RuntimeError(
3729
+ f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
3730
+ ) from last_exception
3626
3731
 
3627
- raise RuntimeError(f'Failed to invoke Skylet after {max_attempts} attempts'
3628
- ) from last_exception
3732
+
3733
+ def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
3734
+ if e.code() == grpc.StatusCode.INTERNAL:
3735
+ with ux_utils.print_exception_no_traceback():
3736
+ raise exceptions.SkyletInternalError(e.details())
3737
+ elif e.code() == grpc.StatusCode.UNAVAILABLE:
3738
+ time.sleep(current_backoff)
3739
+ elif e.code() == grpc.StatusCode.UNIMPLEMENTED:
3740
+ # Handle backwards compatibility: old server doesn't implement this RPC.
3741
+ # Let the caller fall back to legacy execution.
3742
+ raise exceptions.SkyletMethodNotImplementedError(
3743
+ f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
3744
+ )
3745
+ else:
3746
+ raise e