skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +125 -22
- sky/backends/cloud_vm_ray_backend.py +224 -72
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +34 -0
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +6 -11
- sky/logs/agent.py +10 -2
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/skylet/constants.py +12 -7
- sky/skylet/log_lib.py +11 -0
- sky/skylet/log_lib.pyi +9 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +33 -33
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +66 -66
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
|
@@ -7,9 +7,11 @@ import json
|
|
|
7
7
|
import math
|
|
8
8
|
import os
|
|
9
9
|
import pathlib
|
|
10
|
+
import random
|
|
10
11
|
import re
|
|
11
12
|
import shlex
|
|
12
13
|
import signal
|
|
14
|
+
import socket
|
|
13
15
|
import subprocess
|
|
14
16
|
import sys
|
|
15
17
|
import tempfile
|
|
@@ -48,6 +50,7 @@ from sky.provision import common as provision_common
|
|
|
48
50
|
from sky.provision import instance_setup
|
|
49
51
|
from sky.provision import metadata_utils
|
|
50
52
|
from sky.provision import provisioner
|
|
53
|
+
from sky.provision.kubernetes import config as config_lib
|
|
51
54
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
52
55
|
from sky.server.requests import requests as requests_lib
|
|
53
56
|
from sky.skylet import autostop_lib
|
|
@@ -87,7 +90,11 @@ if typing.TYPE_CHECKING:
|
|
|
87
90
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
88
91
|
else:
|
|
89
92
|
# To avoid requiring grpcio to be installed on the client side.
|
|
90
|
-
grpc = adaptors_common.LazyImport(
|
|
93
|
+
grpc = adaptors_common.LazyImport(
|
|
94
|
+
'grpc',
|
|
95
|
+
# https://github.com/grpc/grpc/issues/37642 to avoid spam in console
|
|
96
|
+
set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
|
|
97
|
+
if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
|
|
91
98
|
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
92
99
|
'sky.schemas.generated.autostopv1_pb2')
|
|
93
100
|
autostopv1_pb2_grpc = adaptors_common.LazyImport(
|
|
@@ -330,6 +337,8 @@ class RayCodeGen:
|
|
|
330
337
|
|
|
331
338
|
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
|
|
332
339
|
|
|
340
|
+
CANCELLED_RETURN_CODE = 137
|
|
341
|
+
|
|
333
342
|
kwargs = dict()
|
|
334
343
|
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when
|
|
335
344
|
# the directory exists for backward compatibility for the VM
|
|
@@ -345,8 +354,10 @@ class RayCodeGen:
|
|
|
345
354
|
def get_or_fail(futures, pg) -> List[int]:
|
|
346
355
|
\"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
|
|
347
356
|
if not futures:
|
|
348
|
-
return []
|
|
357
|
+
return [], []
|
|
349
358
|
returncodes = [1] * len(futures)
|
|
359
|
+
pids = [None] * len(futures)
|
|
360
|
+
failed = False
|
|
350
361
|
# Wait for 1 task to be ready.
|
|
351
362
|
ready = []
|
|
352
363
|
# Keep invoking ray.wait if ready is empty. This is because
|
|
@@ -355,12 +366,22 @@ class RayCodeGen:
|
|
|
355
366
|
# before becoming ready.
|
|
356
367
|
# (Such tasks are common in serving jobs.)
|
|
357
368
|
# Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
|
|
369
|
+
|
|
370
|
+
def handle_ready_tasks(tasks: List[ray.ObjectRef]) -> None:
|
|
371
|
+
nonlocal returncodes, pids, failed
|
|
372
|
+
for task in tasks:
|
|
373
|
+
idx = futures.index(task)
|
|
374
|
+
res = ray.get(task)
|
|
375
|
+
returncodes[idx] = res['return_code']
|
|
376
|
+
pids[idx] = res['pid']
|
|
377
|
+
if res['return_code'] != 0:
|
|
378
|
+
failed = True
|
|
379
|
+
|
|
358
380
|
while not ready:
|
|
359
381
|
ready, unready = ray.wait(futures)
|
|
360
|
-
|
|
361
|
-
returncodes[idx] = ray.get(ready[0])
|
|
382
|
+
handle_ready_tasks(ready)
|
|
362
383
|
while unready:
|
|
363
|
-
if
|
|
384
|
+
if failed:
|
|
364
385
|
for task in unready:
|
|
365
386
|
# ray.cancel without force fails to kill tasks.
|
|
366
387
|
# We use force=True to kill unready tasks.
|
|
@@ -368,17 +389,16 @@ class RayCodeGen:
|
|
|
368
389
|
# Use SIGKILL=128+9 to indicate the task is forcely
|
|
369
390
|
# killed.
|
|
370
391
|
idx = futures.index(task)
|
|
371
|
-
returncodes[idx] =
|
|
392
|
+
returncodes[idx] = CANCELLED_RETURN_CODE
|
|
372
393
|
break
|
|
373
394
|
ready, unready = ray.wait(unready)
|
|
374
|
-
|
|
375
|
-
returncodes[idx] = ray.get(ready[0])
|
|
395
|
+
handle_ready_tasks(ready)
|
|
376
396
|
# Remove the placement group after all tasks are done, so that
|
|
377
397
|
# the next job can be scheduled on the released resources
|
|
378
398
|
# immediately.
|
|
379
399
|
ray_util.remove_placement_group(pg)
|
|
380
400
|
sys.stdout.flush()
|
|
381
|
-
return returncodes
|
|
401
|
+
return returncodes, pids
|
|
382
402
|
|
|
383
403
|
run_fn = None
|
|
384
404
|
futures = []
|
|
@@ -394,7 +414,10 @@ class RayCodeGen:
|
|
|
394
414
|
inspect.getsource(log_lib.make_task_bash_script),
|
|
395
415
|
inspect.getsource(log_lib.add_ray_env_vars),
|
|
396
416
|
inspect.getsource(log_lib.run_bash_command_with_log),
|
|
397
|
-
|
|
417
|
+
inspect.getsource(log_lib.run_bash_command_with_log_and_return_pid),
|
|
418
|
+
'run_bash_command_with_log = run_bash_command_with_log',
|
|
419
|
+
'run_bash_command_with_log_and_return_pid = \
|
|
420
|
+
ray.remote(run_bash_command_with_log_and_return_pid)',
|
|
398
421
|
]
|
|
399
422
|
# Currently, the codegen program is/can only be submitted to the head
|
|
400
423
|
# node, due to using job_lib for updating job statuses, and using
|
|
@@ -499,7 +522,7 @@ class RayCodeGen:
|
|
|
499
522
|
total_num_nodes = len(ray.nodes())
|
|
500
523
|
setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
|
|
501
524
|
setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
|
|
502
|
-
setup_workers = [
|
|
525
|
+
setup_workers = [run_bash_command_with_log_and_return_pid \\
|
|
503
526
|
.options(
|
|
504
527
|
name='setup',
|
|
505
528
|
num_cpus=_SETUP_CPUS,
|
|
@@ -514,15 +537,25 @@ class RayCodeGen:
|
|
|
514
537
|
stream_logs=True,
|
|
515
538
|
with_ray=True,
|
|
516
539
|
) for i in range(total_num_nodes)]
|
|
517
|
-
setup_returncodes = get_or_fail(setup_workers, setup_pg)
|
|
518
|
-
|
|
540
|
+
setup_returncodes, setup_pids = get_or_fail(setup_workers, setup_pg)
|
|
541
|
+
success = True
|
|
542
|
+
failed_workers_and_returncodes = []
|
|
543
|
+
for i in range(len(setup_returncodes)):
|
|
544
|
+
returncode = setup_returncodes[i]
|
|
545
|
+
pid = setup_pids[i]
|
|
546
|
+
if pid == None:
|
|
547
|
+
pid = os.getpid()
|
|
548
|
+
if returncode != 0 and returncode != CANCELLED_RETURN_CODE:
|
|
549
|
+
success = False
|
|
550
|
+
failed_workers_and_returncodes.append((pid, returncode))
|
|
551
|
+
if not success:
|
|
552
|
+
msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed. '
|
|
553
|
+
msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
|
|
554
|
+
msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
555
|
+
print(msg, flush=True)
|
|
519
556
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
520
557
|
# This waits for all streaming logs to finish.
|
|
521
558
|
time.sleep(1)
|
|
522
|
-
print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
|
|
523
|
-
'return code list:{colorama.Style.RESET_ALL}',
|
|
524
|
-
setup_returncodes,
|
|
525
|
-
flush=True)
|
|
526
559
|
# Need this to set the job status in ray job to be FAILED.
|
|
527
560
|
sys.exit(1)
|
|
528
561
|
""")
|
|
@@ -695,7 +728,7 @@ class RayCodeGen:
|
|
|
695
728
|
|
|
696
729
|
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
|
|
697
730
|
|
|
698
|
-
futures.append(
|
|
731
|
+
futures.append(run_bash_command_with_log_and_return_pid \\
|
|
699
732
|
.options(name=name_str, {options_str}) \\
|
|
700
733
|
.remote(
|
|
701
734
|
script,
|
|
@@ -714,7 +747,7 @@ class RayCodeGen:
|
|
|
714
747
|
|
|
715
748
|
self._code += [
|
|
716
749
|
textwrap.dedent(f"""\
|
|
717
|
-
returncodes = get_or_fail(futures, pg)
|
|
750
|
+
returncodes, _ = get_or_fail(futures, pg)
|
|
718
751
|
if sum(returncodes) != 0:
|
|
719
752
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
|
|
720
753
|
# Schedule the next pending job immediately to make the job
|
|
@@ -1340,6 +1373,34 @@ class RetryingVmProvisioner(object):
|
|
|
1340
1373
|
zones = [clouds.Zone(name=to_provision.zone)]
|
|
1341
1374
|
yield zones
|
|
1342
1375
|
|
|
1376
|
+
def _insufficient_resources_msg(
|
|
1377
|
+
self,
|
|
1378
|
+
to_provision: resources_lib.Resources,
|
|
1379
|
+
requested_resources: Set[resources_lib.Resources],
|
|
1380
|
+
insufficient_resources: Optional[List[str]],
|
|
1381
|
+
) -> str:
|
|
1382
|
+
insufficent_resource_msg = ('' if insufficient_resources is None else
|
|
1383
|
+
f' ({", ".join(insufficient_resources)})')
|
|
1384
|
+
message = f'Failed to acquire resources{insufficent_resource_msg} '
|
|
1385
|
+
if to_provision.zone is not None:
|
|
1386
|
+
message += (f'in {to_provision.zone} for {requested_resources}. ')
|
|
1387
|
+
elif to_provision.region is not None and to_provision.cloud is not None:
|
|
1388
|
+
# For public clouds, provision.region is always set.
|
|
1389
|
+
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
1390
|
+
message += (
|
|
1391
|
+
f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
|
|
1392
|
+
f'for {requested_resources}. The SSH Node Pool may not '
|
|
1393
|
+
'have enough resources.')
|
|
1394
|
+
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
1395
|
+
message += (f'in context {to_provision.region} for '
|
|
1396
|
+
f'{requested_resources}. ')
|
|
1397
|
+
else:
|
|
1398
|
+
message += (f'in all zones in {to_provision.region} for '
|
|
1399
|
+
f'{requested_resources}. ')
|
|
1400
|
+
else:
|
|
1401
|
+
message += (f'{to_provision.cloud} for {requested_resources}. ')
|
|
1402
|
+
return message
|
|
1403
|
+
|
|
1343
1404
|
def _retry_zones(
|
|
1344
1405
|
self,
|
|
1345
1406
|
to_provision: resources_lib.Resources,
|
|
@@ -1418,6 +1479,7 @@ class RetryingVmProvisioner(object):
|
|
|
1418
1479
|
f'To request quotas, check the instruction: '
|
|
1419
1480
|
f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
|
|
1420
1481
|
|
|
1482
|
+
insufficient_resources = None
|
|
1421
1483
|
for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
|
|
1422
1484
|
prev_cluster_status,
|
|
1423
1485
|
prev_cluster_ever_up):
|
|
@@ -1630,6 +1692,24 @@ class RetryingVmProvisioner(object):
|
|
|
1630
1692
|
# No teardown happens for this error.
|
|
1631
1693
|
with ux_utils.print_exception_no_traceback():
|
|
1632
1694
|
raise
|
|
1695
|
+
except config_lib.KubernetesError as e:
|
|
1696
|
+
if e.insufficent_resources:
|
|
1697
|
+
insufficient_resources = e.insufficent_resources
|
|
1698
|
+
# NOTE: We try to cleanup the cluster even if the previous
|
|
1699
|
+
# cluster does not exist. Also we are fast at
|
|
1700
|
+
# cleaning up clusters now if there is no existing node.
|
|
1701
|
+
CloudVmRayBackend().post_teardown_cleanup(
|
|
1702
|
+
handle,
|
|
1703
|
+
terminate=not prev_cluster_ever_up,
|
|
1704
|
+
remove_from_db=False,
|
|
1705
|
+
failover=True,
|
|
1706
|
+
)
|
|
1707
|
+
# TODO(suquark): other clouds may have different zone
|
|
1708
|
+
# blocking strategy. See '_update_blocklist_on_error'
|
|
1709
|
+
# for details.
|
|
1710
|
+
FailoverCloudErrorHandlerV2.update_blocklist_on_error(
|
|
1711
|
+
self._blocked_resources, to_provision, region, zones, e)
|
|
1712
|
+
continue
|
|
1633
1713
|
except Exception as e: # pylint: disable=broad-except
|
|
1634
1714
|
# NOTE: We try to cleanup the cluster even if the previous
|
|
1635
1715
|
# cluster does not exist. Also we are fast at
|
|
@@ -1760,26 +1840,9 @@ class RetryingVmProvisioner(object):
|
|
|
1760
1840
|
terminate=terminate_or_stop,
|
|
1761
1841
|
remove_from_db=False)
|
|
1762
1842
|
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
f'{requested_resources}. ')
|
|
1767
|
-
elif to_provision.region is not None:
|
|
1768
|
-
# For public clouds, provision.region is always set.
|
|
1769
|
-
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
1770
|
-
message = ('Failed to acquire resources in SSH Node Pool '
|
|
1771
|
-
f'({to_provision.region.lstrip("ssh-")}) for '
|
|
1772
|
-
f'{requested_resources}. The SSH Node Pool may not '
|
|
1773
|
-
'have enough resources.')
|
|
1774
|
-
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
1775
|
-
message = ('Failed to acquire resources in context '
|
|
1776
|
-
f'{to_provision.region} for {requested_resources}. ')
|
|
1777
|
-
else:
|
|
1778
|
-
message = ('Failed to acquire resources in all zones in '
|
|
1779
|
-
f'{to_provision.region} for {requested_resources}. ')
|
|
1780
|
-
else:
|
|
1781
|
-
message = (f'Failed to acquire resources in {to_provision.cloud} '
|
|
1782
|
-
f'for {requested_resources}. ')
|
|
1843
|
+
message = self._insufficient_resources_msg(to_provision,
|
|
1844
|
+
requested_resources,
|
|
1845
|
+
insufficient_resources)
|
|
1783
1846
|
# Do not failover to other locations if the cluster was ever up, since
|
|
1784
1847
|
# the user can have some data on the cluster.
|
|
1785
1848
|
raise exceptions.ResourcesUnavailableError(
|
|
@@ -2261,8 +2324,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2261
2324
|
- (optional) Skylet SSH tunnel info.
|
|
2262
2325
|
"""
|
|
2263
2326
|
# Bump if any fields get added/removed/changed, and add backward
|
|
2264
|
-
#
|
|
2265
|
-
_VERSION =
|
|
2327
|
+
# compatibility logic in __setstate__ and/or __getstate__.
|
|
2328
|
+
_VERSION = 12
|
|
2266
2329
|
|
|
2267
2330
|
def __init__(
|
|
2268
2331
|
self,
|
|
@@ -2296,7 +2359,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2296
2359
|
self.launched_resources = launched_resources
|
|
2297
2360
|
self.docker_user: Optional[str] = None
|
|
2298
2361
|
self.is_grpc_enabled = True
|
|
2299
|
-
self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
|
|
2300
2362
|
|
|
2301
2363
|
def __repr__(self):
|
|
2302
2364
|
return (f'ResourceHandle('
|
|
@@ -2313,8 +2375,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2313
2375
|
f'{self.launched_resources}, '
|
|
2314
2376
|
f'\n\tdocker_user={self.docker_user},'
|
|
2315
2377
|
f'\n\tssh_user={self.ssh_user},'
|
|
2316
|
-
f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
|
|
2317
|
-
f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
|
|
2378
|
+
f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
|
|
2318
2379
|
|
|
2319
2380
|
def get_cluster_name(self):
|
|
2320
2381
|
return self.cluster_name
|
|
@@ -2643,11 +2704,74 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2643
2704
|
cluster_config_file)
|
|
2644
2705
|
self.docker_user = docker_user
|
|
2645
2706
|
|
|
2707
|
+
def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
|
|
2708
|
+
metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
|
|
2709
|
+
self.cluster_name)
|
|
2710
|
+
if metadata is None:
|
|
2711
|
+
return None
|
|
2712
|
+
return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
|
|
2713
|
+
|
|
2714
|
+
def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
|
|
2715
|
+
global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
|
|
2716
|
+
self.cluster_name,
|
|
2717
|
+
(tunnel.port, tunnel.pid) if tunnel is not None else None)
|
|
2718
|
+
|
|
2646
2719
|
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
2650
|
-
|
|
2720
|
+
# It's fine to not grab the lock here, as we're only reading,
|
|
2721
|
+
# and writes are very rare.
|
|
2722
|
+
# It's acceptable to read while another process is opening a tunnel,
|
|
2723
|
+
# because it will only happen on:
|
|
2724
|
+
# 1. A new cluster who has no tunnel yet, or
|
|
2725
|
+
# 2. A cluster with an unhealthy tunnel
|
|
2726
|
+
# For (2), for processes that read the "stale" tunnel, it will fail
|
|
2727
|
+
# and on the next retry, it will call get_grpc_channel again
|
|
2728
|
+
# and get the new tunnel.
|
|
2729
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2730
|
+
if tunnel is not None:
|
|
2731
|
+
try:
|
|
2732
|
+
# Check if the tunnel is open.
|
|
2733
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2734
|
+
s.settimeout(0.5)
|
|
2735
|
+
s.connect(('localhost', tunnel.port))
|
|
2736
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}')
|
|
2737
|
+
except socket.error as e:
|
|
2738
|
+
logger.warning(
|
|
2739
|
+
'Failed to connect to SSH tunnel for cluster '
|
|
2740
|
+
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2741
|
+
'acquiring lock')
|
|
2742
|
+
pass
|
|
2743
|
+
lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
|
|
2744
|
+
lock_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
|
|
2745
|
+
lock = locks.get_lock(lock_id, lock_timeout)
|
|
2746
|
+
try:
|
|
2747
|
+
with lock.acquire(blocking=True):
|
|
2748
|
+
# Re-read the tunnel from the DB.
|
|
2749
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2750
|
+
if tunnel is None:
|
|
2751
|
+
logger.debug('No SSH tunnel found for cluster '
|
|
2752
|
+
f'{self.cluster_name!r}, '
|
|
2753
|
+
'opening the tunnel')
|
|
2754
|
+
tunnel = self._open_and_update_skylet_tunnel()
|
|
2755
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}')
|
|
2756
|
+
try:
|
|
2757
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2758
|
+
s.settimeout(0.5)
|
|
2759
|
+
s.connect(('localhost', tunnel.port))
|
|
2760
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}')
|
|
2761
|
+
except socket.error as e:
|
|
2762
|
+
logger.warning(
|
|
2763
|
+
'Failed to connect to SSH tunnel for cluster '
|
|
2764
|
+
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2765
|
+
'opening new tunnel')
|
|
2766
|
+
tunnel = self._open_and_update_skylet_tunnel()
|
|
2767
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}')
|
|
2768
|
+
except locks.LockTimeout as e:
|
|
2769
|
+
raise RuntimeError(
|
|
2770
|
+
'Failed to get gRPC channel for cluster '
|
|
2771
|
+
f'{self.cluster_name!r} due to a timeout when waiting for the '
|
|
2772
|
+
'SSH tunnel to be opened. Please try again or manually remove '
|
|
2773
|
+
f'the lock at {lock_id}. '
|
|
2774
|
+
f'{common_utils.format_exception(e)}') from e
|
|
2651
2775
|
|
|
2652
2776
|
def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
|
|
2653
2777
|
"""Clean up an SSH tunnel by terminating the process."""
|
|
@@ -2668,31 +2792,48 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2668
2792
|
logger.warning(
|
|
2669
2793
|
f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
|
|
2670
2794
|
|
|
2671
|
-
def
|
|
2795
|
+
def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
|
|
2672
2796
|
"""Opens an SSH tunnel to the Skylet on the head node,
|
|
2673
2797
|
updates the cluster handle, and persists it to the database."""
|
|
2674
|
-
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2681
|
-
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
|
|
2686
|
-
|
|
2798
|
+
max_attempts = 3
|
|
2799
|
+
# There could be a race condition here, as multiple processes may
|
|
2800
|
+
# attempt to open the same port at the same time.
|
|
2801
|
+
for attempt in range(max_attempts):
|
|
2802
|
+
runners = self.get_command_runners()
|
|
2803
|
+
head_runner = runners[0]
|
|
2804
|
+
local_port = random.randint(10000, 65535)
|
|
2805
|
+
try:
|
|
2806
|
+
ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
|
|
2807
|
+
head_runner, (local_port, constants.SKYLET_GRPC_PORT))
|
|
2808
|
+
except exceptions.CommandError as e:
|
|
2809
|
+
# Don't retry if the error is due to timeout,
|
|
2810
|
+
# connection refused, Kubernetes pods not found,
|
|
2811
|
+
# or an in-progress termination.
|
|
2812
|
+
if (e.detailed_reason is not None and
|
|
2813
|
+
(backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
|
|
2814
|
+
e.detailed_reason) or
|
|
2815
|
+
backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
|
|
2816
|
+
e.detailed_reason) or attempt == max_attempts - 1)):
|
|
2817
|
+
raise e
|
|
2818
|
+
logger.warning(
|
|
2819
|
+
f'Failed to open SSH tunnel on port {local_port} '
|
|
2820
|
+
f'({attempt + 1}/{max_attempts}). '
|
|
2821
|
+
f'{e.error_msg}\n{e.detailed_reason}')
|
|
2822
|
+
continue
|
|
2823
|
+
tunnel_info = SSHTunnelInfo(port=local_port,
|
|
2824
|
+
pid=ssh_tunnel_proc.pid)
|
|
2825
|
+
break
|
|
2826
|
+
|
|
2687
2827
|
try:
|
|
2688
2828
|
grpc.channel_ready_future(
|
|
2689
2829
|
grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
|
|
2690
2830
|
timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
|
|
2691
2831
|
# Clean up existing tunnel before setting up the new one.
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
|
|
2832
|
+
old_tunnel = self._get_skylet_ssh_tunnel()
|
|
2833
|
+
if old_tunnel is not None:
|
|
2834
|
+
self._cleanup_ssh_tunnel(old_tunnel)
|
|
2835
|
+
self._set_skylet_ssh_tunnel(tunnel_info)
|
|
2836
|
+
return tunnel_info
|
|
2696
2837
|
except grpc.FutureTimeoutError as e:
|
|
2697
2838
|
self._cleanup_ssh_tunnel(tunnel_info)
|
|
2698
2839
|
logger.warning(
|
|
@@ -2752,6 +2893,13 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2752
2893
|
"""Returns whether this handle has gRPC enabled and gRPC flag is set."""
|
|
2753
2894
|
return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
|
|
2754
2895
|
|
|
2896
|
+
def __getstate__(self):
|
|
2897
|
+
state = self.__dict__.copy()
|
|
2898
|
+
# For backwards compatibility. Refer to
|
|
2899
|
+
# https://github.com/skypilot-org/skypilot/pull/7133
|
|
2900
|
+
state.setdefault('skylet_ssh_tunnel', None)
|
|
2901
|
+
return state
|
|
2902
|
+
|
|
2755
2903
|
def __setstate__(self, state):
|
|
2756
2904
|
self._version = self._VERSION
|
|
2757
2905
|
|
|
@@ -2809,6 +2957,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2809
2957
|
state['is_grpc_enabled'] = False
|
|
2810
2958
|
state['skylet_ssh_tunnel'] = None
|
|
2811
2959
|
|
|
2960
|
+
if version >= 12:
|
|
2961
|
+
# DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
|
|
2962
|
+
state.pop('skylet_ssh_tunnel', None)
|
|
2963
|
+
|
|
2812
2964
|
self.__dict__.update(state)
|
|
2813
2965
|
|
|
2814
2966
|
# Because the update_cluster_ips and update_ssh_ports
|
|
@@ -3115,7 +3267,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3115
3267
|
colorama.Style.RESET_ALL +
|
|
3116
3268
|
colorama.Style.DIM +
|
|
3117
3269
|
'Check concurrent requests: ' +
|
|
3118
|
-
'sky api status '
|
|
3270
|
+
'sky api status -v | grep '
|
|
3271
|
+
f'{cluster_name}'))
|
|
3119
3272
|
|
|
3120
3273
|
def _locked_provision(
|
|
3121
3274
|
self,
|
|
@@ -4974,9 +5127,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4974
5127
|
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
|
|
4975
5128
|
down=down,
|
|
4976
5129
|
)
|
|
4977
|
-
backend_utils.invoke_skylet_with_retries(
|
|
4978
|
-
handle
|
|
4979
|
-
set_autostop(request))
|
|
5130
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
5131
|
+
handle.get_grpc_channel()).set_autostop(request))
|
|
4980
5132
|
else:
|
|
4981
5133
|
code = autostop_lib.AutostopCodeGen.set_autostop(
|
|
4982
5134
|
idle_minutes_to_autostop, self.NAME, wait_for, down)
|
|
@@ -5015,8 +5167,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5015
5167
|
try:
|
|
5016
5168
|
request = autostopv1_pb2.IsAutostoppingRequest()
|
|
5017
5169
|
response = backend_utils.invoke_skylet_with_retries(
|
|
5018
|
-
|
|
5019
|
-
|
|
5170
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
5171
|
+
).is_autostopping(request))
|
|
5020
5172
|
return response.is_autostopping
|
|
5021
5173
|
except Exception as e: # pylint: disable=broad-except
|
|
5022
5174
|
# The cluster may have been terminated, causing the gRPC call
|
sky/catalog/__init__.py
CHANGED
|
@@ -247,6 +247,13 @@ def get_accelerators_from_instance_type(
|
|
|
247
247
|
instance_type)
|
|
248
248
|
|
|
249
249
|
|
|
250
|
+
def get_arch_from_instance_type(instance_type: str,
|
|
251
|
+
clouds: CloudFilter = None) -> Optional[str]:
|
|
252
|
+
"""Returns the arch from a instance type."""
|
|
253
|
+
return _map_clouds_catalog(clouds, 'get_arch_from_instance_type',
|
|
254
|
+
instance_type)
|
|
255
|
+
|
|
256
|
+
|
|
250
257
|
def get_instance_type_for_accelerator(
|
|
251
258
|
acc_name: str,
|
|
252
259
|
acc_count: Union[int, float],
|
sky/catalog/aws_catalog.py
CHANGED
|
@@ -271,6 +271,10 @@ def get_accelerators_from_instance_type(
|
|
|
271
271
|
_get_df(), instance_type)
|
|
272
272
|
|
|
273
273
|
|
|
274
|
+
def get_arch_from_instance_type(instance_type: str) -> Optional[str]:
|
|
275
|
+
return common.get_arch_from_instance_type_impl(_get_df(), instance_type)
|
|
276
|
+
|
|
277
|
+
|
|
274
278
|
def get_instance_type_for_accelerator(
|
|
275
279
|
acc_name: str,
|
|
276
280
|
acc_count: int,
|
sky/catalog/common.py
CHANGED
|
@@ -527,6 +527,24 @@ def get_accelerators_from_instance_type_impl(
|
|
|
527
527
|
return {acc_name: _convert(acc_count)}
|
|
528
528
|
|
|
529
529
|
|
|
530
|
+
def get_arch_from_instance_type_impl(
|
|
531
|
+
df: 'pd.DataFrame',
|
|
532
|
+
instance_type: str,
|
|
533
|
+
) -> Optional[str]:
|
|
534
|
+
df = _get_instance_type(df, instance_type, None)
|
|
535
|
+
if df.empty:
|
|
536
|
+
with ux_utils.print_exception_no_traceback():
|
|
537
|
+
raise ValueError(f'No instance type {instance_type} found.')
|
|
538
|
+
row = df.iloc[0]
|
|
539
|
+
if 'Arch' not in row:
|
|
540
|
+
return None
|
|
541
|
+
arch = row['Arch']
|
|
542
|
+
if pd.isnull(arch):
|
|
543
|
+
return None
|
|
544
|
+
|
|
545
|
+
return arch
|
|
546
|
+
|
|
547
|
+
|
|
530
548
|
def get_instance_type_for_accelerator_impl(
|
|
531
549
|
df: 'pd.DataFrame',
|
|
532
550
|
acc_name: str,
|
|
@@ -67,7 +67,7 @@ US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
|
|
|
67
67
|
# The following columns will be included in the final catalog.
|
|
68
68
|
USEFUL_COLUMNS = [
|
|
69
69
|
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
|
|
70
|
-
'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone'
|
|
70
|
+
'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Arch'
|
|
71
71
|
]
|
|
72
72
|
|
|
73
73
|
# NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
|
|
@@ -275,6 +275,17 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
275
275
|
return None, np.nan
|
|
276
276
|
return accelerator['Name'], accelerator['Count']
|
|
277
277
|
|
|
278
|
+
def get_arch(row) -> Optional[str]:
|
|
279
|
+
if 'ProcessorInfo' in row:
|
|
280
|
+
processor = row['ProcessorInfo']
|
|
281
|
+
if 'SupportedArchitectures' in processor:
|
|
282
|
+
archs = processor['SupportedArchitectures']
|
|
283
|
+
if isinstance(archs, list):
|
|
284
|
+
return archs[0]
|
|
285
|
+
elif isinstance(archs, str):
|
|
286
|
+
return archs
|
|
287
|
+
return None
|
|
288
|
+
|
|
278
289
|
def get_vcpus(row) -> float:
|
|
279
290
|
if not np.isnan(row['vCPU']):
|
|
280
291
|
return float(row['vCPU'])
|
|
@@ -332,6 +343,7 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
332
343
|
'AcceleratorCount': acc_count,
|
|
333
344
|
'vCPUs': get_vcpus(row),
|
|
334
345
|
'MemoryGiB': get_memory_gib(row),
|
|
346
|
+
'Arch': get_arch(row),
|
|
335
347
|
})
|
|
336
348
|
|
|
337
349
|
# The AWS API may not have all the instance types in the pricing table,
|