skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/seeweb.py +103 -0
- sky/authentication.py +38 -0
- sky/backends/backend_utils.py +148 -30
- sky/backends/cloud_vm_ray_backend.py +606 -223
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -37
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/seeweb_catalog.py +184 -0
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/seeweb.py +463 -0
- sky/core.py +46 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
- sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +75 -26
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +67 -24
- sky/logs/agent.py +10 -2
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +14 -3
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +806 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +252 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +14 -8
- sky/skylet/job_lib.py +128 -10
- sky/skylet/log_lib.py +14 -3
- sky/skylet/log_lib.pyi +9 -0
- sky/skylet/services.py +203 -0
- sky/skylet/skylet.py +4 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +11 -5
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
|
@@ -7,9 +7,11 @@ import json
|
|
|
7
7
|
import math
|
|
8
8
|
import os
|
|
9
9
|
import pathlib
|
|
10
|
+
import random
|
|
10
11
|
import re
|
|
11
12
|
import shlex
|
|
12
13
|
import signal
|
|
14
|
+
import socket
|
|
13
15
|
import subprocess
|
|
14
16
|
import sys
|
|
15
17
|
import tempfile
|
|
@@ -48,6 +50,7 @@ from sky.provision import common as provision_common
|
|
|
48
50
|
from sky.provision import instance_setup
|
|
49
51
|
from sky.provision import metadata_utils
|
|
50
52
|
from sky.provision import provisioner
|
|
53
|
+
from sky.provision.kubernetes import config as config_lib
|
|
51
54
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
52
55
|
from sky.server.requests import requests as requests_lib
|
|
53
56
|
from sky.skylet import autostop_lib
|
|
@@ -85,13 +88,22 @@ if typing.TYPE_CHECKING:
|
|
|
85
88
|
from sky import dag
|
|
86
89
|
from sky.schemas.generated import autostopv1_pb2
|
|
87
90
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
91
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
92
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
88
93
|
else:
|
|
89
94
|
# To avoid requiring grpcio to be installed on the client side.
|
|
90
|
-
grpc = adaptors_common.LazyImport(
|
|
95
|
+
grpc = adaptors_common.LazyImport(
|
|
96
|
+
'grpc',
|
|
97
|
+
# https://github.com/grpc/grpc/issues/37642 to avoid spam in console
|
|
98
|
+
set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
|
|
99
|
+
if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
|
|
91
100
|
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
92
101
|
'sky.schemas.generated.autostopv1_pb2')
|
|
93
102
|
autostopv1_pb2_grpc = adaptors_common.LazyImport(
|
|
94
103
|
'sky.schemas.generated.autostopv1_pb2_grpc')
|
|
104
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
105
|
+
jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
106
|
+
'sky.schemas.generated.jobsv1_pb2_grpc')
|
|
95
107
|
|
|
96
108
|
Path = str
|
|
97
109
|
|
|
@@ -218,7 +230,8 @@ def _get_cluster_config_template(cloud):
|
|
|
218
230
|
clouds.Vast: 'vast-ray.yml.j2',
|
|
219
231
|
clouds.Fluidstack: 'fluidstack-ray.yml.j2',
|
|
220
232
|
clouds.Nebius: 'nebius-ray.yml.j2',
|
|
221
|
-
clouds.Hyperbolic: 'hyperbolic-ray.yml.j2'
|
|
233
|
+
clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
|
|
234
|
+
clouds.Seeweb: 'seeweb-ray.yml.j2'
|
|
222
235
|
}
|
|
223
236
|
return cloud_to_template[type(cloud)]
|
|
224
237
|
|
|
@@ -330,6 +343,8 @@ class RayCodeGen:
|
|
|
330
343
|
|
|
331
344
|
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
|
|
332
345
|
|
|
346
|
+
CANCELLED_RETURN_CODE = 137
|
|
347
|
+
|
|
333
348
|
kwargs = dict()
|
|
334
349
|
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when
|
|
335
350
|
# the directory exists for backward compatibility for the VM
|
|
@@ -345,8 +360,10 @@ class RayCodeGen:
|
|
|
345
360
|
def get_or_fail(futures, pg) -> List[int]:
|
|
346
361
|
\"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
|
|
347
362
|
if not futures:
|
|
348
|
-
return []
|
|
363
|
+
return [], []
|
|
349
364
|
returncodes = [1] * len(futures)
|
|
365
|
+
pids = [None] * len(futures)
|
|
366
|
+
failed = False
|
|
350
367
|
# Wait for 1 task to be ready.
|
|
351
368
|
ready = []
|
|
352
369
|
# Keep invoking ray.wait if ready is empty. This is because
|
|
@@ -355,12 +372,22 @@ class RayCodeGen:
|
|
|
355
372
|
# before becoming ready.
|
|
356
373
|
# (Such tasks are common in serving jobs.)
|
|
357
374
|
# Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
|
|
375
|
+
|
|
376
|
+
def handle_ready_tasks(tasks: List[ray.ObjectRef]) -> None:
|
|
377
|
+
nonlocal returncodes, pids, failed
|
|
378
|
+
for task in tasks:
|
|
379
|
+
idx = futures.index(task)
|
|
380
|
+
res = ray.get(task)
|
|
381
|
+
returncodes[idx] = res['return_code']
|
|
382
|
+
pids[idx] = res['pid']
|
|
383
|
+
if res['return_code'] != 0:
|
|
384
|
+
failed = True
|
|
385
|
+
|
|
358
386
|
while not ready:
|
|
359
387
|
ready, unready = ray.wait(futures)
|
|
360
|
-
|
|
361
|
-
returncodes[idx] = ray.get(ready[0])
|
|
388
|
+
handle_ready_tasks(ready)
|
|
362
389
|
while unready:
|
|
363
|
-
if
|
|
390
|
+
if failed:
|
|
364
391
|
for task in unready:
|
|
365
392
|
# ray.cancel without force fails to kill tasks.
|
|
366
393
|
# We use force=True to kill unready tasks.
|
|
@@ -368,17 +395,16 @@ class RayCodeGen:
|
|
|
368
395
|
# Use SIGKILL=128+9 to indicate the task is forcely
|
|
369
396
|
# killed.
|
|
370
397
|
idx = futures.index(task)
|
|
371
|
-
returncodes[idx] =
|
|
398
|
+
returncodes[idx] = CANCELLED_RETURN_CODE
|
|
372
399
|
break
|
|
373
400
|
ready, unready = ray.wait(unready)
|
|
374
|
-
|
|
375
|
-
returncodes[idx] = ray.get(ready[0])
|
|
401
|
+
handle_ready_tasks(ready)
|
|
376
402
|
# Remove the placement group after all tasks are done, so that
|
|
377
403
|
# the next job can be scheduled on the released resources
|
|
378
404
|
# immediately.
|
|
379
405
|
ray_util.remove_placement_group(pg)
|
|
380
406
|
sys.stdout.flush()
|
|
381
|
-
return returncodes
|
|
407
|
+
return returncodes, pids
|
|
382
408
|
|
|
383
409
|
run_fn = None
|
|
384
410
|
futures = []
|
|
@@ -394,7 +420,10 @@ class RayCodeGen:
|
|
|
394
420
|
inspect.getsource(log_lib.make_task_bash_script),
|
|
395
421
|
inspect.getsource(log_lib.add_ray_env_vars),
|
|
396
422
|
inspect.getsource(log_lib.run_bash_command_with_log),
|
|
397
|
-
|
|
423
|
+
inspect.getsource(log_lib.run_bash_command_with_log_and_return_pid),
|
|
424
|
+
'run_bash_command_with_log = run_bash_command_with_log',
|
|
425
|
+
'run_bash_command_with_log_and_return_pid = \
|
|
426
|
+
ray.remote(run_bash_command_with_log_and_return_pid)',
|
|
398
427
|
]
|
|
399
428
|
# Currently, the codegen program is/can only be submitted to the head
|
|
400
429
|
# node, due to using job_lib for updating job statuses, and using
|
|
@@ -499,7 +528,7 @@ class RayCodeGen:
|
|
|
499
528
|
total_num_nodes = len(ray.nodes())
|
|
500
529
|
setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
|
|
501
530
|
setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
|
|
502
|
-
setup_workers = [
|
|
531
|
+
setup_workers = [run_bash_command_with_log_and_return_pid \\
|
|
503
532
|
.options(
|
|
504
533
|
name='setup',
|
|
505
534
|
num_cpus=_SETUP_CPUS,
|
|
@@ -514,15 +543,25 @@ class RayCodeGen:
|
|
|
514
543
|
stream_logs=True,
|
|
515
544
|
with_ray=True,
|
|
516
545
|
) for i in range(total_num_nodes)]
|
|
517
|
-
setup_returncodes = get_or_fail(setup_workers, setup_pg)
|
|
518
|
-
|
|
546
|
+
setup_returncodes, setup_pids = get_or_fail(setup_workers, setup_pg)
|
|
547
|
+
success = True
|
|
548
|
+
failed_workers_and_returncodes = []
|
|
549
|
+
for i in range(len(setup_returncodes)):
|
|
550
|
+
returncode = setup_returncodes[i]
|
|
551
|
+
pid = setup_pids[i]
|
|
552
|
+
if pid == None:
|
|
553
|
+
pid = os.getpid()
|
|
554
|
+
if returncode != 0 and returncode != CANCELLED_RETURN_CODE:
|
|
555
|
+
success = False
|
|
556
|
+
failed_workers_and_returncodes.append((pid, returncode))
|
|
557
|
+
if not success:
|
|
558
|
+
msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed. '
|
|
559
|
+
msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
|
|
560
|
+
msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
561
|
+
print(msg, flush=True)
|
|
519
562
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
520
563
|
# This waits for all streaming logs to finish.
|
|
521
564
|
time.sleep(1)
|
|
522
|
-
print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
|
|
523
|
-
'return code list:{colorama.Style.RESET_ALL}',
|
|
524
|
-
setup_returncodes,
|
|
525
|
-
flush=True)
|
|
526
565
|
# Need this to set the job status in ray job to be FAILED.
|
|
527
566
|
sys.exit(1)
|
|
528
567
|
""")
|
|
@@ -695,7 +734,7 @@ class RayCodeGen:
|
|
|
695
734
|
|
|
696
735
|
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
|
|
697
736
|
|
|
698
|
-
futures.append(
|
|
737
|
+
futures.append(run_bash_command_with_log_and_return_pid \\
|
|
699
738
|
.options(name=name_str, {options_str}) \\
|
|
700
739
|
.remote(
|
|
701
740
|
script,
|
|
@@ -714,7 +753,7 @@ class RayCodeGen:
|
|
|
714
753
|
|
|
715
754
|
self._code += [
|
|
716
755
|
textwrap.dedent(f"""\
|
|
717
|
-
returncodes = get_or_fail(futures, pg)
|
|
756
|
+
returncodes, _ = get_or_fail(futures, pg)
|
|
718
757
|
if sum(returncodes) != 0:
|
|
719
758
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
|
|
720
759
|
# Schedule the next pending job immediately to make the job
|
|
@@ -1340,6 +1379,34 @@ class RetryingVmProvisioner(object):
|
|
|
1340
1379
|
zones = [clouds.Zone(name=to_provision.zone)]
|
|
1341
1380
|
yield zones
|
|
1342
1381
|
|
|
1382
|
+
def _insufficient_resources_msg(
|
|
1383
|
+
self,
|
|
1384
|
+
to_provision: resources_lib.Resources,
|
|
1385
|
+
requested_resources: Set[resources_lib.Resources],
|
|
1386
|
+
insufficient_resources: Optional[List[str]],
|
|
1387
|
+
) -> str:
|
|
1388
|
+
insufficent_resource_msg = ('' if insufficient_resources is None else
|
|
1389
|
+
f' ({", ".join(insufficient_resources)})')
|
|
1390
|
+
message = f'Failed to acquire resources{insufficent_resource_msg} '
|
|
1391
|
+
if to_provision.zone is not None:
|
|
1392
|
+
message += (f'in {to_provision.zone} for {requested_resources}. ')
|
|
1393
|
+
elif to_provision.region is not None and to_provision.cloud is not None:
|
|
1394
|
+
# For public clouds, provision.region is always set.
|
|
1395
|
+
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
1396
|
+
message += (
|
|
1397
|
+
f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
|
|
1398
|
+
f'for {requested_resources}. The SSH Node Pool may not '
|
|
1399
|
+
'have enough resources.')
|
|
1400
|
+
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
1401
|
+
message += (f'in context {to_provision.region} for '
|
|
1402
|
+
f'{requested_resources}. ')
|
|
1403
|
+
else:
|
|
1404
|
+
message += (f'in all zones in {to_provision.region} for '
|
|
1405
|
+
f'{requested_resources}. ')
|
|
1406
|
+
else:
|
|
1407
|
+
message += (f'{to_provision.cloud} for {requested_resources}. ')
|
|
1408
|
+
return message
|
|
1409
|
+
|
|
1343
1410
|
def _retry_zones(
|
|
1344
1411
|
self,
|
|
1345
1412
|
to_provision: resources_lib.Resources,
|
|
@@ -1418,6 +1485,7 @@ class RetryingVmProvisioner(object):
|
|
|
1418
1485
|
f'To request quotas, check the instruction: '
|
|
1419
1486
|
f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
|
|
1420
1487
|
|
|
1488
|
+
insufficient_resources = None
|
|
1421
1489
|
for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
|
|
1422
1490
|
prev_cluster_status,
|
|
1423
1491
|
prev_cluster_ever_up):
|
|
@@ -1630,6 +1698,24 @@ class RetryingVmProvisioner(object):
|
|
|
1630
1698
|
# No teardown happens for this error.
|
|
1631
1699
|
with ux_utils.print_exception_no_traceback():
|
|
1632
1700
|
raise
|
|
1701
|
+
except config_lib.KubernetesError as e:
|
|
1702
|
+
if e.insufficent_resources:
|
|
1703
|
+
insufficient_resources = e.insufficent_resources
|
|
1704
|
+
# NOTE: We try to cleanup the cluster even if the previous
|
|
1705
|
+
# cluster does not exist. Also we are fast at
|
|
1706
|
+
# cleaning up clusters now if there is no existing node.
|
|
1707
|
+
CloudVmRayBackend().post_teardown_cleanup(
|
|
1708
|
+
handle,
|
|
1709
|
+
terminate=not prev_cluster_ever_up,
|
|
1710
|
+
remove_from_db=False,
|
|
1711
|
+
failover=True,
|
|
1712
|
+
)
|
|
1713
|
+
# TODO(suquark): other clouds may have different zone
|
|
1714
|
+
# blocking strategy. See '_update_blocklist_on_error'
|
|
1715
|
+
# for details.
|
|
1716
|
+
FailoverCloudErrorHandlerV2.update_blocklist_on_error(
|
|
1717
|
+
self._blocked_resources, to_provision, region, zones, e)
|
|
1718
|
+
continue
|
|
1633
1719
|
except Exception as e: # pylint: disable=broad-except
|
|
1634
1720
|
# NOTE: We try to cleanup the cluster even if the previous
|
|
1635
1721
|
# cluster does not exist. Also we are fast at
|
|
@@ -1760,26 +1846,9 @@ class RetryingVmProvisioner(object):
|
|
|
1760
1846
|
terminate=terminate_or_stop,
|
|
1761
1847
|
remove_from_db=False)
|
|
1762
1848
|
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
f'{requested_resources}. ')
|
|
1767
|
-
elif to_provision.region is not None:
|
|
1768
|
-
# For public clouds, provision.region is always set.
|
|
1769
|
-
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
1770
|
-
message = ('Failed to acquire resources in SSH Node Pool '
|
|
1771
|
-
f'({to_provision.region.lstrip("ssh-")}) for '
|
|
1772
|
-
f'{requested_resources}. The SSH Node Pool may not '
|
|
1773
|
-
'have enough resources.')
|
|
1774
|
-
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
1775
|
-
message = ('Failed to acquire resources in context '
|
|
1776
|
-
f'{to_provision.region} for {requested_resources}. ')
|
|
1777
|
-
else:
|
|
1778
|
-
message = ('Failed to acquire resources in all zones in '
|
|
1779
|
-
f'{to_provision.region} for {requested_resources}. ')
|
|
1780
|
-
else:
|
|
1781
|
-
message = (f'Failed to acquire resources in {to_provision.cloud} '
|
|
1782
|
-
f'for {requested_resources}. ')
|
|
1849
|
+
message = self._insufficient_resources_msg(to_provision,
|
|
1850
|
+
requested_resources,
|
|
1851
|
+
insufficient_resources)
|
|
1783
1852
|
# Do not failover to other locations if the cluster was ever up, since
|
|
1784
1853
|
# the user can have some data on the cluster.
|
|
1785
1854
|
raise exceptions.ResourcesUnavailableError(
|
|
@@ -2261,8 +2330,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2261
2330
|
- (optional) Skylet SSH tunnel info.
|
|
2262
2331
|
"""
|
|
2263
2332
|
# Bump if any fields get added/removed/changed, and add backward
|
|
2264
|
-
#
|
|
2265
|
-
_VERSION =
|
|
2333
|
+
# compatibility logic in __setstate__ and/or __getstate__.
|
|
2334
|
+
_VERSION = 12
|
|
2266
2335
|
|
|
2267
2336
|
def __init__(
|
|
2268
2337
|
self,
|
|
@@ -2296,7 +2365,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2296
2365
|
self.launched_resources = launched_resources
|
|
2297
2366
|
self.docker_user: Optional[str] = None
|
|
2298
2367
|
self.is_grpc_enabled = True
|
|
2299
|
-
self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
|
|
2300
2368
|
|
|
2301
2369
|
def __repr__(self):
|
|
2302
2370
|
return (f'ResourceHandle('
|
|
@@ -2313,8 +2381,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2313
2381
|
f'{self.launched_resources}, '
|
|
2314
2382
|
f'\n\tdocker_user={self.docker_user},'
|
|
2315
2383
|
f'\n\tssh_user={self.ssh_user},'
|
|
2316
|
-
f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
|
|
2317
|
-
f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
|
|
2384
|
+
f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
|
|
2318
2385
|
|
|
2319
2386
|
def get_cluster_name(self):
|
|
2320
2387
|
return self.cluster_name
|
|
@@ -2643,11 +2710,74 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2643
2710
|
cluster_config_file)
|
|
2644
2711
|
self.docker_user = docker_user
|
|
2645
2712
|
|
|
2713
|
+
def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
|
|
2714
|
+
metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
|
|
2715
|
+
self.cluster_name)
|
|
2716
|
+
if metadata is None:
|
|
2717
|
+
return None
|
|
2718
|
+
return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
|
|
2719
|
+
|
|
2720
|
+
def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
|
|
2721
|
+
global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
|
|
2722
|
+
self.cluster_name,
|
|
2723
|
+
(tunnel.port, tunnel.pid) if tunnel is not None else None)
|
|
2724
|
+
|
|
2646
2725
|
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
2650
|
-
|
|
2726
|
+
# It's fine to not grab the lock here, as we're only reading,
|
|
2727
|
+
# and writes are very rare.
|
|
2728
|
+
# It's acceptable to read while another process is opening a tunnel,
|
|
2729
|
+
# because it will only happen on:
|
|
2730
|
+
# 1. A new cluster who has no tunnel yet, or
|
|
2731
|
+
# 2. A cluster with an unhealthy tunnel
|
|
2732
|
+
# For (2), for processes that read the "stale" tunnel, it will fail
|
|
2733
|
+
# and on the next retry, it will call get_grpc_channel again
|
|
2734
|
+
# and get the new tunnel.
|
|
2735
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2736
|
+
if tunnel is not None:
|
|
2737
|
+
try:
|
|
2738
|
+
# Check if the tunnel is open.
|
|
2739
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2740
|
+
s.settimeout(0.5)
|
|
2741
|
+
s.connect(('localhost', tunnel.port))
|
|
2742
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}')
|
|
2743
|
+
except socket.error as e:
|
|
2744
|
+
logger.warning(
|
|
2745
|
+
'Failed to connect to SSH tunnel for cluster '
|
|
2746
|
+
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2747
|
+
'acquiring lock')
|
|
2748
|
+
pass
|
|
2749
|
+
lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
|
|
2750
|
+
lock_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
|
|
2751
|
+
lock = locks.get_lock(lock_id, lock_timeout)
|
|
2752
|
+
try:
|
|
2753
|
+
with lock.acquire(blocking=True):
|
|
2754
|
+
# Re-read the tunnel from the DB.
|
|
2755
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2756
|
+
if tunnel is None:
|
|
2757
|
+
logger.debug('No SSH tunnel found for cluster '
|
|
2758
|
+
f'{self.cluster_name!r}, '
|
|
2759
|
+
'opening the tunnel')
|
|
2760
|
+
tunnel = self._open_and_update_skylet_tunnel()
|
|
2761
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}')
|
|
2762
|
+
try:
|
|
2763
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2764
|
+
s.settimeout(0.5)
|
|
2765
|
+
s.connect(('localhost', tunnel.port))
|
|
2766
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}')
|
|
2767
|
+
except socket.error as e:
|
|
2768
|
+
logger.warning(
|
|
2769
|
+
'Failed to connect to SSH tunnel for cluster '
|
|
2770
|
+
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2771
|
+
'opening new tunnel')
|
|
2772
|
+
tunnel = self._open_and_update_skylet_tunnel()
|
|
2773
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}')
|
|
2774
|
+
except locks.LockTimeout as e:
|
|
2775
|
+
raise RuntimeError(
|
|
2776
|
+
'Failed to get gRPC channel for cluster '
|
|
2777
|
+
f'{self.cluster_name!r} due to a timeout when waiting for the '
|
|
2778
|
+
'SSH tunnel to be opened. Please try again or manually remove '
|
|
2779
|
+
f'the lock at {lock_id}. '
|
|
2780
|
+
f'{common_utils.format_exception(e)}') from e
|
|
2651
2781
|
|
|
2652
2782
|
def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
|
|
2653
2783
|
"""Clean up an SSH tunnel by terminating the process."""
|
|
@@ -2668,31 +2798,48 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2668
2798
|
logger.warning(
|
|
2669
2799
|
f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
|
|
2670
2800
|
|
|
2671
|
-
def
|
|
2801
|
+
def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
|
|
2672
2802
|
"""Opens an SSH tunnel to the Skylet on the head node,
|
|
2673
2803
|
updates the cluster handle, and persists it to the database."""
|
|
2674
|
-
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2681
|
-
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
|
|
2686
|
-
|
|
2804
|
+
max_attempts = 3
|
|
2805
|
+
# There could be a race condition here, as multiple processes may
|
|
2806
|
+
# attempt to open the same port at the same time.
|
|
2807
|
+
for attempt in range(max_attempts):
|
|
2808
|
+
runners = self.get_command_runners()
|
|
2809
|
+
head_runner = runners[0]
|
|
2810
|
+
local_port = random.randint(10000, 65535)
|
|
2811
|
+
try:
|
|
2812
|
+
ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
|
|
2813
|
+
head_runner, (local_port, constants.SKYLET_GRPC_PORT))
|
|
2814
|
+
except exceptions.CommandError as e:
|
|
2815
|
+
# Don't retry if the error is due to timeout,
|
|
2816
|
+
# connection refused, Kubernetes pods not found,
|
|
2817
|
+
# or an in-progress termination.
|
|
2818
|
+
if (e.detailed_reason is not None and
|
|
2819
|
+
(backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
|
|
2820
|
+
e.detailed_reason) or
|
|
2821
|
+
backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
|
|
2822
|
+
e.detailed_reason) or attempt == max_attempts - 1)):
|
|
2823
|
+
raise e
|
|
2824
|
+
logger.warning(
|
|
2825
|
+
f'Failed to open SSH tunnel on port {local_port} '
|
|
2826
|
+
f'({attempt + 1}/{max_attempts}). '
|
|
2827
|
+
f'{e.error_msg}\n{e.detailed_reason}')
|
|
2828
|
+
continue
|
|
2829
|
+
tunnel_info = SSHTunnelInfo(port=local_port,
|
|
2830
|
+
pid=ssh_tunnel_proc.pid)
|
|
2831
|
+
break
|
|
2832
|
+
|
|
2687
2833
|
try:
|
|
2688
2834
|
grpc.channel_ready_future(
|
|
2689
2835
|
grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
|
|
2690
2836
|
timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
|
|
2691
2837
|
# Clean up existing tunnel before setting up the new one.
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
|
|
2838
|
+
old_tunnel = self._get_skylet_ssh_tunnel()
|
|
2839
|
+
if old_tunnel is not None:
|
|
2840
|
+
self._cleanup_ssh_tunnel(old_tunnel)
|
|
2841
|
+
self._set_skylet_ssh_tunnel(tunnel_info)
|
|
2842
|
+
return tunnel_info
|
|
2696
2843
|
except grpc.FutureTimeoutError as e:
|
|
2697
2844
|
self._cleanup_ssh_tunnel(tunnel_info)
|
|
2698
2845
|
logger.warning(
|
|
@@ -2752,6 +2899,13 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2752
2899
|
"""Returns whether this handle has gRPC enabled and gRPC flag is set."""
|
|
2753
2900
|
return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
|
|
2754
2901
|
|
|
2902
|
+
def __getstate__(self):
|
|
2903
|
+
state = self.__dict__.copy()
|
|
2904
|
+
# For backwards compatibility. Refer to
|
|
2905
|
+
# https://github.com/skypilot-org/skypilot/pull/7133
|
|
2906
|
+
state.setdefault('skylet_ssh_tunnel', None)
|
|
2907
|
+
return state
|
|
2908
|
+
|
|
2755
2909
|
def __setstate__(self, state):
|
|
2756
2910
|
self._version = self._VERSION
|
|
2757
2911
|
|
|
@@ -2809,6 +2963,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2809
2963
|
state['is_grpc_enabled'] = False
|
|
2810
2964
|
state['skylet_ssh_tunnel'] = None
|
|
2811
2965
|
|
|
2966
|
+
if version >= 12:
|
|
2967
|
+
# DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
|
|
2968
|
+
state.pop('skylet_ssh_tunnel', None)
|
|
2969
|
+
|
|
2812
2970
|
self.__dict__.update(state)
|
|
2813
2971
|
|
|
2814
2972
|
# Because the update_cluster_ips and update_ssh_ports
|
|
@@ -2886,21 +3044,93 @@ class SkyletClient:
|
|
|
2886
3044
|
|
|
2887
3045
|
def __init__(self, channel: 'grpc.Channel'):
|
|
2888
3046
|
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
3047
|
+
self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
|
|
2889
3048
|
|
|
2890
3049
|
def set_autostop(
|
|
2891
3050
|
self,
|
|
2892
3051
|
request: 'autostopv1_pb2.SetAutostopRequest',
|
|
2893
|
-
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3052
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2894
3053
|
) -> 'autostopv1_pb2.SetAutostopResponse':
|
|
2895
3054
|
return self._autostop_stub.SetAutostop(request, timeout=timeout)
|
|
2896
3055
|
|
|
2897
3056
|
def is_autostopping(
|
|
2898
3057
|
self,
|
|
2899
3058
|
request: 'autostopv1_pb2.IsAutostoppingRequest',
|
|
2900
|
-
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3059
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2901
3060
|
) -> 'autostopv1_pb2.IsAutostoppingResponse':
|
|
2902
3061
|
return self._autostop_stub.IsAutostopping(request, timeout=timeout)
|
|
2903
3062
|
|
|
3063
|
+
def add_job(
|
|
3064
|
+
self,
|
|
3065
|
+
request: 'jobsv1_pb2.AddJobRequest',
|
|
3066
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3067
|
+
) -> 'jobsv1_pb2.AddJobResponse':
|
|
3068
|
+
return self._jobs_stub.AddJob(request, timeout=timeout)
|
|
3069
|
+
|
|
3070
|
+
def queue_job(
|
|
3071
|
+
self,
|
|
3072
|
+
request: 'jobsv1_pb2.QueueJobRequest',
|
|
3073
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3074
|
+
) -> 'jobsv1_pb2.QueueJobResponse':
|
|
3075
|
+
return self._jobs_stub.QueueJob(request, timeout=timeout)
|
|
3076
|
+
|
|
3077
|
+
def update_status(
|
|
3078
|
+
self,
|
|
3079
|
+
request: 'jobsv1_pb2.UpdateStatusRequest',
|
|
3080
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3081
|
+
) -> 'jobsv1_pb2.UpdateStatusResponse':
|
|
3082
|
+
return self._jobs_stub.UpdateStatus(request, timeout=timeout)
|
|
3083
|
+
|
|
3084
|
+
def get_job_queue(
|
|
3085
|
+
self,
|
|
3086
|
+
request: 'jobsv1_pb2.GetJobQueueRequest',
|
|
3087
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3088
|
+
) -> 'jobsv1_pb2.GetJobQueueResponse':
|
|
3089
|
+
return self._jobs_stub.GetJobQueue(request, timeout=timeout)
|
|
3090
|
+
|
|
3091
|
+
def cancel_jobs(
|
|
3092
|
+
self,
|
|
3093
|
+
request: 'jobsv1_pb2.CancelJobsRequest',
|
|
3094
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3095
|
+
) -> 'jobsv1_pb2.CancelJobsResponse':
|
|
3096
|
+
return self._jobs_stub.CancelJobs(request, timeout=timeout)
|
|
3097
|
+
|
|
3098
|
+
def fail_all_in_progress_jobs(
|
|
3099
|
+
self,
|
|
3100
|
+
request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
|
|
3101
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3102
|
+
) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
|
|
3103
|
+
return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
|
|
3104
|
+
|
|
3105
|
+
def get_job_status(
|
|
3106
|
+
self,
|
|
3107
|
+
request: 'jobsv1_pb2.GetJobStatusRequest',
|
|
3108
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3109
|
+
) -> 'jobsv1_pb2.GetJobStatusResponse':
|
|
3110
|
+
return self._jobs_stub.GetJobStatus(request, timeout=timeout)
|
|
3111
|
+
|
|
3112
|
+
def get_job_submitted_timestamp(
|
|
3113
|
+
self,
|
|
3114
|
+
request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
|
|
3115
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3116
|
+
) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
|
|
3117
|
+
return self._jobs_stub.GetJobSubmittedTimestamp(request,
|
|
3118
|
+
timeout=timeout)
|
|
3119
|
+
|
|
3120
|
+
def get_job_ended_timestamp(
|
|
3121
|
+
self,
|
|
3122
|
+
request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
|
|
3123
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3124
|
+
) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
|
|
3125
|
+
return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
|
|
3126
|
+
|
|
3127
|
+
def get_log_dirs_for_jobs(
|
|
3128
|
+
self,
|
|
3129
|
+
request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
|
|
3130
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3131
|
+
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
3132
|
+
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
3133
|
+
|
|
2904
3134
|
|
|
2905
3135
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
2906
3136
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
@@ -3115,7 +3345,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3115
3345
|
colorama.Style.RESET_ALL +
|
|
3116
3346
|
colorama.Style.DIM +
|
|
3117
3347
|
'Check concurrent requests: ' +
|
|
3118
|
-
'sky api status '
|
|
3348
|
+
'sky api status -v | grep '
|
|
3349
|
+
f'{cluster_name}'))
|
|
3119
3350
|
|
|
3120
3351
|
def _locked_provision(
|
|
3121
3352
|
self,
|
|
@@ -3406,16 +3637,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3406
3637
|
# update_status will query the ray job status for all INIT /
|
|
3407
3638
|
# PENDING / RUNNING jobs for the real status, since we do not
|
|
3408
3639
|
# know the actual previous status of the cluster.
|
|
3409
|
-
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3410
3640
|
logger.debug('Update job queue on remote cluster.')
|
|
3411
3641
|
with rich_utils.safe_status(
|
|
3412
3642
|
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
|
3413
|
-
|
|
3414
|
-
|
|
3415
|
-
|
|
3416
|
-
|
|
3417
|
-
|
|
3418
|
-
|
|
3643
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3644
|
+
|
|
3645
|
+
if handle.is_grpc_enabled_with_flag:
|
|
3646
|
+
try:
|
|
3647
|
+
request = jobsv1_pb2.UpdateStatusRequest()
|
|
3648
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3649
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
3650
|
+
).update_status(request))
|
|
3651
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3652
|
+
use_legacy = True
|
|
3653
|
+
|
|
3654
|
+
if use_legacy:
|
|
3655
|
+
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3656
|
+
returncode, _, stderr = self.run_on_head(
|
|
3657
|
+
handle, cmd, require_outputs=True)
|
|
3658
|
+
subprocess_utils.handle_returncode(
|
|
3659
|
+
returncode, cmd, 'Failed to update job status.', stderr)
|
|
3419
3660
|
if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
|
|
3420
3661
|
# Safely set all the previous jobs to FAILED since the cluster
|
|
3421
3662
|
# is restarted
|
|
@@ -3423,14 +3664,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3423
3664
|
# 1. A job finishes RUNNING, but right before it update itself
|
|
3424
3665
|
# to SUCCEEDED, the cluster is STOPPED by `sky stop`.
|
|
3425
3666
|
# 2. On next `sky start`, it gets reset to FAILED.
|
|
3426
|
-
|
|
3427
|
-
|
|
3428
|
-
|
|
3429
|
-
|
|
3430
|
-
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
|
|
3667
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3668
|
+
|
|
3669
|
+
if handle.is_grpc_enabled_with_flag:
|
|
3670
|
+
try:
|
|
3671
|
+
fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
|
|
3672
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3673
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
3674
|
+
)).fail_all_in_progress_jobs(fail_request))
|
|
3675
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3676
|
+
use_legacy = True
|
|
3677
|
+
|
|
3678
|
+
if use_legacy:
|
|
3679
|
+
cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
|
|
3680
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3681
|
+
handle, cmd, require_outputs=True)
|
|
3682
|
+
subprocess_utils.handle_returncode(
|
|
3683
|
+
returncode, cmd,
|
|
3684
|
+
'Failed to set previously in-progress jobs to FAILED',
|
|
3685
|
+
stdout + stderr)
|
|
3434
3686
|
|
|
3435
3687
|
prev_ports = None
|
|
3436
3688
|
if prev_handle is not None:
|
|
@@ -3789,109 +4041,161 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3789
4041
|
remote_log_dir: Optional[str] = None,
|
|
3790
4042
|
) -> None:
|
|
3791
4043
|
"""Executes generated code on the head node."""
|
|
3792
|
-
|
|
4044
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4045
|
+
file_name = f'sky_job_{job_id}'
|
|
4046
|
+
script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
|
|
3793
4047
|
if remote_log_dir is None:
|
|
3794
4048
|
remote_log_dir = self.log_dir
|
|
3795
4049
|
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
|
3796
4050
|
|
|
3797
|
-
|
|
4051
|
+
def _dump_code_to_file(codegen: str,
|
|
4052
|
+
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
4053
|
+
runners = handle.get_command_runners()
|
|
4054
|
+
head_runner = runners[0]
|
|
4055
|
+
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
4056
|
+
fp.write(codegen)
|
|
4057
|
+
fp.flush()
|
|
4058
|
+
script_path = os.path.join(target_dir, file_name)
|
|
4059
|
+
# We choose to sync code + exec, because the alternative of
|
|
4060
|
+
# 'ray submit' may not work as it may use system python
|
|
4061
|
+
# (python2) to execute the script. Happens for AWS.
|
|
4062
|
+
head_runner.rsync(source=fp.name,
|
|
4063
|
+
target=script_path,
|
|
4064
|
+
up=True,
|
|
4065
|
+
stream_logs=False)
|
|
3798
4066
|
|
|
4067
|
+
cd = f'cd {SKY_REMOTE_WORKDIR}'
|
|
3799
4068
|
mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
|
|
3800
4069
|
f'touch {remote_log_path}')
|
|
3801
4070
|
encoded_script = shlex.quote(codegen)
|
|
3802
4071
|
create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
|
|
3803
4072
|
job_submit_cmd = (
|
|
3804
|
-
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3805
|
-
# with pid is the same driver process.
|
|
4073
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
4074
|
+
# retrieved with pid is the same driver process.
|
|
3806
4075
|
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
3807
4076
|
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
3808
4077
|
# Do not use &>, which is not POSIX and may not work.
|
|
3809
4078
|
# Note that the order of ">filename 2>&1" matters.
|
|
3810
4079
|
f'> {remote_log_path} 2>&1')
|
|
3811
|
-
|
|
3812
4080
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
|
3813
4081
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
|
3814
4082
|
|
|
3815
|
-
def _dump_code_to_file(codegen: str,
|
|
3816
|
-
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
3817
|
-
runners = handle.get_command_runners()
|
|
3818
|
-
head_runner = runners[0]
|
|
3819
|
-
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
3820
|
-
fp.write(codegen)
|
|
3821
|
-
fp.flush()
|
|
3822
|
-
script_path = os.path.join(target_dir, f'sky_job_{job_id}')
|
|
3823
|
-
# We choose to sync code + exec, because the alternative of 'ray
|
|
3824
|
-
# submit' may not work as it may use system python (python2) to
|
|
3825
|
-
# execute the script. Happens for AWS.
|
|
3826
|
-
head_runner.rsync(source=fp.name,
|
|
3827
|
-
target=script_path,
|
|
3828
|
-
up=True,
|
|
3829
|
-
stream_logs=False)
|
|
3830
|
-
|
|
3831
4083
|
# Should also be ealier than _is_command_length_over_limit
|
|
3832
4084
|
# Same reason as in _setup
|
|
3833
4085
|
if self._dump_final_script:
|
|
3834
4086
|
_dump_code_to_file(job_submit_cmd,
|
|
3835
4087
|
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
|
3836
4088
|
|
|
3837
|
-
if
|
|
3838
|
-
|
|
3839
|
-
|
|
3840
|
-
|
|
3841
|
-
|
|
3842
|
-
|
|
3843
|
-
|
|
3844
|
-
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
3850
|
-
|
|
3851
|
-
|
|
3852
|
-
|
|
3853
|
-
|
|
3854
|
-
|
|
3855
|
-
|
|
3856
|
-
|
|
3857
|
-
|
|
3858
|
-
|
|
3859
|
-
|
|
3860
|
-
|
|
3861
|
-
|
|
4089
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4090
|
+
try:
|
|
4091
|
+
managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
|
|
4092
|
+
if managed_job_dag is not None:
|
|
4093
|
+
workspace = skypilot_config.get_active_workspace(
|
|
4094
|
+
force_user_workspace=True)
|
|
4095
|
+
entrypoint = common_utils.get_current_command()
|
|
4096
|
+
|
|
4097
|
+
managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
|
|
4098
|
+
for task_id, task in enumerate(managed_job_dag.tasks):
|
|
4099
|
+
resources_str = backend_utils.get_task_resources_str(
|
|
4100
|
+
task, is_managed_job=True)
|
|
4101
|
+
managed_job_tasks.append(
|
|
4102
|
+
jobsv1_pb2.ManagedJobTask(
|
|
4103
|
+
task_id=task_id,
|
|
4104
|
+
name=task.name,
|
|
4105
|
+
resources_str=resources_str,
|
|
4106
|
+
metadata_json=task.metadata_json))
|
|
4107
|
+
|
|
4108
|
+
managed_job_info = jobsv1_pb2.ManagedJobInfo(
|
|
4109
|
+
name=managed_job_dag.name,
|
|
4110
|
+
pool=managed_job_dag.pool,
|
|
4111
|
+
workspace=workspace,
|
|
4112
|
+
entrypoint=entrypoint,
|
|
4113
|
+
tasks=managed_job_tasks)
|
|
4114
|
+
|
|
4115
|
+
if _is_command_length_over_limit(codegen):
|
|
4116
|
+
_dump_code_to_file(codegen)
|
|
4117
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
4118
|
+
job_id=job_id,
|
|
4119
|
+
# codegen not set - server assumes script uploaded
|
|
4120
|
+
remote_log_dir=remote_log_dir,
|
|
4121
|
+
managed_job=managed_job_info,
|
|
4122
|
+
script_path=script_path)
|
|
4123
|
+
else:
|
|
4124
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
4125
|
+
job_id=job_id,
|
|
4126
|
+
codegen=codegen,
|
|
4127
|
+
remote_log_dir=remote_log_dir,
|
|
4128
|
+
managed_job=managed_job_info,
|
|
4129
|
+
script_path=script_path)
|
|
4130
|
+
|
|
4131
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
4132
|
+
handle.get_grpc_channel()).queue_job(queue_job_request))
|
|
4133
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4134
|
+
use_legacy = True
|
|
4135
|
+
|
|
4136
|
+
if use_legacy:
|
|
4137
|
+
if _is_command_length_over_limit(job_submit_cmd):
|
|
4138
|
+
_dump_code_to_file(codegen)
|
|
4139
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
4140
|
+
|
|
4141
|
+
def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
|
|
4142
|
+
if managed_job_dag is not None:
|
|
4143
|
+
# Add the managed job to job queue database.
|
|
4144
|
+
managed_job_codegen = managed_jobs.ManagedJobCodeGen()
|
|
4145
|
+
managed_job_code = managed_job_codegen.set_pending(
|
|
4146
|
+
job_id,
|
|
4147
|
+
managed_job_dag,
|
|
4148
|
+
skypilot_config.get_active_workspace(
|
|
4149
|
+
force_user_workspace=True),
|
|
4150
|
+
entrypoint=common_utils.get_current_command())
|
|
4151
|
+
# Set the managed job to PENDING state to make sure that
|
|
4152
|
+
# this managed job appears in the `sky jobs queue`, even
|
|
4153
|
+
# if it needs to wait to be submitted.
|
|
4154
|
+
# We cannot set the managed job to PENDING state in the
|
|
4155
|
+
# job template (jobs-controller.yaml.j2), as it may need
|
|
4156
|
+
# to wait for the run commands to be scheduled on the job
|
|
4157
|
+
# controller in high-load cases.
|
|
4158
|
+
job_submit_cmd += ' && ' + managed_job_code
|
|
4159
|
+
return job_submit_cmd
|
|
3862
4160
|
|
|
3863
|
-
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3864
|
-
job_submit_cmd,
|
|
3865
|
-
stream_logs=False,
|
|
3866
|
-
require_outputs=True)
|
|
3867
|
-
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3868
|
-
# running a job. Necessitating calling `sky launch`.
|
|
3869
|
-
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3870
|
-
handle.cluster_name)
|
|
3871
|
-
output = stdout + stderr
|
|
3872
|
-
if ((returncode == 255 and 'too long' in output.lower()) or
|
|
3873
|
-
(returncode == 1 and 'request-uri too large' in output.lower())):
|
|
3874
|
-
# If the generated script is too long, we retry it with dumping
|
|
3875
|
-
# the script to a file and running it with SSH. We use a general
|
|
3876
|
-
# length limit check before but it could be inaccurate on some
|
|
3877
|
-
# systems.
|
|
3878
|
-
# When there is a cloudflare proxy in front of the remote, it could
|
|
3879
|
-
# cause `414 Request-URI Too Large` error.
|
|
3880
|
-
logger.debug('Failed to submit job due to command length limit. '
|
|
3881
|
-
'Dumping job to file and running it with SSH. '
|
|
3882
|
-
f'Output: {output}')
|
|
3883
|
-
_dump_code_to_file(codegen)
|
|
3884
|
-
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3885
4161
|
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
4162
|
+
|
|
3886
4163
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3887
4164
|
job_submit_cmd,
|
|
3888
4165
|
stream_logs=False,
|
|
3889
4166
|
require_outputs=True)
|
|
4167
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
4168
|
+
# running a job. Necessitating calling `sky launch`.
|
|
4169
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
4170
|
+
handle.cluster_name)
|
|
4171
|
+
output = stdout + stderr
|
|
4172
|
+
if ((returncode == 255 and 'too long' in output.lower()) or
|
|
4173
|
+
(returncode == 1 and
|
|
4174
|
+
'request-uri too large' in output.lower())):
|
|
4175
|
+
# If the generated script is too long, we retry it with dumping
|
|
4176
|
+
# the script to a file and running it with SSH. We use a general
|
|
4177
|
+
# length limit check before but it could be inaccurate on some
|
|
4178
|
+
# systems.
|
|
4179
|
+
# When there is a cloudflare proxy in front of the remote, it
|
|
4180
|
+
# could cause `414 Request-URI Too Large` error.
|
|
4181
|
+
logger.debug(
|
|
4182
|
+
'Failed to submit job due to command length limit. '
|
|
4183
|
+
'Dumping job to file and running it with SSH. '
|
|
4184
|
+
f'Output: {output}')
|
|
4185
|
+
_dump_code_to_file(codegen)
|
|
4186
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
4187
|
+
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
4188
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
4189
|
+
handle,
|
|
4190
|
+
job_submit_cmd,
|
|
4191
|
+
stream_logs=False,
|
|
4192
|
+
require_outputs=True)
|
|
3890
4193
|
|
|
3891
|
-
|
|
3892
|
-
|
|
3893
|
-
|
|
3894
|
-
|
|
4194
|
+
subprocess_utils.handle_returncode(
|
|
4195
|
+
returncode,
|
|
4196
|
+
job_submit_cmd,
|
|
4197
|
+
f'Failed to submit job {job_id}.',
|
|
4198
|
+
stderr=stdout + stderr)
|
|
3895
4199
|
|
|
3896
4200
|
controller = controller_utils.Controllers.from_name(handle.cluster_name)
|
|
3897
4201
|
if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
|
|
@@ -3912,42 +4216,64 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3912
4216
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
|
3913
4217
|
job_name: Optional[str], resources_str: str,
|
|
3914
4218
|
metadata: str) -> Tuple[int, str]:
|
|
3915
|
-
|
|
3916
|
-
|
|
3917
|
-
|
|
3918
|
-
|
|
3919
|
-
|
|
3920
|
-
|
|
3921
|
-
|
|
3922
|
-
|
|
3923
|
-
|
|
3924
|
-
|
|
3925
|
-
|
|
3926
|
-
|
|
3927
|
-
|
|
3928
|
-
|
|
3929
|
-
|
|
3930
|
-
|
|
3931
|
-
|
|
3932
|
-
|
|
3933
|
-
|
|
3934
|
-
|
|
3935
|
-
|
|
3936
|
-
|
|
3937
|
-
|
|
3938
|
-
|
|
3939
|
-
|
|
3940
|
-
|
|
3941
|
-
|
|
3942
|
-
|
|
3943
|
-
|
|
3944
|
-
|
|
3945
|
-
|
|
3946
|
-
|
|
3947
|
-
|
|
3948
|
-
|
|
3949
|
-
|
|
3950
|
-
|
|
4219
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4220
|
+
|
|
4221
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4222
|
+
try:
|
|
4223
|
+
request = jobsv1_pb2.AddJobRequest(
|
|
4224
|
+
job_name=job_name,
|
|
4225
|
+
username=common_utils.get_user_hash(),
|
|
4226
|
+
run_timestamp=self.run_timestamp,
|
|
4227
|
+
resources_str=resources_str,
|
|
4228
|
+
metadata=metadata)
|
|
4229
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4230
|
+
lambda: SkyletClient(handle.get_grpc_channel()).add_job(
|
|
4231
|
+
request))
|
|
4232
|
+
job_id = response.job_id
|
|
4233
|
+
log_dir = response.log_dir
|
|
4234
|
+
return job_id, log_dir
|
|
4235
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4236
|
+
use_legacy = True
|
|
4237
|
+
|
|
4238
|
+
if use_legacy:
|
|
4239
|
+
code = job_lib.JobLibCodeGen.add_job(
|
|
4240
|
+
job_name=job_name,
|
|
4241
|
+
username=common_utils.get_user_hash(),
|
|
4242
|
+
run_timestamp=self.run_timestamp,
|
|
4243
|
+
resources_str=resources_str,
|
|
4244
|
+
metadata=metadata)
|
|
4245
|
+
returncode, result_str, stderr = self.run_on_head(
|
|
4246
|
+
handle,
|
|
4247
|
+
code,
|
|
4248
|
+
stream_logs=False,
|
|
4249
|
+
require_outputs=True,
|
|
4250
|
+
separate_stderr=True)
|
|
4251
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
4252
|
+
# adding a job. Necessitating calling `sky launch`.
|
|
4253
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
4254
|
+
handle.cluster_name)
|
|
4255
|
+
# TODO(zhwu): this sometimes will unexpectedly fail, we can add
|
|
4256
|
+
# retry for this, after we figure out the reason.
|
|
4257
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4258
|
+
'Failed to fetch job id.',
|
|
4259
|
+
stderr)
|
|
4260
|
+
try:
|
|
4261
|
+
job_id_match = _JOB_ID_PATTERN.search(result_str)
|
|
4262
|
+
if job_id_match is not None:
|
|
4263
|
+
job_id = int(job_id_match.group(1))
|
|
4264
|
+
else:
|
|
4265
|
+
# For backward compatibility.
|
|
4266
|
+
job_id = int(result_str)
|
|
4267
|
+
log_dir_match = _LOG_DIR_PATTERN.search(result_str)
|
|
4268
|
+
if log_dir_match is not None:
|
|
4269
|
+
log_dir = log_dir_match.group(1).strip()
|
|
4270
|
+
else:
|
|
4271
|
+
# For backward compatibility, use the same log dir as local.
|
|
4272
|
+
log_dir = self.log_dir
|
|
4273
|
+
except ValueError as e:
|
|
4274
|
+
logger.error(stderr)
|
|
4275
|
+
raise ValueError(f'Failed to parse job id: {result_str}; '
|
|
4276
|
+
f'Returncode: {returncode}') from e
|
|
3951
4277
|
return job_id, log_dir
|
|
3952
4278
|
|
|
3953
4279
|
def _execute(
|
|
@@ -4126,6 +4452,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4126
4452
|
job_ids: Optional[List[int]] = None,
|
|
4127
4453
|
stream_logs: bool = True
|
|
4128
4454
|
) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
|
|
4455
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4456
|
+
try:
|
|
4457
|
+
request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
|
|
4458
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4459
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4460
|
+
).get_job_status(request))
|
|
4461
|
+
statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
|
|
4462
|
+
job_id: job_lib.JobStatus.from_protobuf(proto_status)
|
|
4463
|
+
for job_id, proto_status in response.job_statuses.items()
|
|
4464
|
+
}
|
|
4465
|
+
return statuses
|
|
4466
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4467
|
+
pass
|
|
4468
|
+
|
|
4129
4469
|
code = job_lib.JobLibCodeGen.get_job_status(job_ids)
|
|
4130
4470
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4131
4471
|
code,
|
|
@@ -4146,16 +4486,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4146
4486
|
|
|
4147
4487
|
See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
|
|
4148
4488
|
"""
|
|
4149
|
-
|
|
4150
|
-
|
|
4151
|
-
|
|
4152
|
-
|
|
4153
|
-
|
|
4154
|
-
|
|
4155
|
-
|
|
4156
|
-
|
|
4157
|
-
|
|
4158
|
-
|
|
4489
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4490
|
+
|
|
4491
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4492
|
+
try:
|
|
4493
|
+
request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
|
|
4494
|
+
cancel_all=cancel_all,
|
|
4495
|
+
user_hash=user_hash)
|
|
4496
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4497
|
+
lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
|
|
4498
|
+
request))
|
|
4499
|
+
cancelled_ids = response.cancelled_job_ids
|
|
4500
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4501
|
+
use_legacy = True
|
|
4502
|
+
|
|
4503
|
+
if use_legacy:
|
|
4504
|
+
code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
|
|
4505
|
+
user_hash)
|
|
4506
|
+
returncode, stdout, _ = self.run_on_head(handle,
|
|
4507
|
+
code,
|
|
4508
|
+
stream_logs=False,
|
|
4509
|
+
require_outputs=True)
|
|
4510
|
+
subprocess_utils.handle_returncode(
|
|
4511
|
+
returncode, code,
|
|
4512
|
+
f'Failed to cancel jobs on cluster {handle.cluster_name}.',
|
|
4513
|
+
stdout)
|
|
4514
|
+
cancelled_ids = message_utils.decode_payload(stdout)
|
|
4159
4515
|
if cancelled_ids:
|
|
4160
4516
|
logger.info(
|
|
4161
4517
|
f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
|
|
@@ -4172,20 +4528,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4172
4528
|
Returns:
|
|
4173
4529
|
A dictionary mapping job_id to log path.
|
|
4174
4530
|
"""
|
|
4175
|
-
|
|
4176
|
-
|
|
4531
|
+
job_to_dir: Dict[str, str] = {}
|
|
4532
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4533
|
+
|
|
4534
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4535
|
+
try:
|
|
4536
|
+
int_job_ids = []
|
|
4537
|
+
if job_ids:
|
|
4538
|
+
for str_job_id in job_ids:
|
|
4539
|
+
if str_job_id.isdigit():
|
|
4540
|
+
int_job_ids.append(int(str_job_id))
|
|
4541
|
+
request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4542
|
+
job_ids=int_job_ids)
|
|
4543
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4544
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4545
|
+
).get_log_dirs_for_jobs(request))
|
|
4546
|
+
job_log_dirs = response.job_log_dirs
|
|
4547
|
+
if not job_log_dirs:
|
|
4548
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4549
|
+
'No matching log directories found'
|
|
4550
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4551
|
+
return {}
|
|
4552
|
+
for job_id, log_dir in job_log_dirs.items():
|
|
4553
|
+
# Convert to string for backwards compatibility
|
|
4554
|
+
job_to_dir[str(job_id)] = log_dir
|
|
4555
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4556
|
+
use_legacy = True
|
|
4557
|
+
|
|
4558
|
+
if use_legacy:
|
|
4559
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
|
|
4560
|
+
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4177
4561
|
code,
|
|
4178
4562
|
stream_logs=False,
|
|
4179
4563
|
require_outputs=True,
|
|
4180
4564
|
separate_stderr=True)
|
|
4181
|
-
|
|
4182
|
-
|
|
4183
|
-
|
|
4184
|
-
|
|
4185
|
-
|
|
4186
|
-
|
|
4187
|
-
|
|
4188
|
-
|
|
4565
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4566
|
+
'Failed to sync logs.', stderr)
|
|
4567
|
+
job_to_dir = message_utils.decode_payload(stdout)
|
|
4568
|
+
if not job_to_dir:
|
|
4569
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4570
|
+
'No matching log directories found'
|
|
4571
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4572
|
+
return {}
|
|
4189
4573
|
|
|
4190
4574
|
job_ids = list(job_to_dir.keys())
|
|
4191
4575
|
dirs = list(job_to_dir.values())
|
|
@@ -4462,11 +4846,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4462
4846
|
exist_ok=True)
|
|
4463
4847
|
log_file = os.path.join(local_log_dir, 'run.log')
|
|
4464
4848
|
|
|
4465
|
-
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4466
|
-
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
4849
|
+
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4850
|
+
job_name=None,
|
|
4851
|
+
job_id=int(job_id),
|
|
4852
|
+
follow=False,
|
|
4853
|
+
controller=False)
|
|
4470
4854
|
# With the stdin=subprocess.DEVNULL, the ctrl-c will not
|
|
4471
4855
|
# kill the process, so we need to handle it manually here.
|
|
4472
4856
|
if threading.current_thread() is threading.main_thread():
|
|
@@ -4974,9 +5358,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4974
5358
|
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
|
|
4975
5359
|
down=down,
|
|
4976
5360
|
)
|
|
4977
|
-
backend_utils.invoke_skylet_with_retries(
|
|
4978
|
-
handle
|
|
4979
|
-
set_autostop(request))
|
|
5361
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
5362
|
+
handle.get_grpc_channel()).set_autostop(request))
|
|
4980
5363
|
else:
|
|
4981
5364
|
code = autostop_lib.AutostopCodeGen.set_autostop(
|
|
4982
5365
|
idle_minutes_to_autostop, self.NAME, wait_for, down)
|
|
@@ -5015,8 +5398,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5015
5398
|
try:
|
|
5016
5399
|
request = autostopv1_pb2.IsAutostoppingRequest()
|
|
5017
5400
|
response = backend_utils.invoke_skylet_with_retries(
|
|
5018
|
-
|
|
5019
|
-
|
|
5401
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
5402
|
+
).is_autostopping(request))
|
|
5020
5403
|
return response.is_autostopping
|
|
5021
5404
|
except Exception as e: # pylint: disable=broad-except
|
|
5022
5405
|
# The cluster may have been terminated, causing the gRPC call
|