skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (105) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/seeweb.py +103 -0
  3. sky/authentication.py +38 -0
  4. sky/backends/backend_utils.py +148 -30
  5. sky/backends/cloud_vm_ray_backend.py +606 -223
  6. sky/catalog/__init__.py +7 -0
  7. sky/catalog/aws_catalog.py +4 -0
  8. sky/catalog/common.py +18 -0
  9. sky/catalog/data_fetchers/fetch_aws.py +13 -37
  10. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  11. sky/catalog/seeweb_catalog.py +184 -0
  12. sky/client/cli/command.py +2 -71
  13. sky/client/sdk_async.py +5 -2
  14. sky/clouds/__init__.py +2 -0
  15. sky/clouds/aws.py +23 -5
  16. sky/clouds/cloud.py +8 -0
  17. sky/clouds/kubernetes.py +2 -0
  18. sky/clouds/seeweb.py +463 -0
  19. sky/core.py +46 -12
  20. sky/dashboard/out/404.html +1 -1
  21. sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
  22. sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
  25. sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
  27. sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
  30. sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
  31. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  32. sky/dashboard/out/clusters/[cluster].html +1 -1
  33. sky/dashboard/out/clusters.html +1 -1
  34. sky/dashboard/out/config.html +1 -1
  35. sky/dashboard/out/index.html +1 -1
  36. sky/dashboard/out/infra/[context].html +1 -1
  37. sky/dashboard/out/infra.html +1 -1
  38. sky/dashboard/out/jobs/[job].html +1 -1
  39. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/volumes.html +1 -1
  43. sky/dashboard/out/workspace/new.html +1 -1
  44. sky/dashboard/out/workspaces/[name].html +1 -1
  45. sky/dashboard/out/workspaces.html +1 -1
  46. sky/exceptions.py +5 -0
  47. sky/global_user_state.py +75 -26
  48. sky/jobs/client/sdk_async.py +4 -2
  49. sky/jobs/controller.py +4 -2
  50. sky/jobs/recovery_strategy.py +1 -1
  51. sky/jobs/state.py +26 -16
  52. sky/jobs/utils.py +67 -24
  53. sky/logs/agent.py +10 -2
  54. sky/provision/__init__.py +1 -0
  55. sky/provision/kubernetes/config.py +7 -2
  56. sky/provision/kubernetes/instance.py +84 -41
  57. sky/provision/kubernetes/utils.py +14 -3
  58. sky/provision/seeweb/__init__.py +11 -0
  59. sky/provision/seeweb/config.py +13 -0
  60. sky/provision/seeweb/instance.py +806 -0
  61. sky/provision/vast/instance.py +1 -1
  62. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  63. sky/schemas/generated/jobsv1_pb2.py +86 -0
  64. sky/schemas/generated/jobsv1_pb2.pyi +252 -0
  65. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  66. sky/server/config.py +14 -5
  67. sky/server/metrics.py +41 -8
  68. sky/server/requests/executor.py +41 -4
  69. sky/server/server.py +1 -0
  70. sky/server/uvicorn.py +11 -5
  71. sky/setup_files/dependencies.py +8 -1
  72. sky/skylet/constants.py +14 -8
  73. sky/skylet/job_lib.py +128 -10
  74. sky/skylet/log_lib.py +14 -3
  75. sky/skylet/log_lib.pyi +9 -0
  76. sky/skylet/services.py +203 -0
  77. sky/skylet/skylet.py +4 -0
  78. sky/task.py +62 -0
  79. sky/templates/kubernetes-ray.yml.j2 +120 -3
  80. sky/templates/seeweb-ray.yml.j2 +108 -0
  81. sky/utils/accelerator_registry.py +3 -1
  82. sky/utils/command_runner.py +35 -11
  83. sky/utils/command_runner.pyi +22 -0
  84. sky/utils/context_utils.py +15 -2
  85. sky/utils/controller_utils.py +11 -5
  86. sky/utils/db/migration_utils.py +1 -1
  87. sky/utils/git.py +559 -1
  88. sky/utils/resource_checker.py +8 -7
  89. sky/workspaces/core.py +57 -21
  90. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
  91. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
  92. sky/client/cli/git.py +0 -549
  93. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  98. sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  101. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
  102. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
  103. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
  104. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
  105. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
@@ -7,9 +7,11 @@ import json
7
7
  import math
8
8
  import os
9
9
  import pathlib
10
+ import random
10
11
  import re
11
12
  import shlex
12
13
  import signal
14
+ import socket
13
15
  import subprocess
14
16
  import sys
15
17
  import tempfile
@@ -48,6 +50,7 @@ from sky.provision import common as provision_common
48
50
  from sky.provision import instance_setup
49
51
  from sky.provision import metadata_utils
50
52
  from sky.provision import provisioner
53
+ from sky.provision.kubernetes import config as config_lib
51
54
  from sky.provision.kubernetes import utils as kubernetes_utils
52
55
  from sky.server.requests import requests as requests_lib
53
56
  from sky.skylet import autostop_lib
@@ -85,13 +88,22 @@ if typing.TYPE_CHECKING:
85
88
  from sky import dag
86
89
  from sky.schemas.generated import autostopv1_pb2
87
90
  from sky.schemas.generated import autostopv1_pb2_grpc
91
+ from sky.schemas.generated import jobsv1_pb2
92
+ from sky.schemas.generated import jobsv1_pb2_grpc
88
93
  else:
89
94
  # To avoid requiring grpcio to be installed on the client side.
90
- grpc = adaptors_common.LazyImport('grpc')
95
+ grpc = adaptors_common.LazyImport(
96
+ 'grpc',
97
+ # https://github.com/grpc/grpc/issues/37642 to avoid spam in console
98
+ set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
99
+ if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
91
100
  autostopv1_pb2 = adaptors_common.LazyImport(
92
101
  'sky.schemas.generated.autostopv1_pb2')
93
102
  autostopv1_pb2_grpc = adaptors_common.LazyImport(
94
103
  'sky.schemas.generated.autostopv1_pb2_grpc')
104
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
105
+ jobsv1_pb2_grpc = adaptors_common.LazyImport(
106
+ 'sky.schemas.generated.jobsv1_pb2_grpc')
95
107
 
96
108
  Path = str
97
109
 
@@ -218,7 +230,8 @@ def _get_cluster_config_template(cloud):
218
230
  clouds.Vast: 'vast-ray.yml.j2',
219
231
  clouds.Fluidstack: 'fluidstack-ray.yml.j2',
220
232
  clouds.Nebius: 'nebius-ray.yml.j2',
221
- clouds.Hyperbolic: 'hyperbolic-ray.yml.j2'
233
+ clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
234
+ clouds.Seeweb: 'seeweb-ray.yml.j2'
222
235
  }
223
236
  return cloud_to_template[type(cloud)]
224
237
 
@@ -330,6 +343,8 @@ class RayCodeGen:
330
343
 
331
344
  SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
332
345
 
346
+ CANCELLED_RETURN_CODE = 137
347
+
333
348
  kwargs = dict()
334
349
  # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
335
350
  # the directory exists for backward compatibility for the VM
@@ -345,8 +360,10 @@ class RayCodeGen:
345
360
  def get_or_fail(futures, pg) -> List[int]:
346
361
  \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
347
362
  if not futures:
348
- return []
363
+ return [], []
349
364
  returncodes = [1] * len(futures)
365
+ pids = [None] * len(futures)
366
+ failed = False
350
367
  # Wait for 1 task to be ready.
351
368
  ready = []
352
369
  # Keep invoking ray.wait if ready is empty. This is because
@@ -355,12 +372,22 @@ class RayCodeGen:
355
372
  # before becoming ready.
356
373
  # (Such tasks are common in serving jobs.)
357
374
  # Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
375
+
376
+ def handle_ready_tasks(tasks: List[ray.ObjectRef]) -> None:
377
+ nonlocal returncodes, pids, failed
378
+ for task in tasks:
379
+ idx = futures.index(task)
380
+ res = ray.get(task)
381
+ returncodes[idx] = res['return_code']
382
+ pids[idx] = res['pid']
383
+ if res['return_code'] != 0:
384
+ failed = True
385
+
358
386
  while not ready:
359
387
  ready, unready = ray.wait(futures)
360
- idx = futures.index(ready[0])
361
- returncodes[idx] = ray.get(ready[0])
388
+ handle_ready_tasks(ready)
362
389
  while unready:
363
- if returncodes[idx] != 0:
390
+ if failed:
364
391
  for task in unready:
365
392
  # ray.cancel without force fails to kill tasks.
366
393
  # We use force=True to kill unready tasks.
@@ -368,17 +395,16 @@ class RayCodeGen:
368
395
  # Use SIGKILL=128+9 to indicate the task is forcely
369
396
  # killed.
370
397
  idx = futures.index(task)
371
- returncodes[idx] = 137
398
+ returncodes[idx] = CANCELLED_RETURN_CODE
372
399
  break
373
400
  ready, unready = ray.wait(unready)
374
- idx = futures.index(ready[0])
375
- returncodes[idx] = ray.get(ready[0])
401
+ handle_ready_tasks(ready)
376
402
  # Remove the placement group after all tasks are done, so that
377
403
  # the next job can be scheduled on the released resources
378
404
  # immediately.
379
405
  ray_util.remove_placement_group(pg)
380
406
  sys.stdout.flush()
381
- return returncodes
407
+ return returncodes, pids
382
408
 
383
409
  run_fn = None
384
410
  futures = []
@@ -394,7 +420,10 @@ class RayCodeGen:
394
420
  inspect.getsource(log_lib.make_task_bash_script),
395
421
  inspect.getsource(log_lib.add_ray_env_vars),
396
422
  inspect.getsource(log_lib.run_bash_command_with_log),
397
- 'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
423
+ inspect.getsource(log_lib.run_bash_command_with_log_and_return_pid),
424
+ 'run_bash_command_with_log = run_bash_command_with_log',
425
+ 'run_bash_command_with_log_and_return_pid = \
426
+ ray.remote(run_bash_command_with_log_and_return_pid)',
398
427
  ]
399
428
  # Currently, the codegen program is/can only be submitted to the head
400
429
  # node, due to using job_lib for updating job statuses, and using
@@ -499,7 +528,7 @@ class RayCodeGen:
499
528
  total_num_nodes = len(ray.nodes())
500
529
  setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
501
530
  setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
502
- setup_workers = [run_bash_command_with_log \\
531
+ setup_workers = [run_bash_command_with_log_and_return_pid \\
503
532
  .options(
504
533
  name='setup',
505
534
  num_cpus=_SETUP_CPUS,
@@ -514,15 +543,25 @@ class RayCodeGen:
514
543
  stream_logs=True,
515
544
  with_ray=True,
516
545
  ) for i in range(total_num_nodes)]
517
- setup_returncodes = get_or_fail(setup_workers, setup_pg)
518
- if sum(setup_returncodes) != 0:
546
+ setup_returncodes, setup_pids = get_or_fail(setup_workers, setup_pg)
547
+ success = True
548
+ failed_workers_and_returncodes = []
549
+ for i in range(len(setup_returncodes)):
550
+ returncode = setup_returncodes[i]
551
+ pid = setup_pids[i]
552
+ if pid == None:
553
+ pid = os.getpid()
554
+ if returncode != 0 and returncode != CANCELLED_RETURN_CODE:
555
+ success = False
556
+ failed_workers_and_returncodes.append((pid, returncode))
557
+ if not success:
558
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed. '
559
+ msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
560
+ msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
561
+ print(msg, flush=True)
519
562
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
520
563
  # This waits for all streaming logs to finish.
521
564
  time.sleep(1)
522
- print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
523
- 'return code list:{colorama.Style.RESET_ALL}',
524
- setup_returncodes,
525
- flush=True)
526
565
  # Need this to set the job status in ray job to be FAILED.
527
566
  sys.exit(1)
528
567
  """)
@@ -695,7 +734,7 @@ class RayCodeGen:
695
734
 
696
735
  sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
697
736
 
698
- futures.append(run_bash_command_with_log \\
737
+ futures.append(run_bash_command_with_log_and_return_pid \\
699
738
  .options(name=name_str, {options_str}) \\
700
739
  .remote(
701
740
  script,
@@ -714,7 +753,7 @@ class RayCodeGen:
714
753
 
715
754
  self._code += [
716
755
  textwrap.dedent(f"""\
717
- returncodes = get_or_fail(futures, pg)
756
+ returncodes, _ = get_or_fail(futures, pg)
718
757
  if sum(returncodes) != 0:
719
758
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
720
759
  # Schedule the next pending job immediately to make the job
@@ -1340,6 +1379,34 @@ class RetryingVmProvisioner(object):
1340
1379
  zones = [clouds.Zone(name=to_provision.zone)]
1341
1380
  yield zones
1342
1381
 
1382
+ def _insufficient_resources_msg(
1383
+ self,
1384
+ to_provision: resources_lib.Resources,
1385
+ requested_resources: Set[resources_lib.Resources],
1386
+ insufficient_resources: Optional[List[str]],
1387
+ ) -> str:
1388
+ insufficent_resource_msg = ('' if insufficient_resources is None else
1389
+ f' ({", ".join(insufficient_resources)})')
1390
+ message = f'Failed to acquire resources{insufficent_resource_msg} '
1391
+ if to_provision.zone is not None:
1392
+ message += (f'in {to_provision.zone} for {requested_resources}. ')
1393
+ elif to_provision.region is not None and to_provision.cloud is not None:
1394
+ # For public clouds, provision.region is always set.
1395
+ if clouds.SSH().is_same_cloud(to_provision.cloud):
1396
+ message += (
1397
+ f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
1398
+ f'for {requested_resources}. The SSH Node Pool may not '
1399
+ 'have enough resources.')
1400
+ elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
1401
+ message += (f'in context {to_provision.region} for '
1402
+ f'{requested_resources}. ')
1403
+ else:
1404
+ message += (f'in all zones in {to_provision.region} for '
1405
+ f'{requested_resources}. ')
1406
+ else:
1407
+ message += (f'{to_provision.cloud} for {requested_resources}. ')
1408
+ return message
1409
+
1343
1410
  def _retry_zones(
1344
1411
  self,
1345
1412
  to_provision: resources_lib.Resources,
@@ -1418,6 +1485,7 @@ class RetryingVmProvisioner(object):
1418
1485
  f'To request quotas, check the instruction: '
1419
1486
  f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
1420
1487
 
1488
+ insufficient_resources = None
1421
1489
  for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
1422
1490
  prev_cluster_status,
1423
1491
  prev_cluster_ever_up):
@@ -1630,6 +1698,24 @@ class RetryingVmProvisioner(object):
1630
1698
  # No teardown happens for this error.
1631
1699
  with ux_utils.print_exception_no_traceback():
1632
1700
  raise
1701
+ except config_lib.KubernetesError as e:
1702
+ if e.insufficent_resources:
1703
+ insufficient_resources = e.insufficent_resources
1704
+ # NOTE: We try to cleanup the cluster even if the previous
1705
+ # cluster does not exist. Also we are fast at
1706
+ # cleaning up clusters now if there is no existing node.
1707
+ CloudVmRayBackend().post_teardown_cleanup(
1708
+ handle,
1709
+ terminate=not prev_cluster_ever_up,
1710
+ remove_from_db=False,
1711
+ failover=True,
1712
+ )
1713
+ # TODO(suquark): other clouds may have different zone
1714
+ # blocking strategy. See '_update_blocklist_on_error'
1715
+ # for details.
1716
+ FailoverCloudErrorHandlerV2.update_blocklist_on_error(
1717
+ self._blocked_resources, to_provision, region, zones, e)
1718
+ continue
1633
1719
  except Exception as e: # pylint: disable=broad-except
1634
1720
  # NOTE: We try to cleanup the cluster even if the previous
1635
1721
  # cluster does not exist. Also we are fast at
@@ -1760,26 +1846,9 @@ class RetryingVmProvisioner(object):
1760
1846
  terminate=terminate_or_stop,
1761
1847
  remove_from_db=False)
1762
1848
 
1763
- if to_provision.zone is not None:
1764
- message = (
1765
- f'Failed to acquire resources in {to_provision.zone} for '
1766
- f'{requested_resources}. ')
1767
- elif to_provision.region is not None:
1768
- # For public clouds, provision.region is always set.
1769
- if clouds.SSH().is_same_cloud(to_provision.cloud):
1770
- message = ('Failed to acquire resources in SSH Node Pool '
1771
- f'({to_provision.region.lstrip("ssh-")}) for '
1772
- f'{requested_resources}. The SSH Node Pool may not '
1773
- 'have enough resources.')
1774
- elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
1775
- message = ('Failed to acquire resources in context '
1776
- f'{to_provision.region} for {requested_resources}. ')
1777
- else:
1778
- message = ('Failed to acquire resources in all zones in '
1779
- f'{to_provision.region} for {requested_resources}. ')
1780
- else:
1781
- message = (f'Failed to acquire resources in {to_provision.cloud} '
1782
- f'for {requested_resources}. ')
1849
+ message = self._insufficient_resources_msg(to_provision,
1850
+ requested_resources,
1851
+ insufficient_resources)
1783
1852
  # Do not failover to other locations if the cluster was ever up, since
1784
1853
  # the user can have some data on the cluster.
1785
1854
  raise exceptions.ResourcesUnavailableError(
@@ -2261,8 +2330,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2261
2330
  - (optional) Skylet SSH tunnel info.
2262
2331
  """
2263
2332
  # Bump if any fields get added/removed/changed, and add backward
2264
- # compaitibility logic in __setstate__.
2265
- _VERSION = 11
2333
+ # compatibility logic in __setstate__ and/or __getstate__.
2334
+ _VERSION = 12
2266
2335
 
2267
2336
  def __init__(
2268
2337
  self,
@@ -2296,7 +2365,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2296
2365
  self.launched_resources = launched_resources
2297
2366
  self.docker_user: Optional[str] = None
2298
2367
  self.is_grpc_enabled = True
2299
- self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
2300
2368
 
2301
2369
  def __repr__(self):
2302
2370
  return (f'ResourceHandle('
@@ -2313,8 +2381,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2313
2381
  f'{self.launched_resources}, '
2314
2382
  f'\n\tdocker_user={self.docker_user},'
2315
2383
  f'\n\tssh_user={self.ssh_user},'
2316
- f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
2317
- f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
2384
+ f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
2318
2385
 
2319
2386
  def get_cluster_name(self):
2320
2387
  return self.cluster_name
@@ -2643,11 +2710,74 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2643
2710
  cluster_config_file)
2644
2711
  self.docker_user = docker_user
2645
2712
 
2713
+ def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
2714
+ metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
2715
+ self.cluster_name)
2716
+ if metadata is None:
2717
+ return None
2718
+ return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
2719
+
2720
+ def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
2721
+ global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
2722
+ self.cluster_name,
2723
+ (tunnel.port, tunnel.pid) if tunnel is not None else None)
2724
+
2646
2725
  def get_grpc_channel(self) -> 'grpc.Channel':
2647
- if self.skylet_ssh_tunnel is None:
2648
- self.open_and_update_skylet_tunnel()
2649
- assert self.skylet_ssh_tunnel is not None
2650
- return grpc.insecure_channel(f'localhost:{self.skylet_ssh_tunnel.port}')
2726
+ # It's fine to not grab the lock here, as we're only reading,
2727
+ # and writes are very rare.
2728
+ # It's acceptable to read while another process is opening a tunnel,
2729
+ # because it will only happen on:
2730
+ # 1. A new cluster who has no tunnel yet, or
2731
+ # 2. A cluster with an unhealthy tunnel
2732
+ # For (2), for processes that read the "stale" tunnel, it will fail
2733
+ # and on the next retry, it will call get_grpc_channel again
2734
+ # and get the new tunnel.
2735
+ tunnel = self._get_skylet_ssh_tunnel()
2736
+ if tunnel is not None:
2737
+ try:
2738
+ # Check if the tunnel is open.
2739
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
2740
+ s.settimeout(0.5)
2741
+ s.connect(('localhost', tunnel.port))
2742
+ return grpc.insecure_channel(f'localhost:{tunnel.port}')
2743
+ except socket.error as e:
2744
+ logger.warning(
2745
+ 'Failed to connect to SSH tunnel for cluster '
2746
+ f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
2747
+ 'acquiring lock')
2748
+ pass
2749
+ lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
2750
+ lock_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
2751
+ lock = locks.get_lock(lock_id, lock_timeout)
2752
+ try:
2753
+ with lock.acquire(blocking=True):
2754
+ # Re-read the tunnel from the DB.
2755
+ tunnel = self._get_skylet_ssh_tunnel()
2756
+ if tunnel is None:
2757
+ logger.debug('No SSH tunnel found for cluster '
2758
+ f'{self.cluster_name!r}, '
2759
+ 'opening the tunnel')
2760
+ tunnel = self._open_and_update_skylet_tunnel()
2761
+ return grpc.insecure_channel(f'localhost:{tunnel.port}')
2762
+ try:
2763
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
2764
+ s.settimeout(0.5)
2765
+ s.connect(('localhost', tunnel.port))
2766
+ return grpc.insecure_channel(f'localhost:{tunnel.port}')
2767
+ except socket.error as e:
2768
+ logger.warning(
2769
+ 'Failed to connect to SSH tunnel for cluster '
2770
+ f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
2771
+ 'opening new tunnel')
2772
+ tunnel = self._open_and_update_skylet_tunnel()
2773
+ return grpc.insecure_channel(f'localhost:{tunnel.port}')
2774
+ except locks.LockTimeout as e:
2775
+ raise RuntimeError(
2776
+ 'Failed to get gRPC channel for cluster '
2777
+ f'{self.cluster_name!r} due to a timeout when waiting for the '
2778
+ 'SSH tunnel to be opened. Please try again or manually remove '
2779
+ f'the lock at {lock_id}. '
2780
+ f'{common_utils.format_exception(e)}') from e
2651
2781
 
2652
2782
  def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
2653
2783
  """Clean up an SSH tunnel by terminating the process."""
@@ -2668,31 +2798,48 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2668
2798
  logger.warning(
2669
2799
  f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
2670
2800
 
2671
- def open_and_update_skylet_tunnel(self) -> None:
2801
+ def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
2672
2802
  """Opens an SSH tunnel to the Skylet on the head node,
2673
2803
  updates the cluster handle, and persists it to the database."""
2674
- local_port = common_utils.find_free_port(10000)
2675
- runners = self.get_command_runners()
2676
- head_runner = runners[0]
2677
- if isinstance(head_runner, command_runner.SSHCommandRunner):
2678
- # Disabling ControlMaster makes things easier to reason about
2679
- # with respect to resource management/ownership,
2680
- # as killing the process will close the tunnel too.
2681
- head_runner.disable_control_master = True
2682
-
2683
- cmd = head_runner.port_forward_command([(local_port,
2684
- constants.SKYLET_GRPC_PORT)])
2685
- ssh_tunnel_proc = subprocess.Popen(cmd)
2686
- tunnel_info = SSHTunnelInfo(port=local_port, pid=ssh_tunnel_proc.pid)
2804
+ max_attempts = 3
2805
+ # There could be a race condition here, as multiple processes may
2806
+ # attempt to open the same port at the same time.
2807
+ for attempt in range(max_attempts):
2808
+ runners = self.get_command_runners()
2809
+ head_runner = runners[0]
2810
+ local_port = random.randint(10000, 65535)
2811
+ try:
2812
+ ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
2813
+ head_runner, (local_port, constants.SKYLET_GRPC_PORT))
2814
+ except exceptions.CommandError as e:
2815
+ # Don't retry if the error is due to timeout,
2816
+ # connection refused, Kubernetes pods not found,
2817
+ # or an in-progress termination.
2818
+ if (e.detailed_reason is not None and
2819
+ (backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
2820
+ e.detailed_reason) or
2821
+ backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
2822
+ e.detailed_reason) or attempt == max_attempts - 1)):
2823
+ raise e
2824
+ logger.warning(
2825
+ f'Failed to open SSH tunnel on port {local_port} '
2826
+ f'({attempt + 1}/{max_attempts}). '
2827
+ f'{e.error_msg}\n{e.detailed_reason}')
2828
+ continue
2829
+ tunnel_info = SSHTunnelInfo(port=local_port,
2830
+ pid=ssh_tunnel_proc.pid)
2831
+ break
2832
+
2687
2833
  try:
2688
2834
  grpc.channel_ready_future(
2689
2835
  grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
2690
2836
  timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
2691
2837
  # Clean up existing tunnel before setting up the new one.
2692
- if self.skylet_ssh_tunnel is not None:
2693
- self._cleanup_ssh_tunnel(self.skylet_ssh_tunnel)
2694
- self.skylet_ssh_tunnel = tunnel_info
2695
- global_user_state.update_cluster_handle(self.cluster_name, self)
2838
+ old_tunnel = self._get_skylet_ssh_tunnel()
2839
+ if old_tunnel is not None:
2840
+ self._cleanup_ssh_tunnel(old_tunnel)
2841
+ self._set_skylet_ssh_tunnel(tunnel_info)
2842
+ return tunnel_info
2696
2843
  except grpc.FutureTimeoutError as e:
2697
2844
  self._cleanup_ssh_tunnel(tunnel_info)
2698
2845
  logger.warning(
@@ -2752,6 +2899,13 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2752
2899
  """Returns whether this handle has gRPC enabled and gRPC flag is set."""
2753
2900
  return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
2754
2901
 
2902
+ def __getstate__(self):
2903
+ state = self.__dict__.copy()
2904
+ # For backwards compatibility. Refer to
2905
+ # https://github.com/skypilot-org/skypilot/pull/7133
2906
+ state.setdefault('skylet_ssh_tunnel', None)
2907
+ return state
2908
+
2755
2909
  def __setstate__(self, state):
2756
2910
  self._version = self._VERSION
2757
2911
 
@@ -2809,6 +2963,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2809
2963
  state['is_grpc_enabled'] = False
2810
2964
  state['skylet_ssh_tunnel'] = None
2811
2965
 
2966
+ if version >= 12:
2967
+ # DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
2968
+ state.pop('skylet_ssh_tunnel', None)
2969
+
2812
2970
  self.__dict__.update(state)
2813
2971
 
2814
2972
  # Because the update_cluster_ips and update_ssh_ports
@@ -2886,21 +3044,93 @@ class SkyletClient:
2886
3044
 
2887
3045
  def __init__(self, channel: 'grpc.Channel'):
2888
3046
  self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
3047
+ self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
2889
3048
 
2890
3049
  def set_autostop(
2891
3050
  self,
2892
3051
  request: 'autostopv1_pb2.SetAutostopRequest',
2893
- timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3052
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2894
3053
  ) -> 'autostopv1_pb2.SetAutostopResponse':
2895
3054
  return self._autostop_stub.SetAutostop(request, timeout=timeout)
2896
3055
 
2897
3056
  def is_autostopping(
2898
3057
  self,
2899
3058
  request: 'autostopv1_pb2.IsAutostoppingRequest',
2900
- timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3059
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2901
3060
  ) -> 'autostopv1_pb2.IsAutostoppingResponse':
2902
3061
  return self._autostop_stub.IsAutostopping(request, timeout=timeout)
2903
3062
 
3063
+ def add_job(
3064
+ self,
3065
+ request: 'jobsv1_pb2.AddJobRequest',
3066
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3067
+ ) -> 'jobsv1_pb2.AddJobResponse':
3068
+ return self._jobs_stub.AddJob(request, timeout=timeout)
3069
+
3070
+ def queue_job(
3071
+ self,
3072
+ request: 'jobsv1_pb2.QueueJobRequest',
3073
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3074
+ ) -> 'jobsv1_pb2.QueueJobResponse':
3075
+ return self._jobs_stub.QueueJob(request, timeout=timeout)
3076
+
3077
+ def update_status(
3078
+ self,
3079
+ request: 'jobsv1_pb2.UpdateStatusRequest',
3080
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3081
+ ) -> 'jobsv1_pb2.UpdateStatusResponse':
3082
+ return self._jobs_stub.UpdateStatus(request, timeout=timeout)
3083
+
3084
+ def get_job_queue(
3085
+ self,
3086
+ request: 'jobsv1_pb2.GetJobQueueRequest',
3087
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3088
+ ) -> 'jobsv1_pb2.GetJobQueueResponse':
3089
+ return self._jobs_stub.GetJobQueue(request, timeout=timeout)
3090
+
3091
+ def cancel_jobs(
3092
+ self,
3093
+ request: 'jobsv1_pb2.CancelJobsRequest',
3094
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3095
+ ) -> 'jobsv1_pb2.CancelJobsResponse':
3096
+ return self._jobs_stub.CancelJobs(request, timeout=timeout)
3097
+
3098
+ def fail_all_in_progress_jobs(
3099
+ self,
3100
+ request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
3101
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3102
+ ) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
3103
+ return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
3104
+
3105
+ def get_job_status(
3106
+ self,
3107
+ request: 'jobsv1_pb2.GetJobStatusRequest',
3108
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3109
+ ) -> 'jobsv1_pb2.GetJobStatusResponse':
3110
+ return self._jobs_stub.GetJobStatus(request, timeout=timeout)
3111
+
3112
+ def get_job_submitted_timestamp(
3113
+ self,
3114
+ request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
3115
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3116
+ ) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
3117
+ return self._jobs_stub.GetJobSubmittedTimestamp(request,
3118
+ timeout=timeout)
3119
+
3120
+ def get_job_ended_timestamp(
3121
+ self,
3122
+ request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
3123
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3124
+ ) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
3125
+ return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
3126
+
3127
+ def get_log_dirs_for_jobs(
3128
+ self,
3129
+ request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
3130
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3131
+ ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
3132
+ return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
3133
+
2904
3134
 
2905
3135
  @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
2906
3136
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
@@ -3115,7 +3345,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3115
3345
  colorama.Style.RESET_ALL +
3116
3346
  colorama.Style.DIM +
3117
3347
  'Check concurrent requests: ' +
3118
- 'sky api status '))
3348
+ 'sky api status -v | grep '
3349
+ f'{cluster_name}'))
3119
3350
 
3120
3351
  def _locked_provision(
3121
3352
  self,
@@ -3406,16 +3637,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3406
3637
  # update_status will query the ray job status for all INIT /
3407
3638
  # PENDING / RUNNING jobs for the real status, since we do not
3408
3639
  # know the actual previous status of the cluster.
3409
- cmd = job_lib.JobLibCodeGen.update_status()
3410
3640
  logger.debug('Update job queue on remote cluster.')
3411
3641
  with rich_utils.safe_status(
3412
3642
  ux_utils.spinner_message('Preparing SkyPilot runtime')):
3413
- returncode, _, stderr = self.run_on_head(handle,
3414
- cmd,
3415
- require_outputs=True)
3416
- subprocess_utils.handle_returncode(returncode, cmd,
3417
- 'Failed to update job status.',
3418
- stderr)
3643
+ use_legacy = not handle.is_grpc_enabled_with_flag
3644
+
3645
+ if handle.is_grpc_enabled_with_flag:
3646
+ try:
3647
+ request = jobsv1_pb2.UpdateStatusRequest()
3648
+ backend_utils.invoke_skylet_with_retries(
3649
+ lambda: SkyletClient(handle.get_grpc_channel()
3650
+ ).update_status(request))
3651
+ except exceptions.SkyletMethodNotImplementedError:
3652
+ use_legacy = True
3653
+
3654
+ if use_legacy:
3655
+ cmd = job_lib.JobLibCodeGen.update_status()
3656
+ returncode, _, stderr = self.run_on_head(
3657
+ handle, cmd, require_outputs=True)
3658
+ subprocess_utils.handle_returncode(
3659
+ returncode, cmd, 'Failed to update job status.', stderr)
3419
3660
  if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
3420
3661
  # Safely set all the previous jobs to FAILED since the cluster
3421
3662
  # is restarted
@@ -3423,14 +3664,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3423
3664
  # 1. A job finishes RUNNING, but right before it update itself
3424
3665
  # to SUCCEEDED, the cluster is STOPPED by `sky stop`.
3425
3666
  # 2. On next `sky start`, it gets reset to FAILED.
3426
- cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3427
- returncode, stdout, stderr = self.run_on_head(handle,
3428
- cmd,
3429
- require_outputs=True)
3430
- subprocess_utils.handle_returncode(
3431
- returncode, cmd,
3432
- 'Failed to set previously in-progress jobs to FAILED',
3433
- stdout + stderr)
3667
+ use_legacy = not handle.is_grpc_enabled_with_flag
3668
+
3669
+ if handle.is_grpc_enabled_with_flag:
3670
+ try:
3671
+ fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
3672
+ backend_utils.invoke_skylet_with_retries(
3673
+ lambda: SkyletClient(handle.get_grpc_channel(
3674
+ )).fail_all_in_progress_jobs(fail_request))
3675
+ except exceptions.SkyletMethodNotImplementedError:
3676
+ use_legacy = True
3677
+
3678
+ if use_legacy:
3679
+ cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3680
+ returncode, stdout, stderr = self.run_on_head(
3681
+ handle, cmd, require_outputs=True)
3682
+ subprocess_utils.handle_returncode(
3683
+ returncode, cmd,
3684
+ 'Failed to set previously in-progress jobs to FAILED',
3685
+ stdout + stderr)
3434
3686
 
3435
3687
  prev_ports = None
3436
3688
  if prev_handle is not None:
@@ -3789,109 +4041,161 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3789
4041
  remote_log_dir: Optional[str] = None,
3790
4042
  ) -> None:
3791
4043
  """Executes generated code on the head node."""
3792
- script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
4044
+ use_legacy = not handle.is_grpc_enabled_with_flag
4045
+ file_name = f'sky_job_{job_id}'
4046
+ script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
3793
4047
  if remote_log_dir is None:
3794
4048
  remote_log_dir = self.log_dir
3795
4049
  remote_log_path = os.path.join(remote_log_dir, 'run.log')
3796
4050
 
3797
- cd = f'cd {SKY_REMOTE_WORKDIR}'
4051
+ def _dump_code_to_file(codegen: str,
4052
+ target_dir: str = SKY_REMOTE_APP_DIR) -> None:
4053
+ runners = handle.get_command_runners()
4054
+ head_runner = runners[0]
4055
+ with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
4056
+ fp.write(codegen)
4057
+ fp.flush()
4058
+ script_path = os.path.join(target_dir, file_name)
4059
+ # We choose to sync code + exec, because the alternative of
4060
+ # 'ray submit' may not work as it may use system python
4061
+ # (python2) to execute the script. Happens for AWS.
4062
+ head_runner.rsync(source=fp.name,
4063
+ target=script_path,
4064
+ up=True,
4065
+ stream_logs=False)
3798
4066
 
4067
+ cd = f'cd {SKY_REMOTE_WORKDIR}'
3799
4068
  mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
3800
4069
  f'touch {remote_log_path}')
3801
4070
  encoded_script = shlex.quote(codegen)
3802
4071
  create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
3803
4072
  job_submit_cmd = (
3804
- # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
3805
- # with pid is the same driver process.
4073
+ # JOB_CMD_IDENTIFIER is used for identifying the process
4074
+ # retrieved with pid is the same driver process.
3806
4075
  f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
3807
4076
  f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
3808
4077
  # Do not use &>, which is not POSIX and may not work.
3809
4078
  # Note that the order of ">filename 2>&1" matters.
3810
4079
  f'> {remote_log_path} 2>&1')
3811
-
3812
4080
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3813
4081
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3814
4082
 
3815
- def _dump_code_to_file(codegen: str,
3816
- target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3817
- runners = handle.get_command_runners()
3818
- head_runner = runners[0]
3819
- with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3820
- fp.write(codegen)
3821
- fp.flush()
3822
- script_path = os.path.join(target_dir, f'sky_job_{job_id}')
3823
- # We choose to sync code + exec, because the alternative of 'ray
3824
- # submit' may not work as it may use system python (python2) to
3825
- # execute the script. Happens for AWS.
3826
- head_runner.rsync(source=fp.name,
3827
- target=script_path,
3828
- up=True,
3829
- stream_logs=False)
3830
-
3831
4083
  # Should also be ealier than _is_command_length_over_limit
3832
4084
  # Same reason as in _setup
3833
4085
  if self._dump_final_script:
3834
4086
  _dump_code_to_file(job_submit_cmd,
3835
4087
  constants.PERSISTENT_RUN_SCRIPT_DIR)
3836
4088
 
3837
- if _is_command_length_over_limit(job_submit_cmd):
3838
- _dump_code_to_file(codegen)
3839
- job_submit_cmd = f'{mkdir_code} && {code}'
3840
-
3841
- def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
3842
- if managed_job_dag is not None:
3843
- # Add the managed job to job queue database.
3844
- managed_job_codegen = managed_jobs.ManagedJobCodeGen()
3845
- managed_job_code = managed_job_codegen.set_pending(
3846
- job_id,
3847
- managed_job_dag,
3848
- skypilot_config.get_active_workspace(
3849
- force_user_workspace=True),
3850
- entrypoint=common_utils.get_current_command())
3851
- # Set the managed job to PENDING state to make sure that this
3852
- # managed job appears in the `sky jobs queue`, even if it needs
3853
- # to wait to be submitted.
3854
- # We cannot set the managed job to PENDING state in the job
3855
- # template (jobs-controller.yaml.j2), as it may need to wait for
3856
- # the run commands to be scheduled on the job controller in
3857
- # high-load cases.
3858
- job_submit_cmd += ' && ' + managed_job_code
3859
- return job_submit_cmd
3860
-
3861
- job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
4089
+ if handle.is_grpc_enabled_with_flag:
4090
+ try:
4091
+ managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
4092
+ if managed_job_dag is not None:
4093
+ workspace = skypilot_config.get_active_workspace(
4094
+ force_user_workspace=True)
4095
+ entrypoint = common_utils.get_current_command()
4096
+
4097
+ managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
4098
+ for task_id, task in enumerate(managed_job_dag.tasks):
4099
+ resources_str = backend_utils.get_task_resources_str(
4100
+ task, is_managed_job=True)
4101
+ managed_job_tasks.append(
4102
+ jobsv1_pb2.ManagedJobTask(
4103
+ task_id=task_id,
4104
+ name=task.name,
4105
+ resources_str=resources_str,
4106
+ metadata_json=task.metadata_json))
4107
+
4108
+ managed_job_info = jobsv1_pb2.ManagedJobInfo(
4109
+ name=managed_job_dag.name,
4110
+ pool=managed_job_dag.pool,
4111
+ workspace=workspace,
4112
+ entrypoint=entrypoint,
4113
+ tasks=managed_job_tasks)
4114
+
4115
+ if _is_command_length_over_limit(codegen):
4116
+ _dump_code_to_file(codegen)
4117
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
4118
+ job_id=job_id,
4119
+ # codegen not set - server assumes script uploaded
4120
+ remote_log_dir=remote_log_dir,
4121
+ managed_job=managed_job_info,
4122
+ script_path=script_path)
4123
+ else:
4124
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
4125
+ job_id=job_id,
4126
+ codegen=codegen,
4127
+ remote_log_dir=remote_log_dir,
4128
+ managed_job=managed_job_info,
4129
+ script_path=script_path)
4130
+
4131
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
4132
+ handle.get_grpc_channel()).queue_job(queue_job_request))
4133
+ except exceptions.SkyletMethodNotImplementedError:
4134
+ use_legacy = True
4135
+
4136
+ if use_legacy:
4137
+ if _is_command_length_over_limit(job_submit_cmd):
4138
+ _dump_code_to_file(codegen)
4139
+ job_submit_cmd = f'{mkdir_code} && {code}'
4140
+
4141
+ def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
4142
+ if managed_job_dag is not None:
4143
+ # Add the managed job to job queue database.
4144
+ managed_job_codegen = managed_jobs.ManagedJobCodeGen()
4145
+ managed_job_code = managed_job_codegen.set_pending(
4146
+ job_id,
4147
+ managed_job_dag,
4148
+ skypilot_config.get_active_workspace(
4149
+ force_user_workspace=True),
4150
+ entrypoint=common_utils.get_current_command())
4151
+ # Set the managed job to PENDING state to make sure that
4152
+ # this managed job appears in the `sky jobs queue`, even
4153
+ # if it needs to wait to be submitted.
4154
+ # We cannot set the managed job to PENDING state in the
4155
+ # job template (jobs-controller.yaml.j2), as it may need
4156
+ # to wait for the run commands to be scheduled on the job
4157
+ # controller in high-load cases.
4158
+ job_submit_cmd += ' && ' + managed_job_code
4159
+ return job_submit_cmd
3862
4160
 
3863
- returncode, stdout, stderr = self.run_on_head(handle,
3864
- job_submit_cmd,
3865
- stream_logs=False,
3866
- require_outputs=True)
3867
- # Happens when someone calls `sky exec` but remote is outdated for
3868
- # running a job. Necessitating calling `sky launch`.
3869
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3870
- handle.cluster_name)
3871
- output = stdout + stderr
3872
- if ((returncode == 255 and 'too long' in output.lower()) or
3873
- (returncode == 1 and 'request-uri too large' in output.lower())):
3874
- # If the generated script is too long, we retry it with dumping
3875
- # the script to a file and running it with SSH. We use a general
3876
- # length limit check before but it could be inaccurate on some
3877
- # systems.
3878
- # When there is a cloudflare proxy in front of the remote, it could
3879
- # cause `414 Request-URI Too Large` error.
3880
- logger.debug('Failed to submit job due to command length limit. '
3881
- 'Dumping job to file and running it with SSH. '
3882
- f'Output: {output}')
3883
- _dump_code_to_file(codegen)
3884
- job_submit_cmd = f'{mkdir_code} && {code}'
3885
4161
  job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
4162
+
3886
4163
  returncode, stdout, stderr = self.run_on_head(handle,
3887
4164
  job_submit_cmd,
3888
4165
  stream_logs=False,
3889
4166
  require_outputs=True)
4167
+ # Happens when someone calls `sky exec` but remote is outdated for
4168
+ # running a job. Necessitating calling `sky launch`.
4169
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
4170
+ handle.cluster_name)
4171
+ output = stdout + stderr
4172
+ if ((returncode == 255 and 'too long' in output.lower()) or
4173
+ (returncode == 1 and
4174
+ 'request-uri too large' in output.lower())):
4175
+ # If the generated script is too long, we retry it with dumping
4176
+ # the script to a file and running it with SSH. We use a general
4177
+ # length limit check before but it could be inaccurate on some
4178
+ # systems.
4179
+ # When there is a cloudflare proxy in front of the remote, it
4180
+ # could cause `414 Request-URI Too Large` error.
4181
+ logger.debug(
4182
+ 'Failed to submit job due to command length limit. '
4183
+ 'Dumping job to file and running it with SSH. '
4184
+ f'Output: {output}')
4185
+ _dump_code_to_file(codegen)
4186
+ job_submit_cmd = f'{mkdir_code} && {code}'
4187
+ job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
4188
+ returncode, stdout, stderr = self.run_on_head(
4189
+ handle,
4190
+ job_submit_cmd,
4191
+ stream_logs=False,
4192
+ require_outputs=True)
3890
4193
 
3891
- subprocess_utils.handle_returncode(returncode,
3892
- job_submit_cmd,
3893
- f'Failed to submit job {job_id}.',
3894
- stderr=stdout + stderr)
4194
+ subprocess_utils.handle_returncode(
4195
+ returncode,
4196
+ job_submit_cmd,
4197
+ f'Failed to submit job {job_id}.',
4198
+ stderr=stdout + stderr)
3895
4199
 
3896
4200
  controller = controller_utils.Controllers.from_name(handle.cluster_name)
3897
4201
  if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
@@ -3912,42 +4216,64 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3912
4216
  def _add_job(self, handle: CloudVmRayResourceHandle,
3913
4217
  job_name: Optional[str], resources_str: str,
3914
4218
  metadata: str) -> Tuple[int, str]:
3915
- code = job_lib.JobLibCodeGen.add_job(
3916
- job_name=job_name,
3917
- username=common_utils.get_user_hash(),
3918
- run_timestamp=self.run_timestamp,
3919
- resources_str=resources_str,
3920
- metadata=metadata)
3921
- returncode, result_str, stderr = self.run_on_head(handle,
3922
- code,
3923
- stream_logs=False,
3924
- require_outputs=True,
3925
- separate_stderr=True)
3926
- # Happens when someone calls `sky exec` but remote is outdated for
3927
- # adding a job. Necessitating calling `sky launch`.
3928
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3929
- handle.cluster_name)
3930
- # TODO(zhwu): this sometimes will unexpectedly fail, we can add
3931
- # retry for this, after we figure out the reason.
3932
- subprocess_utils.handle_returncode(returncode, code,
3933
- 'Failed to fetch job id.', stderr)
3934
- try:
3935
- job_id_match = _JOB_ID_PATTERN.search(result_str)
3936
- if job_id_match is not None:
3937
- job_id = int(job_id_match.group(1))
3938
- else:
3939
- # For backward compatibility.
3940
- job_id = int(result_str)
3941
- log_dir_match = _LOG_DIR_PATTERN.search(result_str)
3942
- if log_dir_match is not None:
3943
- log_dir = log_dir_match.group(1).strip()
3944
- else:
3945
- # For backward compatibility, use the same log dir as local.
3946
- log_dir = self.log_dir
3947
- except ValueError as e:
3948
- logger.error(stderr)
3949
- raise ValueError(f'Failed to parse job id: {result_str}; '
3950
- f'Returncode: {returncode}') from e
4219
+ use_legacy = not handle.is_grpc_enabled_with_flag
4220
+
4221
+ if handle.is_grpc_enabled_with_flag:
4222
+ try:
4223
+ request = jobsv1_pb2.AddJobRequest(
4224
+ job_name=job_name,
4225
+ username=common_utils.get_user_hash(),
4226
+ run_timestamp=self.run_timestamp,
4227
+ resources_str=resources_str,
4228
+ metadata=metadata)
4229
+ response = backend_utils.invoke_skylet_with_retries(
4230
+ lambda: SkyletClient(handle.get_grpc_channel()).add_job(
4231
+ request))
4232
+ job_id = response.job_id
4233
+ log_dir = response.log_dir
4234
+ return job_id, log_dir
4235
+ except exceptions.SkyletMethodNotImplementedError:
4236
+ use_legacy = True
4237
+
4238
+ if use_legacy:
4239
+ code = job_lib.JobLibCodeGen.add_job(
4240
+ job_name=job_name,
4241
+ username=common_utils.get_user_hash(),
4242
+ run_timestamp=self.run_timestamp,
4243
+ resources_str=resources_str,
4244
+ metadata=metadata)
4245
+ returncode, result_str, stderr = self.run_on_head(
4246
+ handle,
4247
+ code,
4248
+ stream_logs=False,
4249
+ require_outputs=True,
4250
+ separate_stderr=True)
4251
+ # Happens when someone calls `sky exec` but remote is outdated for
4252
+ # adding a job. Necessitating calling `sky launch`.
4253
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
4254
+ handle.cluster_name)
4255
+ # TODO(zhwu): this sometimes will unexpectedly fail, we can add
4256
+ # retry for this, after we figure out the reason.
4257
+ subprocess_utils.handle_returncode(returncode, code,
4258
+ 'Failed to fetch job id.',
4259
+ stderr)
4260
+ try:
4261
+ job_id_match = _JOB_ID_PATTERN.search(result_str)
4262
+ if job_id_match is not None:
4263
+ job_id = int(job_id_match.group(1))
4264
+ else:
4265
+ # For backward compatibility.
4266
+ job_id = int(result_str)
4267
+ log_dir_match = _LOG_DIR_PATTERN.search(result_str)
4268
+ if log_dir_match is not None:
4269
+ log_dir = log_dir_match.group(1).strip()
4270
+ else:
4271
+ # For backward compatibility, use the same log dir as local.
4272
+ log_dir = self.log_dir
4273
+ except ValueError as e:
4274
+ logger.error(stderr)
4275
+ raise ValueError(f'Failed to parse job id: {result_str}; '
4276
+ f'Returncode: {returncode}') from e
3951
4277
  return job_id, log_dir
3952
4278
 
3953
4279
  def _execute(
@@ -4126,6 +4452,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4126
4452
  job_ids: Optional[List[int]] = None,
4127
4453
  stream_logs: bool = True
4128
4454
  ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
4455
+ if handle.is_grpc_enabled_with_flag:
4456
+ try:
4457
+ request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
4458
+ response = backend_utils.invoke_skylet_with_retries(
4459
+ lambda: SkyletClient(handle.get_grpc_channel()
4460
+ ).get_job_status(request))
4461
+ statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
4462
+ job_id: job_lib.JobStatus.from_protobuf(proto_status)
4463
+ for job_id, proto_status in response.job_statuses.items()
4464
+ }
4465
+ return statuses
4466
+ except exceptions.SkyletMethodNotImplementedError:
4467
+ pass
4468
+
4129
4469
  code = job_lib.JobLibCodeGen.get_job_status(job_ids)
4130
4470
  returncode, stdout, stderr = self.run_on_head(handle,
4131
4471
  code,
@@ -4146,16 +4486,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4146
4486
 
4147
4487
  See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
4148
4488
  """
4149
- code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
4150
- returncode, stdout, _ = self.run_on_head(handle,
4151
- code,
4152
- stream_logs=False,
4153
- require_outputs=True)
4154
- subprocess_utils.handle_returncode(
4155
- returncode, code,
4156
- f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
4157
-
4158
- cancelled_ids = message_utils.decode_payload(stdout)
4489
+ use_legacy = not handle.is_grpc_enabled_with_flag
4490
+
4491
+ if handle.is_grpc_enabled_with_flag:
4492
+ try:
4493
+ request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
4494
+ cancel_all=cancel_all,
4495
+ user_hash=user_hash)
4496
+ response = backend_utils.invoke_skylet_with_retries(
4497
+ lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
4498
+ request))
4499
+ cancelled_ids = response.cancelled_job_ids
4500
+ except exceptions.SkyletMethodNotImplementedError:
4501
+ use_legacy = True
4502
+
4503
+ if use_legacy:
4504
+ code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
4505
+ user_hash)
4506
+ returncode, stdout, _ = self.run_on_head(handle,
4507
+ code,
4508
+ stream_logs=False,
4509
+ require_outputs=True)
4510
+ subprocess_utils.handle_returncode(
4511
+ returncode, code,
4512
+ f'Failed to cancel jobs on cluster {handle.cluster_name}.',
4513
+ stdout)
4514
+ cancelled_ids = message_utils.decode_payload(stdout)
4159
4515
  if cancelled_ids:
4160
4516
  logger.info(
4161
4517
  f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
@@ -4172,20 +4528,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4172
4528
  Returns:
4173
4529
  A dictionary mapping job_id to log path.
4174
4530
  """
4175
- code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
4176
- returncode, job_to_dir, stderr = self.run_on_head(handle,
4531
+ job_to_dir: Dict[str, str] = {}
4532
+ use_legacy = not handle.is_grpc_enabled_with_flag
4533
+
4534
+ if handle.is_grpc_enabled_with_flag:
4535
+ try:
4536
+ int_job_ids = []
4537
+ if job_ids:
4538
+ for str_job_id in job_ids:
4539
+ if str_job_id.isdigit():
4540
+ int_job_ids.append(int(str_job_id))
4541
+ request = jobsv1_pb2.GetLogDirsForJobsRequest(
4542
+ job_ids=int_job_ids)
4543
+ response = backend_utils.invoke_skylet_with_retries(
4544
+ lambda: SkyletClient(handle.get_grpc_channel()
4545
+ ).get_log_dirs_for_jobs(request))
4546
+ job_log_dirs = response.job_log_dirs
4547
+ if not job_log_dirs:
4548
+ logger.info(f'{colorama.Fore.YELLOW}'
4549
+ 'No matching log directories found'
4550
+ f'{colorama.Style.RESET_ALL}')
4551
+ return {}
4552
+ for job_id, log_dir in job_log_dirs.items():
4553
+ # Convert to string for backwards compatibility
4554
+ job_to_dir[str(job_id)] = log_dir
4555
+ except exceptions.SkyletMethodNotImplementedError:
4556
+ use_legacy = True
4557
+
4558
+ if use_legacy:
4559
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
4560
+ returncode, stdout, stderr = self.run_on_head(handle,
4177
4561
  code,
4178
4562
  stream_logs=False,
4179
4563
  require_outputs=True,
4180
4564
  separate_stderr=True)
4181
- subprocess_utils.handle_returncode(returncode, code,
4182
- 'Failed to sync logs.', stderr)
4183
- job_to_dir: Dict[str, str] = message_utils.decode_payload(job_to_dir)
4184
- if not job_to_dir:
4185
- logger.info(f'{colorama.Fore.YELLOW}'
4186
- 'No matching log directories found'
4187
- f'{colorama.Style.RESET_ALL}')
4188
- return {}
4565
+ subprocess_utils.handle_returncode(returncode, code,
4566
+ 'Failed to sync logs.', stderr)
4567
+ job_to_dir = message_utils.decode_payload(stdout)
4568
+ if not job_to_dir:
4569
+ logger.info(f'{colorama.Fore.YELLOW}'
4570
+ 'No matching log directories found'
4571
+ f'{colorama.Style.RESET_ALL}')
4572
+ return {}
4189
4573
 
4190
4574
  job_ids = list(job_to_dir.keys())
4191
4575
  dirs = list(job_to_dir.values())
@@ -4462,11 +4846,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4462
4846
  exist_ok=True)
4463
4847
  log_file = os.path.join(local_log_dir, 'run.log')
4464
4848
 
4465
- code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
4466
- job_id=job_id,
4467
- follow=False,
4468
- controller=False)
4469
-
4849
+ code = managed_jobs.ManagedJobCodeGen.stream_logs(
4850
+ job_name=None,
4851
+ job_id=int(job_id),
4852
+ follow=False,
4853
+ controller=False)
4470
4854
  # With the stdin=subprocess.DEVNULL, the ctrl-c will not
4471
4855
  # kill the process, so we need to handle it manually here.
4472
4856
  if threading.current_thread() is threading.main_thread():
@@ -4974,9 +5358,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4974
5358
  autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
4975
5359
  down=down,
4976
5360
  )
4977
- backend_utils.invoke_skylet_with_retries(
4978
- handle, lambda: SkyletClient(handle.get_grpc_channel()).
4979
- set_autostop(request))
5361
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
5362
+ handle.get_grpc_channel()).set_autostop(request))
4980
5363
  else:
4981
5364
  code = autostop_lib.AutostopCodeGen.set_autostop(
4982
5365
  idle_minutes_to_autostop, self.NAME, wait_for, down)
@@ -5015,8 +5398,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5015
5398
  try:
5016
5399
  request = autostopv1_pb2.IsAutostoppingRequest()
5017
5400
  response = backend_utils.invoke_skylet_with_retries(
5018
- handle, lambda: SkyletClient(handle.get_grpc_channel()).
5019
- is_autostopping(request))
5401
+ lambda: SkyletClient(handle.get_grpc_channel()
5402
+ ).is_autostopping(request))
5020
5403
  return response.is_autostopping
5021
5404
  except Exception as e: # pylint: disable=broad-except
5022
5405
  # The cluster may have been terminated, causing the gRPC call