skypilot-nightly 1.0.0.dev20250804__py3-none-any.whl → 1.0.0.dev20250807__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (151) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +33 -4
  3. sky/catalog/kubernetes_catalog.py +8 -0
  4. sky/catalog/nebius_catalog.py +0 -1
  5. sky/check.py +11 -1
  6. sky/client/cli/command.py +234 -100
  7. sky/client/sdk.py +30 -9
  8. sky/client/sdk_async.py +815 -0
  9. sky/clouds/kubernetes.py +6 -1
  10. sky/clouds/nebius.py +1 -4
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/YAirOGsV1z6B2RJ0VIUmD/_buildManifest.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
  14. sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{3698-7874720877646365.js → 3850-ff4a9a69d978632b.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/6601-3e21152fe16da09c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
  26. sky/dashboard/out/_next/static/chunks/8969-318c3dca725e8e5d.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/{9025.7937c16bc8623516.js → 9025.a1bef12d672bb66d.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
  30. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-1e6de35d15a8d432.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{config-8620d099cbef8608.js → config-dfb9bf07b13045f4.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
  41. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +1 -0
  49. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
  50. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  51. sky/dashboard/out/clusters/[cluster].html +1 -1
  52. sky/dashboard/out/clusters.html +1 -1
  53. sky/dashboard/out/config.html +1 -1
  54. sky/dashboard/out/index.html +1 -1
  55. sky/dashboard/out/infra/[context].html +1 -1
  56. sky/dashboard/out/infra.html +1 -1
  57. sky/dashboard/out/jobs/[job].html +1 -1
  58. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -1
  61. sky/dashboard/out/volumes.html +1 -1
  62. sky/dashboard/out/workspace/new.html +1 -1
  63. sky/dashboard/out/workspaces/[name].html +1 -1
  64. sky/dashboard/out/workspaces.html +1 -1
  65. sky/global_user_state.py +14 -2
  66. sky/jobs/__init__.py +2 -0
  67. sky/jobs/client/sdk.py +43 -2
  68. sky/jobs/client/sdk_async.py +135 -0
  69. sky/jobs/server/core.py +48 -1
  70. sky/jobs/server/server.py +52 -3
  71. sky/jobs/state.py +5 -1
  72. sky/jobs/utils.py +3 -1
  73. sky/provision/kubernetes/utils.py +30 -4
  74. sky/provision/nebius/instance.py +1 -0
  75. sky/provision/nebius/utils.py +9 -1
  76. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  77. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  78. sky/serve/client/impl.py +85 -1
  79. sky/serve/client/sdk.py +16 -47
  80. sky/serve/client/sdk_async.py +130 -0
  81. sky/serve/constants.py +3 -1
  82. sky/serve/controller.py +6 -3
  83. sky/serve/load_balancer.py +3 -1
  84. sky/serve/serve_state.py +93 -5
  85. sky/serve/serve_utils.py +200 -67
  86. sky/serve/server/core.py +13 -197
  87. sky/serve/server/impl.py +261 -23
  88. sky/serve/service.py +15 -3
  89. sky/server/auth/__init__.py +0 -0
  90. sky/server/auth/authn.py +46 -0
  91. sky/server/auth/oauth2_proxy.py +185 -0
  92. sky/server/common.py +119 -21
  93. sky/server/constants.py +1 -1
  94. sky/server/daemons.py +60 -11
  95. sky/server/requests/executor.py +5 -3
  96. sky/server/requests/payloads.py +19 -0
  97. sky/server/rest.py +114 -0
  98. sky/server/server.py +44 -40
  99. sky/setup_files/dependencies.py +2 -0
  100. sky/skylet/constants.py +1 -1
  101. sky/skylet/events.py +5 -1
  102. sky/skylet/skylet.py +3 -1
  103. sky/task.py +61 -21
  104. sky/templates/kubernetes-ray.yml.j2 +9 -0
  105. sky/templates/nebius-ray.yml.j2 +1 -0
  106. sky/templates/sky-serve-controller.yaml.j2 +1 -0
  107. sky/usage/usage_lib.py +8 -6
  108. sky/utils/annotations.py +8 -3
  109. sky/utils/common_utils.py +11 -1
  110. sky/utils/controller_utils.py +7 -0
  111. sky/utils/db/migration_utils.py +2 -2
  112. sky/utils/rich_utils.py +120 -0
  113. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/METADATA +22 -13
  114. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/RECORD +120 -112
  115. sky/client/sdk.pyi +0 -300
  116. sky/dashboard/out/_next/static/KiGGm4fK0CpmN6BT17jkh/_buildManifest.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
  119. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
  126. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +0 -11
  133. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
  138. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
  145. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
  146. /sky/dashboard/out/_next/static/{KiGGm4fK0CpmN6BT17jkh → YAirOGsV1z6B2RJ0VIUmD}/_ssgManifest.js +0 -0
  147. /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-85426374db04811e.js} +0 -0
  148. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/WHEEL +0 -0
  149. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/entry_points.txt +0 -0
  150. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/licenses/LICENSE +0 -0
  151. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '641a8e762a1d86274420f7a9d9cea6657b5be5b4'
8
+ _SKYPILOT_COMMIT_SHA = 'a167cba8230b0ffda6baa0c825fa0eb5d5ab4aa4'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250804'
38
+ __version__ = '1.0.0.dev20250807'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -168,6 +168,9 @@ _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
168
168
  _RESOURCES_UNAVAILABLE_LOG = (
169
169
  'Reasons for provision failures (for details, please check the log above):')
170
170
 
171
+ # Number of seconds to wait locking the cluster before communicating with user.
172
+ _CLUSTER_LOCK_TIMEOUT = 5.0
173
+
171
174
 
172
175
  def _is_command_length_over_limit(command: str) -> bool:
173
176
  """Check if the length of the command exceeds the limit.
@@ -2917,10 +2920,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2917
2920
  # exceptions.ClusterOwnerIdentityMismatchError
2918
2921
  backend_utils.check_owner_identity(cluster_name)
2919
2922
  lock_id = backend_utils.cluster_status_lock_id(cluster_name)
2920
- with timeline.DistributedLockEvent(lock_id):
2921
- # Try to launch the exiting cluster first. If no existing cluster,
2922
- # this function will create a to_provision_config with required
2923
- # resources.
2923
+ communicated_with_user = False
2924
+
2925
+ while True:
2926
+ try:
2927
+ return self._locked_provision(lock_id, task, to_provision,
2928
+ dryrun, stream_logs, cluster_name,
2929
+ retry_until_up,
2930
+ skip_unnecessary_provisioning)
2931
+ except locks.LockTimeout:
2932
+ if not communicated_with_user:
2933
+ logger.info(f'{colorama.Fore.YELLOW}'
2934
+ f'Launching delayed, check concurrent tasks: '
2935
+ f'sky api status')
2936
+ communicated_with_user = True
2937
+
2938
+ def _locked_provision(
2939
+ self,
2940
+ lock_id: str,
2941
+ task: task_lib.Task,
2942
+ to_provision: Optional[resources_lib.Resources],
2943
+ dryrun: bool,
2944
+ stream_logs: bool,
2945
+ cluster_name: str,
2946
+ retry_until_up: bool = False,
2947
+ skip_unnecessary_provisioning: bool = False,
2948
+ ) -> Tuple[Optional[CloudVmRayResourceHandle], bool]:
2949
+ with timeline.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
2950
+ # Try to launch the exiting cluster first. If no existing
2951
+ # cluster, this function will create a to_provision_config
2952
+ # with required resources.
2924
2953
  to_provision_config = self._check_existing_cluster(
2925
2954
  task, to_provision, cluster_name, dryrun)
2926
2955
  assert to_provision_config.resources is not None, (
@@ -255,6 +255,14 @@ def _list_accelerators(
255
255
  # Get all the pods running on the node
256
256
  if (pod.spec.node_name == node.metadata.name and
257
257
  pod.status.phase in ['Running', 'Pending']):
258
+ # Skip pods that should not count against GPU count
259
+ if (kubernetes_utils.
260
+ should_exclude_pod_from_gpu_allocation(pod)):
261
+ logger.debug(
262
+ f'Excluding pod '
263
+ f'{pod.metadata.name} from GPU count '
264
+ f'calculations on node {node.metadata.name}')
265
+ continue
258
266
  # Iterate over all the containers in the pod and sum
259
267
  # the GPU requests
260
268
  for container in pod.spec.containers:
@@ -38,7 +38,6 @@ def get_hourly_cost(instance_type: str,
38
38
  region: Optional[str] = None,
39
39
  zone: Optional[str] = None) -> float:
40
40
  """Returns the cost, or the cheapest cost among all zones for spot."""
41
- assert not use_spot, 'Nebius does not support spot.'
42
41
  if zone is not None:
43
42
  with ux_utils.print_exception_no_traceback():
44
43
  raise ValueError('Nebius does not support zones.')
sky/check.py CHANGED
@@ -467,8 +467,18 @@ def _print_checked_cloud(
467
467
  if ok:
468
468
  enabled_capabilities.append(capability)
469
469
  # `dict` reasons for K8s and SSH will be printed in detail in
470
- # _format_enabled_cloud. Skip here.
470
+ # _format_enabled_cloud. Skip here unless the cloud is disabled.
471
471
  if not isinstance(reason, str):
472
+ if not ok and isinstance(cloud_tuple[1],
473
+ (sky_clouds.SSH, sky_clouds.Kubernetes)):
474
+ if reason is not None:
475
+ reason_str = _format_context_details(cloud_tuple[1],
476
+ show_details=True,
477
+ ctx2text=reason)
478
+ reason_str = '\n'.join(
479
+ ' ' + line for line in reason_str.splitlines())
480
+ reasons_to_capabilities.setdefault(reason_str,
481
+ []).append(capability)
472
482
  continue
473
483
  if ok:
474
484
  if reason is not None:
sky/client/cli/command.py CHANGED
@@ -556,17 +556,17 @@ def _parse_override_params(
556
556
  """Parses the override parameters into a dictionary."""
557
557
  override_params: Dict[str, Any] = {}
558
558
  if cloud is not None:
559
- if cloud.lower() == 'none':
559
+ if cloud.lower() == 'none' or cloud == '*':
560
560
  override_params['cloud'] = None
561
561
  else:
562
562
  override_params['cloud'] = registry.CLOUD_REGISTRY.from_str(cloud)
563
563
  if region is not None:
564
- if region.lower() == 'none':
564
+ if region.lower() == 'none' or region == '*':
565
565
  override_params['region'] = None
566
566
  else:
567
567
  override_params['region'] = region
568
568
  if zone is not None:
569
- if zone.lower() == 'none':
569
+ if zone.lower() == 'none' or zone == '*':
570
570
  override_params['zone'] = None
571
571
  else:
572
572
  override_params['zone'] = zone
@@ -964,9 +964,10 @@ def _handle_infra_cloud_region_zone_options(infra: Optional[str],
964
964
 
965
965
  if infra is not None:
966
966
  infra_info = infra_utils.InfraInfo.from_str(infra)
967
- cloud = infra_info.cloud
968
- region = infra_info.region
969
- zone = infra_info.zone
967
+ # Convert None to '*' to ensure proper override behavior
968
+ cloud = infra_info.cloud if infra_info.cloud is not None else '*'
969
+ region = infra_info.region if infra_info.region is not None else '*'
970
+ zone = infra_info.zone if infra_info.zone is not None else '*'
970
971
  return cloud, region, zone
971
972
 
972
973
 
@@ -3462,6 +3463,17 @@ def show_gpus(
3462
3463
  region,
3463
3464
  zone=None)
3464
3465
 
3466
+ # cloud and region could be '*' from _handle_infra_cloud_region_zone_options
3467
+ # which normally indicates to
3468
+ # _make_task_or_dag_from_entrypoint_with_overrides -> _parse_override_params
3469
+ # to disregard the cloud and region from the YAML.
3470
+ # In show_gpus, there is no YAML, so we need to handle the '*' value
3471
+ # directly here. We should use None instead to indicate "any".
3472
+ if cloud == '*':
3473
+ cloud = None
3474
+ if region == '*':
3475
+ region = None
3476
+
3465
3477
  # validation for the --region flag
3466
3478
  if region is not None and cloud is None:
3467
3479
  raise click.UsageError(
@@ -3502,7 +3514,14 @@ def show_gpus(
3502
3514
  (cloud_name is None or cloud_is_ssh))
3503
3515
 
3504
3516
  def _list_to_str(lst):
3505
- return ', '.join([str(e) for e in lst])
3517
+
3518
+ def format_number(n):
3519
+ # If it's a float that's a whole number, display as int
3520
+ if isinstance(n, float) and n.is_integer():
3521
+ return str(int(n))
3522
+ return str(n)
3523
+
3524
+ return ', '.join([format_number(n) for n in lst])
3506
3525
 
3507
3526
  # TODO(zhwu,romilb): We should move most of these kubernetes related
3508
3527
  # queries into the backend, especially behind the server.
@@ -4953,6 +4972,205 @@ def jobs_pool_down(
4953
4972
  _async_call_or_wait(request_id, async_call, 'sky.jobs.pool_down')
4954
4973
 
4955
4974
 
4975
+ def _handle_serve_logs(
4976
+ service_name: str,
4977
+ follow: bool,
4978
+ controller: bool,
4979
+ load_balancer: bool,
4980
+ replica_ids: Tuple[int, ...],
4981
+ sync_down: bool,
4982
+ tail: Optional[int],
4983
+ pool: bool, # pylint: disable=redefined-outer-name
4984
+ ):
4985
+ noun = 'pool' if pool else 'service'
4986
+ capnoun = noun.capitalize()
4987
+ repnoun = 'worker' if pool else 'replica'
4988
+ if tail is not None:
4989
+ if tail < 0:
4990
+ raise click.UsageError('--tail must be a non-negative integer.')
4991
+ # TODO(arda): We could add ability to tail and follow logs together.
4992
+ if follow:
4993
+ follow = False
4994
+ logger.warning(
4995
+ f'{colorama.Fore.YELLOW}'
4996
+ '--tail and --follow cannot be used together. '
4997
+ f'Changed the mode to --no-follow.{colorama.Style.RESET_ALL}')
4998
+
4999
+ chosen_components: Set[serve_lib.ServiceComponent] = set()
5000
+ if controller:
5001
+ chosen_components.add(serve_lib.ServiceComponent.CONTROLLER)
5002
+ if load_balancer:
5003
+ assert not pool, 'Load balancer is not supported for pools.'
5004
+ chosen_components.add(serve_lib.ServiceComponent.LOAD_BALANCER)
5005
+ # replica_ids contains the specific replica IDs provided by the user.
5006
+ # If it's not empty, it implies the user wants replica logs.
5007
+ if replica_ids:
5008
+ chosen_components.add(serve_lib.ServiceComponent.REPLICA)
5009
+
5010
+ if sync_down:
5011
+ # For sync-down, multiple targets are allowed.
5012
+ # If no specific components/replicas are mentioned, sync all.
5013
+ # Note: Multiple replicas or targets can only be specified when
5014
+ # using --sync-down.
5015
+ targets_to_sync = list(chosen_components)
5016
+ if not targets_to_sync and not replica_ids:
5017
+ # Default to all components if nothing specific is requested
5018
+ targets_to_sync = [
5019
+ serve_lib.ServiceComponent.CONTROLLER,
5020
+ serve_lib.ServiceComponent.REPLICA,
5021
+ ]
5022
+ if not pool:
5023
+ targets_to_sync.append(serve_lib.ServiceComponent.LOAD_BALANCER)
5024
+
5025
+ timestamp = sky_logging.get_run_timestamp()
5026
+ log_dir = (pathlib.Path(constants.SKY_LOGS_DIRECTORY) / noun /
5027
+ f'{service_name}_{timestamp}').expanduser()
5028
+ log_dir.mkdir(parents=True, exist_ok=True)
5029
+
5030
+ with rich_utils.client_status(
5031
+ ux_utils.spinner_message(f'Downloading {noun} logs...')):
5032
+ if pool:
5033
+ managed_jobs.pool_sync_down_logs(service_name,
5034
+ str(log_dir),
5035
+ targets=targets_to_sync,
5036
+ worker_ids=list(replica_ids),
5037
+ tail=tail)
5038
+ else:
5039
+ serve_lib.sync_down_logs(service_name,
5040
+ str(log_dir),
5041
+ targets=targets_to_sync,
5042
+ replica_ids=list(replica_ids),
5043
+ tail=tail)
5044
+ style = colorama.Style
5045
+ fore = colorama.Fore
5046
+ logger.info(f'{fore.CYAN}{capnoun} {service_name} logs: '
5047
+ f'{log_dir}{style.RESET_ALL}')
5048
+ return
5049
+
5050
+ # Tailing requires exactly one target.
5051
+ num_targets = len(chosen_components)
5052
+ # If REPLICA component is chosen, len(replica_ids) must be 1 for tailing.
5053
+ if serve_lib.ServiceComponent.REPLICA in chosen_components:
5054
+ if len(replica_ids) != 1:
5055
+ raise click.UsageError(
5056
+ f'Can only tail logs from a single {repnoun} at a time. '
5057
+ f'Provide exactly one {repnoun.upper()}_ID or use --sync-down '
5058
+ f'to download logs from multiple {repnoun}s.')
5059
+ # If replica is chosen and len is 1, num_targets effectively counts it.
5060
+ # We need to ensure no other component (controller/LB) is selected.
5061
+ if num_targets > 1:
5062
+ raise click.UsageError(
5063
+ 'Can only tail logs from one target at a time (controller, '
5064
+ f'load balancer, or a single {repnoun}). Use --sync-down '
5065
+ 'to download logs from multiple sources.')
5066
+ elif num_targets == 0:
5067
+ raise click.UsageError(
5068
+ 'Specify a target to tail: --controller, --load-balancer, or '
5069
+ f'a {repnoun.upper()}_ID.')
5070
+ elif num_targets > 1:
5071
+ raise click.UsageError(
5072
+ 'Can only tail logs from one target at a time. Use --sync-down '
5073
+ 'to download logs from multiple sources.')
5074
+
5075
+ # At this point, we have exactly one target for tailing.
5076
+ assert len(chosen_components) == 1
5077
+ assert len(replica_ids) in [0, 1]
5078
+ target_component = chosen_components.pop()
5079
+ target_replica_id: Optional[int] = replica_ids[0] if replica_ids else None
5080
+
5081
+ try:
5082
+ if pool:
5083
+ managed_jobs.pool_tail_logs(service_name,
5084
+ target=target_component,
5085
+ worker_id=target_replica_id,
5086
+ follow=follow,
5087
+ tail=tail)
5088
+ else:
5089
+ serve_lib.tail_logs(service_name,
5090
+ target=target_component,
5091
+ replica_id=target_replica_id,
5092
+ follow=follow,
5093
+ tail=tail)
5094
+ except exceptions.ClusterNotUpError:
5095
+ with ux_utils.print_exception_no_traceback():
5096
+ raise
5097
+
5098
+
5099
+ @pool.command('logs', cls=_DocumentedCodeCommand)
5100
+ @flags.config_option(expose_value=False)
5101
+ @click.option(
5102
+ '--follow/--no-follow',
5103
+ is_flag=True,
5104
+ default=True,
5105
+ help=('Follow the logs of the job. [default: --follow] '
5106
+ 'If --no-follow is specified, print the log so far and exit.'))
5107
+ @click.option('--controller',
5108
+ is_flag=True,
5109
+ default=False,
5110
+ required=False,
5111
+ help='Show the controller logs of this pool.')
5112
+ @click.option('--sync-down',
5113
+ '-s',
5114
+ is_flag=True,
5115
+ default=False,
5116
+ help='Sync down logs to the local machine. Can be combined with '
5117
+ '--controller or worker ID to narrow scope.')
5118
+ @click.option(
5119
+ '--tail',
5120
+ default=None,
5121
+ type=int,
5122
+ help='The number of lines to display from the end of the log file. '
5123
+ 'Default is None, which means print all lines.')
5124
+ @click.argument('pool_name', required=True, type=str)
5125
+ @click.argument('worker_ids', required=False, type=int, nargs=-1)
5126
+ @usage_lib.entrypoint
5127
+ # TODO(tian): Add default argument for this CLI if none of the flags are
5128
+ # specified.
5129
+ def pool_logs(
5130
+ pool_name: str,
5131
+ follow: bool,
5132
+ controller: bool,
5133
+ worker_ids: Tuple[int, ...],
5134
+ sync_down: bool,
5135
+ tail: Optional[int],
5136
+ ):
5137
+ """Tail or sync down logs of a pool.
5138
+
5139
+ Logs can be tailed from one target (controller, or a single worker) or
5140
+ synced down from multiple targets simultaneously.
5141
+
5142
+ Example:
5143
+
5144
+ .. code-block:: bash
5145
+
5146
+ # Tail the controller logs of a pool
5147
+ sky pool logs --controller [POOL_NAME]
5148
+ \b
5149
+ # Print the worker logs so far and exit
5150
+ sky pool logs --no-follow [POOL_NAME]
5151
+ \b
5152
+ # Tail the logs of worker 1
5153
+ sky pool logs [POOL_NAME] 1
5154
+ \b
5155
+ # Show the last 100 lines of the controller logs
5156
+ sky pool logs --controller --tail 100 [POOL_NAME]
5157
+ \b
5158
+ # Sync down all logs of the pool (controller, all workers)
5159
+ sky pool logs [POOL_NAME] --sync-down
5160
+ \b
5161
+ # Sync down controller logs and logs for workers 1 and 3
5162
+ sky pool logs [POOL_NAME] 1 3 --controller --sync-down
5163
+ """
5164
+ _handle_serve_logs(pool_name,
5165
+ follow=follow,
5166
+ controller=controller,
5167
+ load_balancer=False,
5168
+ replica_ids=worker_ids,
5169
+ sync_down=sync_down,
5170
+ tail=tail,
5171
+ pool=True)
5172
+
5173
+
4956
5174
  @cli.command(cls=_DocumentedCodeCommand)
4957
5175
  @flags.config_option(expose_value=False)
4958
5176
  @usage_lib.entrypoint
@@ -5536,6 +5754,7 @@ def serve_down(
5536
5754
  show_default=True)
5537
5755
 
5538
5756
  if replica_id_is_defined:
5757
+ assert replica_id is not None
5539
5758
  request_id = serve_lib.terminate_replica(service_names[0], replica_id,
5540
5759
  purge)
5541
5760
  else:
@@ -5616,99 +5835,14 @@ def serve_logs(
5616
5835
  # Sync down controller logs and logs for replicas 1 and 3
5617
5836
  sky serve logs [SERVICE_NAME] 1 3 --controller --sync-down
5618
5837
  """
5619
- if tail is not None:
5620
- if tail < 0:
5621
- raise click.UsageError('--tail must be a non-negative integer.')
5622
- # TODO(arda): We could add ability to tail and follow logs together.
5623
- if follow:
5624
- follow = False
5625
- logger.warning(
5626
- f'{colorama.Fore.YELLOW}'
5627
- '--tail and --follow cannot be used together. '
5628
- f'Changed the mode to --no-follow.{colorama.Style.RESET_ALL}')
5629
-
5630
- chosen_components: Set[serve_lib.ServiceComponent] = set()
5631
- if controller:
5632
- chosen_components.add(serve_lib.ServiceComponent.CONTROLLER)
5633
- if load_balancer:
5634
- chosen_components.add(serve_lib.ServiceComponent.LOAD_BALANCER)
5635
- # replica_ids contains the specific replica IDs provided by the user.
5636
- # If it's not empty, it implies the user wants replica logs.
5637
- if replica_ids:
5638
- chosen_components.add(serve_lib.ServiceComponent.REPLICA)
5639
-
5640
- if sync_down:
5641
- # For sync-down, multiple targets are allowed.
5642
- # If no specific components/replicas are mentioned, sync all.
5643
- # Note: Multiple replicas or targets can only be specified when
5644
- # using --sync-down.
5645
- targets_to_sync = list(chosen_components)
5646
- if not targets_to_sync and not replica_ids:
5647
- # Default to all components if nothing specific is requested
5648
- targets_to_sync = [
5649
- serve_lib.ServiceComponent.CONTROLLER,
5650
- serve_lib.ServiceComponent.LOAD_BALANCER,
5651
- serve_lib.ServiceComponent.REPLICA,
5652
- ]
5653
-
5654
- timestamp = sky_logging.get_run_timestamp()
5655
- log_dir = (pathlib.Path(constants.SKY_LOGS_DIRECTORY) / 'service' /
5656
- f'{service_name}_{timestamp}').expanduser()
5657
- log_dir.mkdir(parents=True, exist_ok=True)
5658
-
5659
- with rich_utils.client_status(
5660
- ux_utils.spinner_message('Downloading service logs...')):
5661
- serve_lib.sync_down_logs(service_name,
5662
- local_dir=str(log_dir),
5663
- targets=targets_to_sync,
5664
- replica_ids=list(replica_ids),
5665
- tail=tail)
5666
- style = colorama.Style
5667
- fore = colorama.Fore
5668
- logger.info(f'{fore.CYAN}Service {service_name} logs: '
5669
- f'{log_dir}{style.RESET_ALL}')
5670
- return
5671
-
5672
- # Tailing requires exactly one target.
5673
- num_targets = len(chosen_components)
5674
- # If REPLICA component is chosen, len(replica_ids) must be 1 for tailing.
5675
- if serve_lib.ServiceComponent.REPLICA in chosen_components:
5676
- if len(replica_ids) != 1:
5677
- raise click.UsageError(
5678
- 'Can only tail logs from a single replica at a time. '
5679
- 'Provide exactly one REPLICA_ID or use --sync-down '
5680
- 'to download logs from multiple replicas.')
5681
- # If replica is chosen and len is 1, num_targets effectively counts it.
5682
- # We need to ensure no other component (controller/LB) is selected.
5683
- if num_targets > 1:
5684
- raise click.UsageError(
5685
- 'Can only tail logs from one target at a time (controller, '
5686
- 'load balancer, or a single replica). Use --sync-down '
5687
- 'to download logs from multiple sources.')
5688
- elif num_targets == 0:
5689
- raise click.UsageError(
5690
- 'Specify a target to tail: --controller, --load-balancer, or '
5691
- 'a REPLICA_ID.')
5692
- elif num_targets > 1:
5693
- raise click.UsageError(
5694
- 'Can only tail logs from one target at a time. Use --sync-down '
5695
- 'to download logs from multiple sources.')
5696
-
5697
- # At this point, we have exactly one target for tailing.
5698
- assert len(chosen_components) == 1
5699
- assert len(replica_ids) in [0, 1]
5700
- target_component = chosen_components.pop()
5701
- target_replica_id: Optional[int] = replica_ids[0] if replica_ids else None
5702
-
5703
- try:
5704
- serve_lib.tail_logs(service_name,
5705
- target=target_component,
5706
- replica_id=target_replica_id,
5707
- follow=follow,
5708
- tail=tail)
5709
- except exceptions.ClusterNotUpError:
5710
- with ux_utils.print_exception_no_traceback():
5711
- raise
5838
+ _handle_serve_logs(service_name,
5839
+ follow=follow,
5840
+ controller=controller,
5841
+ load_balancer=load_balancer,
5842
+ replica_ids=replica_ids,
5843
+ sync_down=sync_down,
5844
+ tail=tail,
5845
+ pool=False)
5712
5846
 
5713
5847
 
5714
5848
  @cli.group(cls=_NaturalOrderGroup, hidden=True)
sky/client/sdk.py CHANGED
@@ -88,14 +88,17 @@ def reload_config() -> None:
88
88
  skypilot_config.safe_reload_config()
89
89
 
90
90
 
91
- def stream_response(request_id: Optional[str],
91
+ def stream_response(request_id: Optional[server_common.RequestId],
92
92
  response: 'requests.Response',
93
93
  output_stream: Optional['io.TextIOBase'] = None,
94
94
  resumable: bool = False) -> Any:
95
95
  """Streams the response to the console.
96
96
 
97
97
  Args:
98
- request_id: The request ID.
98
+ request_id: The request ID of the request to stream. May be a full
99
+ request ID or a prefix.
100
+ If None, the latest request submitted to the API server is streamed.
101
+ Using None request_id is not recommended in multi-user environments.
99
102
  response: The HTTP response.
100
103
  output_stream: The output stream to write to. If None, print to the
101
104
  console.
@@ -349,7 +352,14 @@ def validate(
349
352
  validation. This is only required when a admin policy is in use,
350
353
  see: https://docs.skypilot.co/en/latest/cloud-setup/policy.html
351
354
  """
355
+ remote_api_version = versions.get_remote_api_version()
356
+ # TODO(kevin): remove this in v0.13.0
357
+ omit_user_specified_yaml = (remote_api_version is None or
358
+ remote_api_version < 15)
352
359
  for task in dag.tasks:
360
+ if omit_user_specified_yaml:
361
+ # pylint: disable=protected-access
362
+ task._user_specified_yaml = None
353
363
  task.expand_and_validate_workdir()
354
364
  if not workdir_only:
355
365
  task.expand_and_validate_file_mounts()
@@ -1756,7 +1766,7 @@ def status_kubernetes() -> server_common.RequestId:
1756
1766
  # === API request APIs ===
1757
1767
  @usage_lib.entrypoint
1758
1768
  @annotations.client_api
1759
- def get(request_id: str) -> Any:
1769
+ def get(request_id: server_common.RequestId) -> Any:
1760
1770
  """Waits for and gets the result of a request.
1761
1771
 
1762
1772
  This function will not check the server health since /api/get is typically
@@ -1764,7 +1774,8 @@ def get(request_id: str) -> Any:
1764
1774
  may cause GET /api/get being sent to a restarted API server.
1765
1775
 
1766
1776
  Args:
1767
- request_id: The request ID of the request to get.
1777
+ request_id: The request ID of the request to get. May be a full request
1778
+ ID or a prefix.
1768
1779
 
1769
1780
  Returns:
1770
1781
  The ``Request Returns`` of the specified request. See the documentation
@@ -1818,7 +1829,7 @@ def get(request_id: str) -> Any:
1818
1829
  @server_common.check_server_healthy_or_start
1819
1830
  @annotations.client_api
1820
1831
  def stream_and_get(
1821
- request_id: Optional[str] = None,
1832
+ request_id: Optional[server_common.RequestId] = None,
1822
1833
  log_path: Optional[str] = None,
1823
1834
  tail: Optional[int] = None,
1824
1835
  follow: bool = True,
@@ -1830,7 +1841,10 @@ def stream_and_get(
1830
1841
  prefix of the full request id.
1831
1842
 
1832
1843
  Args:
1833
- request_id: The prefix of the request ID of the request to stream.
1844
+ request_id: The request ID of the request to stream. May be a full
1845
+ request ID or a prefix.
1846
+ If None, the latest request submitted to the API server is streamed.
1847
+ Using None request_id is not recommended in multi-user environments.
1834
1848
  log_path: The path to the log file to stream.
1835
1849
  tail: The number of lines to show from the end of the logs.
1836
1850
  If None, show all logs.
@@ -1867,13 +1881,18 @@ def stream_and_get(
1867
1881
  with ux_utils.print_exception_no_traceback():
1868
1882
  raise RuntimeError(f'Failed to stream logs: {detail}')
1869
1883
  elif response.status_code != 200:
1884
+ # TODO(syang): handle the case where the requestID is not provided
1885
+ # see https://github.com/skypilot-org/skypilot/issues/6549
1886
+ if request_id is None:
1887
+ return None
1870
1888
  return get(request_id)
1871
1889
  return stream_response(request_id, response, output_stream)
1872
1890
 
1873
1891
 
1874
1892
  @usage_lib.entrypoint
1875
1893
  @annotations.client_api
1876
- def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
1894
+ def api_cancel(request_ids: Optional[Union[
1895
+ server_common.RequestId, List[server_common.RequestId]]] = None,
1877
1896
  all_users: bool = False,
1878
1897
  silent: bool = False) -> server_common.RequestId:
1879
1898
  """Aborts a request or all requests.
@@ -1938,7 +1957,7 @@ def _local_api_server_running(kill: bool = False) -> bool:
1938
1957
  @usage_lib.entrypoint
1939
1958
  @annotations.client_api
1940
1959
  def api_status(
1941
- request_ids: Optional[List[str]] = None,
1960
+ request_ids: Optional[List[server_common.RequestId]] = None,
1942
1961
  # pylint: disable=redefined-builtin
1943
1962
  all_status: bool = False
1944
1963
  ) -> List[payloads.RequestPayload]:
@@ -2412,7 +2431,9 @@ def api_login(endpoint: Optional[str] = None,
2412
2431
  _save_config_updates(endpoint=endpoint)
2413
2432
  dashboard_url = server_common.get_dashboard_url(endpoint)
2414
2433
 
2415
- server_common.get_api_server_status.cache_clear()
2434
+ # see https://github.com/python/mypy/issues/5107 on why
2435
+ # typing is disabled on this line
2436
+ server_common.get_api_server_status.cache_clear() # type: ignore
2416
2437
  # After successful authentication, check server health again to get user
2417
2438
  # identity
2418
2439
  server_status, final_api_server_info = server_common.check_server_healthy(