skypilot-nightly 1.0.0.dev20250804__py3-none-any.whl → 1.0.0.dev20250807__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (151) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +33 -4
  3. sky/catalog/kubernetes_catalog.py +8 -0
  4. sky/catalog/nebius_catalog.py +0 -1
  5. sky/check.py +11 -1
  6. sky/client/cli/command.py +234 -100
  7. sky/client/sdk.py +30 -9
  8. sky/client/sdk_async.py +815 -0
  9. sky/clouds/kubernetes.py +6 -1
  10. sky/clouds/nebius.py +1 -4
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/YAirOGsV1z6B2RJ0VIUmD/_buildManifest.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
  14. sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{3698-7874720877646365.js → 3850-ff4a9a69d978632b.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/6601-3e21152fe16da09c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
  26. sky/dashboard/out/_next/static/chunks/8969-318c3dca725e8e5d.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/{9025.7937c16bc8623516.js → 9025.a1bef12d672bb66d.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
  30. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-1e6de35d15a8d432.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{config-8620d099cbef8608.js → config-dfb9bf07b13045f4.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
  41. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +1 -0
  49. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
  50. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  51. sky/dashboard/out/clusters/[cluster].html +1 -1
  52. sky/dashboard/out/clusters.html +1 -1
  53. sky/dashboard/out/config.html +1 -1
  54. sky/dashboard/out/index.html +1 -1
  55. sky/dashboard/out/infra/[context].html +1 -1
  56. sky/dashboard/out/infra.html +1 -1
  57. sky/dashboard/out/jobs/[job].html +1 -1
  58. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -1
  61. sky/dashboard/out/volumes.html +1 -1
  62. sky/dashboard/out/workspace/new.html +1 -1
  63. sky/dashboard/out/workspaces/[name].html +1 -1
  64. sky/dashboard/out/workspaces.html +1 -1
  65. sky/global_user_state.py +14 -2
  66. sky/jobs/__init__.py +2 -0
  67. sky/jobs/client/sdk.py +43 -2
  68. sky/jobs/client/sdk_async.py +135 -0
  69. sky/jobs/server/core.py +48 -1
  70. sky/jobs/server/server.py +52 -3
  71. sky/jobs/state.py +5 -1
  72. sky/jobs/utils.py +3 -1
  73. sky/provision/kubernetes/utils.py +30 -4
  74. sky/provision/nebius/instance.py +1 -0
  75. sky/provision/nebius/utils.py +9 -1
  76. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  77. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  78. sky/serve/client/impl.py +85 -1
  79. sky/serve/client/sdk.py +16 -47
  80. sky/serve/client/sdk_async.py +130 -0
  81. sky/serve/constants.py +3 -1
  82. sky/serve/controller.py +6 -3
  83. sky/serve/load_balancer.py +3 -1
  84. sky/serve/serve_state.py +93 -5
  85. sky/serve/serve_utils.py +200 -67
  86. sky/serve/server/core.py +13 -197
  87. sky/serve/server/impl.py +261 -23
  88. sky/serve/service.py +15 -3
  89. sky/server/auth/__init__.py +0 -0
  90. sky/server/auth/authn.py +46 -0
  91. sky/server/auth/oauth2_proxy.py +185 -0
  92. sky/server/common.py +119 -21
  93. sky/server/constants.py +1 -1
  94. sky/server/daemons.py +60 -11
  95. sky/server/requests/executor.py +5 -3
  96. sky/server/requests/payloads.py +19 -0
  97. sky/server/rest.py +114 -0
  98. sky/server/server.py +44 -40
  99. sky/setup_files/dependencies.py +2 -0
  100. sky/skylet/constants.py +1 -1
  101. sky/skylet/events.py +5 -1
  102. sky/skylet/skylet.py +3 -1
  103. sky/task.py +61 -21
  104. sky/templates/kubernetes-ray.yml.j2 +9 -0
  105. sky/templates/nebius-ray.yml.j2 +1 -0
  106. sky/templates/sky-serve-controller.yaml.j2 +1 -0
  107. sky/usage/usage_lib.py +8 -6
  108. sky/utils/annotations.py +8 -3
  109. sky/utils/common_utils.py +11 -1
  110. sky/utils/controller_utils.py +7 -0
  111. sky/utils/db/migration_utils.py +2 -2
  112. sky/utils/rich_utils.py +120 -0
  113. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/METADATA +22 -13
  114. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/RECORD +120 -112
  115. sky/client/sdk.pyi +0 -300
  116. sky/dashboard/out/_next/static/KiGGm4fK0CpmN6BT17jkh/_buildManifest.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
  119. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
  126. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +0 -11
  133. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
  138. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
  145. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
  146. /sky/dashboard/out/_next/static/{KiGGm4fK0CpmN6BT17jkh → YAirOGsV1z6B2RJ0VIUmD}/_ssgManifest.js +0 -0
  147. /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-85426374db04811e.js} +0 -0
  148. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/WHEEL +0 -0
  149. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/entry_points.txt +0 -0
  150. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/licenses/LICENSE +0 -0
  151. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/top_level.txt +0 -0
@@ -2032,9 +2032,7 @@ class KubernetesInstanceType:
2032
2032
  accelerator_type = match.group('accelerator_type')
2033
2033
  if accelerator_count:
2034
2034
  accelerator_count = int(accelerator_count)
2035
- # This is to revert the accelerator types with spaces back to
2036
- # the original format.
2037
- accelerator_type = str(accelerator_type).replace('_', ' ')
2035
+ accelerator_type = str(accelerator_type)
2038
2036
  else:
2039
2037
  accelerator_count = None
2040
2038
  accelerator_type = None
@@ -2047,7 +2045,7 @@ class KubernetesInstanceType:
2047
2045
  accelerator_type = prev_match.group('accelerator_type')
2048
2046
  if accelerator_count:
2049
2047
  accelerator_count = int(accelerator_count)
2050
- accelerator_type = str(accelerator_type).replace('_', ' ')
2048
+ accelerator_type = str(accelerator_type)
2051
2049
  else:
2052
2050
  accelerator_count = None
2053
2051
  accelerator_type = None
@@ -2998,6 +2996,13 @@ def get_kubernetes_node_info(
2998
2996
  # Get all the pods running on the node
2999
2997
  if (pod.spec.node_name == node.metadata.name and
3000
2998
  pod.status.phase in ['Running', 'Pending']):
2999
+ # Skip pods that should not count against GPU count
3000
+ if should_exclude_pod_from_gpu_allocation(pod):
3001
+ logger.debug(
3002
+ f'Excluding low priority pod '
3003
+ f'{pod.metadata.name} from GPU allocation '
3004
+ f'calculations on node {node.metadata.name}')
3005
+ continue
3001
3006
  # Iterate over all the containers in the pod and sum the
3002
3007
  # GPU requests
3003
3008
  for container in pod.spec.containers:
@@ -3596,3 +3601,24 @@ def delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
3596
3601
  time.sleep(retry_delay)
3597
3602
  else:
3598
3603
  raise
3604
+
3605
+
3606
+ def should_exclude_pod_from_gpu_allocation(pod) -> bool:
3607
+ """Check if a pod should be excluded from GPU count calculations.
3608
+
3609
+ Some cloud providers run low priority test/verification pods that request
3610
+ GPUs but should not count against real GPU availability since they are
3611
+ designed to be evicted when higher priority workloads need resources.
3612
+
3613
+ Args:
3614
+ pod: Kubernetes pod object
3615
+
3616
+ Returns:
3617
+ bool: True if the pod should be excluded from GPU count calculations.
3618
+ """
3619
+ # CoreWeave HPC verification pods - identified by namespace
3620
+ if (hasattr(pod.metadata, 'namespace') and
3621
+ pod.metadata.namespace == 'cw-hpc-verification'):
3622
+ return True
3623
+
3624
+ return False
@@ -134,6 +134,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
134
134
  image_family=config.node_config['ImageId'],
135
135
  disk_size=config.node_config['DiskSize'],
136
136
  user_data=config.node_config['UserData'],
137
+ use_spot=config.node_config['use_spot'],
137
138
  associate_public_ip_address=(
138
139
  not config.provider_config['use_internal_ips']),
139
140
  filesystems=config.node_config.get('filesystems', []),
@@ -168,6 +168,7 @@ def launch(cluster_name_on_cloud: str,
168
168
  user_data: str,
169
169
  associate_public_ip_address: bool,
170
170
  filesystems: List[Dict[str, Any]],
171
+ use_spot: bool = False,
171
172
  network_tier: Optional[resources_utils.NetworkTier] = None) -> str:
172
173
  # Each node must have a unique name to avoid conflicts between
173
174
  # multiple worker VMs. To ensure uniqueness,a UUID is appended
@@ -281,7 +282,14 @@ def launch(cluster_name_on_cloud: str,
281
282
  public_ip_address=nebius.compute().PublicIPAddress()
282
283
  if associate_public_ip_address else None,
283
284
  )
284
- ]))).wait()
285
+ ],
286
+ recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
287
+ if use_spot else None,
288
+ preemptible=nebius.compute().PreemptibleSpec(
289
+ priority=1,
290
+ on_preemption=nebius.compute(
291
+ ).PreemptibleSpec.PreemptionPolicy.STOP) if use_spot else None,
292
+ ))).wait()
285
293
  instance_id = ''
286
294
  retry_count = 0
287
295
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
@@ -0,0 +1,35 @@
1
+ """add workspace column to cluster_history table
2
+
3
+ Revision ID: 002
4
+ Revises: 001
5
+ Create Date: 2025-08-06
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '002'
18
+ down_revision: Union[str, Sequence[str], None] = '001'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade() -> None:
24
+ """Upgrade schema."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('cluster_history',
27
+ 'workspace',
28
+ sa.Text(),
29
+ server_default=None)
30
+ pass
31
+
32
+
33
+ def downgrade() -> None:
34
+ """Downgrade schema."""
35
+ pass
@@ -0,0 +1,34 @@
1
+ """Adding a hash column for pool.
2
+
3
+ Revision ID: 003
4
+ Revises: 002
5
+ Create Date: 2025-07-18
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '003'
18
+ down_revision: Union[str, Sequence[str], None] = '002'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add columns for pool hash."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('job_info',
27
+ 'pool_hash',
28
+ sa.Text(),
29
+ server_default=None)
30
+
31
+
32
+ def downgrade():
33
+ """Remove columns for pool hash."""
34
+ pass
sky/serve/client/impl.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """Implementation of SDK for SkyServe."""
2
2
  import json
3
3
  import typing
4
- from typing import List, Optional, Union
4
+ from typing import List, Optional, Sequence, Union
5
5
 
6
6
  import click
7
7
 
@@ -12,6 +12,8 @@ from sky.utils import admin_policy_utils
12
12
  from sky.utils import dag_utils
13
13
 
14
14
  if typing.TYPE_CHECKING:
15
+ import io
16
+
15
17
  import sky
16
18
  from sky.serve import serve_utils
17
19
 
@@ -186,3 +188,85 @@ def status(
186
188
  json=json.loads(body.model_dump_json()),
187
189
  timeout=(5, None))
188
190
  return server_common.get_request_id(response)
191
+
192
+
193
+ def tail_logs(service_name: str,
194
+ target: Union[str, 'serve_utils.ServiceComponent'],
195
+ replica_id: Optional[int] = None,
196
+ follow: bool = True,
197
+ output_stream: Optional['io.TextIOBase'] = None,
198
+ tail: Optional[int] = None,
199
+ pool: bool = False) -> None:
200
+ # Avoid circular import.
201
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
202
+
203
+ if pool:
204
+ body = payloads.JobsPoolLogsBody(
205
+ pool_name=service_name,
206
+ target=target,
207
+ worker_id=replica_id,
208
+ follow=follow,
209
+ tail=tail,
210
+ )
211
+ else:
212
+ body = payloads.ServeLogsBody(
213
+ service_name=service_name,
214
+ target=target,
215
+ replica_id=replica_id,
216
+ follow=follow,
217
+ tail=tail,
218
+ )
219
+ response = server_common.make_authenticated_request(
220
+ 'POST',
221
+ '/jobs/pool_logs' if pool else '/serve/logs',
222
+ json=json.loads(body.model_dump_json()),
223
+ timeout=(5, None),
224
+ stream=True)
225
+ request_id = server_common.get_request_id(response)
226
+ return sdk.stream_response(request_id=request_id,
227
+ response=response,
228
+ output_stream=output_stream,
229
+ resumable=True)
230
+
231
+
232
+ def sync_down_logs(service_name: str,
233
+ local_dir: str,
234
+ *,
235
+ targets: Optional[Union[
236
+ str, 'serve_utils.ServiceComponent',
237
+ Sequence[Union[str,
238
+ 'serve_utils.ServiceComponent']]]] = None,
239
+ replica_ids: Optional[List[int]] = None,
240
+ tail: Optional[int] = None,
241
+ pool: bool = False) -> None:
242
+ # Avoid circular import.
243
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
244
+
245
+ if pool:
246
+ body = payloads.JobsPoolDownloadLogsBody(
247
+ pool_name=service_name,
248
+ local_dir=local_dir,
249
+ targets=targets,
250
+ worker_ids=replica_ids,
251
+ tail=tail,
252
+ )
253
+ else:
254
+ body = payloads.ServeDownloadLogsBody(
255
+ service_name=service_name,
256
+ # No need to set here, since the server will override it
257
+ # to a directory on the API server.
258
+ local_dir=local_dir,
259
+ targets=targets,
260
+ replica_ids=replica_ids,
261
+ tail=tail,
262
+ )
263
+ response = server_common.make_authenticated_request(
264
+ 'POST',
265
+ '/jobs/pool_sync-down-logs' if pool else '/serve/sync-down-logs',
266
+ json=json.loads(body.model_dump_json()),
267
+ timeout=(5, None))
268
+ remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
269
+
270
+ # Download from API server paths to the client's local_dir
271
+ client_common.download_logs_from_api_server([remote_dir], remote_dir,
272
+ local_dir)
sky/serve/client/sdk.py CHANGED
@@ -1,9 +1,8 @@
1
1
  """SDK for SkyServe."""
2
2
  import json
3
3
  import typing
4
- from typing import List, Optional, Union
4
+ from typing import List, Optional, Sequence, Union
5
5
 
6
- from sky.client import common as client_common
7
6
  from sky.serve.client import impl
8
7
  from sky.server import common as server_common
9
8
  from sky.server import rest
@@ -290,27 +289,13 @@ def tail_logs(service_name: str,
290
289
  sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
291
290
  ValueError: arguments not valid, or failed to tail the logs.
292
291
  """
293
- # Avoid circular import.
294
- from sky.client import sdk # pylint: disable=import-outside-toplevel
295
-
296
- body = payloads.ServeLogsBody(
297
- service_name=service_name,
298
- target=target,
299
- replica_id=replica_id,
300
- follow=follow,
301
- tail=tail,
302
- )
303
- response = server_common.make_authenticated_request(
304
- 'POST',
305
- '/serve/logs',
306
- json=json.loads(body.model_dump_json()),
307
- timeout=(5, None),
308
- stream=True)
309
- request_id = server_common.get_request_id(response)
310
- return sdk.stream_response(request_id=request_id,
311
- response=response,
312
- output_stream=output_stream,
313
- resumable=True)
292
+ return impl.tail_logs(service_name,
293
+ target,
294
+ replica_id,
295
+ follow,
296
+ output_stream,
297
+ tail,
298
+ pool=False)
314
299
 
315
300
 
316
301
  @usage_lib.entrypoint
@@ -320,8 +305,8 @@ def sync_down_logs(service_name: str,
320
305
  *,
321
306
  targets: Optional[Union[
322
307
  str, 'serve_utils.ServiceComponent',
323
- List[Union[str,
324
- 'serve_utils.ServiceComponent']]]] = None,
308
+ Sequence[Union[str,
309
+ 'serve_utils.ServiceComponent']]]] = None,
325
310
  replica_ids: Optional[List[int]] = None,
326
311
  tail: Optional[int] = None) -> None:
327
312
  """Sync down logs from the service components to a local directory.
@@ -352,25 +337,9 @@ def sync_down_logs(service_name: str,
352
337
  sky.exceptions.ClusterNotUpError: If the controller is not up.
353
338
  ValueError: Arguments not valid.
354
339
  """
355
- # Avoid circular import.
356
- from sky.client import sdk # pylint: disable=import-outside-toplevel
357
-
358
- body = payloads.ServeDownloadLogsBody(
359
- service_name=service_name,
360
- # No need to set here, since the server will override it
361
- # to a directory on the API server.
362
- local_dir=local_dir,
363
- targets=targets,
364
- replica_ids=replica_ids,
365
- tail=tail,
366
- )
367
- response = server_common.make_authenticated_request(
368
- 'POST',
369
- '/serve/sync-down-logs',
370
- json=json.loads(body.model_dump_json()),
371
- timeout=(5, None))
372
- remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
373
-
374
- # Download from API server paths to the client's local_dir
375
- client_common.download_logs_from_api_server([remote_dir], remote_dir,
376
- local_dir)
340
+ return impl.sync_down_logs(service_name,
341
+ local_dir,
342
+ targets=targets,
343
+ replica_ids=replica_ids,
344
+ tail=tail,
345
+ pool=False)
@@ -0,0 +1,130 @@
1
+ """Async SDK for SkyServe."""
2
+ import typing
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
+
5
+ from sky.client import sdk_async
6
+ from sky.serve.client import sdk
7
+ from sky.usage import usage_lib
8
+ from sky.utils import context_utils
9
+
10
+ if typing.TYPE_CHECKING:
11
+ import io
12
+
13
+ import sky
14
+ from sky.serve import serve_utils
15
+
16
+
17
+ @usage_lib.entrypoint
18
+ async def up(
19
+ task: Union['sky.Task', 'sky.Dag'],
20
+ service_name: str,
21
+ # Internal only:
22
+ # pylint: disable=invalid-name
23
+ _need_confirmation: bool = False,
24
+ stream_logs: Optional[
25
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
26
+ ) -> Tuple[str, str]:
27
+ """Async version of up() that spins up a service."""
28
+ request_id = await context_utils.to_thread(sdk.up, task, service_name,
29
+ _need_confirmation)
30
+ if stream_logs is not None:
31
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
32
+ else:
33
+ return await sdk_async.get(request_id)
34
+
35
+
36
+ @usage_lib.entrypoint
37
+ async def update(
38
+ task: Union['sky.Task', 'sky.Dag'],
39
+ service_name: str,
40
+ mode: 'serve_utils.UpdateMode',
41
+ # Internal only:
42
+ # pylint: disable=invalid-name
43
+ _need_confirmation: bool = False,
44
+ stream_logs: Optional[
45
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
46
+ ) -> None:
47
+ """Async version of update() that updates an existing service."""
48
+ request_id = await context_utils.to_thread(sdk.update, task, service_name,
49
+ mode, _need_confirmation)
50
+ if stream_logs is not None:
51
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
52
+ else:
53
+ return await sdk_async.get(request_id)
54
+
55
+
56
+ @usage_lib.entrypoint
57
+ async def down(
58
+ service_names: Optional[Union[str, List[str]]],
59
+ all: bool = False, # pylint: disable=redefined-builtin
60
+ purge: bool = False,
61
+ stream_logs: Optional[
62
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
63
+ ) -> None:
64
+ """Async version of down() that tears down a service."""
65
+ request_id = await context_utils.to_thread(sdk.down, service_names, all,
66
+ purge)
67
+ if stream_logs is not None:
68
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
69
+ else:
70
+ return await sdk_async.get(request_id)
71
+
72
+
73
+ @usage_lib.entrypoint
74
+ async def terminate_replica(
75
+ service_name: str,
76
+ replica_id: int,
77
+ purge: bool,
78
+ stream_logs: Optional[
79
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
80
+ ) -> None:
81
+ """Async version of terminate_replica() that tears down a specific
82
+ replica."""
83
+ request_id = await context_utils.to_thread(sdk.terminate_replica,
84
+ service_name, replica_id, purge)
85
+ if stream_logs is not None:
86
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
87
+ else:
88
+ return await sdk_async.get(request_id)
89
+
90
+
91
+ @usage_lib.entrypoint
92
+ async def status(
93
+ service_names: Optional[Union[str, List[str]]],
94
+ stream_logs: Optional[
95
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
96
+ ) -> List[Dict[str, Any]]:
97
+ """Async version of status() that sdk_async.gets service statuses."""
98
+ request_id = await context_utils.to_thread(sdk.status, service_names)
99
+ if stream_logs is not None:
100
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
101
+ else:
102
+ return await sdk_async.get(request_id)
103
+
104
+
105
+ @usage_lib.entrypoint
106
+ async def tail_logs(service_name: str,
107
+ target: Union[str, 'serve_utils.ServiceComponent'],
108
+ replica_id: Optional[int] = None,
109
+ follow: bool = True,
110
+ output_stream: Optional['io.TextIOBase'] = None) -> None:
111
+ """Async version of tail_logs() that tails logs for a service."""
112
+ return await context_utils.to_thread(sdk.tail_logs, service_name, target,
113
+ replica_id, follow, output_stream)
114
+
115
+
116
+ @usage_lib.entrypoint
117
+ async def sync_down_logs(service_name: str,
118
+ local_dir: str,
119
+ *,
120
+ targets: Optional[Union[
121
+ str, 'serve_utils.ServiceComponent', List[Union[
122
+ str, 'serve_utils.ServiceComponent']]]] = None,
123
+ replica_ids: Optional[List[int]] = None) -> None:
124
+ """Async version of sync_down_logs() that syncs down logs from service
125
+ components."""
126
+ return await context_utils.to_thread(sdk.sync_down_logs,
127
+ service_name,
128
+ local_dir,
129
+ targets=targets,
130
+ replica_ids=replica_ids)
sky/serve/constants.py CHANGED
@@ -105,7 +105,9 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
105
105
  # v1.0 - Introduce rolling update.
106
106
  # v2.0 - Added template-replica feature.
107
107
  # v3.0 - Added cluster pool.
108
- SERVE_VERSION = 3
108
+ # v4.0 - Added pool argument to wait_service_registration.
109
+ # v5.0 - Added pool argument to stream_serve_process_logs & stream_replica_logs.
110
+ SERVE_VERSION = 5
109
111
 
110
112
  TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
111
113
  'The version of service is outdated and does not support manually '
sky/serve/controller.py CHANGED
@@ -4,6 +4,7 @@ Responsible for autoscaling and replica management.
4
4
  """
5
5
  import contextlib
6
6
  import logging
7
+ import os
7
8
  import threading
8
9
  import time
9
10
  import traceback
@@ -26,11 +27,12 @@ from sky.utils import ux_utils
26
27
  logger = sky_logging.init_logger(__name__)
27
28
 
28
29
 
29
- class SuppressSuccessGetAccessLogsFilter(logging.Filter):
30
+ class AutoscalerInfoFilter(logging.Filter):
30
31
 
31
32
  def filter(self, record: logging.LogRecord) -> bool:
32
33
  message = record.getMessage()
33
- return not ('GET' in message and '200' in message)
34
+ return not ('GET' in message and '200' in message and
35
+ '/autoscaler/info' in message)
34
36
 
35
37
 
36
38
  class SkyServeController:
@@ -60,6 +62,7 @@ class SkyServeController:
60
62
  uvicorn_access_logger = logging.getLogger('uvicorn.access')
61
63
  for handler in uvicorn_access_logger.handlers:
62
64
  handler.setFormatter(sky_logging.FORMATTER)
65
+ handler.addFilter(AutoscalerInfoFilter())
63
66
  yield
64
67
 
65
68
  def _run_autoscaler(self):
@@ -242,7 +245,7 @@ class SkyServeController:
242
245
  threading.Thread(target=self._run_autoscaler).start()
243
246
 
244
247
  logger.info('SkyServe Controller started on '
245
- f'http://{self._host}:{self._port}')
248
+ f'http://{self._host}:{self._port}. PID: {os.getpid()}')
246
249
 
247
250
  uvicorn.run(self._app, host=self._host, port=self._port)
248
251
 
@@ -1,6 +1,7 @@
1
1
  """LoadBalancer: Distribute any incoming request to all ready replicas."""
2
2
  import asyncio
3
3
  import logging
4
+ import os
4
5
  import threading
5
6
  import traceback
6
7
  from typing import Dict, List, Optional, Union
@@ -254,7 +255,8 @@ class SkyServeLoadBalancer:
254
255
  protocol = 'https' if self._tls_credential is not None else 'http'
255
256
 
256
257
  logger.info('SkyServe Load Balancer started on '
257
- f'{protocol}://0.0.0.0:{self._load_balancer_port}')
258
+ f'{protocol}://0.0.0.0:{self._load_balancer_port}. '
259
+ f'PID: {os.getpid()}')
258
260
 
259
261
  uvicorn.run(self._app,
260
262
  host='0.0.0.0',