skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +452 -53
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/config.html +1 -1
  15. sky/dashboard/out/index.html +1 -1
  16. sky/dashboard/out/infra/[context].html +1 -1
  17. sky/dashboard/out/infra.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/data_utils.py +21 -1
  26. sky/data/storage.py +12 -0
  27. sky/jobs/__init__.py +3 -0
  28. sky/jobs/client/sdk.py +80 -3
  29. sky/jobs/controller.py +76 -25
  30. sky/jobs/recovery_strategy.py +80 -34
  31. sky/jobs/scheduler.py +68 -20
  32. sky/jobs/server/core.py +228 -136
  33. sky/jobs/server/server.py +40 -0
  34. sky/jobs/state.py +129 -24
  35. sky/jobs/utils.py +109 -51
  36. sky/provision/nebius/constants.py +3 -0
  37. sky/provision/runpod/utils.py +27 -12
  38. sky/py.typed +0 -0
  39. sky/resources.py +16 -12
  40. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  41. sky/serve/autoscalers.py +8 -0
  42. sky/serve/client/impl.py +188 -0
  43. sky/serve/client/sdk.py +12 -82
  44. sky/serve/constants.py +5 -1
  45. sky/serve/controller.py +5 -0
  46. sky/serve/replica_managers.py +112 -37
  47. sky/serve/serve_state.py +16 -6
  48. sky/serve/serve_utils.py +274 -77
  49. sky/serve/server/core.py +8 -525
  50. sky/serve/server/impl.py +709 -0
  51. sky/serve/service.py +13 -9
  52. sky/serve/service_spec.py +74 -4
  53. sky/server/constants.py +1 -1
  54. sky/server/daemons.py +164 -0
  55. sky/server/requests/payloads.py +33 -0
  56. sky/server/requests/requests.py +2 -107
  57. sky/server/requests/serializers/decoders.py +12 -3
  58. sky/server/requests/serializers/encoders.py +13 -2
  59. sky/server/server.py +2 -1
  60. sky/server/uvicorn.py +2 -1
  61. sky/sky_logging.py +30 -0
  62. sky/skylet/constants.py +2 -1
  63. sky/skylet/events.py +9 -0
  64. sky/skypilot_config.py +24 -21
  65. sky/task.py +41 -11
  66. sky/templates/jobs-controller.yaml.j2 +3 -0
  67. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  68. sky/users/server.py +1 -1
  69. sky/utils/command_runner.py +4 -2
  70. sky/utils/controller_utils.py +14 -10
  71. sky/utils/dag_utils.py +4 -2
  72. sky/utils/db/migration_utils.py +2 -4
  73. sky/utils/schemas.py +47 -19
  74. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
  75. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
  76. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
  77. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
  78. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
  79. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
  80. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
  81. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/resources.py CHANGED
@@ -19,6 +19,7 @@ from sky.clouds import cloud as sky_cloud
19
19
  from sky.provision import docker_utils
20
20
  from sky.provision.gcp import constants as gcp_constants
21
21
  from sky.provision.kubernetes import utils as kubernetes_utils
22
+ from sky.provision.nebius import constants as nebius_constants
22
23
  from sky.skylet import constants
23
24
  from sky.utils import accelerator_registry
24
25
  from sky.utils import annotations
@@ -1260,15 +1261,19 @@ class Resources:
1260
1261
  ValueError: if the attribute is invalid.
1261
1262
  """
1262
1263
 
1263
- if (self._network_tier == resources_utils.NetworkTier.BEST and
1264
- isinstance(self._cloud, clouds.GCP)):
1265
- # Handle GPU Direct TCPX requirement for docker images
1266
- if self._image_id is None:
1267
- # No custom image specified - use the default GPU Direct image
1268
- self._image_id = {
1269
- self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
1270
- }
1271
- else:
1264
+ if self._network_tier == resources_utils.NetworkTier.BEST:
1265
+ if isinstance(self._cloud, clouds.GCP):
1266
+ # Handle GPU Direct TCPX requirement for docker images
1267
+ if self._image_id is None:
1268
+ self._image_id = {
1269
+ self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
1270
+ }
1271
+ elif isinstance(self._cloud, clouds.Nebius):
1272
+ if self._image_id is None:
1273
+ self._image_id = {
1274
+ self._region: nebius_constants.INFINIBAND_IMAGE_ID
1275
+ }
1276
+ elif self._image_id:
1272
1277
  # Custom image specified - validate it's a docker image
1273
1278
  # Check if any of the specified images are not docker images
1274
1279
  non_docker_images = []
@@ -1280,14 +1285,13 @@ class Resources:
1280
1285
  if non_docker_images:
1281
1286
  with ux_utils.print_exception_no_traceback():
1282
1287
  raise ValueError(
1283
- f'When using network_tier=BEST on GCP, image_id '
1288
+ f'When using network_tier=BEST, image_id '
1284
1289
  f'must be a docker image. '
1285
1290
  f'Found non-docker images: '
1286
1291
  f'{", ".join(non_docker_images)}. '
1287
1292
  f'Please either: (1) use a docker image '
1288
1293
  f'(prefix with "docker:"), or '
1289
- f'(2) leave image_id empty to use the default '
1290
- f'GPU Direct TCPX image.')
1294
+ f'(2) leave image_id empty to use the default')
1291
1295
 
1292
1296
  if self._image_id is None:
1293
1297
  return
@@ -0,0 +1,42 @@
1
+ """Columns for cluster pool.
2
+
3
+ Revision ID: 002
4
+ Revises: 001
5
+ Create Date: 2025-07-18
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '002'
18
+ down_revision: Union[str, Sequence[str], None] = '001'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add columns for cluster pool."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('job_info',
27
+ 'pool',
28
+ sa.Text(),
29
+ server_default=None)
30
+ db_utils.add_column_to_table_alembic('job_info',
31
+ 'current_cluster_name',
32
+ sa.Text(),
33
+ server_default=None)
34
+ db_utils.add_column_to_table_alembic('job_info',
35
+ 'job_id_on_pool_cluster',
36
+ sa.Integer(),
37
+ server_default=None)
38
+
39
+
40
+ def downgrade():
41
+ """Remove columns for cluster pool."""
42
+ pass
sky/serve/autoscalers.py CHANGED
@@ -175,6 +175,14 @@ class Autoscaler:
175
175
  """Collect request information from aggregator for autoscaling."""
176
176
  raise NotImplementedError
177
177
 
178
+ def info(self) -> Dict[str, Any]:
179
+ """Get information about the autoscaler."""
180
+ return {
181
+ 'target_num_replicas': self.target_num_replicas,
182
+ 'min_replicas': self.min_replicas,
183
+ 'max_replicas': self.max_replicas,
184
+ }
185
+
178
186
  def _generate_scaling_decisions(
179
187
  self,
180
188
  replica_infos: List['replica_managers.ReplicaInfo'],
@@ -0,0 +1,188 @@
1
+ """Implementation of SDK for SkyServe."""
2
+ import json
3
+ import typing
4
+ from typing import List, Optional, Union
5
+
6
+ import click
7
+
8
+ from sky.client import common as client_common
9
+ from sky.server import common as server_common
10
+ from sky.server.requests import payloads
11
+ from sky.utils import admin_policy_utils
12
+ from sky.utils import dag_utils
13
+
14
+ if typing.TYPE_CHECKING:
15
+ import sky
16
+ from sky.serve import serve_utils
17
+
18
+
19
+ def up(
20
+ task: Union['sky.Task', 'sky.Dag'],
21
+ service_name: str,
22
+ pool: bool = False,
23
+ # Internal only:
24
+ # pylint: disable=invalid-name
25
+ _need_confirmation: bool = False
26
+ ) -> server_common.RequestId:
27
+ assert not pool, 'Command `up` is not supported for pool.'
28
+ # Avoid circular import.
29
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
30
+
31
+ dag = dag_utils.convert_entrypoint_to_dag(task)
32
+ with admin_policy_utils.apply_and_use_config_in_current_request(
33
+ dag, at_client_side=True) as dag:
34
+ sdk.validate(dag)
35
+ request_id = sdk.optimize(dag)
36
+ sdk.stream_and_get(request_id)
37
+ if _need_confirmation:
38
+ noun = 'pool' if pool else 'service'
39
+ prompt = f'Launching a new {noun} {service_name!r}. Proceed?'
40
+ if prompt is not None:
41
+ click.confirm(prompt,
42
+ default=True,
43
+ abort=True,
44
+ show_default=True)
45
+
46
+ dag = client_common.upload_mounts_to_api_server(dag)
47
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
48
+
49
+ body = payloads.ServeUpBody(
50
+ task=dag_str,
51
+ service_name=service_name,
52
+ )
53
+
54
+ response = server_common.make_authenticated_request(
55
+ 'POST',
56
+ '/serve/up',
57
+ json=json.loads(body.model_dump_json()),
58
+ timeout=(5, None))
59
+ return server_common.get_request_id(response)
60
+
61
+
62
+ def update(
63
+ task: Union['sky.Task', 'sky.Dag'],
64
+ service_name: str,
65
+ mode: 'serve_utils.UpdateMode',
66
+ pool: bool = False,
67
+ # Internal only:
68
+ # pylint: disable=invalid-name
69
+ _need_confirmation: bool = False
70
+ ) -> server_common.RequestId:
71
+ assert not pool, 'Command `update` is not supported for pool.'
72
+ # Avoid circular import.
73
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
74
+ noun = 'pool' if pool else 'service'
75
+
76
+ dag = dag_utils.convert_entrypoint_to_dag(task)
77
+ with admin_policy_utils.apply_and_use_config_in_current_request(
78
+ dag, at_client_side=True) as dag:
79
+ sdk.validate(dag)
80
+ request_id = sdk.optimize(dag)
81
+ sdk.stream_and_get(request_id)
82
+ if _need_confirmation:
83
+ click.confirm(f'Updating {noun} {service_name!r}. Proceed?',
84
+ default=True,
85
+ abort=True,
86
+ show_default=True)
87
+
88
+ dag = client_common.upload_mounts_to_api_server(dag)
89
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
90
+
91
+ body = payloads.ServeUpdateBody(
92
+ task=dag_str,
93
+ service_name=service_name,
94
+ mode=mode,
95
+ )
96
+
97
+ response = server_common.make_authenticated_request(
98
+ 'POST',
99
+ '/serve/update',
100
+ json=json.loads(body.model_dump_json()),
101
+ timeout=(5, None))
102
+ return server_common.get_request_id(response)
103
+
104
+
105
+ def apply(
106
+ task: Union['sky.Task', 'sky.Dag'],
107
+ service_name: str,
108
+ mode: 'serve_utils.UpdateMode',
109
+ pool: bool = False,
110
+ # Internal only:
111
+ # pylint: disable=invalid-name
112
+ _need_confirmation: bool = False
113
+ ) -> server_common.RequestId:
114
+ assert pool, 'Command `apply` is only supported for pool.'
115
+ # Avoid circular import.
116
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
117
+
118
+ dag = dag_utils.convert_entrypoint_to_dag(task)
119
+ with admin_policy_utils.apply_and_use_config_in_current_request(
120
+ dag, at_client_side=True) as dag:
121
+ sdk.validate(dag)
122
+ request_id = sdk.optimize(dag)
123
+ sdk.stream_and_get(request_id)
124
+ if _need_confirmation:
125
+ noun = 'pool' if pool else 'service'
126
+ prompt = f'Applying config to {noun} {service_name!r}. Proceed?'
127
+ if prompt is not None:
128
+ click.confirm(prompt,
129
+ default=True,
130
+ abort=True,
131
+ show_default=True)
132
+
133
+ dag = client_common.upload_mounts_to_api_server(dag)
134
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
135
+
136
+ body = payloads.JobsPoolApplyBody(
137
+ task=dag_str,
138
+ pool_name=service_name,
139
+ mode=mode,
140
+ )
141
+ response = server_common.make_authenticated_request(
142
+ 'POST',
143
+ '/jobs/pool_apply',
144
+ json=json.loads(body.model_dump_json()),
145
+ timeout=(5, None))
146
+ return server_common.get_request_id(response)
147
+
148
+
149
+ def down(
150
+ service_names: Optional[Union[str, List[str]]],
151
+ all: bool = False, # pylint: disable=redefined-builtin
152
+ purge: bool = False,
153
+ pool: bool = False,
154
+ ) -> server_common.RequestId:
155
+ if pool:
156
+ body = payloads.JobsPoolDownBody(
157
+ pool_names=service_names,
158
+ all=all,
159
+ purge=purge,
160
+ )
161
+ else:
162
+ body = payloads.ServeDownBody(
163
+ service_names=service_names,
164
+ all=all,
165
+ purge=purge,
166
+ )
167
+ response = server_common.make_authenticated_request(
168
+ 'POST',
169
+ '/jobs/pool_down' if pool else '/serve/down',
170
+ json=json.loads(body.model_dump_json()),
171
+ timeout=(5, None))
172
+ return server_common.get_request_id(response)
173
+
174
+
175
+ def status(
176
+ service_names: Optional[Union[str, List[str]]],
177
+ pool: bool = False,
178
+ ) -> server_common.RequestId:
179
+ if pool:
180
+ body = payloads.JobsPoolStatusBody(pool_names=service_names)
181
+ else:
182
+ body = payloads.ServeStatusBody(service_names=service_names)
183
+ response = server_common.make_authenticated_request(
184
+ 'POST',
185
+ '/jobs/pool_status' if pool else '/serve/status',
186
+ json=json.loads(body.model_dump_json()),
187
+ timeout=(5, None))
188
+ return server_common.get_request_id(response)
sky/serve/client/sdk.py CHANGED
@@ -3,16 +3,13 @@ import json
3
3
  import typing
4
4
  from typing import List, Optional, Union
5
5
 
6
- import click
7
-
8
6
  from sky.client import common as client_common
7
+ from sky.serve.client import impl
9
8
  from sky.server import common as server_common
10
9
  from sky.server import rest
11
10
  from sky.server.requests import payloads
12
11
  from sky.usage import usage_lib
13
- from sky.utils import admin_policy_utils
14
12
  from sky.utils import context
15
- from sky.utils import dag_utils
16
13
 
17
14
  if typing.TYPE_CHECKING:
18
15
  import io
@@ -49,37 +46,10 @@ def up(
49
46
  argument.
50
47
  endpoint (str): The service endpoint.
51
48
  """
52
-
53
- # Avoid circular import.
54
- from sky.client import sdk # pylint: disable=import-outside-toplevel
55
-
56
- dag = dag_utils.convert_entrypoint_to_dag(task)
57
- with admin_policy_utils.apply_and_use_config_in_current_request(
58
- dag, at_client_side=True) as dag:
59
- sdk.validate(dag)
60
- request_id = sdk.optimize(dag)
61
- sdk.stream_and_get(request_id)
62
- if _need_confirmation:
63
- prompt = f'Launching a new service {service_name!r}. Proceed?'
64
- if prompt is not None:
65
- click.confirm(prompt,
66
- default=True,
67
- abort=True,
68
- show_default=True)
69
-
70
- dag = client_common.upload_mounts_to_api_server(dag)
71
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
72
-
73
- body = payloads.ServeUpBody(
74
- task=dag_str,
75
- service_name=service_name,
76
- )
77
- response = server_common.make_authenticated_request(
78
- 'POST',
79
- '/serve/up',
80
- json=json.loads(body.model_dump_json()),
81
- timeout=(5, None))
82
- return server_common.get_request_id(response)
49
+ return impl.up(task,
50
+ service_name,
51
+ pool=False,
52
+ _need_confirmation=_need_confirmation)
83
53
 
84
54
 
85
55
  @context.contextual
@@ -112,35 +82,11 @@ def update(
112
82
  Request Returns:
113
83
  None
114
84
  """
115
- # Avoid circular import.
116
- from sky.client import sdk # pylint: disable=import-outside-toplevel
117
-
118
- dag = dag_utils.convert_entrypoint_to_dag(task)
119
- with admin_policy_utils.apply_and_use_config_in_current_request(
120
- dag, at_client_side=True) as dag:
121
- sdk.validate(dag)
122
- request_id = sdk.optimize(dag)
123
- sdk.stream_and_get(request_id)
124
- if _need_confirmation:
125
- click.confirm(f'Updating service {service_name!r}. Proceed?',
126
- default=True,
127
- abort=True,
128
- show_default=True)
129
-
130
- dag = client_common.upload_mounts_to_api_server(dag)
131
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
132
- body = payloads.ServeUpdateBody(
133
- task=dag_str,
134
- service_name=service_name,
135
- mode=mode,
136
- )
137
-
138
- response = server_common.make_authenticated_request(
139
- 'POST',
140
- '/serve/update',
141
- json=json.loads(body.model_dump_json()),
142
- timeout=(5, None))
143
- return server_common.get_request_id(response)
85
+ return impl.update(task,
86
+ service_name,
87
+ mode,
88
+ pool=False,
89
+ _need_confirmation=_need_confirmation)
144
90
 
145
91
 
146
92
  @usage_lib.entrypoint
@@ -171,17 +117,7 @@ def down(
171
117
  ValueError: if the arguments are invalid.
172
118
  RuntimeError: if failed to terminate the service.
173
119
  """
174
- body = payloads.ServeDownBody(
175
- service_names=service_names,
176
- all=all,
177
- purge=purge,
178
- )
179
- response = server_common.make_authenticated_request(
180
- 'POST',
181
- '/serve/down',
182
- json=json.loads(body.model_dump_json()),
183
- timeout=(5, None))
184
- return server_common.get_request_id(response)
120
+ return impl.down(service_names, all, purge, pool=False)
185
121
 
186
122
 
187
123
  @usage_lib.entrypoint
@@ -281,13 +217,7 @@ def status(
281
217
  RuntimeError: if failed to get the service status.
282
218
  exceptions.ClusterNotUpError: if the sky serve controller is not up.
283
219
  """
284
- body = payloads.ServeStatusBody(service_names=service_names,)
285
- response = server_common.make_authenticated_request(
286
- 'POST',
287
- '/serve/status',
288
- json=json.loads(body.model_dump_json()),
289
- timeout=(5, None))
290
- return server_common.get_request_id(response)
220
+ return impl.status(service_names, pool=False)
291
221
 
292
222
 
293
223
  @usage_lib.entrypoint
sky/serve/constants.py CHANGED
@@ -104,8 +104,12 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
104
104
  # Changelog:
105
105
  # v1.0 - Introduce rolling update.
106
106
  # v2.0 - Added template-replica feature.
107
- SERVE_VERSION = 2
107
+ # v3.0 - Added cluster pool.
108
+ SERVE_VERSION = 3
108
109
 
109
110
  TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
110
111
  'The version of service is outdated and does not support manually '
111
112
  'terminating replicas. Please terminate the service and spin up again.')
113
+
114
+ # Dummy run command for cluster pool.
115
+ POOL_DUMMY_RUN_COMMAND = 'echo "setup done"'
sky/serve/controller.py CHANGED
@@ -100,6 +100,11 @@ class SkyServeController:
100
100
 
101
101
  def run(self) -> None:
102
102
 
103
+ @self._app.get('/autoscaler/info')
104
+ async def get_autoscaler_info() -> fastapi.Response:
105
+ return responses.JSONResponse(content=self._autoscaler.info(),
106
+ status_code=200)
107
+
103
108
  @self._app.post('/controller/load_balancer_sync')
104
109
  async def load_balancer_sync(
105
110
  request: fastapi.Request) -> fastapi.Response: