skypilot-nightly 1.0.0.dev20250806__py3-none-any.whl → 1.0.0.dev20250808__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +20 -1
- sky/backends/cloud_vm_ray_backend.py +42 -6
- sky/check.py +11 -1
- sky/client/cli/command.py +248 -119
- sky/client/sdk.py +146 -66
- sky/client/sdk_async.py +5 -1
- sky/core.py +5 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/-DXZksWqf2waNHeU9YTQe/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
- sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
- sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +1 -0
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
- sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-2a43ea3241bbdacd.js → _app-491a4d699d95e808.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-47f1ddae13a2f8e4.js → clusters-b30460f683e6ba96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-2a44e70b500b6b70.js → [context]-13d53fffc03ccb52.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-22faac9325016d83.js → infra-fc9222e26c8e2f0d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-b90c865a690bfe84.js → users-7ed36e44e779d5c7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-7af733f5d7b6ed1c.js → volumes-c9695d657f78b5dc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-339efec49c0cc7d0.js +1 -0
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +6 -4
- sky/global_user_state.py +22 -3
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +67 -19
- sky/jobs/controller.py +2 -1
- sky/jobs/server/core.py +48 -1
- sky/jobs/server/server.py +52 -3
- sky/jobs/state.py +5 -1
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/serve/client/impl.py +93 -6
- sky/serve/client/sdk.py +22 -53
- sky/serve/constants.py +2 -1
- sky/serve/controller.py +4 -2
- sky/serve/serve_state.py +444 -324
- sky/serve/serve_utils.py +77 -46
- sky/serve/server/core.py +13 -197
- sky/serve/server/impl.py +239 -2
- sky/serve/service.py +8 -3
- sky/server/common.py +18 -7
- sky/server/constants.py +1 -1
- sky/server/requests/executor.py +5 -3
- sky/server/requests/payloads.py +19 -0
- sky/setup_files/alembic.ini +4 -0
- sky/task.py +18 -11
- sky/templates/kubernetes-ray.yml.j2 +5 -0
- sky/templates/sky-serve-controller.yaml.j2 +1 -0
- sky/usage/usage_lib.py +8 -6
- sky/utils/annotations.py +8 -3
- sky/utils/cli_utils/status_utils.py +1 -1
- sky/utils/common_utils.py +11 -1
- sky/utils/db/db_utils.py +31 -0
- sky/utils/db/migration_utils.py +6 -2
- sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
- sky/utils/resource_checker.py +162 -21
- sky/volumes/client/sdk.py +4 -4
- sky/workspaces/core.py +210 -6
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/METADATA +19 -14
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/RECORD +109 -103
- sky/client/sdk.pyi +0 -301
- sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +0 -11
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ced1c14230cad6e1.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6601-2109d22e7861861c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-bda2685db5eae6cf.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-7cb24da04ca00956.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1e95993124dbfc57.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-35e0de5bca55e594.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +0 -1
- sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
- /sky/dashboard/out/_next/static/{Gelsd19kVxXcX7aQQGsGu → -DXZksWqf2waNHeU9YTQe}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{6135-2d7ed3350659d073.js → 6135-85426374db04811e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/top_level.txt +0 -0
sky/jobs/server/server.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""REST API for managed jobs."""
|
|
2
2
|
|
|
3
|
+
import pathlib
|
|
4
|
+
|
|
3
5
|
import fastapi
|
|
4
6
|
|
|
5
7
|
from sky import sky_logging
|
|
@@ -117,7 +119,7 @@ async def pool_apply(request: fastapi.Request,
|
|
|
117
119
|
request_body=jobs_pool_apply_body,
|
|
118
120
|
func=core.pool_apply,
|
|
119
121
|
schedule_type=api_requests.ScheduleType.LONG,
|
|
120
|
-
request_cluster_name=common.
|
|
122
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
121
123
|
)
|
|
122
124
|
|
|
123
125
|
|
|
@@ -130,7 +132,7 @@ async def pool_down(request: fastapi.Request,
|
|
|
130
132
|
request_body=jobs_pool_down_body,
|
|
131
133
|
func=core.pool_down,
|
|
132
134
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
133
|
-
request_cluster_name=common.
|
|
135
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
134
136
|
)
|
|
135
137
|
|
|
136
138
|
|
|
@@ -144,5 +146,52 @@ async def pool_status(
|
|
|
144
146
|
request_body=jobs_pool_status_body,
|
|
145
147
|
func=core.pool_status,
|
|
146
148
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
147
|
-
request_cluster_name=common.
|
|
149
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@router.post('/pool_logs')
|
|
154
|
+
async def pool_tail_logs(
|
|
155
|
+
request: fastapi.Request, log_body: payloads.JobsPoolLogsBody,
|
|
156
|
+
background_tasks: fastapi.BackgroundTasks
|
|
157
|
+
) -> fastapi.responses.StreamingResponse:
|
|
158
|
+
executor.schedule_request(
|
|
159
|
+
request_id=request.state.request_id,
|
|
160
|
+
request_name='jobs.pool_logs',
|
|
161
|
+
request_body=log_body,
|
|
162
|
+
func=core.pool_tail_logs,
|
|
163
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
164
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
request_task = api_requests.get_request(request.state.request_id)
|
|
168
|
+
|
|
169
|
+
return stream_utils.stream_response(
|
|
170
|
+
request_id=request_task.request_id,
|
|
171
|
+
logs_path=request_task.log_path,
|
|
172
|
+
background_tasks=background_tasks,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@router.post('/pool_sync-down-logs')
|
|
177
|
+
async def pool_download_logs(
|
|
178
|
+
request: fastapi.Request,
|
|
179
|
+
download_logs_body: payloads.JobsPoolDownloadLogsBody,
|
|
180
|
+
) -> None:
|
|
181
|
+
user_hash = download_logs_body.env_vars[constants.USER_ID_ENV_VAR]
|
|
182
|
+
timestamp = sky_logging.get_run_timestamp()
|
|
183
|
+
logs_dir_on_api_server = (
|
|
184
|
+
pathlib.Path(server_common.api_server_user_logs_dir_prefix(user_hash)) /
|
|
185
|
+
'pool' / f'{download_logs_body.pool_name}_{timestamp}')
|
|
186
|
+
logs_dir_on_api_server.mkdir(parents=True, exist_ok=True)
|
|
187
|
+
# We should reuse the original request body, so that the env vars, such as
|
|
188
|
+
# user hash, are kept the same.
|
|
189
|
+
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
190
|
+
executor.schedule_request(
|
|
191
|
+
request_id=request.state.request_id,
|
|
192
|
+
request_name='jobs.pool_sync_down_logs',
|
|
193
|
+
request_body=download_logs_body,
|
|
194
|
+
func=core.pool_sync_down_logs,
|
|
195
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
196
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
148
197
|
)
|
sky/jobs/state.py
CHANGED
|
@@ -107,6 +107,7 @@ job_info_table = sqlalchemy.Table(
|
|
|
107
107
|
sqlalchemy.Column('job_id_on_pool_cluster',
|
|
108
108
|
sqlalchemy.Integer,
|
|
109
109
|
server_default=None),
|
|
110
|
+
sqlalchemy.Column('pool_hash', sqlalchemy.Text, server_default=None),
|
|
110
111
|
)
|
|
111
112
|
|
|
112
113
|
ha_recovery_script_table = sqlalchemy.Table(
|
|
@@ -225,6 +226,7 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
|
225
226
|
'pool': r['pool'],
|
|
226
227
|
'current_cluster_name': r['current_cluster_name'],
|
|
227
228
|
'job_id_on_pool_cluster': r['job_id_on_pool_cluster'],
|
|
229
|
+
'pool_hash': r['pool_hash'],
|
|
228
230
|
}
|
|
229
231
|
|
|
230
232
|
|
|
@@ -462,7 +464,8 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
|
|
|
462
464
|
|
|
463
465
|
@_init_db
|
|
464
466
|
def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
|
|
465
|
-
pool: Optional[str]
|
|
467
|
+
pool: Optional[str],
|
|
468
|
+
pool_hash: Optional[str]) -> int:
|
|
466
469
|
assert _SQLALCHEMY_ENGINE is not None
|
|
467
470
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
468
471
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -480,6 +483,7 @@ def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
|
|
|
480
483
|
workspace=workspace,
|
|
481
484
|
entrypoint=entrypoint,
|
|
482
485
|
pool=pool,
|
|
486
|
+
pool_hash=pool_hash,
|
|
483
487
|
)
|
|
484
488
|
|
|
485
489
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""add workspace column to cluster_history table
|
|
2
|
+
|
|
3
|
+
Revision ID: 002
|
|
4
|
+
Revises: 001
|
|
5
|
+
Create Date: 2025-08-06
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '002'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '001'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade() -> None:
|
|
24
|
+
"""Upgrade schema."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
27
|
+
'workspace',
|
|
28
|
+
sa.Text(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def downgrade() -> None:
|
|
34
|
+
"""Downgrade schema."""
|
|
35
|
+
pass
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""fix initial revision
|
|
2
|
+
|
|
3
|
+
Revision ID: 003
|
|
4
|
+
Revises: 002
|
|
5
|
+
Create Date: 2025-08-07
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '003'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '002'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade() -> None:
|
|
24
|
+
"""Upgrade schema."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
# add missing columns to clusters table
|
|
27
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
28
|
+
'storage_mounts_metadata',
|
|
29
|
+
sa.LargeBinary(),
|
|
30
|
+
server_default=None)
|
|
31
|
+
# Set the value to replace existing entries to 1 so that all the
|
|
32
|
+
# existing clusters before #2977 are considered as ever up, i.e:
|
|
33
|
+
# existing cluster's default (null) -> 1;
|
|
34
|
+
# new cluster's default -> 0;
|
|
35
|
+
# This is conservative for the existing clusters: even if some INIT
|
|
36
|
+
# clusters were never really UP, setting it to 1 means they won't be
|
|
37
|
+
# auto-deleted during any failover.
|
|
38
|
+
db_utils.add_column_to_table_alembic(
|
|
39
|
+
'clusters',
|
|
40
|
+
'cluster_ever_up',
|
|
41
|
+
sa.Integer(),
|
|
42
|
+
server_default='0',
|
|
43
|
+
value_to_replace_existing_entries=1)
|
|
44
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
45
|
+
'status_updated_at',
|
|
46
|
+
sa.Integer(),
|
|
47
|
+
server_default=None)
|
|
48
|
+
|
|
49
|
+
# remove mistakenly added columns
|
|
50
|
+
db_utils.drop_column_from_table_alembic('clusters', 'launched_nodes')
|
|
51
|
+
db_utils.drop_column_from_table_alembic('clusters', 'disk_tier')
|
|
52
|
+
db_utils.drop_column_from_table_alembic('clusters',
|
|
53
|
+
'config_hash_locked')
|
|
54
|
+
db_utils.drop_column_from_table_alembic('clusters', 'handle_locked')
|
|
55
|
+
db_utils.drop_column_from_table_alembic('clusters', 'num_failures')
|
|
56
|
+
db_utils.drop_column_from_table_alembic('clusters', 'configs')
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def downgrade() -> None:
|
|
60
|
+
"""Downgrade schema."""
|
|
61
|
+
pass
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Columns for whether the cluster is managed.
|
|
2
|
+
|
|
3
|
+
Revision ID: 004
|
|
4
|
+
Revises: 003
|
|
5
|
+
Create Date: 2025-08-07
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.π
|
|
17
|
+
revision: str = '004'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '003'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add columns for whether the cluster is managed."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
27
|
+
'is_managed',
|
|
28
|
+
sa.Integer(),
|
|
29
|
+
server_default='0')
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def downgrade():
|
|
33
|
+
"""Remove columns for whether the cluster is managed."""
|
|
34
|
+
pass
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Initial schema for sky serve state database with backwards compatibility
|
|
2
|
+
|
|
3
|
+
Revision ID: 001
|
|
4
|
+
Revises:
|
|
5
|
+
Create Date: 2024-01-01 12:00:00.000000
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.serve import constants
|
|
15
|
+
from sky.serve.serve_state import Base
|
|
16
|
+
from sky.utils.db import db_utils
|
|
17
|
+
|
|
18
|
+
# revision identifiers, used by Alembic.
|
|
19
|
+
revision = '001'
|
|
20
|
+
down_revision = None
|
|
21
|
+
branch_labels = None
|
|
22
|
+
depends_on = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def upgrade():
|
|
26
|
+
"""Create initial schema and add all backwards compatibility columns"""
|
|
27
|
+
with op.get_context().autocommit_block():
|
|
28
|
+
# Create all tables with their current schema
|
|
29
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
30
|
+
|
|
31
|
+
# Add backwards compatibility columns using helper function that matches
|
|
32
|
+
# original add_column_to_table_sqlalchemy behavior exactly
|
|
33
|
+
db_utils.add_column_to_table_alembic('services',
|
|
34
|
+
'requested_resources_str',
|
|
35
|
+
sa.Text())
|
|
36
|
+
db_utils.add_column_to_table_alembic(
|
|
37
|
+
'services',
|
|
38
|
+
'current_version',
|
|
39
|
+
sa.Integer(),
|
|
40
|
+
server_default=f'{constants.INITIAL_VERSION}')
|
|
41
|
+
db_utils.add_column_to_table_alembic('services',
|
|
42
|
+
'active_versions',
|
|
43
|
+
sa.Text(),
|
|
44
|
+
server_default=json.dumps([]))
|
|
45
|
+
db_utils.add_column_to_table_alembic('services',
|
|
46
|
+
'load_balancing_policy', sa.Text())
|
|
47
|
+
db_utils.add_column_to_table_alembic('services',
|
|
48
|
+
'tls_encrypted',
|
|
49
|
+
sa.Integer(),
|
|
50
|
+
server_default='0')
|
|
51
|
+
db_utils.add_column_to_table_alembic('services',
|
|
52
|
+
'pool',
|
|
53
|
+
sa.Integer(),
|
|
54
|
+
server_default='0')
|
|
55
|
+
db_utils.add_column_to_table_alembic(
|
|
56
|
+
'services',
|
|
57
|
+
'controller_pid',
|
|
58
|
+
sa.Integer(),
|
|
59
|
+
value_to_replace_existing_entries=-1)
|
|
60
|
+
db_utils.add_column_to_table_alembic('services', 'hash', sa.Text())
|
|
61
|
+
db_utils.add_column_to_table_alembic('services', 'entrypoint',
|
|
62
|
+
sa.Text())
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def downgrade():
|
|
66
|
+
"""Drop all tables"""
|
|
67
|
+
Base.metadata.drop_all(bind=op.get_bind())
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Adding a hash column for pool.
|
|
2
|
+
|
|
3
|
+
Revision ID: 003
|
|
4
|
+
Revises: 002
|
|
5
|
+
Create Date: 2025-07-18
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '003'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '002'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add columns for pool hash."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
27
|
+
'pool_hash',
|
|
28
|
+
sa.Text(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def downgrade():
|
|
33
|
+
"""Remove columns for pool hash."""
|
|
34
|
+
pass
|
sky/serve/client/impl.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Implementation of SDK for SkyServe."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import List, Optional, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
7
|
|
|
@@ -12,6 +12,8 @@ from sky.utils import admin_policy_utils
|
|
|
12
12
|
from sky.utils import dag_utils
|
|
13
13
|
|
|
14
14
|
if typing.TYPE_CHECKING:
|
|
15
|
+
import io
|
|
16
|
+
|
|
15
17
|
import sky
|
|
16
18
|
from sky.serve import serve_utils
|
|
17
19
|
|
|
@@ -23,7 +25,7 @@ def up(
|
|
|
23
25
|
# Internal only:
|
|
24
26
|
# pylint: disable=invalid-name
|
|
25
27
|
_need_confirmation: bool = False
|
|
26
|
-
) -> server_common.RequestId:
|
|
28
|
+
) -> server_common.RequestId[Tuple[str, str]]:
|
|
27
29
|
assert not pool, 'Command `up` is not supported for pool.'
|
|
28
30
|
# Avoid circular import.
|
|
29
31
|
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
|
@@ -67,7 +69,7 @@ def update(
|
|
|
67
69
|
# Internal only:
|
|
68
70
|
# pylint: disable=invalid-name
|
|
69
71
|
_need_confirmation: bool = False
|
|
70
|
-
) -> server_common.RequestId:
|
|
72
|
+
) -> server_common.RequestId[None]:
|
|
71
73
|
assert not pool, 'Command `update` is not supported for pool.'
|
|
72
74
|
# Avoid circular import.
|
|
73
75
|
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
|
@@ -110,7 +112,7 @@ def apply(
|
|
|
110
112
|
# Internal only:
|
|
111
113
|
# pylint: disable=invalid-name
|
|
112
114
|
_need_confirmation: bool = False
|
|
113
|
-
) -> server_common.RequestId:
|
|
115
|
+
) -> server_common.RequestId[None]:
|
|
114
116
|
assert pool, 'Command `apply` is only supported for pool.'
|
|
115
117
|
# Avoid circular import.
|
|
116
118
|
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
|
@@ -151,7 +153,7 @@ def down(
|
|
|
151
153
|
all: bool = False, # pylint: disable=redefined-builtin
|
|
152
154
|
purge: bool = False,
|
|
153
155
|
pool: bool = False,
|
|
154
|
-
) -> server_common.RequestId:
|
|
156
|
+
) -> server_common.RequestId[None]:
|
|
155
157
|
if pool:
|
|
156
158
|
body = payloads.JobsPoolDownBody(
|
|
157
159
|
pool_names=service_names,
|
|
@@ -175,7 +177,7 @@ def down(
|
|
|
175
177
|
def status(
|
|
176
178
|
service_names: Optional[Union[str, List[str]]],
|
|
177
179
|
pool: bool = False,
|
|
178
|
-
) -> server_common.RequestId:
|
|
180
|
+
) -> server_common.RequestId[List[Dict[str, Any]]]:
|
|
179
181
|
if pool:
|
|
180
182
|
body = payloads.JobsPoolStatusBody(pool_names=service_names)
|
|
181
183
|
else:
|
|
@@ -186,3 +188,88 @@ def status(
|
|
|
186
188
|
json=json.loads(body.model_dump_json()),
|
|
187
189
|
timeout=(5, None))
|
|
188
190
|
return server_common.get_request_id(response)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def tail_logs(service_name: str,
|
|
194
|
+
target: Union[str, 'serve_utils.ServiceComponent'],
|
|
195
|
+
replica_id: Optional[int] = None,
|
|
196
|
+
follow: bool = True,
|
|
197
|
+
output_stream: Optional['io.TextIOBase'] = None,
|
|
198
|
+
tail: Optional[int] = None,
|
|
199
|
+
pool: bool = False) -> None:
|
|
200
|
+
# Avoid circular import.
|
|
201
|
+
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
|
202
|
+
|
|
203
|
+
if pool:
|
|
204
|
+
body = payloads.JobsPoolLogsBody(
|
|
205
|
+
pool_name=service_name,
|
|
206
|
+
target=target,
|
|
207
|
+
worker_id=replica_id,
|
|
208
|
+
follow=follow,
|
|
209
|
+
tail=tail,
|
|
210
|
+
)
|
|
211
|
+
else:
|
|
212
|
+
body = payloads.ServeLogsBody(
|
|
213
|
+
service_name=service_name,
|
|
214
|
+
target=target,
|
|
215
|
+
replica_id=replica_id,
|
|
216
|
+
follow=follow,
|
|
217
|
+
tail=tail,
|
|
218
|
+
)
|
|
219
|
+
response = server_common.make_authenticated_request(
|
|
220
|
+
'POST',
|
|
221
|
+
'/jobs/pool_logs' if pool else '/serve/logs',
|
|
222
|
+
json=json.loads(body.model_dump_json()),
|
|
223
|
+
timeout=(5, None),
|
|
224
|
+
stream=True)
|
|
225
|
+
request_id: server_common.RequestId[None] = server_common.get_request_id(
|
|
226
|
+
response)
|
|
227
|
+
return sdk.stream_response(request_id=request_id,
|
|
228
|
+
response=response,
|
|
229
|
+
output_stream=output_stream,
|
|
230
|
+
resumable=True)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def sync_down_logs(service_name: str,
|
|
234
|
+
local_dir: str,
|
|
235
|
+
*,
|
|
236
|
+
targets: Optional[Union[
|
|
237
|
+
str, 'serve_utils.ServiceComponent',
|
|
238
|
+
Sequence[Union[str,
|
|
239
|
+
'serve_utils.ServiceComponent']]]] = None,
|
|
240
|
+
replica_ids: Optional[List[int]] = None,
|
|
241
|
+
tail: Optional[int] = None,
|
|
242
|
+
pool: bool = False) -> None:
|
|
243
|
+
# Avoid circular import.
|
|
244
|
+
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
|
245
|
+
|
|
246
|
+
if pool:
|
|
247
|
+
body = payloads.JobsPoolDownloadLogsBody(
|
|
248
|
+
pool_name=service_name,
|
|
249
|
+
local_dir=local_dir,
|
|
250
|
+
targets=targets,
|
|
251
|
+
worker_ids=replica_ids,
|
|
252
|
+
tail=tail,
|
|
253
|
+
)
|
|
254
|
+
else:
|
|
255
|
+
body = payloads.ServeDownloadLogsBody(
|
|
256
|
+
service_name=service_name,
|
|
257
|
+
# No need to set here, since the server will override it
|
|
258
|
+
# to a directory on the API server.
|
|
259
|
+
local_dir=local_dir,
|
|
260
|
+
targets=targets,
|
|
261
|
+
replica_ids=replica_ids,
|
|
262
|
+
tail=tail,
|
|
263
|
+
)
|
|
264
|
+
response = server_common.make_authenticated_request(
|
|
265
|
+
'POST',
|
|
266
|
+
'/jobs/pool_sync-down-logs' if pool else '/serve/sync-down-logs',
|
|
267
|
+
json=json.loads(body.model_dump_json()),
|
|
268
|
+
timeout=(5, None))
|
|
269
|
+
request_id: server_common.RequestId[str] = server_common.get_request_id(
|
|
270
|
+
response)
|
|
271
|
+
remote_dir = sdk.stream_and_get(request_id)
|
|
272
|
+
|
|
273
|
+
# Download from API server paths to the client's local_dir
|
|
274
|
+
client_common.download_logs_from_api_server([remote_dir], remote_dir,
|
|
275
|
+
local_dir)
|
sky/serve/client/sdk.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
"""SDK for SkyServe."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import List, Optional, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
5
5
|
|
|
6
|
-
from sky.client import common as client_common
|
|
7
6
|
from sky.serve.client import impl
|
|
8
7
|
from sky.server import common as server_common
|
|
9
8
|
from sky.server import rest
|
|
@@ -27,7 +26,7 @@ def up(
|
|
|
27
26
|
# Internal only:
|
|
28
27
|
# pylint: disable=invalid-name
|
|
29
28
|
_need_confirmation: bool = False
|
|
30
|
-
) -> server_common.RequestId:
|
|
29
|
+
) -> server_common.RequestId[Tuple[str, str]]:
|
|
31
30
|
"""Spins up a service.
|
|
32
31
|
|
|
33
32
|
Please refer to the sky.cli.serve_up for the document.
|
|
@@ -62,7 +61,7 @@ def update(
|
|
|
62
61
|
# Internal only:
|
|
63
62
|
# pylint: disable=invalid-name
|
|
64
63
|
_need_confirmation: bool = False
|
|
65
|
-
) -> server_common.RequestId:
|
|
64
|
+
) -> server_common.RequestId[None]:
|
|
66
65
|
"""Updates an existing service.
|
|
67
66
|
|
|
68
67
|
Please refer to the sky.cli.serve_update for the document.
|
|
@@ -95,7 +94,7 @@ def down(
|
|
|
95
94
|
service_names: Optional[Union[str, List[str]]],
|
|
96
95
|
all: bool = False, # pylint: disable=redefined-builtin
|
|
97
96
|
purge: bool = False
|
|
98
|
-
) -> server_common.RequestId:
|
|
97
|
+
) -> server_common.RequestId[None]:
|
|
99
98
|
"""Tears down a service.
|
|
100
99
|
|
|
101
100
|
Please refer to the sky.cli.serve_down for the docs.
|
|
@@ -123,7 +122,7 @@ def down(
|
|
|
123
122
|
@usage_lib.entrypoint
|
|
124
123
|
@server_common.check_server_healthy_or_start
|
|
125
124
|
def terminate_replica(service_name: str, replica_id: int,
|
|
126
|
-
purge: bool) -> server_common.RequestId:
|
|
125
|
+
purge: bool) -> server_common.RequestId[None]:
|
|
127
126
|
"""Tears down a specific replica for the given service.
|
|
128
127
|
|
|
129
128
|
Args:
|
|
@@ -157,8 +156,8 @@ def terminate_replica(service_name: str, replica_id: int,
|
|
|
157
156
|
@usage_lib.entrypoint
|
|
158
157
|
@server_common.check_server_healthy_or_start
|
|
159
158
|
def status(
|
|
160
|
-
|
|
161
|
-
|
|
159
|
+
service_names: Optional[Union[str, List[str]]]
|
|
160
|
+
) -> server_common.RequestId[List[Dict[str, Any]]]:
|
|
162
161
|
"""Gets service statuses.
|
|
163
162
|
|
|
164
163
|
If service_names is given, return those services. Otherwise, return all
|
|
@@ -290,27 +289,13 @@ def tail_logs(service_name: str,
|
|
|
290
289
|
sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
|
|
291
290
|
ValueError: arguments not valid, or failed to tail the logs.
|
|
292
291
|
"""
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
follow=follow,
|
|
301
|
-
tail=tail,
|
|
302
|
-
)
|
|
303
|
-
response = server_common.make_authenticated_request(
|
|
304
|
-
'POST',
|
|
305
|
-
'/serve/logs',
|
|
306
|
-
json=json.loads(body.model_dump_json()),
|
|
307
|
-
timeout=(5, None),
|
|
308
|
-
stream=True)
|
|
309
|
-
request_id = server_common.get_request_id(response)
|
|
310
|
-
return sdk.stream_response(request_id=request_id,
|
|
311
|
-
response=response,
|
|
312
|
-
output_stream=output_stream,
|
|
313
|
-
resumable=True)
|
|
292
|
+
return impl.tail_logs(service_name,
|
|
293
|
+
target,
|
|
294
|
+
replica_id,
|
|
295
|
+
follow,
|
|
296
|
+
output_stream,
|
|
297
|
+
tail,
|
|
298
|
+
pool=False)
|
|
314
299
|
|
|
315
300
|
|
|
316
301
|
@usage_lib.entrypoint
|
|
@@ -320,8 +305,8 @@ def sync_down_logs(service_name: str,
|
|
|
320
305
|
*,
|
|
321
306
|
targets: Optional[Union[
|
|
322
307
|
str, 'serve_utils.ServiceComponent',
|
|
323
|
-
|
|
324
|
-
|
|
308
|
+
Sequence[Union[str,
|
|
309
|
+
'serve_utils.ServiceComponent']]]] = None,
|
|
325
310
|
replica_ids: Optional[List[int]] = None,
|
|
326
311
|
tail: Optional[int] = None) -> None:
|
|
327
312
|
"""Sync down logs from the service components to a local directory.
|
|
@@ -352,25 +337,9 @@ def sync_down_logs(service_name: str,
|
|
|
352
337
|
sky.exceptions.ClusterNotUpError: If the controller is not up.
|
|
353
338
|
ValueError: Arguments not valid.
|
|
354
339
|
"""
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
# to a directory on the API server.
|
|
362
|
-
local_dir=local_dir,
|
|
363
|
-
targets=targets,
|
|
364
|
-
replica_ids=replica_ids,
|
|
365
|
-
tail=tail,
|
|
366
|
-
)
|
|
367
|
-
response = server_common.make_authenticated_request(
|
|
368
|
-
'POST',
|
|
369
|
-
'/serve/sync-down-logs',
|
|
370
|
-
json=json.loads(body.model_dump_json()),
|
|
371
|
-
timeout=(5, None))
|
|
372
|
-
remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
|
|
373
|
-
|
|
374
|
-
# Download from API server paths to the client's local_dir
|
|
375
|
-
client_common.download_logs_from_api_server([remote_dir], remote_dir,
|
|
376
|
-
local_dir)
|
|
340
|
+
return impl.sync_down_logs(service_name,
|
|
341
|
+
local_dir,
|
|
342
|
+
targets=targets,
|
|
343
|
+
replica_ids=replica_ids,
|
|
344
|
+
tail=tail,
|
|
345
|
+
pool=False)
|
sky/serve/constants.py
CHANGED
|
@@ -106,7 +106,8 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
|
|
|
106
106
|
# v2.0 - Added template-replica feature.
|
|
107
107
|
# v3.0 - Added cluster pool.
|
|
108
108
|
# v4.0 - Added pool argument to wait_service_registration.
|
|
109
|
-
|
|
109
|
+
# v5.0 - Added pool argument to stream_serve_process_logs & stream_replica_logs.
|
|
110
|
+
SERVE_VERSION = 5
|
|
110
111
|
|
|
111
112
|
TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
|
|
112
113
|
'The version of service is outdated and does not support manually '
|
sky/serve/controller.py
CHANGED
|
@@ -27,11 +27,12 @@ from sky.utils import ux_utils
|
|
|
27
27
|
logger = sky_logging.init_logger(__name__)
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
class
|
|
30
|
+
class AutoscalerInfoFilter(logging.Filter):
|
|
31
31
|
|
|
32
32
|
def filter(self, record: logging.LogRecord) -> bool:
|
|
33
33
|
message = record.getMessage()
|
|
34
|
-
return not ('GET' in message and '200' in message
|
|
34
|
+
return not ('GET' in message and '200' in message and
|
|
35
|
+
'/autoscaler/info' in message)
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
class SkyServeController:
|
|
@@ -61,6 +62,7 @@ class SkyServeController:
|
|
|
61
62
|
uvicorn_access_logger = logging.getLogger('uvicorn.access')
|
|
62
63
|
for handler in uvicorn_access_logger.handlers:
|
|
63
64
|
handler.setFormatter(sky_logging.FORMATTER)
|
|
65
|
+
handler.addFilter(AutoscalerInfoFilter())
|
|
64
66
|
yield
|
|
65
67
|
|
|
66
68
|
def _run_autoscaler(self):
|