skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +6 -2
- sky/backends/cloud_vm_ray_backend.py +13 -4
- sky/client/cli/command.py +22 -8
- sky/client/sdk.py +50 -0
- sky/clouds/kubernetes.py +2 -6
- sky/clouds/nebius.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +13 -10
- sky/global_user_state.py +128 -1
- sky/jobs/constants.py +1 -1
- sky/jobs/scheduler.py +14 -21
- sky/jobs/server/core.py +64 -10
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +1 -3
- sky/jobs/utils.py +159 -8
- sky/provision/aws/config.py +19 -3
- sky/provision/aws/instance.py +2 -1
- sky/provision/nebius/utils.py +101 -86
- sky/provision/provisioner.py +13 -8
- sky/resources.py +5 -5
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/serve/replica_managers.py +123 -101
- sky/serve/serve_state.py +32 -0
- sky/serve/serve_utils.py +37 -16
- sky/serve/service.py +51 -17
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +44 -2
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/common_utils.py +20 -0
- sky/utils/controller_utils.py +17 -4
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +36 -5
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +99 -98
- sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
- /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Add provision_log_path to clusters and cluster_history.
|
|
2
|
+
|
|
3
|
+
Revision ID: 006
|
|
4
|
+
Revises: 005
|
|
5
|
+
Create Date: 2025-08-12
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '006'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '005'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add provision_log_path columns."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
# clusters.provision_log_path
|
|
27
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
28
|
+
'provision_log_path',
|
|
29
|
+
sa.Text(),
|
|
30
|
+
server_default=None)
|
|
31
|
+
|
|
32
|
+
# cluster_history.provision_log_path
|
|
33
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
34
|
+
'provision_log_path',
|
|
35
|
+
sa.Text(),
|
|
36
|
+
server_default=None)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def downgrade():
|
|
40
|
+
"""No-op for backward compatibility."""
|
|
41
|
+
pass
|
sky/serve/replica_managers.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
"""ReplicaManager: handles the creation and deletion of endpoint replicas."""
|
|
2
|
-
import collections
|
|
3
2
|
import dataclasses
|
|
4
|
-
import enum
|
|
5
3
|
import functools
|
|
6
4
|
import multiprocessing
|
|
7
5
|
from multiprocessing import pool as mp_pool
|
|
@@ -199,6 +197,12 @@ def _should_use_spot(service_task_yaml_path: str,
|
|
|
199
197
|
return len(spot_use_resources) == len(task.resources)
|
|
200
198
|
|
|
201
199
|
|
|
200
|
+
# Every function that calls serve_state.add_or_update_replica should acquire
|
|
201
|
+
# this lock. It is to prevent race condition when the replica status is updated
|
|
202
|
+
# by multiple threads at the same time. The modification of replica info is
|
|
203
|
+
# 2 database calls: read the whole replica info object, unpickle it, and modify
|
|
204
|
+
# corresponding fields. Then it is write back to the database. We need to ensure
|
|
205
|
+
# the read-modify-write operation is atomic.
|
|
202
206
|
def with_lock(func):
|
|
203
207
|
|
|
204
208
|
@functools.wraps(func)
|
|
@@ -209,22 +213,6 @@ def with_lock(func):
|
|
|
209
213
|
return wrapper
|
|
210
214
|
|
|
211
215
|
|
|
212
|
-
class ProcessStatus(enum.Enum):
|
|
213
|
-
"""Process status."""
|
|
214
|
-
|
|
215
|
-
# The process is running
|
|
216
|
-
RUNNING = 'RUNNING'
|
|
217
|
-
|
|
218
|
-
# The process is finished and succeeded
|
|
219
|
-
SUCCEEDED = 'SUCCEEDED'
|
|
220
|
-
|
|
221
|
-
# The process is interrupted
|
|
222
|
-
INTERRUPTED = 'INTERRUPTED'
|
|
223
|
-
|
|
224
|
-
# The process failed
|
|
225
|
-
FAILED = 'FAILED'
|
|
226
|
-
|
|
227
|
-
|
|
228
216
|
@dataclasses.dataclass
|
|
229
217
|
class ReplicaStatusProperty:
|
|
230
218
|
"""Some properties that determine replica status.
|
|
@@ -236,15 +224,16 @@ class ReplicaStatusProperty:
|
|
|
236
224
|
first_ready_time: The first time the service is ready.
|
|
237
225
|
sky_down_status: Process status of sky.down.
|
|
238
226
|
"""
|
|
239
|
-
#
|
|
240
|
-
sky_launch_status:
|
|
227
|
+
# sky.launch will always be scheduled on creation of ReplicaStatusProperty.
|
|
228
|
+
sky_launch_status: common_utils.ProcessStatus = (
|
|
229
|
+
common_utils.ProcessStatus.SCHEDULED)
|
|
241
230
|
user_app_failed: bool = False
|
|
242
231
|
service_ready_now: bool = False
|
|
243
232
|
# None means readiness probe is not succeeded yet;
|
|
244
233
|
# -1 means the initial delay seconds is exceeded.
|
|
245
234
|
first_ready_time: Optional[float] = None
|
|
246
235
|
# None means sky.down is not called yet.
|
|
247
|
-
sky_down_status: Optional[ProcessStatus] = None
|
|
236
|
+
sky_down_status: Optional[common_utils.ProcessStatus] = None
|
|
248
237
|
# Whether the termination is caused by autoscaler's decision
|
|
249
238
|
is_scale_down: bool = False
|
|
250
239
|
# The replica's spot instance was preempted.
|
|
@@ -299,7 +288,7 @@ class ReplicaStatusProperty:
|
|
|
299
288
|
(1) Job status;
|
|
300
289
|
(2) Readiness probe.
|
|
301
290
|
"""
|
|
302
|
-
if self.sky_launch_status != ProcessStatus.SUCCEEDED:
|
|
291
|
+
if self.sky_launch_status != common_utils.ProcessStatus.SUCCEEDED:
|
|
303
292
|
return False
|
|
304
293
|
if self.sky_down_status is not None:
|
|
305
294
|
return False
|
|
@@ -313,37 +302,43 @@ class ReplicaStatusProperty:
|
|
|
313
302
|
|
|
314
303
|
def to_replica_status(self) -> serve_state.ReplicaStatus:
|
|
315
304
|
"""Convert status property to human-readable replica status."""
|
|
316
|
-
|
|
305
|
+
# Backward compatibility. Before we introduce ProcessStatus.SCHEDULED,
|
|
306
|
+
# we use None to represent sky.launch is not called yet.
|
|
307
|
+
if (self.sky_launch_status is None or
|
|
308
|
+
self.sky_launch_status == common_utils.ProcessStatus.SCHEDULED):
|
|
317
309
|
# Pending to launch
|
|
318
310
|
return serve_state.ReplicaStatus.PENDING
|
|
319
|
-
if self.sky_launch_status == ProcessStatus.RUNNING:
|
|
320
|
-
if self.sky_down_status == ProcessStatus.FAILED:
|
|
311
|
+
if self.sky_launch_status == common_utils.ProcessStatus.RUNNING:
|
|
312
|
+
if self.sky_down_status == common_utils.ProcessStatus.FAILED:
|
|
321
313
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
322
|
-
if self.sky_down_status == ProcessStatus.SUCCEEDED:
|
|
314
|
+
if self.sky_down_status == common_utils.ProcessStatus.SUCCEEDED:
|
|
323
315
|
# This indicate it is a scale_down with correct teardown.
|
|
324
316
|
# Should have been cleaned from the replica table.
|
|
325
317
|
return serve_state.ReplicaStatus.UNKNOWN
|
|
326
318
|
# Still launching
|
|
327
319
|
return serve_state.ReplicaStatus.PROVISIONING
|
|
328
|
-
if self.sky_launch_status == ProcessStatus.INTERRUPTED:
|
|
320
|
+
if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
|
|
329
321
|
# sky.down is running and a scale down interrupted sky.launch
|
|
330
322
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
331
323
|
if self.sky_down_status is not None:
|
|
332
324
|
if self.preempted:
|
|
333
325
|
# Replica (spot) is preempted
|
|
334
326
|
return serve_state.ReplicaStatus.PREEMPTED
|
|
335
|
-
if self.sky_down_status == ProcessStatus.
|
|
327
|
+
if self.sky_down_status == common_utils.ProcessStatus.SCHEDULED:
|
|
328
|
+
# sky.down is scheduled to run, but not started yet.
|
|
329
|
+
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
330
|
+
if self.sky_down_status == common_utils.ProcessStatus.RUNNING:
|
|
336
331
|
# sky.down is running
|
|
337
332
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
338
|
-
if self.sky_launch_status == ProcessStatus.INTERRUPTED:
|
|
333
|
+
if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
|
|
339
334
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
340
|
-
if self.sky_down_status == ProcessStatus.FAILED:
|
|
335
|
+
if self.sky_down_status == common_utils.ProcessStatus.FAILED:
|
|
341
336
|
# sky.down failed
|
|
342
337
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
343
338
|
if self.user_app_failed:
|
|
344
339
|
# Failed on user setup/run
|
|
345
340
|
return serve_state.ReplicaStatus.FAILED
|
|
346
|
-
if self.sky_launch_status == ProcessStatus.FAILED:
|
|
341
|
+
if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
|
|
347
342
|
# sky.launch failed
|
|
348
343
|
return serve_state.ReplicaStatus.FAILED_PROVISION
|
|
349
344
|
if self.first_ready_time is None:
|
|
@@ -359,7 +354,7 @@ class ReplicaStatusProperty:
|
|
|
359
354
|
# This indicate it is a scale_down with correct teardown.
|
|
360
355
|
# Should have been cleaned from the replica table.
|
|
361
356
|
return serve_state.ReplicaStatus.UNKNOWN
|
|
362
|
-
if self.sky_launch_status == ProcessStatus.FAILED:
|
|
357
|
+
if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
|
|
363
358
|
# sky.launch failed
|
|
364
359
|
# The down process has not been started if it reaches here,
|
|
365
360
|
# due to the `if self.sky_down_status is not None`` check above.
|
|
@@ -703,6 +698,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
703
698
|
|
|
704
699
|
self._recover_replica_operations()
|
|
705
700
|
|
|
701
|
+
@with_lock
|
|
706
702
|
def _recover_replica_operations(self):
|
|
707
703
|
"""Let's see are there something to do for ReplicaManager in a
|
|
708
704
|
recovery run"""
|
|
@@ -743,9 +739,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
743
739
|
# Replica management functions #
|
|
744
740
|
################################
|
|
745
741
|
|
|
746
|
-
#
|
|
747
|
-
#
|
|
748
|
-
@with_lock
|
|
742
|
+
# We don't need to add lock here since every caller of this function
|
|
743
|
+
# will acquire the lock.
|
|
749
744
|
def _launch_replica(
|
|
750
745
|
self,
|
|
751
746
|
replica_id: int,
|
|
@@ -801,11 +796,61 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
801
796
|
# to avoid too many sky.launch running at the same time.
|
|
802
797
|
self._launch_process_pool[replica_id] = p
|
|
803
798
|
|
|
799
|
+
@with_lock
|
|
804
800
|
def scale_up(self,
|
|
805
801
|
resources_override: Optional[Dict[str, Any]] = None) -> None:
|
|
806
802
|
self._launch_replica(self._next_replica_id, resources_override)
|
|
807
803
|
self._next_replica_id += 1
|
|
808
804
|
|
|
805
|
+
def _handle_sky_down_finish(self, info: ReplicaInfo, exitcode: int) -> None:
|
|
806
|
+
if exitcode != 0:
|
|
807
|
+
logger.error(f'Down process for replica {info.replica_id} '
|
|
808
|
+
f'exited abnormally with code {exitcode}.')
|
|
809
|
+
info.status_property.sky_down_status = (
|
|
810
|
+
common_utils.ProcessStatus.FAILED)
|
|
811
|
+
else:
|
|
812
|
+
info.status_property.sky_down_status = (
|
|
813
|
+
common_utils.ProcessStatus.SUCCEEDED)
|
|
814
|
+
# Failed replica still count as a replica. In our current design, we
|
|
815
|
+
# want to fail early if user code have any error. This will prevent
|
|
816
|
+
# infinite loop of teardown and re-provision. However, there is a
|
|
817
|
+
# special case that if the replica is UP for longer than
|
|
818
|
+
# initial_delay_seconds, we assume it is just some random failure and
|
|
819
|
+
# we should restart the replica. Please refer to the implementation of
|
|
820
|
+
# `is_scale_down_succeeded` for more details.
|
|
821
|
+
# TODO(tian): Currently, restart replicas that failed within
|
|
822
|
+
# initial_delay_seconds is not supported. We should add it
|
|
823
|
+
# later when we support `sky serve update`.
|
|
824
|
+
removal_reason = None
|
|
825
|
+
if info.status_property.is_scale_down:
|
|
826
|
+
# This means the cluster is deleted due to an autoscaler
|
|
827
|
+
# decision or the cluster is recovering from preemption.
|
|
828
|
+
# Delete the replica info so it won't count as a replica.
|
|
829
|
+
if info.status_property.preempted:
|
|
830
|
+
removal_reason = 'for preemption recovery'
|
|
831
|
+
else:
|
|
832
|
+
removal_reason = 'normally'
|
|
833
|
+
# Don't keep failed record for version mismatch replicas,
|
|
834
|
+
# since user should fixed the error before update.
|
|
835
|
+
elif info.version != self.latest_version:
|
|
836
|
+
removal_reason = 'for version outdated'
|
|
837
|
+
elif info.status_property.purged:
|
|
838
|
+
removal_reason = 'for purge'
|
|
839
|
+
elif info.status_property.failed_spot_availability:
|
|
840
|
+
removal_reason = 'for spot availability failure'
|
|
841
|
+
else:
|
|
842
|
+
logger.info(f'Termination of replica {info.replica_id} '
|
|
843
|
+
'finished. Replica info is kept since some '
|
|
844
|
+
'failure detected.')
|
|
845
|
+
serve_state.add_or_update_replica(self._service_name,
|
|
846
|
+
info.replica_id, info)
|
|
847
|
+
if removal_reason is not None:
|
|
848
|
+
serve_state.remove_replica(self._service_name, info.replica_id)
|
|
849
|
+
logger.info(f'Replica {info.replica_id} removed from the '
|
|
850
|
+
f'replica table {removal_reason}.')
|
|
851
|
+
|
|
852
|
+
# We don't need to add lock here since every caller of this function
|
|
853
|
+
# will acquire the lock.
|
|
809
854
|
def _terminate_replica(self,
|
|
810
855
|
replica_id: int,
|
|
811
856
|
sync_down_logs: bool,
|
|
@@ -823,7 +868,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
823
868
|
info = serve_state.get_replica_info_from_id(self._service_name,
|
|
824
869
|
replica_id)
|
|
825
870
|
assert info is not None
|
|
826
|
-
info.status_property.sky_launch_status =
|
|
871
|
+
info.status_property.sky_launch_status = (
|
|
872
|
+
common_utils.ProcessStatus.INTERRUPTED)
|
|
827
873
|
serve_state.add_or_update_replica(self._service_name, replica_id,
|
|
828
874
|
info)
|
|
829
875
|
launch_process = self._launch_process_pool[replica_id]
|
|
@@ -895,18 +941,30 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
895
941
|
|
|
896
942
|
logger.info(f'preempted: {info.status_property.preempted}, '
|
|
897
943
|
f'replica_id: {replica_id}')
|
|
944
|
+
info.status_property.is_scale_down = is_scale_down
|
|
945
|
+
info.status_property.purged = purge
|
|
946
|
+
|
|
947
|
+
# If the cluster does not exist, it means either the cluster never
|
|
948
|
+
# exists (e.g., the cluster is scaled down before it gets a chance to
|
|
949
|
+
# provision) or the cluster is preempted and cleaned up by the status
|
|
950
|
+
# refresh. In this case, we skip spawning a new down process to save
|
|
951
|
+
# controller resources.
|
|
952
|
+
if global_user_state.get_cluster_from_name(info.cluster_name) is None:
|
|
953
|
+
self._handle_sky_down_finish(info, exitcode=0)
|
|
954
|
+
return
|
|
955
|
+
|
|
956
|
+
# Otherwise, start the process to terminate the cluster.
|
|
898
957
|
p = multiprocessing.Process(
|
|
899
958
|
target=ux_utils.RedirectOutputForProcess(terminate_cluster,
|
|
900
959
|
log_file_name, 'a').run,
|
|
901
960
|
args=(info.cluster_name, replica_drain_delay_seconds),
|
|
902
961
|
)
|
|
903
|
-
info.status_property.sky_down_status =
|
|
904
|
-
|
|
905
|
-
info.status_property.purged = purge
|
|
962
|
+
info.status_property.sky_down_status = (
|
|
963
|
+
common_utils.ProcessStatus.SCHEDULED)
|
|
906
964
|
serve_state.add_or_update_replica(self._service_name, replica_id, info)
|
|
907
|
-
p.start()
|
|
908
965
|
self._down_process_pool[replica_id] = p
|
|
909
966
|
|
|
967
|
+
@with_lock
|
|
910
968
|
def scale_down(self, replica_id: int, purge: bool = False) -> None:
|
|
911
969
|
self._terminate_replica(
|
|
912
970
|
replica_id,
|
|
@@ -915,6 +973,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
915
973
|
is_scale_down=True,
|
|
916
974
|
purge=purge)
|
|
917
975
|
|
|
976
|
+
# We don't need to add lock here since every caller of this function
|
|
977
|
+
# will acquire the lock.
|
|
918
978
|
def _handle_preemption(self, info: ReplicaInfo) -> bool:
|
|
919
979
|
"""Handle preemption of the replica if any error happened.
|
|
920
980
|
|
|
@@ -990,7 +1050,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
990
1050
|
if controller_utils.can_provision():
|
|
991
1051
|
p.start()
|
|
992
1052
|
info.status_property.sky_launch_status = (
|
|
993
|
-
ProcessStatus.RUNNING)
|
|
1053
|
+
common_utils.ProcessStatus.RUNNING)
|
|
994
1054
|
else:
|
|
995
1055
|
# sky.launch finished
|
|
996
1056
|
# TODO(tian): Try-catch in process, and have an enum return
|
|
@@ -1007,11 +1067,11 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1007
1067
|
f'exited abnormally with code {p.exitcode}.'
|
|
1008
1068
|
' Terminating...')
|
|
1009
1069
|
info.status_property.sky_launch_status = (
|
|
1010
|
-
ProcessStatus.FAILED)
|
|
1070
|
+
common_utils.ProcessStatus.FAILED)
|
|
1011
1071
|
error_in_sky_launch = True
|
|
1012
1072
|
else:
|
|
1013
1073
|
info.status_property.sky_launch_status = (
|
|
1014
|
-
ProcessStatus.SUCCEEDED)
|
|
1074
|
+
common_utils.ProcessStatus.SUCCEEDED)
|
|
1015
1075
|
schedule_next_jobs = True
|
|
1016
1076
|
if self._spot_placer is not None and info.is_spot:
|
|
1017
1077
|
# TODO(tian): Currently, we set the location to
|
|
@@ -1033,8 +1093,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1033
1093
|
serve_state.add_or_update_replica(self._service_name,
|
|
1034
1094
|
replica_id, info)
|
|
1035
1095
|
if schedule_next_jobs and self._is_pool:
|
|
1036
|
-
jobs_scheduler.maybe_schedule_next_jobs(
|
|
1037
|
-
pool=self._service_name)
|
|
1096
|
+
jobs_scheduler.maybe_schedule_next_jobs()
|
|
1038
1097
|
if error_in_sky_launch:
|
|
1039
1098
|
# Teardown after update replica info since
|
|
1040
1099
|
# _terminate_replica will update the replica info too.
|
|
@@ -1045,59 +1104,25 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1045
1104
|
jobs_scheduler.maybe_schedule_next_jobs()
|
|
1046
1105
|
down_process_pool_snapshot = list(self._down_process_pool.items())
|
|
1047
1106
|
for replica_id, p in down_process_pool_snapshot:
|
|
1048
|
-
if
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
info.status_property.sky_down_status = (
|
|
1059
|
-
ProcessStatus.FAILED)
|
|
1060
|
-
else:
|
|
1107
|
+
if p.is_alive():
|
|
1108
|
+
continue
|
|
1109
|
+
info = serve_state.get_replica_info_from_id(self._service_name,
|
|
1110
|
+
replica_id)
|
|
1111
|
+
assert info is not None, replica_id
|
|
1112
|
+
if (info.status_property.sky_down_status ==
|
|
1113
|
+
common_utils.ProcessStatus.SCHEDULED):
|
|
1114
|
+
# sky.down not started yet
|
|
1115
|
+
if controller_utils.can_terminate():
|
|
1116
|
+
p.start()
|
|
1061
1117
|
info.status_property.sky_down_status = (
|
|
1062
|
-
ProcessStatus.
|
|
1063
|
-
# Failed replica still count as a replica. In our current
|
|
1064
|
-
# design, we want to fail early if user code have any error.
|
|
1065
|
-
# This will prevent infinite loop of teardown and
|
|
1066
|
-
# re-provision. However, there is a special case that if the
|
|
1067
|
-
# replica is UP for longer than initial_delay_seconds, we
|
|
1068
|
-
# assume it is just some random failure and we should restart
|
|
1069
|
-
# the replica. Please refer to the implementation of
|
|
1070
|
-
# `is_scale_down_succeeded` for more details.
|
|
1071
|
-
# TODO(tian): Currently, restart replicas that failed within
|
|
1072
|
-
# initial_delay_seconds is not supported. We should add it
|
|
1073
|
-
# later when we support `sky serve update`.
|
|
1074
|
-
removal_reason = None
|
|
1075
|
-
if info.status_property.is_scale_down:
|
|
1076
|
-
# This means the cluster is deleted due to an autoscaler
|
|
1077
|
-
# decision or the cluster is recovering from preemption.
|
|
1078
|
-
# Delete the replica info so it won't count as a replica.
|
|
1079
|
-
if info.status_property.preempted:
|
|
1080
|
-
removal_reason = 'for preemption recovery'
|
|
1081
|
-
else:
|
|
1082
|
-
removal_reason = 'normally'
|
|
1083
|
-
# Don't keep failed record for version mismatch replicas,
|
|
1084
|
-
# since user should fixed the error before update.
|
|
1085
|
-
elif info.version != self.latest_version:
|
|
1086
|
-
removal_reason = 'for version outdated'
|
|
1087
|
-
elif info.status_property.purged:
|
|
1088
|
-
removal_reason = 'for purge'
|
|
1089
|
-
elif info.status_property.failed_spot_availability:
|
|
1090
|
-
removal_reason = 'for spot availability failure'
|
|
1091
|
-
else:
|
|
1092
|
-
logger.info(f'Termination of replica {replica_id} '
|
|
1093
|
-
'finished. Replica info is kept since some '
|
|
1094
|
-
'failure detected.')
|
|
1118
|
+
common_utils.ProcessStatus.RUNNING)
|
|
1095
1119
|
serve_state.add_or_update_replica(self._service_name,
|
|
1096
1120
|
replica_id, info)
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1121
|
+
else:
|
|
1122
|
+
logger.info(
|
|
1123
|
+
f'Terminate process for replica {replica_id} finished.')
|
|
1124
|
+
del self._down_process_pool[replica_id]
|
|
1125
|
+
self._handle_sky_down_finish(info, exitcode=p.exitcode)
|
|
1101
1126
|
|
|
1102
1127
|
# Clean old version
|
|
1103
1128
|
replica_infos = serve_state.get_replica_infos(self._service_name)
|
|
@@ -1393,12 +1418,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1393
1418
|
old_config_any_of = old_config.get('resources',
|
|
1394
1419
|
{}).pop('any_of', [])
|
|
1395
1420
|
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
if (normalize_dict_list(old_config_any_of) !=
|
|
1401
|
-
normalize_dict_list(new_config_any_of)):
|
|
1421
|
+
if (resources_utils.normalize_any_of_resources_config(
|
|
1422
|
+
old_config_any_of) != resources_utils.
|
|
1423
|
+
normalize_any_of_resources_config(new_config_any_of)):
|
|
1402
1424
|
logger.info('Replica config changed (any_of), skipping. '
|
|
1403
1425
|
f'old: {old_config_any_of}, '
|
|
1404
1426
|
f'new: {new_config_any_of}')
|
sky/serve/serve_state.py
CHANGED
|
@@ -670,6 +670,38 @@ def total_number_provisioning_replicas() -> int:
|
|
|
670
670
|
return provisioning_count
|
|
671
671
|
|
|
672
672
|
|
|
673
|
+
@init_db
|
|
674
|
+
def total_number_terminating_replicas() -> int:
|
|
675
|
+
"""Returns the total number of terminating replicas."""
|
|
676
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
677
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
678
|
+
rows = session.execute(sqlalchemy.select(
|
|
679
|
+
replicas_table.c.replica_info)).fetchall()
|
|
680
|
+
terminating_count = 0
|
|
681
|
+
for row in rows:
|
|
682
|
+
replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
|
|
683
|
+
if (replica_info.status_property.sky_down_status ==
|
|
684
|
+
common_utils.ProcessStatus.RUNNING):
|
|
685
|
+
terminating_count += 1
|
|
686
|
+
return terminating_count
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
@init_db
|
|
690
|
+
def total_number_scheduled_to_terminate_replicas() -> int:
|
|
691
|
+
"""Returns the total number of terminating replicas."""
|
|
692
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
693
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
694
|
+
rows = session.execute(sqlalchemy.select(
|
|
695
|
+
replicas_table.c.replica_info)).fetchall()
|
|
696
|
+
terminating_count = 0
|
|
697
|
+
for row in rows:
|
|
698
|
+
replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
|
|
699
|
+
if (replica_info.status_property.sky_down_status ==
|
|
700
|
+
common_utils.ProcessStatus.SCHEDULED):
|
|
701
|
+
terminating_count += 1
|
|
702
|
+
return terminating_count
|
|
703
|
+
|
|
704
|
+
|
|
673
705
|
def get_replicas_at_status(
|
|
674
706
|
service_name: str,
|
|
675
707
|
status: ReplicaStatus,
|
sky/serve/serve_utils.py
CHANGED
|
@@ -63,7 +63,10 @@ _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
|
|
|
63
63
|
# when changing UX as this assumption is used to expand some log files while
|
|
64
64
|
# ignoring others.
|
|
65
65
|
_SKYPILOT_LOG_HINT = r'.*sky api logs -l'
|
|
66
|
-
|
|
66
|
+
_SKYPILOT_PROVISION_API_LOG_PATTERN = (
|
|
67
|
+
fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
|
|
68
|
+
# New hint pattern for provision logs
|
|
69
|
+
_SKYPILOT_PROVISION_LOG_CMD_PATTERN = r'.*sky logs --provision\s+(\S+)'
|
|
67
70
|
_SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
|
|
68
71
|
|
|
69
72
|
# TODO(tian): Find all existing replica id and print here.
|
|
@@ -1114,31 +1117,49 @@ def _process_line(line: str,
|
|
|
1114
1117
|
return False
|
|
1115
1118
|
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
|
1116
1119
|
|
|
1117
|
-
|
|
1120
|
+
provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
|
|
1121
|
+
line)
|
|
1122
|
+
provision_log_cmd_prompt = re.match(_SKYPILOT_PROVISION_LOG_CMD_PATTERN,
|
|
1123
|
+
line)
|
|
1118
1124
|
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
|
1119
1125
|
|
|
1120
|
-
|
|
1121
|
-
log_path = provision_log_prompt.group(1)
|
|
1122
|
-
nested_log_path = pathlib.Path(
|
|
1123
|
-
skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
|
|
1124
|
-
log_path).resolve()
|
|
1125
|
-
|
|
1126
|
+
def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
|
|
1126
1127
|
try:
|
|
1127
|
-
with open(
|
|
1128
|
-
#
|
|
1129
|
-
# to avoid any internal bug that causes the launch to fail
|
|
1130
|
-
# while cluster status remains INIT.
|
|
1128
|
+
with open(p, 'r', newline='', encoding='utf-8') as f:
|
|
1129
|
+
# Exit if >10s without new content to avoid hanging when INIT
|
|
1131
1130
|
yield from log_utils.follow_logs(f,
|
|
1132
1131
|
should_stop=cluster_is_up,
|
|
1133
1132
|
stop_on_eof=stop_on_eof,
|
|
1134
1133
|
idle_timeout_seconds=10)
|
|
1135
1134
|
except FileNotFoundError:
|
|
1135
|
+
# Fall back cleanly if the hinted path doesn't exist
|
|
1136
1136
|
yield line
|
|
1137
|
-
|
|
1138
1137
|
yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
|
|
1139
|
-
f'Try to expand log file {
|
|
1140
|
-
f'
|
|
1141
|
-
|
|
1138
|
+
f'Try to expand log file {p} but not found. Skipping...'
|
|
1139
|
+
f'{colorama.Style.RESET_ALL}')
|
|
1140
|
+
return
|
|
1141
|
+
|
|
1142
|
+
if provision_api_log_prompt is not None:
|
|
1143
|
+
rel_path = provision_api_log_prompt.group(1)
|
|
1144
|
+
nested_log_path = pathlib.Path(
|
|
1145
|
+
skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
|
|
1146
|
+
rel_path).resolve()
|
|
1147
|
+
yield from _stream_provision_path(nested_log_path)
|
|
1148
|
+
return
|
|
1149
|
+
|
|
1150
|
+
if provision_log_cmd_prompt is not None:
|
|
1151
|
+
# Resolve provision log via cluster table first, then history.
|
|
1152
|
+
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
1153
|
+
cluster_name)
|
|
1154
|
+
if not log_path_str:
|
|
1155
|
+
log_path_str = (
|
|
1156
|
+
global_user_state.get_cluster_history_provision_log_path(
|
|
1157
|
+
cluster_name))
|
|
1158
|
+
if not log_path_str:
|
|
1159
|
+
yield line
|
|
1160
|
+
return
|
|
1161
|
+
yield from _stream_provision_path(
|
|
1162
|
+
pathlib.Path(log_path_str).expanduser().resolve())
|
|
1142
1163
|
return
|
|
1143
1164
|
|
|
1144
1165
|
if log_prompt is not None:
|
sky/serve/service.py
CHANGED
|
@@ -113,6 +113,9 @@ def cleanup_storage(task_yaml: str) -> bool:
|
|
|
113
113
|
return not failed
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
# NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
|
|
117
|
+
# because we killed all the processes (controller & replica manager) before
|
|
118
|
+
# calling this function.
|
|
116
119
|
def _cleanup(service_name: str) -> bool:
|
|
117
120
|
"""Clean up all service related resources, i.e. replicas and storage."""
|
|
118
121
|
# Cleanup the HA recovery script first as it is possible that some error
|
|
@@ -135,28 +138,59 @@ def _cleanup(service_name: str) -> bool:
|
|
|
135
138
|
continue
|
|
136
139
|
p = multiprocessing.Process(target=replica_managers.terminate_cluster,
|
|
137
140
|
args=(info.cluster_name,))
|
|
138
|
-
p.start()
|
|
139
141
|
info2proc[info] = p
|
|
140
142
|
# Set replica status to `SHUTTING_DOWN`
|
|
141
143
|
info.status_property.sky_launch_status = (
|
|
142
|
-
replica_managers.ProcessStatus.SUCCEEDED)
|
|
144
|
+
replica_managers.common_utils.ProcessStatus.SUCCEEDED)
|
|
143
145
|
info.status_property.sky_down_status = (
|
|
144
|
-
replica_managers.ProcessStatus.
|
|
146
|
+
replica_managers.common_utils.ProcessStatus.SCHEDULED)
|
|
145
147
|
serve_state.add_or_update_replica(service_name, info.replica_id, info)
|
|
146
|
-
logger.info(f'
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
148
|
+
logger.info(f'Scheduling to terminate replica {info.replica_id} ...')
|
|
149
|
+
|
|
150
|
+
def _set_to_failed_cleanup(info: replica_managers.ReplicaInfo) -> None:
|
|
151
|
+
nonlocal failed
|
|
152
|
+
# Set replica status to `FAILED_CLEANUP`
|
|
153
|
+
info.status_property.sky_down_status = (
|
|
154
|
+
replica_managers.common_utils.ProcessStatus.FAILED)
|
|
155
|
+
serve_state.add_or_update_replica(service_name, info.replica_id, info)
|
|
156
|
+
failed = True
|
|
157
|
+
logger.error(f'Replica {info.replica_id} failed to terminate.')
|
|
158
|
+
|
|
159
|
+
# Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
|
|
160
|
+
# TODO(tian): Refactor to use the same logic and code.
|
|
161
|
+
while info2proc:
|
|
162
|
+
snapshot = list(info2proc.items())
|
|
163
|
+
for info, p in snapshot:
|
|
164
|
+
if p.is_alive():
|
|
165
|
+
continue
|
|
166
|
+
if (info.status_property.sky_down_status ==
|
|
167
|
+
replica_managers.common_utils.ProcessStatus.SCHEDULED):
|
|
168
|
+
if controller_utils.can_terminate():
|
|
169
|
+
try:
|
|
170
|
+
p.start()
|
|
171
|
+
except Exception as e: # pylint: disable=broad-except
|
|
172
|
+
_set_to_failed_cleanup(info)
|
|
173
|
+
logger.error(f'Failed to start process for replica '
|
|
174
|
+
f'{info.replica_id}: {e}')
|
|
175
|
+
del info2proc[info]
|
|
176
|
+
else:
|
|
177
|
+
info.status_property.sky_down_status = (
|
|
178
|
+
common_utils.ProcessStatus.RUNNING)
|
|
179
|
+
serve_state.add_or_update_replica(
|
|
180
|
+
service_name, info.replica_id, info)
|
|
181
|
+
else:
|
|
182
|
+
logger.info('Terminate process for replica '
|
|
183
|
+
f'{info.replica_id} finished.')
|
|
184
|
+
p.join()
|
|
185
|
+
del info2proc[info]
|
|
186
|
+
if p.exitcode == 0:
|
|
187
|
+
serve_state.remove_replica(service_name, info.replica_id)
|
|
188
|
+
logger.info(
|
|
189
|
+
f'Replica {info.replica_id} terminated successfully.')
|
|
190
|
+
else:
|
|
191
|
+
_set_to_failed_cleanup(info)
|
|
192
|
+
time.sleep(3)
|
|
193
|
+
|
|
160
194
|
versions = serve_state.get_service_versions(service_name)
|
|
161
195
|
serve_state.remove_service_versions(service_name)
|
|
162
196
|
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 17
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/requests/payloads.py
CHANGED
|
@@ -497,6 +497,12 @@ class JobsQueueBody(RequestBody):
|
|
|
497
497
|
skip_finished: bool = False
|
|
498
498
|
all_users: bool = False
|
|
499
499
|
job_ids: Optional[List[int]] = None
|
|
500
|
+
user_match: Optional[str] = None
|
|
501
|
+
workspace_match: Optional[str] = None
|
|
502
|
+
name_match: Optional[str] = None
|
|
503
|
+
pool_match: Optional[str] = None
|
|
504
|
+
page: Optional[int] = None
|
|
505
|
+
limit: Optional[int] = None
|
|
500
506
|
|
|
501
507
|
|
|
502
508
|
class JobsCancelBody(RequestBody):
|