skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +448 -60
- sky/client/common.py +12 -9
- sky/clouds/nebius.py +1 -1
- sky/clouds/utils/gcp_utils.py +1 -1
- sky/clouds/vast.py +1 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
- sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
- sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
- sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
- sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +25 -0
- sky/data/storage.py +1219 -1775
- sky/global_user_state.py +18 -8
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +164 -31
- sky/jobs/utils.py +144 -68
- sky/logs/aws.py +4 -2
- sky/provision/kubernetes/utils.py +6 -4
- sky/provision/nebius/constants.py +3 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vast/utils.py +9 -6
- sky/py.typed +0 -0
- sky/resources.py +24 -14
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +18 -1
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +6 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +24 -19
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
- sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
- sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
- sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
- sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
- sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
- sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
- sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
- sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
- sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
- /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
|
@@ -11,6 +11,7 @@ import json
|
|
|
11
11
|
import os
|
|
12
12
|
import pickle
|
|
13
13
|
import re
|
|
14
|
+
import threading
|
|
14
15
|
import time
|
|
15
16
|
import typing
|
|
16
17
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
@@ -47,6 +48,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
|
|
|
47
48
|
_ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
|
|
48
49
|
|
|
49
50
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
51
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
50
52
|
|
|
51
53
|
Base = declarative.declarative_base()
|
|
52
54
|
|
|
@@ -241,21 +243,29 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
241
243
|
migration_utils.GLOBAL_USER_STATE_VERSION)
|
|
242
244
|
|
|
243
245
|
|
|
246
|
+
# We wrap the sqlalchemy engine initialization in a thread
|
|
247
|
+
# lock to ensure that multiple threads do not initialize the
|
|
248
|
+
# engine which could result in a rare race condition where
|
|
249
|
+
# a session has already been created with _SQLALCHEMY_ENGINE = e1,
|
|
250
|
+
# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
|
|
251
|
+
# which could result in e1 being garbage collected unexpectedly.
|
|
244
252
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
245
253
|
global _SQLALCHEMY_ENGINE
|
|
246
254
|
|
|
247
255
|
if _SQLALCHEMY_ENGINE is not None:
|
|
248
256
|
return _SQLALCHEMY_ENGINE
|
|
257
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
258
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
259
|
+
return _SQLALCHEMY_ENGINE
|
|
260
|
+
# get an engine to the db
|
|
261
|
+
engine = migration_utils.get_engine('state')
|
|
249
262
|
|
|
250
|
-
|
|
251
|
-
|
|
263
|
+
# run migrations if needed
|
|
264
|
+
create_table(engine)
|
|
252
265
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
# return engine
|
|
257
|
-
_SQLALCHEMY_ENGINE = engine
|
|
258
|
-
return _SQLALCHEMY_ENGINE
|
|
266
|
+
# return engine
|
|
267
|
+
_SQLALCHEMY_ENGINE = engine
|
|
268
|
+
return _SQLALCHEMY_ENGINE
|
|
259
269
|
|
|
260
270
|
|
|
261
271
|
def _init_db(func):
|
sky/jobs/__init__.py
CHANGED
|
@@ -5,6 +5,9 @@ from sky.jobs.client.sdk import cancel
|
|
|
5
5
|
from sky.jobs.client.sdk import dashboard
|
|
6
6
|
from sky.jobs.client.sdk import download_logs
|
|
7
7
|
from sky.jobs.client.sdk import launch
|
|
8
|
+
from sky.jobs.client.sdk import pool_apply
|
|
9
|
+
from sky.jobs.client.sdk import pool_down
|
|
10
|
+
from sky.jobs.client.sdk import pool_status
|
|
8
11
|
from sky.jobs.client.sdk import queue
|
|
9
12
|
from sky.jobs.client.sdk import tail_logs
|
|
10
13
|
from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
|
sky/jobs/client/sdk.py
CHANGED
|
@@ -9,8 +9,10 @@ import click
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.client import common as client_common
|
|
11
11
|
from sky.client import sdk
|
|
12
|
+
from sky.serve.client import impl
|
|
12
13
|
from sky.server import common as server_common
|
|
13
14
|
from sky.server import rest
|
|
15
|
+
from sky.server import versions
|
|
14
16
|
from sky.server.requests import payloads
|
|
15
17
|
from sky.skylet import constants
|
|
16
18
|
from sky.usage import usage_lib
|
|
@@ -23,6 +25,7 @@ if typing.TYPE_CHECKING:
|
|
|
23
25
|
import io
|
|
24
26
|
|
|
25
27
|
import sky
|
|
28
|
+
from sky.serve import serve_utils
|
|
26
29
|
|
|
27
30
|
logger = sky_logging.init_logger(__name__)
|
|
28
31
|
|
|
@@ -33,6 +36,8 @@ logger = sky_logging.init_logger(__name__)
|
|
|
33
36
|
def launch(
|
|
34
37
|
task: Union['sky.Task', 'sky.Dag'],
|
|
35
38
|
name: Optional[str] = None,
|
|
39
|
+
pool: Optional[str] = None,
|
|
40
|
+
num_jobs: Optional[int] = None,
|
|
36
41
|
# Internal only:
|
|
37
42
|
# pylint: disable=invalid-name
|
|
38
43
|
_need_confirmation: bool = False,
|
|
@@ -61,15 +66,35 @@ def launch(
|
|
|
61
66
|
chain dag.
|
|
62
67
|
sky.exceptions.NotSupportedError: the feature is not supported.
|
|
63
68
|
"""
|
|
69
|
+
remote_api_version = versions.get_remote_api_version()
|
|
70
|
+
if (pool is not None and
|
|
71
|
+
(remote_api_version is None or remote_api_version < 12)):
|
|
72
|
+
raise click.UsageError('Pools are not supported in your API server. '
|
|
73
|
+
'Please upgrade to a newer API server to use '
|
|
74
|
+
'pools.')
|
|
75
|
+
if pool is None and num_jobs is not None:
|
|
76
|
+
raise click.UsageError('Cannot specify num_jobs without pool.')
|
|
64
77
|
|
|
65
78
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
|
66
79
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
67
80
|
dag, at_client_side=True) as dag:
|
|
68
81
|
sdk.validate(dag)
|
|
69
82
|
if _need_confirmation:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
83
|
+
job_identity = 'a managed job'
|
|
84
|
+
if pool is None:
|
|
85
|
+
request_id = sdk.optimize(dag)
|
|
86
|
+
sdk.stream_and_get(request_id)
|
|
87
|
+
else:
|
|
88
|
+
request_id = pool_status(pool)
|
|
89
|
+
pool_statuses = sdk.get(request_id)
|
|
90
|
+
if not pool_statuses:
|
|
91
|
+
raise click.UsageError(f'Pool {pool!r} not found.')
|
|
92
|
+
resources = pool_statuses[0]['requested_resources_str']
|
|
93
|
+
click.secho(f'Use resources from pool {pool!r}: {resources}.',
|
|
94
|
+
fg='green')
|
|
95
|
+
if num_jobs is not None:
|
|
96
|
+
job_identity = f'{num_jobs} managed jobs'
|
|
97
|
+
prompt = f'Launching {job_identity} {dag.name!r}. Proceed?'
|
|
73
98
|
if prompt is not None:
|
|
74
99
|
click.confirm(prompt,
|
|
75
100
|
default=True,
|
|
@@ -81,6 +106,8 @@ def launch(
|
|
|
81
106
|
body = payloads.JobsLaunchBody(
|
|
82
107
|
task=dag_str,
|
|
83
108
|
name=name,
|
|
109
|
+
pool=pool,
|
|
110
|
+
num_jobs=num_jobs,
|
|
84
111
|
)
|
|
85
112
|
response = server_common.make_authenticated_request(
|
|
86
113
|
'POST',
|
|
@@ -158,6 +185,7 @@ def cancel(
|
|
|
158
185
|
job_ids: Optional[List[int]] = None,
|
|
159
186
|
all: bool = False, # pylint: disable=redefined-builtin
|
|
160
187
|
all_users: bool = False,
|
|
188
|
+
pool: Optional[str] = None,
|
|
161
189
|
) -> server_common.RequestId:
|
|
162
190
|
"""Cancels managed jobs.
|
|
163
191
|
|
|
@@ -168,6 +196,7 @@ def cancel(
|
|
|
168
196
|
job_ids: IDs of the managed jobs to cancel.
|
|
169
197
|
all: Whether to cancel all managed jobs.
|
|
170
198
|
all_users: Whether to cancel all managed jobs from all users.
|
|
199
|
+
pool: Pool name to cancel.
|
|
171
200
|
|
|
172
201
|
Returns:
|
|
173
202
|
The request ID of the cancel request.
|
|
@@ -176,11 +205,18 @@ def cancel(
|
|
|
176
205
|
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
|
177
206
|
RuntimeError: failed to cancel the job.
|
|
178
207
|
"""
|
|
208
|
+
remote_api_version = versions.get_remote_api_version()
|
|
209
|
+
if (pool is not None and
|
|
210
|
+
(remote_api_version is None or remote_api_version < 12)):
|
|
211
|
+
raise click.UsageError('Pools are not supported in your API server. '
|
|
212
|
+
'Please upgrade to a newer API server to use '
|
|
213
|
+
'pools.')
|
|
179
214
|
body = payloads.JobsCancelBody(
|
|
180
215
|
name=name,
|
|
181
216
|
job_ids=job_ids,
|
|
182
217
|
all=all,
|
|
183
218
|
all_users=all_users,
|
|
219
|
+
pool=pool,
|
|
184
220
|
)
|
|
185
221
|
response = server_common.make_authenticated_request(
|
|
186
222
|
'POST',
|
|
@@ -327,3 +363,44 @@ def dashboard() -> None:
|
|
|
327
363
|
url = f'{api_server_url}/jobs/dashboard?{params}'
|
|
328
364
|
logger.info(f'Opening dashboard in browser: {url}')
|
|
329
365
|
webbrowser.open(url)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
@context.contextual
|
|
369
|
+
@usage_lib.entrypoint
|
|
370
|
+
@server_common.check_server_healthy_or_start
|
|
371
|
+
@versions.minimal_api_version(12)
|
|
372
|
+
def pool_apply(
|
|
373
|
+
task: Union['sky.Task', 'sky.Dag'],
|
|
374
|
+
pool_name: str,
|
|
375
|
+
mode: 'serve_utils.UpdateMode',
|
|
376
|
+
# Internal only:
|
|
377
|
+
# pylint: disable=invalid-name
|
|
378
|
+
_need_confirmation: bool = False
|
|
379
|
+
) -> server_common.RequestId:
|
|
380
|
+
"""Apply a config to a pool."""
|
|
381
|
+
return impl.apply(task,
|
|
382
|
+
pool_name,
|
|
383
|
+
mode,
|
|
384
|
+
pool=True,
|
|
385
|
+
_need_confirmation=_need_confirmation)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
@usage_lib.entrypoint
|
|
389
|
+
@server_common.check_server_healthy_or_start
|
|
390
|
+
@versions.minimal_api_version(12)
|
|
391
|
+
def pool_down(
|
|
392
|
+
pool_names: Optional[Union[str, List[str]]],
|
|
393
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
|
394
|
+
purge: bool = False,
|
|
395
|
+
) -> server_common.RequestId:
|
|
396
|
+
"""Delete a pool."""
|
|
397
|
+
return impl.down(pool_names, all, purge, pool=True)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
@usage_lib.entrypoint
|
|
401
|
+
@server_common.check_server_healthy_or_start
|
|
402
|
+
@versions.minimal_api_version(12)
|
|
403
|
+
def pool_status(
|
|
404
|
+
pool_names: Optional[Union[str, List[str]]],) -> server_common.RequestId:
|
|
405
|
+
"""Query a pool."""
|
|
406
|
+
return impl.status(pool_names, pool=True)
|
sky/jobs/controller.py
CHANGED
|
@@ -30,6 +30,7 @@ from sky.jobs import recovery_strategy
|
|
|
30
30
|
from sky.jobs import scheduler
|
|
31
31
|
from sky.jobs import state as managed_job_state
|
|
32
32
|
from sky.jobs import utils as managed_job_utils
|
|
33
|
+
from sky.serve import serve_utils
|
|
33
34
|
from sky.skylet import constants
|
|
34
35
|
from sky.skylet import job_lib
|
|
35
36
|
from sky.usage import usage_lib
|
|
@@ -60,12 +61,13 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
|
|
|
60
61
|
class JobsController:
|
|
61
62
|
"""Each jobs controller manages the life cycle of one managed job."""
|
|
62
63
|
|
|
63
|
-
def __init__(self, job_id: int, dag_yaml: str) -> None:
|
|
64
|
+
def __init__(self, job_id: int, dag_yaml: str, pool: Optional[str]) -> None:
|
|
64
65
|
self._job_id = job_id
|
|
65
66
|
self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
|
|
66
67
|
logger.info(self._dag)
|
|
67
68
|
# TODO(zhwu): this assumes the specific backend.
|
|
68
69
|
self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
|
70
|
+
self._pool = pool
|
|
69
71
|
|
|
70
72
|
# pylint: disable=line-too-long
|
|
71
73
|
# Add a unique identifier to the task environment variables, so that
|
|
@@ -99,8 +101,10 @@ class JobsController:
|
|
|
99
101
|
task.update_envs(task_envs)
|
|
100
102
|
|
|
101
103
|
def _download_log_and_stream(
|
|
102
|
-
self,
|
|
103
|
-
|
|
104
|
+
self,
|
|
105
|
+
task_id: Optional[int],
|
|
106
|
+
handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle],
|
|
107
|
+
job_id_on_pool_cluster: Optional[int],
|
|
104
108
|
) -> None:
|
|
105
109
|
"""Downloads and streams the logs of the current job with given task ID.
|
|
106
110
|
|
|
@@ -113,9 +117,14 @@ class JobsController:
|
|
|
113
117
|
'Skipping downloading and streaming the logs.')
|
|
114
118
|
return
|
|
115
119
|
managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
116
|
-
'managed_jobs'
|
|
117
|
-
|
|
118
|
-
|
|
120
|
+
'managed_jobs',
|
|
121
|
+
f'job-id-{self._job_id}')
|
|
122
|
+
log_file = controller_utils.download_and_stream_job_log(
|
|
123
|
+
self._backend,
|
|
124
|
+
handle,
|
|
125
|
+
managed_job_logs_dir,
|
|
126
|
+
job_ids=[str(job_id_on_pool_cluster)]
|
|
127
|
+
if job_id_on_pool_cluster is not None else None)
|
|
119
128
|
if log_file is not None:
|
|
120
129
|
# Set the path of the log file for the current task, so it can be
|
|
121
130
|
# accessed even after the job is finished
|
|
@@ -123,6 +132,12 @@ class JobsController:
|
|
|
123
132
|
log_file)
|
|
124
133
|
logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
|
|
125
134
|
|
|
135
|
+
def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
|
|
136
|
+
if cluster_name is None:
|
|
137
|
+
return
|
|
138
|
+
if self._pool is None:
|
|
139
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
|
140
|
+
|
|
126
141
|
def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
|
|
127
142
|
"""Busy loop monitoring cluster status and handling recovery.
|
|
128
143
|
|
|
@@ -193,10 +208,14 @@ class JobsController:
|
|
|
193
208
|
usage_lib.messages.usage.update_task_id(task_id)
|
|
194
209
|
task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
|
|
195
210
|
assert task.name is not None, task
|
|
211
|
+
# Set the cluster name to None if the job is submitted
|
|
212
|
+
# to a pool. This will be updated when we later calls the `launch`
|
|
213
|
+
# or `recover` function from the strategy executor.
|
|
196
214
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
|
197
|
-
task.name, self._job_id)
|
|
215
|
+
task.name, self._job_id) if self._pool is None else None
|
|
198
216
|
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
|
199
|
-
cluster_name, self._backend, task, self._job_id, task_id
|
|
217
|
+
cluster_name, self._backend, task, self._job_id, task_id,
|
|
218
|
+
self._pool)
|
|
200
219
|
if not is_resume:
|
|
201
220
|
submitted_at = time.time()
|
|
202
221
|
if task_id == 0:
|
|
@@ -226,6 +245,13 @@ class JobsController:
|
|
|
226
245
|
if not is_resume:
|
|
227
246
|
remote_job_submitted_at = self._strategy_executor.launch()
|
|
228
247
|
assert remote_job_submitted_at is not None, remote_job_submitted_at
|
|
248
|
+
if self._pool is None:
|
|
249
|
+
job_id_on_pool_cluster = None
|
|
250
|
+
else:
|
|
251
|
+
# Update the cluster name when using cluster pool.
|
|
252
|
+
cluster_name, job_id_on_pool_cluster = (
|
|
253
|
+
managed_job_state.get_pool_submit_info(self._job_id))
|
|
254
|
+
assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
|
|
229
255
|
|
|
230
256
|
if not is_resume:
|
|
231
257
|
managed_job_state.set_started(job_id=self._job_id,
|
|
@@ -279,7 +305,9 @@ class JobsController:
|
|
|
279
305
|
if not force_transit_to_recovering:
|
|
280
306
|
try:
|
|
281
307
|
job_status = managed_job_utils.get_job_status(
|
|
282
|
-
self._backend,
|
|
308
|
+
self._backend,
|
|
309
|
+
cluster_name,
|
|
310
|
+
job_id=job_id_on_pool_cluster)
|
|
283
311
|
except exceptions.FetchClusterInfoError as fetch_e:
|
|
284
312
|
logger.info(
|
|
285
313
|
'Failed to fetch the job status. Start recovery.\n'
|
|
@@ -288,7 +316,7 @@ class JobsController:
|
|
|
288
316
|
|
|
289
317
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
|
290
318
|
success_end_time = managed_job_utils.try_to_get_job_end_time(
|
|
291
|
-
self._backend, cluster_name)
|
|
319
|
+
self._backend, cluster_name, job_id_on_pool_cluster)
|
|
292
320
|
# The job is done. Set the job to SUCCEEDED first before start
|
|
293
321
|
# downloading and streaming the logs to make it more responsive.
|
|
294
322
|
managed_job_state.set_succeeded(self._job_id,
|
|
@@ -299,6 +327,8 @@ class JobsController:
|
|
|
299
327
|
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
|
300
328
|
f'Cleaning up the cluster {cluster_name}.')
|
|
301
329
|
try:
|
|
330
|
+
logger.info(f'Downloading logs on cluster {cluster_name} '
|
|
331
|
+
f'and job id {job_id_on_pool_cluster}.')
|
|
302
332
|
clusters = backend_utils.get_clusters(
|
|
303
333
|
cluster_names=[cluster_name],
|
|
304
334
|
refresh=common.StatusRefreshMode.NONE,
|
|
@@ -307,7 +337,8 @@ class JobsController:
|
|
|
307
337
|
assert len(clusters) == 1, (clusters, cluster_name)
|
|
308
338
|
handle = clusters[0].get('handle')
|
|
309
339
|
# Best effort to download and stream the logs.
|
|
310
|
-
self._download_log_and_stream(task_id, handle
|
|
340
|
+
self._download_log_and_stream(task_id, handle,
|
|
341
|
+
job_id_on_pool_cluster)
|
|
311
342
|
except Exception as e: # pylint: disable=broad-except
|
|
312
343
|
# We don't want to crash here, so just log and continue.
|
|
313
344
|
logger.warning(
|
|
@@ -316,7 +347,7 @@ class JobsController:
|
|
|
316
347
|
exc_info=True)
|
|
317
348
|
# Only clean up the cluster, not the storages, because tasks may
|
|
318
349
|
# share storages.
|
|
319
|
-
|
|
350
|
+
self._cleanup_cluster(cluster_name)
|
|
320
351
|
return True
|
|
321
352
|
|
|
322
353
|
# For single-node jobs, non-terminated job_status indicates a
|
|
@@ -364,13 +395,14 @@ class JobsController:
|
|
|
364
395
|
job_status == job_lib.JobStatus.FAILED_DRIVER):
|
|
365
396
|
# The user code has probably crashed, fail immediately.
|
|
366
397
|
end_time = managed_job_utils.try_to_get_job_end_time(
|
|
367
|
-
self._backend, cluster_name)
|
|
398
|
+
self._backend, cluster_name, job_id_on_pool_cluster)
|
|
368
399
|
logger.info(
|
|
369
400
|
f'The user job failed ({job_status}). Please check the '
|
|
370
401
|
'logs below.\n'
|
|
371
402
|
f'== Logs of the user job (ID: {self._job_id}) ==\n')
|
|
372
403
|
|
|
373
|
-
self._download_log_and_stream(task_id, handle
|
|
404
|
+
self._download_log_and_stream(task_id, handle,
|
|
405
|
+
job_id_on_pool_cluster)
|
|
374
406
|
|
|
375
407
|
failure_reason = (
|
|
376
408
|
'To see the details, run: '
|
|
@@ -457,7 +489,7 @@ class JobsController:
|
|
|
457
489
|
# those clusters again may fail.
|
|
458
490
|
logger.info('Cleaning up the preempted or failed cluster'
|
|
459
491
|
'...')
|
|
460
|
-
|
|
492
|
+
self._cleanup_cluster(cluster_name)
|
|
461
493
|
|
|
462
494
|
# Try to recover the managed jobs, when the cluster is preempted or
|
|
463
495
|
# failed or the job status is failed to be fetched.
|
|
@@ -467,6 +499,10 @@ class JobsController:
|
|
|
467
499
|
force_transit_to_recovering=force_transit_to_recovering,
|
|
468
500
|
callback_func=callback_func)
|
|
469
501
|
recovered_time = self._strategy_executor.recover()
|
|
502
|
+
if self._pool is not None:
|
|
503
|
+
cluster_name, job_id_on_pool_cluster = (
|
|
504
|
+
managed_job_state.get_pool_submit_info(self._job_id))
|
|
505
|
+
assert cluster_name is not None
|
|
470
506
|
managed_job_state.set_recovered(self._job_id,
|
|
471
507
|
task_id,
|
|
472
508
|
recovered_time=recovered_time,
|
|
@@ -541,11 +577,11 @@ class JobsController:
|
|
|
541
577
|
task=self._dag.tasks[task_id]))
|
|
542
578
|
|
|
543
579
|
|
|
544
|
-
def _run_controller(job_id: int, dag_yaml: str):
|
|
580
|
+
def _run_controller(job_id: int, dag_yaml: str, pool: Optional[str]):
|
|
545
581
|
"""Runs the controller in a remote process for interruption."""
|
|
546
582
|
# The controller needs to be instantiated in the remote process, since
|
|
547
583
|
# the controller is not serializable.
|
|
548
|
-
jobs_controller = JobsController(job_id, dag_yaml)
|
|
584
|
+
jobs_controller = JobsController(job_id, dag_yaml, pool)
|
|
549
585
|
jobs_controller.run()
|
|
550
586
|
|
|
551
587
|
|
|
@@ -577,7 +613,7 @@ def _handle_signal(job_id):
|
|
|
577
613
|
f'User sent {user_signal.value} signal.')
|
|
578
614
|
|
|
579
615
|
|
|
580
|
-
def _cleanup(job_id: int, dag_yaml: str):
|
|
616
|
+
def _cleanup(job_id: int, dag_yaml: str, pool: Optional[str]):
|
|
581
617
|
"""Clean up the cluster(s) and storages.
|
|
582
618
|
|
|
583
619
|
(1) Clean up the succeeded task(s)' ephemeral storage. The storage has
|
|
@@ -595,9 +631,18 @@ def _cleanup(job_id: int, dag_yaml: str):
|
|
|
595
631
|
dag, _ = _get_dag_and_name(dag_yaml)
|
|
596
632
|
for task in dag.tasks:
|
|
597
633
|
assert task.name is not None, task
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
634
|
+
if pool is None:
|
|
635
|
+
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
|
636
|
+
task.name, job_id)
|
|
637
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
|
638
|
+
else:
|
|
639
|
+
cluster_name, job_id_on_pool_cluster = (
|
|
640
|
+
managed_job_state.get_pool_submit_info(job_id))
|
|
641
|
+
if cluster_name is not None:
|
|
642
|
+
if job_id_on_pool_cluster is not None:
|
|
643
|
+
core.cancel(cluster_name=cluster_name,
|
|
644
|
+
job_ids=[job_id_on_pool_cluster],
|
|
645
|
+
_try_cancel_if_cluster_is_init=True)
|
|
601
646
|
|
|
602
647
|
# Clean up Storages with persistent=False.
|
|
603
648
|
# TODO(zhwu): this assumes the specific backend.
|
|
@@ -629,7 +674,7 @@ def _cleanup(job_id: int, dag_yaml: str):
|
|
|
629
674
|
f'Failed to clean up file mount {file_mount}: {e}')
|
|
630
675
|
|
|
631
676
|
|
|
632
|
-
def start(job_id, dag_yaml):
|
|
677
|
+
def start(job_id, dag_yaml, pool):
|
|
633
678
|
"""Start the controller."""
|
|
634
679
|
controller_process = None
|
|
635
680
|
cancelling = False
|
|
@@ -643,7 +688,8 @@ def start(job_id, dag_yaml):
|
|
|
643
688
|
# So we can only enable daemon after we no longer need to
|
|
644
689
|
# start daemon processes like Ray.
|
|
645
690
|
controller_process = multiprocessing.Process(target=_run_controller,
|
|
646
|
-
args=(job_id, dag_yaml
|
|
691
|
+
args=(job_id, dag_yaml,
|
|
692
|
+
pool))
|
|
647
693
|
controller_process.start()
|
|
648
694
|
while controller_process.is_alive():
|
|
649
695
|
_handle_signal(job_id)
|
|
@@ -679,7 +725,7 @@ def start(job_id, dag_yaml):
|
|
|
679
725
|
# https://unix.stackexchange.com/questions/356408/strange-problem-with-trap-and-sigint
|
|
680
726
|
# But anyway, a clean solution is killing the controller process
|
|
681
727
|
# directly, and then cleanup the cluster job_state.
|
|
682
|
-
_cleanup(job_id, dag_yaml=dag_yaml)
|
|
728
|
+
_cleanup(job_id, dag_yaml=dag_yaml, pool=pool)
|
|
683
729
|
logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
|
|
684
730
|
|
|
685
731
|
if cancelling:
|
|
@@ -717,8 +763,13 @@ if __name__ == '__main__':
|
|
|
717
763
|
parser.add_argument('dag_yaml',
|
|
718
764
|
type=str,
|
|
719
765
|
help='The path to the user job yaml file.')
|
|
766
|
+
parser.add_argument('--pool',
|
|
767
|
+
required=False,
|
|
768
|
+
default=None,
|
|
769
|
+
type=str,
|
|
770
|
+
help='The pool to use for the controller job.')
|
|
720
771
|
args = parser.parse_args()
|
|
721
772
|
# We start process with 'spawn', because 'fork' could result in weird
|
|
722
773
|
# behaviors; 'spawn' is also cross-platform.
|
|
723
774
|
multiprocessing.set_start_method('spawn', force=True)
|
|
724
|
-
start(args.job_id, args.dag_yaml)
|
|
775
|
+
start(args.job_id, args.dag_yaml, args.pool)
|