skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +452 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +21 -1
- sky/data/storage.py +12 -0
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +129 -24
- sky/jobs/utils.py +109 -51
- sky/provision/nebius/constants.py +3 -0
- sky/provision/runpod/utils.py +27 -12
- sky/py.typed +0 -0
- sky/resources.py +16 -12
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/daemons.py +164 -0
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +2 -107
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +2 -1
- sky/server/uvicorn.py +2 -1
- sky/sky_logging.py +30 -0
- sky/skylet/constants.py +2 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +47 -19
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py
CHANGED
|
@@ -24,6 +24,8 @@ from sky.jobs import constants as managed_job_constants
|
|
|
24
24
|
from sky.jobs import state as managed_job_state
|
|
25
25
|
from sky.jobs import utils as managed_job_utils
|
|
26
26
|
from sky.provision import common as provision_common
|
|
27
|
+
from sky.serve import serve_utils
|
|
28
|
+
from sky.serve.server import impl
|
|
27
29
|
from sky.skylet import constants as skylet_constants
|
|
28
30
|
from sky.usage import usage_lib
|
|
29
31
|
from sky.utils import admin_policy_utils
|
|
@@ -90,7 +92,8 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
|
90
92
|
return local_to_controller_file_mounts
|
|
91
93
|
|
|
92
94
|
|
|
93
|
-
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag'
|
|
95
|
+
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
|
|
96
|
+
num_jobs: Optional[int]) -> Optional[List[int]]:
|
|
94
97
|
"""Submit the managed job locally if in consolidation mode.
|
|
95
98
|
|
|
96
99
|
In normal mode the managed job submission is done in the ray job submission.
|
|
@@ -104,18 +107,29 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag') -> Optional[int]:
|
|
|
104
107
|
|
|
105
108
|
# Create local directory for the managed job.
|
|
106
109
|
pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
110
|
+
job_ids = []
|
|
111
|
+
for _ in range(num_jobs if num_jobs is not None else 1):
|
|
112
|
+
# TODO(tian): We should have a separate name for each job when
|
|
113
|
+
# submitting multiple jobs. Current blocker is that we are sharing
|
|
114
|
+
# the same dag object for all jobs. Maybe we can do copy.copy() for
|
|
115
|
+
# each job and then give it a unique name (e.g. append job id after
|
|
116
|
+
# the task name). The name of the dag also needs to be aligned with
|
|
117
|
+
# the task name.
|
|
118
|
+
consolidation_mode_job_id = (
|
|
119
|
+
managed_job_state.set_job_info_without_job_id(
|
|
120
|
+
dag.name,
|
|
121
|
+
workspace=skypilot_config.get_active_workspace(
|
|
122
|
+
force_user_workspace=True),
|
|
123
|
+
entrypoint=common_utils.get_current_command(),
|
|
124
|
+
pool=pool))
|
|
125
|
+
for task_id, task in enumerate(dag.tasks):
|
|
126
|
+
resources_str = backend_utils.get_task_resources_str(
|
|
127
|
+
task, is_managed_job=True)
|
|
128
|
+
managed_job_state.set_pending(consolidation_mode_job_id, task_id,
|
|
129
|
+
task.name, resources_str,
|
|
130
|
+
task.metadata_json)
|
|
131
|
+
job_ids.append(consolidation_mode_job_id)
|
|
132
|
+
return job_ids
|
|
119
133
|
|
|
120
134
|
|
|
121
135
|
@timeline.event
|
|
@@ -123,8 +137,10 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag') -> Optional[int]:
|
|
|
123
137
|
def launch(
|
|
124
138
|
task: Union['sky.Task', 'sky.Dag'],
|
|
125
139
|
name: Optional[str] = None,
|
|
140
|
+
pool: Optional[str] = None,
|
|
141
|
+
num_jobs: Optional[int] = None,
|
|
126
142
|
stream_logs: bool = True,
|
|
127
|
-
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
143
|
+
) -> Tuple[Optional[Union[int, List[int]]], Optional[backends.ResourceHandle]]:
|
|
128
144
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
129
145
|
"""Launches a managed job.
|
|
130
146
|
|
|
@@ -149,6 +165,9 @@ def launch(
|
|
|
149
165
|
handle: Optional[backends.ResourceHandle]; handle to the controller VM.
|
|
150
166
|
None if dryrun.
|
|
151
167
|
"""
|
|
168
|
+
if pool is not None and not managed_job_utils.is_consolidation_mode():
|
|
169
|
+
with ux_utils.print_exception_no_traceback():
|
|
170
|
+
raise ValueError('pool is only supported in consolidation mode.')
|
|
152
171
|
entrypoint = task
|
|
153
172
|
# using hasattr instead of isinstance to avoid importing sky
|
|
154
173
|
if hasattr(task, 'metadata'):
|
|
@@ -178,8 +197,8 @@ def launch(
|
|
|
178
197
|
# pre-mount operations when submitting jobs.
|
|
179
198
|
dag.pre_mount_volumes()
|
|
180
199
|
|
|
181
|
-
|
|
182
|
-
dag,
|
|
200
|
+
user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
|
|
201
|
+
dag, use_user_specified_yaml=True)
|
|
183
202
|
|
|
184
203
|
dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
|
|
185
204
|
|
|
@@ -262,122 +281,159 @@ def launch(
|
|
|
262
281
|
f'Reason: {common_utils.format_exception(e)}')
|
|
263
282
|
|
|
264
283
|
local_to_controller_file_mounts = _upload_files_to_controller(dag)
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
284
|
+
controller = controller_utils.Controllers.JOBS_CONTROLLER
|
|
285
|
+
controller_name = controller.value.cluster_name
|
|
286
|
+
prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
|
|
287
|
+
controller_resources = controller_utils.get_controller_resources(
|
|
288
|
+
controller=controller,
|
|
289
|
+
task_resources=sum([list(t.resources) for t in dag.tasks], []))
|
|
290
|
+
|
|
291
|
+
consolidation_mode_job_ids = _maybe_submit_job_locally(
|
|
292
|
+
prefix, dag, pool, num_jobs)
|
|
293
|
+
|
|
294
|
+
# This is only needed for non-consolidation mode. For consolidation
|
|
295
|
+
# mode, the controller uses the same catalog as API server.
|
|
296
|
+
modified_catalogs = {} if consolidation_mode_job_ids is not None else (
|
|
297
|
+
service_catalog_common.get_modified_catalog_file_mounts())
|
|
298
|
+
|
|
299
|
+
def _submit_one(
|
|
300
|
+
consolidation_mode_job_id: Optional[int] = None,
|
|
301
|
+
job_rank: Optional[int] = None,
|
|
302
|
+
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
303
|
+
rank_suffix = '' if job_rank is None else f'-{job_rank}'
|
|
278
304
|
remote_original_user_yaml_path = (
|
|
279
|
-
f'{prefix}/{dag.name}-{dag_uuid}.original_user_yaml')
|
|
280
|
-
remote_user_yaml_path =
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
305
|
+
f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.original_user_yaml')
|
|
306
|
+
remote_user_yaml_path = (
|
|
307
|
+
f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.yaml')
|
|
308
|
+
remote_user_config_path = (
|
|
309
|
+
f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.config_yaml')
|
|
310
|
+
remote_env_file_path = (
|
|
311
|
+
f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.env')
|
|
312
|
+
with tempfile.NamedTemporaryFile(
|
|
313
|
+
prefix=f'managed-dag-{dag.name}{rank_suffix}-',
|
|
314
|
+
mode='w',
|
|
315
|
+
) as f, tempfile.NamedTemporaryFile(
|
|
316
|
+
prefix=f'managed-user-dag-{dag.name}{rank_suffix}-',
|
|
317
|
+
mode='w',
|
|
318
|
+
) as original_user_yaml_path:
|
|
319
|
+
original_user_yaml_path.write(user_dag_str_user_specified)
|
|
320
|
+
original_user_yaml_path.flush()
|
|
321
|
+
for task_ in dag.tasks:
|
|
322
|
+
if job_rank is not None:
|
|
323
|
+
task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
|
|
324
|
+
|
|
325
|
+
dag_utils.dump_chain_dag_to_yaml(dag, f.name)
|
|
326
|
+
|
|
327
|
+
vars_to_fill = {
|
|
328
|
+
'remote_original_user_yaml_path':
|
|
329
|
+
(remote_original_user_yaml_path),
|
|
330
|
+
'original_user_dag_path': original_user_yaml_path.name,
|
|
331
|
+
'remote_user_yaml_path': remote_user_yaml_path,
|
|
332
|
+
'user_yaml_path': f.name,
|
|
333
|
+
'local_to_controller_file_mounts':
|
|
334
|
+
(local_to_controller_file_mounts),
|
|
335
|
+
'jobs_controller': controller_name,
|
|
336
|
+
# Note: actual cluster name will be <task.name>-<managed job ID>
|
|
337
|
+
'dag_name': dag.name,
|
|
338
|
+
'remote_user_config_path': remote_user_config_path,
|
|
339
|
+
'remote_env_file_path': remote_env_file_path,
|
|
340
|
+
'modified_catalogs': modified_catalogs,
|
|
341
|
+
'priority': priority,
|
|
342
|
+
'consolidation_mode_job_id': consolidation_mode_job_id,
|
|
343
|
+
'pool': pool,
|
|
344
|
+
**controller_utils.shared_controller_vars_to_fill(
|
|
345
|
+
controller,
|
|
346
|
+
remote_user_config_path=remote_user_config_path,
|
|
347
|
+
# TODO(aylei): the mutated config will not be updated
|
|
348
|
+
# afterwards without recreate the controller. Need to
|
|
349
|
+
# revisit this.
|
|
350
|
+
local_user_config=mutated_user_config,
|
|
351
|
+
),
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
yaml_path = os.path.join(
|
|
355
|
+
managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
|
|
356
|
+
f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml')
|
|
357
|
+
common_utils.fill_template(
|
|
358
|
+
managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
|
|
359
|
+
vars_to_fill,
|
|
360
|
+
output_path=yaml_path)
|
|
361
|
+
controller_task = task_lib.Task.from_yaml(yaml_path)
|
|
362
|
+
controller_task.set_resources(controller_resources)
|
|
363
|
+
|
|
364
|
+
controller_task.managed_job_dag = dag
|
|
365
|
+
# pylint: disable=protected-access
|
|
366
|
+
controller_task._metadata = metadata
|
|
367
|
+
|
|
368
|
+
job_identity = ''
|
|
369
|
+
if consolidation_mode_job_id is not None:
|
|
370
|
+
job_identity = f' (Job ID: {consolidation_mode_job_id})'
|
|
371
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
372
|
+
f'Launching managed job {dag.name!r}{job_identity} '
|
|
373
|
+
f'from jobs controller...{colorama.Style.RESET_ALL}')
|
|
374
|
+
|
|
375
|
+
# Launch with the api server's user hash, so that sky status does
|
|
376
|
+
# not show the owner of the controller as whatever user launched
|
|
377
|
+
# it first.
|
|
378
|
+
with common.with_server_user():
|
|
379
|
+
# Always launch the controller in the default workspace.
|
|
380
|
+
with skypilot_config.local_active_workspace_ctx(
|
|
381
|
+
skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
|
|
382
|
+
# TODO(zhwu): the buckets need to be correctly handled for
|
|
383
|
+
# a specific workspace. For example, if a job is launched in
|
|
384
|
+
# workspace A, but the controller is in workspace B, the
|
|
385
|
+
# intermediate bucket and newly created bucket should be in
|
|
386
|
+
# workspace A.
|
|
387
|
+
if consolidation_mode_job_id is None:
|
|
388
|
+
return execution.launch(task=controller_task,
|
|
389
|
+
cluster_name=controller_name,
|
|
390
|
+
stream_logs=stream_logs,
|
|
391
|
+
retry_until_up=True,
|
|
392
|
+
fast=True,
|
|
393
|
+
_disable_controller_check=True)
|
|
394
|
+
# Manually launch the scheduler in consolidation mode.
|
|
395
|
+
local_handle = backend_utils.is_controller_accessible(
|
|
396
|
+
controller=controller, stopped_message='')
|
|
397
|
+
backend = backend_utils.get_backend_from_handle(
|
|
398
|
+
local_handle)
|
|
399
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
400
|
+
with sky_logging.silent():
|
|
401
|
+
backend.sync_file_mounts(
|
|
402
|
+
handle=local_handle,
|
|
403
|
+
all_file_mounts=controller_task.file_mounts,
|
|
404
|
+
storage_mounts=controller_task.storage_mounts)
|
|
405
|
+
run_script = controller_task.run
|
|
406
|
+
assert isinstance(run_script, str)
|
|
407
|
+
# Manually add the env variables to the run script.
|
|
408
|
+
# Originally this is done in ray jobs submission but now we
|
|
409
|
+
# have to do it manually because there is no ray runtime on
|
|
410
|
+
# the API server.
|
|
411
|
+
env_cmds = [
|
|
412
|
+
f'export {k}={v!r}'
|
|
413
|
+
for k, v in controller_task.envs.items()
|
|
414
|
+
]
|
|
415
|
+
run_script = '\n'.join(env_cmds + [run_script])
|
|
416
|
+
# Dump script for high availability recovery.
|
|
417
|
+
if controller_utils.high_availability_specified(
|
|
418
|
+
controller_name):
|
|
419
|
+
managed_job_state.set_ha_recovery_script(
|
|
420
|
+
consolidation_mode_job_id, run_script)
|
|
421
|
+
backend.run_on_head(local_handle, run_script)
|
|
422
|
+
return consolidation_mode_job_id, local_handle
|
|
423
|
+
|
|
424
|
+
if consolidation_mode_job_ids is None:
|
|
425
|
+
return _submit_one()
|
|
426
|
+
if pool is None:
|
|
427
|
+
assert len(consolidation_mode_job_ids) == 1
|
|
428
|
+
return _submit_one(consolidation_mode_job_ids[0])
|
|
429
|
+
ids = []
|
|
430
|
+
all_handle = None
|
|
431
|
+
for job_rank, job_id in enumerate(consolidation_mode_job_ids):
|
|
432
|
+
jid, handle = _submit_one(job_id, job_rank)
|
|
433
|
+
assert jid is not None, (job_id, handle)
|
|
434
|
+
ids.append(jid)
|
|
435
|
+
all_handle = handle
|
|
436
|
+
return ids, all_handle
|
|
381
437
|
|
|
382
438
|
|
|
383
439
|
def queue_from_kubernetes_pod(
|
|
@@ -590,7 +646,8 @@ def queue(refresh: bool,
|
|
|
590
646
|
def cancel(name: Optional[str] = None,
|
|
591
647
|
job_ids: Optional[List[int]] = None,
|
|
592
648
|
all: bool = False,
|
|
593
|
-
all_users: bool = False
|
|
649
|
+
all_users: bool = False,
|
|
650
|
+
pool: Optional[str] = None) -> None:
|
|
594
651
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
595
652
|
"""Cancels managed jobs.
|
|
596
653
|
|
|
@@ -608,15 +665,19 @@ def cancel(name: Optional[str] = None,
|
|
|
608
665
|
stopped_message='All managed jobs should have finished.')
|
|
609
666
|
|
|
610
667
|
job_id_str = ','.join(map(str, job_ids))
|
|
611
|
-
if sum([
|
|
668
|
+
if sum([
|
|
669
|
+
bool(job_ids), name is not None, pool is not None, all or
|
|
670
|
+
all_users
|
|
671
|
+
]) != 1:
|
|
612
672
|
arguments = []
|
|
613
673
|
arguments += [f'job_ids={job_id_str}'] if job_ids else []
|
|
614
674
|
arguments += [f'name={name}'] if name is not None else []
|
|
675
|
+
arguments += [f'pool={pool}'] if pool is not None else []
|
|
615
676
|
arguments += ['all'] if all else []
|
|
616
677
|
arguments += ['all_users'] if all_users else []
|
|
617
678
|
with ux_utils.print_exception_no_traceback():
|
|
618
679
|
raise ValueError(
|
|
619
|
-
'Can only specify one of JOB_IDS, name, or all/'
|
|
680
|
+
'Can only specify one of JOB_IDS, name, pool, or all/'
|
|
620
681
|
f'all_users. Provided {" ".join(arguments)!r}.')
|
|
621
682
|
|
|
622
683
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
@@ -629,9 +690,11 @@ def cancel(name: Optional[str] = None,
|
|
|
629
690
|
elif job_ids:
|
|
630
691
|
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
|
|
631
692
|
job_ids)
|
|
632
|
-
|
|
633
|
-
assert name is not None, (job_ids, name, all)
|
|
693
|
+
elif name is not None:
|
|
634
694
|
code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
|
|
695
|
+
else:
|
|
696
|
+
assert pool is not None, (job_ids, name, pool, all)
|
|
697
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(pool)
|
|
635
698
|
# The stderr is redirected to stdout
|
|
636
699
|
returncode, stdout, stderr = backend.run_on_head(handle,
|
|
637
700
|
code,
|
|
@@ -751,3 +814,32 @@ def download_logs(
|
|
|
751
814
|
job_name=name,
|
|
752
815
|
controller=controller,
|
|
753
816
|
local_dir=local_dir)
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
@usage_lib.entrypoint
|
|
820
|
+
def pool_apply(
|
|
821
|
+
task: 'sky.Task',
|
|
822
|
+
pool_name: str,
|
|
823
|
+
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
824
|
+
) -> None:
|
|
825
|
+
"""Apply a config to a pool."""
|
|
826
|
+
return impl.apply(task, pool_name, mode, pool=True)
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
@usage_lib.entrypoint
|
|
830
|
+
# pylint: disable=redefined-builtin
|
|
831
|
+
def pool_down(
|
|
832
|
+
pool_names: Optional[Union[str, List[str]]] = None,
|
|
833
|
+
all: bool = False,
|
|
834
|
+
purge: bool = False,
|
|
835
|
+
) -> None:
|
|
836
|
+
"""Delete a pool."""
|
|
837
|
+
return impl.down(pool_names, all, purge, pool=True)
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
@usage_lib.entrypoint
|
|
841
|
+
def pool_status(
|
|
842
|
+
pool_names: Optional[Union[str,
|
|
843
|
+
List[str]]] = None,) -> List[Dict[str, Any]]:
|
|
844
|
+
"""Query a pool."""
|
|
845
|
+
return impl.status(pool_names, pool=True)
|
sky/jobs/server/server.py
CHANGED
|
@@ -106,3 +106,43 @@ async def download_logs(
|
|
|
106
106
|
if jobs_download_logs_body.refresh else api_requests.ScheduleType.SHORT,
|
|
107
107
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
108
108
|
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@router.post('/pool_apply')
|
|
112
|
+
async def pool_apply(request: fastapi.Request,
|
|
113
|
+
jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
|
|
114
|
+
executor.schedule_request(
|
|
115
|
+
request_id=request.state.request_id,
|
|
116
|
+
request_name='jobs.pool_apply',
|
|
117
|
+
request_body=jobs_pool_apply_body,
|
|
118
|
+
func=core.pool_apply,
|
|
119
|
+
schedule_type=api_requests.ScheduleType.LONG,
|
|
120
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@router.post('/pool_down')
|
|
125
|
+
async def pool_down(request: fastapi.Request,
|
|
126
|
+
jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
|
|
127
|
+
executor.schedule_request(
|
|
128
|
+
request_id=request.state.request_id,
|
|
129
|
+
request_name='jobs.pool_down',
|
|
130
|
+
request_body=jobs_pool_down_body,
|
|
131
|
+
func=core.pool_down,
|
|
132
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
133
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@router.post('/pool_status')
|
|
138
|
+
async def pool_status(
|
|
139
|
+
request: fastapi.Request,
|
|
140
|
+
jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
|
|
141
|
+
executor.schedule_request(
|
|
142
|
+
request_id=request.state.request_id,
|
|
143
|
+
request_name='jobs.pool_status',
|
|
144
|
+
request_body=jobs_pool_status_body,
|
|
145
|
+
func=core.pool_status,
|
|
146
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
147
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
|
148
|
+
)
|