skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py
CHANGED
|
@@ -111,6 +111,24 @@ an autogenerated name."""
|
|
|
111
111
|
# command.
|
|
112
112
|
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
|
|
113
113
|
_NUM_MANAGED_JOBS_TO_SHOW = 50
|
|
114
|
+
_NUM_REQUESTS_TO_SHOW = 50
|
|
115
|
+
_DEFAULT_REQUEST_FIELDS_TO_SHOW = [
|
|
116
|
+
'request_id', 'name', 'user_id', 'status', 'created_at'
|
|
117
|
+
]
|
|
118
|
+
_VERBOSE_REQUEST_FIELDS_TO_SHOW = _DEFAULT_REQUEST_FIELDS_TO_SHOW + [
|
|
119
|
+
'cluster_name'
|
|
120
|
+
]
|
|
121
|
+
_DEFAULT_MANAGED_JOB_FIELDS_TO_GET = [
|
|
122
|
+
'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
|
|
123
|
+
'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
|
|
124
|
+
]
|
|
125
|
+
_VERBOSE_MANAGED_JOB_FIELDS_TO_GET = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + [
|
|
126
|
+
'current_cluster_name', 'job_id_on_pool_cluster', 'start_at', 'infra',
|
|
127
|
+
'cloud', 'region', 'zone', 'cluster_resources', 'schedule_state', 'details',
|
|
128
|
+
'failure_reason', 'metadata'
|
|
129
|
+
]
|
|
130
|
+
_USER_NAME_FIELD = ['user_name']
|
|
131
|
+
_USER_HASH_FIELD = ['user_hash']
|
|
114
132
|
|
|
115
133
|
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
|
|
116
134
|
'{cluster_num} cluster{plural} {verb}. Please specify {cause} '
|
|
@@ -151,12 +169,17 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
151
169
|
# Update the SSH config for all clusters
|
|
152
170
|
for record in cluster_records:
|
|
153
171
|
handle = record['handle']
|
|
154
|
-
|
|
172
|
+
name = record['name']
|
|
155
173
|
if not (handle is not None and handle.cached_external_ips is not None
|
|
156
174
|
and 'credentials' in record):
|
|
157
175
|
# If the cluster is not UP or does not have credentials available,
|
|
158
176
|
# we need to remove the cluster from the SSH config.
|
|
159
|
-
cluster_utils.SSHConfigHelper.remove_cluster(
|
|
177
|
+
cluster_utils.SSHConfigHelper.remove_cluster(name)
|
|
178
|
+
continue
|
|
179
|
+
if not record['credentials']:
|
|
180
|
+
# The credential is missing for some reason, continue.
|
|
181
|
+
logger.debug(
|
|
182
|
+
f'Client did not receive SSH credential for cluster {name}')
|
|
160
183
|
continue
|
|
161
184
|
|
|
162
185
|
# During the failover, even though a cluster does not exist, the handle
|
|
@@ -1321,14 +1344,22 @@ def exec(
|
|
|
1321
1344
|
|
|
1322
1345
|
|
|
1323
1346
|
def _handle_jobs_queue_request(
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1347
|
+
request_id: server_common.RequestId[Union[
|
|
1348
|
+
List[responses.ManagedJobRecord],
|
|
1349
|
+
Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]]],
|
|
1350
|
+
show_all: bool,
|
|
1351
|
+
show_user: bool,
|
|
1352
|
+
max_num_jobs_to_show: Optional[int],
|
|
1353
|
+
pool_status_request_id: Optional[server_common.RequestId[List[Dict[
|
|
1354
|
+
str, Any]]]] = None,
|
|
1355
|
+
is_called_by_user: bool = False,
|
|
1356
|
+
only_in_progress: bool = False,
|
|
1357
|
+
) -> Tuple[Optional[int], str]:
|
|
1329
1358
|
"""Get the in-progress managed jobs.
|
|
1330
1359
|
|
|
1331
1360
|
Args:
|
|
1361
|
+
request_id: The request ID for managed jobs.
|
|
1362
|
+
pool_status_request_id: The request ID for pool status, or None.
|
|
1332
1363
|
show_all: Show all information of each job (e.g., region, price).
|
|
1333
1364
|
show_user: Show the user who submitted the job.
|
|
1334
1365
|
max_num_jobs_to_show: If not None, limit the number of jobs to show to
|
|
@@ -1336,6 +1367,7 @@ def _handle_jobs_queue_request(
|
|
|
1336
1367
|
and `sky jobs queue`.
|
|
1337
1368
|
is_called_by_user: If this function is called by user directly, or an
|
|
1338
1369
|
internal call.
|
|
1370
|
+
only_in_progress: If True, only return the number of in-progress jobs.
|
|
1339
1371
|
|
|
1340
1372
|
Returns:
|
|
1341
1373
|
A tuple of (num_in_progress_jobs, msg). If num_in_progress_jobs is None,
|
|
@@ -1346,11 +1378,47 @@ def _handle_jobs_queue_request(
|
|
|
1346
1378
|
# TODO(SKY-980): remove unnecessary fallbacks on the client side.
|
|
1347
1379
|
num_in_progress_jobs = None
|
|
1348
1380
|
msg = ''
|
|
1381
|
+
status_counts: Optional[Dict[str, int]] = None
|
|
1382
|
+
pool_status_result = None
|
|
1349
1383
|
try:
|
|
1350
1384
|
if not is_called_by_user:
|
|
1351
1385
|
usage_lib.messages.usage.set_internal()
|
|
1352
|
-
|
|
1353
|
-
|
|
1386
|
+
# Call both stream_and_get functions in parallel
|
|
1387
|
+
def get_jobs_queue_result():
|
|
1388
|
+
return sdk.stream_and_get(request_id)
|
|
1389
|
+
|
|
1390
|
+
def get_pool_status_result():
|
|
1391
|
+
if pool_status_request_id is not None:
|
|
1392
|
+
try:
|
|
1393
|
+
return sdk.stream_and_get(pool_status_request_id)
|
|
1394
|
+
except Exception: # pylint: disable=broad-except
|
|
1395
|
+
# If getting pool status fails, just continue without it
|
|
1396
|
+
return None
|
|
1397
|
+
return None
|
|
1398
|
+
|
|
1399
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
1400
|
+
jobs_future = executor.submit(get_jobs_queue_result)
|
|
1401
|
+
pool_status_future = executor.submit(get_pool_status_result)
|
|
1402
|
+
|
|
1403
|
+
result = jobs_future.result()
|
|
1404
|
+
pool_status_result = pool_status_future.result()
|
|
1405
|
+
|
|
1406
|
+
if isinstance(result, tuple):
|
|
1407
|
+
managed_jobs_, total, status_counts, _ = result
|
|
1408
|
+
if only_in_progress:
|
|
1409
|
+
num_in_progress_jobs = 0
|
|
1410
|
+
if status_counts:
|
|
1411
|
+
for status_value, count in status_counts.items():
|
|
1412
|
+
status_enum = managed_jobs.ManagedJobStatus(
|
|
1413
|
+
status_value)
|
|
1414
|
+
if not status_enum.is_terminal():
|
|
1415
|
+
num_in_progress_jobs += count
|
|
1416
|
+
else:
|
|
1417
|
+
num_in_progress_jobs = total
|
|
1418
|
+
else:
|
|
1419
|
+
managed_jobs_ = result
|
|
1420
|
+
num_in_progress_jobs = len(
|
|
1421
|
+
set(job['job_id'] for job in managed_jobs_))
|
|
1354
1422
|
except exceptions.ClusterNotUpError as e:
|
|
1355
1423
|
controller_status = e.cluster_status
|
|
1356
1424
|
msg = str(e)
|
|
@@ -1394,10 +1462,14 @@ def _handle_jobs_queue_request(
|
|
|
1394
1462
|
msg += ('Failed to query managed jobs: '
|
|
1395
1463
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
1396
1464
|
else:
|
|
1397
|
-
msg = table_utils.format_job_table(
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1465
|
+
msg = table_utils.format_job_table(
|
|
1466
|
+
managed_jobs_,
|
|
1467
|
+
pool_status=pool_status_result,
|
|
1468
|
+
show_all=show_all,
|
|
1469
|
+
show_user=show_user,
|
|
1470
|
+
max_jobs=max_num_jobs_to_show,
|
|
1471
|
+
status_counts=status_counts,
|
|
1472
|
+
)
|
|
1401
1473
|
return num_in_progress_jobs, msg
|
|
1402
1474
|
|
|
1403
1475
|
|
|
@@ -1786,9 +1858,16 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1786
1858
|
|
|
1787
1859
|
# Phase 2: Parallel submission of all API requests
|
|
1788
1860
|
def submit_managed_jobs():
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1861
|
+
fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
|
|
1862
|
+
if all_users:
|
|
1863
|
+
fields = fields + _USER_NAME_FIELD
|
|
1864
|
+
return managed_jobs.queue(
|
|
1865
|
+
refresh=False,
|
|
1866
|
+
skip_finished=True,
|
|
1867
|
+
all_users=all_users,
|
|
1868
|
+
fields=fields,
|
|
1869
|
+
limit=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
|
|
1870
|
+
)
|
|
1792
1871
|
|
|
1793
1872
|
def submit_services(
|
|
1794
1873
|
) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
|
|
@@ -1861,7 +1940,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1861
1940
|
controllers = []
|
|
1862
1941
|
for cluster_record in cluster_records:
|
|
1863
1942
|
cluster_name = cluster_record['name']
|
|
1864
|
-
controller = controller_utils.Controllers.from_name(
|
|
1943
|
+
controller = controller_utils.Controllers.from_name(
|
|
1944
|
+
cluster_name, expect_exact_match=False)
|
|
1865
1945
|
if controller is not None:
|
|
1866
1946
|
controllers.append(cluster_record)
|
|
1867
1947
|
else:
|
|
@@ -1890,10 +1970,12 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1890
1970
|
try:
|
|
1891
1971
|
num_in_progress_jobs, msg = _handle_jobs_queue_request(
|
|
1892
1972
|
managed_jobs_queue_request_id,
|
|
1973
|
+
pool_status_request_id=pool_status_request_id,
|
|
1893
1974
|
show_all=False,
|
|
1894
1975
|
show_user=all_users,
|
|
1895
1976
|
max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
|
|
1896
|
-
is_called_by_user=False
|
|
1977
|
+
is_called_by_user=False,
|
|
1978
|
+
only_in_progress=True)
|
|
1897
1979
|
except KeyboardInterrupt:
|
|
1898
1980
|
sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
|
|
1899
1981
|
managed_jobs_query_interrupted = True
|
|
@@ -2027,7 +2109,8 @@ def cost_report(all: bool, days: int): # pylint: disable=redefined-builtin
|
|
|
2027
2109
|
for cluster_record in cluster_records:
|
|
2028
2110
|
cluster_name = cluster_record['name']
|
|
2029
2111
|
try:
|
|
2030
|
-
controller = controller_utils.Controllers.from_name(
|
|
2112
|
+
controller = controller_utils.Controllers.from_name(
|
|
2113
|
+
cluster_name, expect_exact_match=False)
|
|
2031
2114
|
except AssertionError:
|
|
2032
2115
|
# There could be some old controller clusters from previous
|
|
2033
2116
|
# versions that we should not show in the cost report.
|
|
@@ -2136,6 +2219,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2136
2219
|
is_flag=True,
|
|
2137
2220
|
default=False,
|
|
2138
2221
|
help='Stream the cluster provisioning logs (provision.log).')
|
|
2222
|
+
@click.option('--worker',
|
|
2223
|
+
'-w',
|
|
2224
|
+
default=None,
|
|
2225
|
+
type=int,
|
|
2226
|
+
help='The worker ID to stream the logs from. '
|
|
2227
|
+
'If not set, stream the logs of the head node.')
|
|
2139
2228
|
@click.option(
|
|
2140
2229
|
'--sync-down',
|
|
2141
2230
|
'-s',
|
|
@@ -2173,6 +2262,7 @@ def logs(
|
|
|
2173
2262
|
cluster: str,
|
|
2174
2263
|
job_ids: Tuple[str, ...],
|
|
2175
2264
|
provision: bool,
|
|
2265
|
+
worker: Optional[int],
|
|
2176
2266
|
sync_down: bool,
|
|
2177
2267
|
status: bool, # pylint: disable=redefined-outer-name
|
|
2178
2268
|
follow: bool,
|
|
@@ -2202,6 +2292,13 @@ def logs(
|
|
|
2202
2292
|
4. If the job fails or fetching the logs fails, the command will exit with
|
|
2203
2293
|
a non-zero return code.
|
|
2204
2294
|
"""
|
|
2295
|
+
if worker is not None:
|
|
2296
|
+
if not provision:
|
|
2297
|
+
raise click.UsageError(
|
|
2298
|
+
'--worker can only be used with --provision.')
|
|
2299
|
+
if worker < 1:
|
|
2300
|
+
raise click.UsageError('--worker must be a positive integer.')
|
|
2301
|
+
|
|
2205
2302
|
if provision and (sync_down or status or job_ids):
|
|
2206
2303
|
raise click.UsageError(
|
|
2207
2304
|
'--provision cannot be combined with job log options '
|
|
@@ -2221,7 +2318,11 @@ def logs(
|
|
|
2221
2318
|
|
|
2222
2319
|
if provision:
|
|
2223
2320
|
# Stream provision logs
|
|
2224
|
-
sys.exit(
|
|
2321
|
+
sys.exit(
|
|
2322
|
+
sdk.tail_provision_logs(cluster_name=cluster,
|
|
2323
|
+
worker=worker,
|
|
2324
|
+
follow=follow,
|
|
2325
|
+
tail=tail))
|
|
2225
2326
|
|
|
2226
2327
|
if sync_down:
|
|
2227
2328
|
with rich_utils.client_status(
|
|
@@ -2399,7 +2500,8 @@ def cancel(
|
|
|
2399
2500
|
job_ids=job_ids_to_cancel)
|
|
2400
2501
|
_async_call_or_wait(request_id, async_call, 'sky.cancel')
|
|
2401
2502
|
except exceptions.NotSupportedError as e:
|
|
2402
|
-
controller = controller_utils.Controllers.from_name(
|
|
2503
|
+
controller = controller_utils.Controllers.from_name(
|
|
2504
|
+
cluster, expect_exact_match=False)
|
|
2403
2505
|
assert controller is not None, cluster
|
|
2404
2506
|
with ux_utils.print_exception_no_traceback():
|
|
2405
2507
|
raise click.UsageError(
|
|
@@ -2700,7 +2802,8 @@ def start(
|
|
|
2700
2802
|
# Get all clusters that are not controllers.
|
|
2701
2803
|
cluster_records = [
|
|
2702
2804
|
cluster for cluster in all_clusters
|
|
2703
|
-
if controller_utils.Controllers.from_name(
|
|
2805
|
+
if controller_utils.Controllers.from_name(
|
|
2806
|
+
cluster['name'], expect_exact_match=False) is None
|
|
2704
2807
|
]
|
|
2705
2808
|
if cluster_records is None:
|
|
2706
2809
|
# Get GLOB cluster names
|
|
@@ -2762,7 +2865,8 @@ def start(
|
|
|
2762
2865
|
# Checks for controller clusters (jobs controller / sky serve controller).
|
|
2763
2866
|
controllers, normal_clusters = [], []
|
|
2764
2867
|
for name in to_start:
|
|
2765
|
-
if controller_utils.Controllers.from_name(
|
|
2868
|
+
if controller_utils.Controllers.from_name(
|
|
2869
|
+
name, expect_exact_match=False) is not None:
|
|
2766
2870
|
controllers.append(name)
|
|
2767
2871
|
else:
|
|
2768
2872
|
normal_clusters.append(name)
|
|
@@ -2898,16 +3002,26 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
2898
3002
|
to be torn down (e.g., because it has jobs running or
|
|
2899
3003
|
it is in init state)
|
|
2900
3004
|
"""
|
|
2901
|
-
controller = controller_utils.Controllers.from_name(
|
|
3005
|
+
controller = controller_utils.Controllers.from_name(
|
|
3006
|
+
controller_name, expect_exact_match=False)
|
|
2902
3007
|
assert controller is not None, controller_name
|
|
2903
3008
|
|
|
3009
|
+
status_counts: Optional[Dict[str, int]] = None
|
|
2904
3010
|
with rich_utils.client_status(
|
|
2905
3011
|
'[bold cyan]Checking for in-progress managed jobs and pools[/]'):
|
|
2906
3012
|
try:
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
|
|
2910
|
-
|
|
3013
|
+
fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + _USER_NAME_FIELD
|
|
3014
|
+
request_id = managed_jobs.queue(
|
|
3015
|
+
refresh=False,
|
|
3016
|
+
skip_finished=True,
|
|
3017
|
+
all_users=True,
|
|
3018
|
+
fields=fields,
|
|
3019
|
+
)
|
|
3020
|
+
result = sdk.stream_and_get(request_id)
|
|
3021
|
+
if isinstance(result, tuple):
|
|
3022
|
+
managed_jobs_, _, status_counts, _ = result
|
|
3023
|
+
else:
|
|
3024
|
+
managed_jobs_ = result
|
|
2911
3025
|
request_id_pools = managed_jobs.pool_status(pool_names=None)
|
|
2912
3026
|
pools_ = sdk.stream_and_get(request_id_pools)
|
|
2913
3027
|
except exceptions.ClusterNotUpError as e:
|
|
@@ -2938,10 +3052,17 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
2938
3052
|
}}):
|
|
2939
3053
|
# Check again with the consolidation mode disabled. This is to
|
|
2940
3054
|
# make sure there is no in-progress managed jobs.
|
|
2941
|
-
request_id = managed_jobs.queue(
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
|
|
3055
|
+
request_id = managed_jobs.queue(
|
|
3056
|
+
refresh=False,
|
|
3057
|
+
skip_finished=True,
|
|
3058
|
+
all_users=True,
|
|
3059
|
+
fields=fields,
|
|
3060
|
+
)
|
|
3061
|
+
result = sdk.stream_and_get(request_id)
|
|
3062
|
+
if isinstance(result, tuple):
|
|
3063
|
+
managed_jobs_, _, status_counts, _ = result
|
|
3064
|
+
else:
|
|
3065
|
+
managed_jobs_ = result
|
|
2945
3066
|
request_id_pools = managed_jobs.pool_status(pool_names=None)
|
|
2946
3067
|
pools_ = sdk.stream_and_get(request_id_pools)
|
|
2947
3068
|
|
|
@@ -2952,9 +3073,12 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
2952
3073
|
'jobs (output of `sky jobs queue`) will be lost.')
|
|
2953
3074
|
click.echo(msg)
|
|
2954
3075
|
if managed_jobs_:
|
|
2955
|
-
job_table = table_utils.format_job_table(
|
|
2956
|
-
|
|
2957
|
-
|
|
3076
|
+
job_table = table_utils.format_job_table(
|
|
3077
|
+
managed_jobs_,
|
|
3078
|
+
show_all=False,
|
|
3079
|
+
show_user=True,
|
|
3080
|
+
status_counts=status_counts,
|
|
3081
|
+
)
|
|
2958
3082
|
msg = controller.value.decline_down_for_dirty_controller_hint
|
|
2959
3083
|
# Add prefix to each line to align with the bullet point.
|
|
2960
3084
|
msg += '\n'.join(
|
|
@@ -2997,7 +3121,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
|
2997
3121
|
to be torn down (e.g., because it has services running or
|
|
2998
3122
|
it is in init state)
|
|
2999
3123
|
"""
|
|
3000
|
-
controller = controller_utils.Controllers.from_name(
|
|
3124
|
+
controller = controller_utils.Controllers.from_name(
|
|
3125
|
+
controller_name, expect_exact_match=False)
|
|
3001
3126
|
assert controller is not None, controller_name
|
|
3002
3127
|
with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
|
|
3003
3128
|
try:
|
|
@@ -3108,14 +3233,15 @@ def _down_or_stop_clusters(
|
|
|
3108
3233
|
names = list(names)
|
|
3109
3234
|
if names:
|
|
3110
3235
|
controllers = [
|
|
3111
|
-
name for name in names
|
|
3112
|
-
|
|
3236
|
+
name for name in names if controller_utils.Controllers.from_name(
|
|
3237
|
+
name, expect_exact_match=False) is not None
|
|
3113
3238
|
]
|
|
3114
3239
|
controllers_str = ', '.join(map(repr, controllers))
|
|
3115
3240
|
names = [
|
|
3116
3241
|
cluster['name']
|
|
3117
3242
|
for cluster in _get_cluster_records_and_set_ssh_config(names)
|
|
3118
|
-
if controller_utils.Controllers.from_name(
|
|
3243
|
+
if controller_utils.Controllers.from_name(
|
|
3244
|
+
cluster['name'], expect_exact_match=False) is None
|
|
3119
3245
|
]
|
|
3120
3246
|
|
|
3121
3247
|
# Make sure the controllers are explicitly specified without other
|
|
@@ -3140,7 +3266,7 @@ def _down_or_stop_clusters(
|
|
|
3140
3266
|
f'{controllers_str} is currently not supported.')
|
|
3141
3267
|
else:
|
|
3142
3268
|
controller = controller_utils.Controllers.from_name(
|
|
3143
|
-
controller_name)
|
|
3269
|
+
controller_name, expect_exact_match=False)
|
|
3144
3270
|
assert controller is not None
|
|
3145
3271
|
hint_or_raise = _controller_to_hint_or_raise(controller)
|
|
3146
3272
|
try:
|
|
@@ -3188,9 +3314,10 @@ def _down_or_stop_clusters(
|
|
|
3188
3314
|
names = [
|
|
3189
3315
|
record['name']
|
|
3190
3316
|
for record in all_clusters
|
|
3191
|
-
if controller_utils.Controllers.from_name(
|
|
3192
|
-
|
|
3193
|
-
|
|
3317
|
+
if controller_utils.Controllers.from_name(
|
|
3318
|
+
record['name'], expect_exact_match=False) is None and
|
|
3319
|
+
(down or idle_minutes_to_autostop is not None or
|
|
3320
|
+
record['status'] != status_lib.ClusterStatus.STOPPED)
|
|
3194
3321
|
]
|
|
3195
3322
|
|
|
3196
3323
|
clusters = names
|
|
@@ -3220,6 +3347,9 @@ def _down_or_stop_clusters(
|
|
|
3220
3347
|
|
|
3221
3348
|
request_ids = []
|
|
3222
3349
|
|
|
3350
|
+
successes: List[str] = []
|
|
3351
|
+
failures: List[Tuple[str, str]] = []
|
|
3352
|
+
|
|
3223
3353
|
def _down_or_stop(name: str):
|
|
3224
3354
|
success_progress = False
|
|
3225
3355
|
if idle_minutes_to_autostop is not None:
|
|
@@ -3227,16 +3357,20 @@ def _down_or_stop_clusters(
|
|
|
3227
3357
|
request_id = sdk.autostop(name, idle_minutes_to_autostop,
|
|
3228
3358
|
wait_for, down)
|
|
3229
3359
|
request_ids.append(request_id)
|
|
3360
|
+
progress.stop()
|
|
3230
3361
|
_async_call_or_wait(
|
|
3231
3362
|
request_id, async_call,
|
|
3232
3363
|
server_constants.REQUEST_NAME_PREFIX + operation)
|
|
3233
|
-
|
|
3234
|
-
|
|
3364
|
+
progress.start()
|
|
3365
|
+
except (exceptions.NotSupportedError, exceptions.ClusterNotUpError,
|
|
3366
|
+
exceptions.CloudError) as e:
|
|
3235
3367
|
message = str(e)
|
|
3368
|
+
failures.append((name, str(e)))
|
|
3236
3369
|
else: # no exception raised
|
|
3237
3370
|
success_progress = True
|
|
3238
3371
|
message = (f'{colorama.Fore.GREEN}{operation} '
|
|
3239
3372
|
f'cluster {name!r}...done{colorama.Style.RESET_ALL}')
|
|
3373
|
+
successes.append(name)
|
|
3240
3374
|
if idle_minutes_to_autostop >= 0:
|
|
3241
3375
|
option_str = 'down' if down else 'stop'
|
|
3242
3376
|
passive_str = 'downed' if down else 'stopped'
|
|
@@ -3256,9 +3390,11 @@ def _down_or_stop_clusters(
|
|
|
3256
3390
|
else:
|
|
3257
3391
|
request_id = sdk.stop(name, purge=purge)
|
|
3258
3392
|
request_ids.append(request_id)
|
|
3393
|
+
progress.stop()
|
|
3259
3394
|
_async_call_or_wait(
|
|
3260
3395
|
request_id, async_call,
|
|
3261
3396
|
server_constants.REQUEST_NAME_PREFIX + operation)
|
|
3397
|
+
progress.start()
|
|
3262
3398
|
if not async_call:
|
|
3263
3399
|
# Remove the cluster from the SSH config file as soon as it
|
|
3264
3400
|
# is stopped or downed.
|
|
@@ -3268,13 +3404,17 @@ def _down_or_stop_clusters(
|
|
|
3268
3404
|
f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
|
|
3269
3405
|
f'{colorama.Style.RESET_ALL}'
|
|
3270
3406
|
f'\nReason: {common_utils.format_exception(e)}.')
|
|
3407
|
+
failures.append((name, str(e)))
|
|
3271
3408
|
except (exceptions.NotSupportedError,
|
|
3272
|
-
exceptions.ClusterOwnerIdentityMismatchError
|
|
3409
|
+
exceptions.ClusterOwnerIdentityMismatchError,
|
|
3410
|
+
exceptions.CloudError) as e:
|
|
3273
3411
|
message = str(e)
|
|
3412
|
+
failures.append((name, str(e)))
|
|
3274
3413
|
else: # no exception raised
|
|
3275
3414
|
message = (
|
|
3276
3415
|
f'{colorama.Fore.GREEN}{operation} cluster {name}...done.'
|
|
3277
3416
|
f'{colorama.Style.RESET_ALL}')
|
|
3417
|
+
successes.append(name)
|
|
3278
3418
|
if not down:
|
|
3279
3419
|
message += ('\n To restart the cluster, run: '
|
|
3280
3420
|
f'{colorama.Style.BRIGHT}sky start {name}'
|
|
@@ -3288,6 +3428,10 @@ def _down_or_stop_clusters(
|
|
|
3288
3428
|
progress.start()
|
|
3289
3429
|
|
|
3290
3430
|
with progress:
|
|
3431
|
+
# we write a new line here to avoid the "Waiting for 'sky.down'
|
|
3432
|
+
# request to be scheduled" message from being printed on the same line
|
|
3433
|
+
# as the "Terminating <num> clusters..." message
|
|
3434
|
+
click.echo('')
|
|
3291
3435
|
subprocess_utils.run_in_parallel(_down_or_stop, clusters)
|
|
3292
3436
|
progress.live.transient = False
|
|
3293
3437
|
# Make sure the progress bar not mess up the terminal.
|
|
@@ -3297,6 +3441,31 @@ def _down_or_stop_clusters(
|
|
|
3297
3441
|
click.secho(f'{operation} requests are sent. Check the requests\' '
|
|
3298
3442
|
'status with `sky request get <request_id>`.')
|
|
3299
3443
|
|
|
3444
|
+
show_summary = len(clusters) > 1
|
|
3445
|
+
|
|
3446
|
+
if show_summary:
|
|
3447
|
+
click.echo('\nSummary:')
|
|
3448
|
+
if successes:
|
|
3449
|
+
# Preserve the original order of clusters as provided by user.
|
|
3450
|
+
click.echo(' ✓ Succeeded: ' + ', '.join(successes))
|
|
3451
|
+
if failures:
|
|
3452
|
+
# Format failures: if one failure, keep on same line. If multiple,
|
|
3453
|
+
# indent each failed cluster on its own line for readability.
|
|
3454
|
+
if len(failures) == 1:
|
|
3455
|
+
name, reason = failures[0]
|
|
3456
|
+
first = reason.strip().splitlines()[0]
|
|
3457
|
+
first = first if len(first) <= 120 else first[:120] + '…'
|
|
3458
|
+
click.echo(f' ✗ Failed: {name} ({first})')
|
|
3459
|
+
else:
|
|
3460
|
+
click.echo(' ✗ Failed:')
|
|
3461
|
+
for name, reason in failures:
|
|
3462
|
+
first = reason.strip().splitlines()[0]
|
|
3463
|
+
first = first if len(first) <= 120 else first[:120] + '…'
|
|
3464
|
+
click.echo(f' {name} ({first})')
|
|
3465
|
+
|
|
3466
|
+
if failures:
|
|
3467
|
+
click.echo('Cluster(s) failed. See details above.')
|
|
3468
|
+
|
|
3300
3469
|
|
|
3301
3470
|
@cli.command(cls=_DocumentedCodeCommand)
|
|
3302
3471
|
@flags.config_option(expose_value=False)
|
|
@@ -4096,6 +4265,10 @@ def volumes():
|
|
|
4096
4265
|
pass
|
|
4097
4266
|
|
|
4098
4267
|
|
|
4268
|
+
# Add 'volume' as an alias for 'volumes'
|
|
4269
|
+
cli.add_command(volumes, name='volume')
|
|
4270
|
+
|
|
4271
|
+
|
|
4099
4272
|
@volumes.command('apply', cls=_DocumentedCodeCommand)
|
|
4100
4273
|
@flags.config_option(expose_value=False)
|
|
4101
4274
|
@click.argument('entrypoint',
|
|
@@ -4492,21 +4665,6 @@ def jobs_launch(
|
|
|
4492
4665
|
|
|
4493
4666
|
job_ids = [job_id_handle[0]] if isinstance(job_id_handle[0],
|
|
4494
4667
|
int) else job_id_handle[0]
|
|
4495
|
-
if pool:
|
|
4496
|
-
# Display the worker assignment for the jobs.
|
|
4497
|
-
logger.debug(f'Getting service records for pool: {pool}')
|
|
4498
|
-
records_request_id = managed_jobs.pool_status(pool_names=pool)
|
|
4499
|
-
service_records = _async_call_or_wait(records_request_id, async_call,
|
|
4500
|
-
'sky.jobs.pool_status')
|
|
4501
|
-
logger.debug(f'Pool status: {service_records}')
|
|
4502
|
-
replica_infos = service_records[0]['replica_info']
|
|
4503
|
-
for replica_info in replica_infos:
|
|
4504
|
-
job_id = replica_info.get('used_by', None)
|
|
4505
|
-
if job_id in job_ids:
|
|
4506
|
-
worker_id = replica_info['replica_id']
|
|
4507
|
-
version = replica_info['version']
|
|
4508
|
-
logger.info(f'Job ID: {job_id} assigned to pool {pool} '
|
|
4509
|
-
f'(worker: {worker_id}, version: {version})')
|
|
4510
4668
|
|
|
4511
4669
|
if not detach_run:
|
|
4512
4670
|
if len(job_ids) == 1:
|
|
@@ -4519,7 +4677,8 @@ def jobs_launch(
|
|
|
4519
4677
|
else:
|
|
4520
4678
|
# TODO(tian): This can be very long. Considering have a "group id"
|
|
4521
4679
|
# and query all job ids with the same group id.
|
|
4522
|
-
|
|
4680
|
+
# Sort job ids to ensure consistent ordering.
|
|
4681
|
+
job_ids_str = ','.join(map(str, sorted(job_ids)))
|
|
4523
4682
|
click.secho(
|
|
4524
4683
|
f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
|
|
4525
4684
|
f'{job_ids_str}{colorama.Style.RESET_ALL}.'
|
|
@@ -4538,6 +4697,14 @@ def jobs_launch(
|
|
|
4538
4697
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
|
4539
4698
|
@flags.config_option(expose_value=False)
|
|
4540
4699
|
@flags.verbose_option()
|
|
4700
|
+
@click.option(
|
|
4701
|
+
'--limit',
|
|
4702
|
+
'-l',
|
|
4703
|
+
default=_NUM_MANAGED_JOBS_TO_SHOW,
|
|
4704
|
+
type=int,
|
|
4705
|
+
required=False,
|
|
4706
|
+
help=(f'Number of jobs to show, default is {_NUM_MANAGED_JOBS_TO_SHOW},'
|
|
4707
|
+
f' use "-a/--all" to show all jobs.'))
|
|
4541
4708
|
@click.option(
|
|
4542
4709
|
'--refresh',
|
|
4543
4710
|
'-r',
|
|
@@ -4557,7 +4724,7 @@ def jobs_launch(
|
|
|
4557
4724
|
@usage_lib.entrypoint
|
|
4558
4725
|
# pylint: disable=redefined-builtin
|
|
4559
4726
|
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
4560
|
-
all_users: bool, all: bool):
|
|
4727
|
+
all_users: bool, all: bool, limit: int):
|
|
4561
4728
|
"""Show statuses of managed jobs.
|
|
4562
4729
|
|
|
4563
4730
|
Each managed jobs can have one of the following statuses:
|
|
@@ -4608,14 +4775,48 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
|
4608
4775
|
|
|
4609
4776
|
watch -n60 sky jobs queue
|
|
4610
4777
|
|
|
4778
|
+
(Tip) To show only the latest 10 jobs, use ``-l/--limit 10``:
|
|
4779
|
+
|
|
4780
|
+
.. code-block:: bash
|
|
4781
|
+
|
|
4782
|
+
sky jobs queue -l 10
|
|
4783
|
+
|
|
4611
4784
|
"""
|
|
4612
4785
|
click.secho('Fetching managed job statuses...', fg='cyan')
|
|
4613
4786
|
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
|
4614
|
-
|
|
4615
|
-
|
|
4616
|
-
|
|
4787
|
+
max_num_jobs_to_show = (limit if not all else None)
|
|
4788
|
+
fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
|
|
4789
|
+
if verbose:
|
|
4790
|
+
fields = _VERBOSE_MANAGED_JOB_FIELDS_TO_GET
|
|
4791
|
+
if all_users:
|
|
4792
|
+
fields = fields + _USER_NAME_FIELD
|
|
4793
|
+
if verbose:
|
|
4794
|
+
fields = fields + _USER_HASH_FIELD
|
|
4795
|
+
# Call both managed_jobs.queue and managed_jobs.pool_status in parallel
|
|
4796
|
+
def get_managed_jobs_queue():
|
|
4797
|
+
return managed_jobs.queue(refresh=refresh,
|
|
4798
|
+
skip_finished=skip_finished,
|
|
4799
|
+
all_users=all_users,
|
|
4800
|
+
limit=max_num_jobs_to_show,
|
|
4801
|
+
fields=fields)
|
|
4802
|
+
|
|
4803
|
+
def get_pool_status():
|
|
4804
|
+
try:
|
|
4805
|
+
return managed_jobs.pool_status(pool_names=None)
|
|
4806
|
+
except Exception: # pylint: disable=broad-except
|
|
4807
|
+
# If pool_status fails, we'll just skip the worker information
|
|
4808
|
+
return None
|
|
4809
|
+
|
|
4810
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
4811
|
+
managed_jobs_future = executor.submit(get_managed_jobs_queue)
|
|
4812
|
+
pool_status_future = executor.submit(get_pool_status)
|
|
4813
|
+
|
|
4814
|
+
managed_jobs_request_id = managed_jobs_future.result()
|
|
4815
|
+
pool_status_request_id = pool_status_future.result()
|
|
4816
|
+
|
|
4617
4817
|
num_jobs, msg = _handle_jobs_queue_request(
|
|
4618
4818
|
managed_jobs_request_id,
|
|
4819
|
+
pool_status_request_id=pool_status_request_id,
|
|
4619
4820
|
show_all=verbose,
|
|
4620
4821
|
show_user=all_users,
|
|
4621
4822
|
max_num_jobs_to_show=max_num_jobs_to_show,
|
|
@@ -4632,7 +4833,8 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
|
4632
4833
|
f'{colorama.Fore.CYAN}'
|
|
4633
4834
|
f'Only showing the latest {max_num_jobs_to_show} '
|
|
4634
4835
|
f'managed jobs'
|
|
4635
|
-
f'(use --
|
|
4836
|
+
f'(use --limit to show more managed jobs or '
|
|
4837
|
+
f'--all to show all managed jobs) {colorama.Style.RESET_ALL} ')
|
|
4636
4838
|
|
|
4637
4839
|
|
|
4638
4840
|
@jobs.command('cancel', cls=_DocumentedCodeCommand)
|
|
@@ -5212,7 +5414,15 @@ def jobs_pool_logs(
|
|
|
5212
5414
|
@flags.config_option(expose_value=False)
|
|
5213
5415
|
@usage_lib.entrypoint
|
|
5214
5416
|
def dashboard() -> None:
|
|
5215
|
-
"""
|
|
5417
|
+
"""Opens the SkyPilot dashboard."""
|
|
5418
|
+
sdk.dashboard()
|
|
5419
|
+
|
|
5420
|
+
|
|
5421
|
+
@cli.command(cls=_DocumentedCodeCommand, hidden=True)
|
|
5422
|
+
@flags.config_option(expose_value=False)
|
|
5423
|
+
@usage_lib.entrypoint
|
|
5424
|
+
def ui() -> None:
|
|
5425
|
+
"""Opens the SkyPilot dashboard."""
|
|
5216
5426
|
sdk.dashboard()
|
|
5217
5427
|
|
|
5218
5428
|
|
|
@@ -6120,20 +6330,22 @@ def api_logs(request_id: Optional[str], server_logs: bool,
|
|
|
6120
6330
|
**_get_shell_complete_args(_complete_api_request))
|
|
6121
6331
|
@flags.all_option('Cancel all your requests.')
|
|
6122
6332
|
@flags.all_users_option('Cancel all requests from all users.')
|
|
6333
|
+
@flags.yes_option()
|
|
6123
6334
|
@usage_lib.entrypoint
|
|
6124
6335
|
# pylint: disable=redefined-builtin
|
|
6125
|
-
def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool
|
|
6336
|
+
def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool,
|
|
6337
|
+
yes: bool):
|
|
6126
6338
|
"""Cancel a request running on SkyPilot API server."""
|
|
6127
6339
|
if all or all_users:
|
|
6128
|
-
|
|
6129
|
-
|
|
6130
|
-
|
|
6131
|
-
|
|
6132
|
-
|
|
6133
|
-
|
|
6134
|
-
|
|
6135
|
-
|
|
6136
|
-
|
|
6340
|
+
if not yes:
|
|
6341
|
+
keyword = 'ALL USERS\'' if all_users else 'YOUR'
|
|
6342
|
+
user_input = click.prompt(
|
|
6343
|
+
f'This will cancel all {keyword} requests.\n'
|
|
6344
|
+
f'To proceed, please type {colorama.Style.BRIGHT}'
|
|
6345
|
+
f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
|
|
6346
|
+
type=str)
|
|
6347
|
+
if user_input != 'cancel all requests':
|
|
6348
|
+
raise click.Abort()
|
|
6137
6349
|
request_ids = None
|
|
6138
6350
|
cancelled_request_ids = sdk.get(
|
|
6139
6351
|
sdk.api_cancel(request_ids=request_ids, all_users=all_users))
|
|
@@ -6147,9 +6359,28 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
|
|
|
6147
6359
|
fg='green')
|
|
6148
6360
|
|
|
6149
6361
|
|
|
6362
|
+
class IntOrNone(click.ParamType):
|
|
6363
|
+
"""Int or None"""
|
|
6364
|
+
name = 'int-or-none'
|
|
6365
|
+
|
|
6366
|
+
def convert(self, value, param, ctx):
|
|
6367
|
+
if isinstance(value, int):
|
|
6368
|
+
return value
|
|
6369
|
+
if isinstance(value, str) and value.lower() in ('none', 'all'):
|
|
6370
|
+
return None
|
|
6371
|
+
try:
|
|
6372
|
+
return int(value)
|
|
6373
|
+
except ValueError:
|
|
6374
|
+
self.fail(f'{value!r} is not a valid integer or "none" or "all"',
|
|
6375
|
+
param, ctx)
|
|
6376
|
+
|
|
6377
|
+
|
|
6378
|
+
INT_OR_NONE = IntOrNone()
|
|
6379
|
+
|
|
6380
|
+
|
|
6150
6381
|
@api.command('status', cls=_DocumentedCodeCommand)
|
|
6151
6382
|
@flags.config_option(expose_value=False)
|
|
6152
|
-
@click.argument('
|
|
6383
|
+
@click.argument('request_id_prefixes',
|
|
6153
6384
|
required=False,
|
|
6154
6385
|
type=str,
|
|
6155
6386
|
nargs=-1,
|
|
@@ -6159,16 +6390,30 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
|
|
|
6159
6390
|
is_flag=True,
|
|
6160
6391
|
default=False,
|
|
6161
6392
|
required=False,
|
|
6162
|
-
help='Show requests of all statuses
|
|
6393
|
+
help=('Show requests of all statuses, including finished ones '
|
|
6394
|
+
'(SUCCEEDED, FAILED, CANCELLED). By default, only active '
|
|
6395
|
+
'requests (PENDING, RUNNING) are shown.'))
|
|
6396
|
+
@click.option(
|
|
6397
|
+
'--limit',
|
|
6398
|
+
'-l',
|
|
6399
|
+
default=_NUM_REQUESTS_TO_SHOW,
|
|
6400
|
+
type=INT_OR_NONE,
|
|
6401
|
+
required=False,
|
|
6402
|
+
help=(f'Number of requests to show, default is {_NUM_REQUESTS_TO_SHOW},'
|
|
6403
|
+
f' set to "none" or "all" to show all requests.'))
|
|
6163
6404
|
@flags.verbose_option('Show more details.')
|
|
6164
6405
|
@usage_lib.entrypoint
|
|
6165
6406
|
# pylint: disable=redefined-builtin
|
|
6166
|
-
def api_status(
|
|
6167
|
-
verbose: bool):
|
|
6407
|
+
def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
|
|
6408
|
+
verbose: bool, limit: Optional[int]):
|
|
6168
6409
|
"""List requests on SkyPilot API server."""
|
|
6169
|
-
if not
|
|
6170
|
-
|
|
6171
|
-
|
|
6410
|
+
if not request_id_prefixes:
|
|
6411
|
+
request_id_prefixes = None
|
|
6412
|
+
fields = _DEFAULT_REQUEST_FIELDS_TO_SHOW
|
|
6413
|
+
if verbose:
|
|
6414
|
+
fields = _VERBOSE_REQUEST_FIELDS_TO_SHOW
|
|
6415
|
+
request_list = sdk.api_status(request_id_prefixes, all_status, limit,
|
|
6416
|
+
fields)
|
|
6172
6417
|
columns = ['ID', 'User', 'Name']
|
|
6173
6418
|
if verbose:
|
|
6174
6419
|
columns.append('Cluster')
|
|
@@ -6194,8 +6439,12 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
|
|
|
6194
6439
|
if verbose:
|
|
6195
6440
|
dummy_row.append('-')
|
|
6196
6441
|
table.add_row(dummy_row)
|
|
6197
|
-
click.echo()
|
|
6198
6442
|
click.echo(table)
|
|
6443
|
+
if limit and len(request_list) >= limit:
|
|
6444
|
+
click.echo()
|
|
6445
|
+
click.echo(
|
|
6446
|
+
f'Showing {limit} requests. Use "-l none" or "-l all" to show'
|
|
6447
|
+
f' all requests.')
|
|
6199
6448
|
|
|
6200
6449
|
|
|
6201
6450
|
@api.command('login', cls=_DocumentedCodeCommand)
|