skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -6,9 +6,8 @@ ManagedJobCodeGen.
|
|
|
6
6
|
"""
|
|
7
7
|
import asyncio
|
|
8
8
|
import collections
|
|
9
|
-
import datetime
|
|
9
|
+
from datetime import datetime
|
|
10
10
|
import enum
|
|
11
|
-
import logging
|
|
12
11
|
import os
|
|
13
12
|
import pathlib
|
|
14
13
|
import re
|
|
@@ -84,6 +83,7 @@ _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
|
|
|
84
83
|
|
|
85
84
|
_JOB_STATUS_FETCH_MAX_RETRIES = 3
|
|
86
85
|
_JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
|
|
86
|
+
_JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
|
|
87
87
|
|
|
88
88
|
_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
|
|
89
89
|
'Waiting for task to start[/]'
|
|
@@ -101,6 +101,28 @@ _JOB_CANCELLED_MESSAGE = (
|
|
|
101
101
|
# update the state.
|
|
102
102
|
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
|
|
103
103
|
|
|
104
|
+
# After enabling consolidation mode, we need to restart the API server to get
|
|
105
|
+
# the jobs refresh deamon and correct number of executors. We use this file to
|
|
106
|
+
# indicate that the API server has been restarted after enabling consolidation
|
|
107
|
+
# mode.
|
|
108
|
+
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
|
|
109
|
+
'~/.sky/.jobs_controller_consolidation_reloaded_signal')
|
|
110
|
+
|
|
111
|
+
# The response fields for managed jobs that require cluster handle
|
|
112
|
+
_CLUSTER_HANDLE_FIELDS = [
|
|
113
|
+
'cluster_resources',
|
|
114
|
+
'cluster_resources_full',
|
|
115
|
+
'cloud',
|
|
116
|
+
'region',
|
|
117
|
+
'zone',
|
|
118
|
+
'infra',
|
|
119
|
+
'accelerators',
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
# The response fields for managed jobs that are not stored in the database
|
|
123
|
+
# These fields will be mapped to the DB fields in the `_update_fields`.
|
|
124
|
+
_NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
|
|
125
|
+
|
|
104
126
|
|
|
105
127
|
class ManagedJobQueueResultType(enum.Enum):
|
|
106
128
|
"""The type of the managed job queue result."""
|
|
@@ -117,9 +139,8 @@ class UserSignal(enum.Enum):
|
|
|
117
139
|
|
|
118
140
|
# ====== internal functions ======
|
|
119
141
|
def terminate_cluster(
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
_logger: logging.Logger = logger, # pylint: disable=invalid-name
|
|
142
|
+
cluster_name: str,
|
|
143
|
+
max_retry: int = 6,
|
|
123
144
|
) -> None:
|
|
124
145
|
"""Terminate the cluster."""
|
|
125
146
|
from sky import core # pylint: disable=import-outside-toplevel
|
|
@@ -143,18 +164,18 @@ def terminate_cluster(
|
|
|
143
164
|
return
|
|
144
165
|
except exceptions.ClusterDoesNotExist:
|
|
145
166
|
# The cluster is already down.
|
|
146
|
-
|
|
167
|
+
logger.debug(f'The cluster {cluster_name} is already down.')
|
|
147
168
|
return
|
|
148
169
|
except Exception as e: # pylint: disable=broad-except
|
|
149
170
|
retry_cnt += 1
|
|
150
171
|
if retry_cnt >= max_retry:
|
|
151
172
|
raise RuntimeError(
|
|
152
173
|
f'Failed to terminate the cluster {cluster_name}.') from e
|
|
153
|
-
|
|
174
|
+
logger.error(
|
|
154
175
|
f'Failed to terminate the cluster {cluster_name}. Retrying.'
|
|
155
176
|
f'Details: {common_utils.format_exception(e)}')
|
|
156
177
|
with ux_utils.enable_traceback():
|
|
157
|
-
|
|
178
|
+
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
158
179
|
time.sleep(backoff.current_backoff())
|
|
159
180
|
|
|
160
181
|
|
|
@@ -174,8 +195,8 @@ def _validate_consolidation_mode_config(
|
|
|
174
195
|
'terminate the controller cluster first.'
|
|
175
196
|
f'{colorama.Style.RESET_ALL}')
|
|
176
197
|
else:
|
|
177
|
-
|
|
178
|
-
if
|
|
198
|
+
total_jobs = managed_job_state.get_managed_jobs_total()
|
|
199
|
+
if total_jobs > 0:
|
|
179
200
|
nonterminal_jobs = (
|
|
180
201
|
managed_job_state.get_nonterminal_job_ids_by_name(
|
|
181
202
|
None, None, all_users=True))
|
|
@@ -190,7 +211,7 @@ def _validate_consolidation_mode_config(
|
|
|
190
211
|
else:
|
|
191
212
|
logger.warning(
|
|
192
213
|
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
|
|
193
|
-
f'but there are {
|
|
214
|
+
f'but there are {total_jobs} jobs from previous '
|
|
194
215
|
'consolidation mode. Reset the `jobs.controller.'
|
|
195
216
|
'consolidation_mode` to `true` and run `sky jobs queue` '
|
|
196
217
|
'to see those jobs. Switching to normal mode will '
|
|
@@ -202,13 +223,39 @@ def _validate_consolidation_mode_config(
|
|
|
202
223
|
# API Server. Under the hood, we submit the job monitoring logic as processes
|
|
203
224
|
# directly in the API Server.
|
|
204
225
|
# Use LRU Cache so that the check is only done once.
|
|
205
|
-
@annotations.lru_cache(scope='request', maxsize=
|
|
206
|
-
def is_consolidation_mode() -> bool:
|
|
226
|
+
@annotations.lru_cache(scope='request', maxsize=2)
|
|
227
|
+
def is_consolidation_mode(on_api_restart: bool = False) -> bool:
|
|
207
228
|
if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
|
|
208
229
|
return True
|
|
209
230
|
|
|
210
|
-
|
|
231
|
+
config_consolidation_mode = skypilot_config.get_nested(
|
|
211
232
|
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
|
233
|
+
|
|
234
|
+
signal_file = pathlib.Path(
|
|
235
|
+
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()
|
|
236
|
+
|
|
237
|
+
restart_signal_file_exists = signal_file.exists()
|
|
238
|
+
consolidation_mode = (config_consolidation_mode and
|
|
239
|
+
restart_signal_file_exists)
|
|
240
|
+
|
|
241
|
+
if on_api_restart:
|
|
242
|
+
if config_consolidation_mode:
|
|
243
|
+
signal_file.touch()
|
|
244
|
+
else:
|
|
245
|
+
if not restart_signal_file_exists:
|
|
246
|
+
if config_consolidation_mode:
|
|
247
|
+
logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
|
|
248
|
+
'managed jobs is enabled in the server config, '
|
|
249
|
+
'but the API server has not been restarted yet. '
|
|
250
|
+
'Please restart the API server to enable it.'
|
|
251
|
+
f'{colorama.Style.RESET_ALL}')
|
|
252
|
+
return False
|
|
253
|
+
elif not config_consolidation_mode:
|
|
254
|
+
# Cleanup the signal file if the consolidation mode is disabled in
|
|
255
|
+
# the config. This allow the user to disable the consolidation mode
|
|
256
|
+
# without restarting the API server.
|
|
257
|
+
signal_file.unlink()
|
|
258
|
+
|
|
212
259
|
# We should only do this check on API server, as the controller will not
|
|
213
260
|
# have related config and will always seemingly disabled for consolidation
|
|
214
261
|
# mode. Check #6611 for more details.
|
|
@@ -219,6 +266,12 @@ def is_consolidation_mode() -> bool:
|
|
|
219
266
|
|
|
220
267
|
def ha_recovery_for_consolidation_mode():
|
|
221
268
|
"""Recovery logic for HA mode."""
|
|
269
|
+
# Touch the signal file here to avoid conflict with
|
|
270
|
+
# update_managed_jobs_statuses. Although we run this first and then start
|
|
271
|
+
# the deamon, this function is also called in cancel_jobs_by_id.
|
|
272
|
+
signal_file = pathlib.Path(
|
|
273
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser()
|
|
274
|
+
signal_file.touch()
|
|
222
275
|
# No setup recovery is needed in consolidation mode, as the API server
|
|
223
276
|
# already has all runtime installed. Directly start jobs recovery here.
|
|
224
277
|
# Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
|
|
@@ -229,7 +282,9 @@ def ha_recovery_for_consolidation_mode():
|
|
|
229
282
|
encoding='utf-8') as f:
|
|
230
283
|
start = time.time()
|
|
231
284
|
f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
|
|
232
|
-
|
|
285
|
+
jobs, _ = managed_job_state.get_managed_jobs_with_filters(
|
|
286
|
+
fields=['job_id', 'controller_pid', 'schedule_state', 'status'])
|
|
287
|
+
for job in jobs:
|
|
233
288
|
job_id = job['job_id']
|
|
234
289
|
controller_pid = job['controller_pid']
|
|
235
290
|
|
|
@@ -265,12 +320,12 @@ def ha_recovery_for_consolidation_mode():
|
|
|
265
320
|
f'{datetime.datetime.now()}\n')
|
|
266
321
|
f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
|
|
267
322
|
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
|
323
|
+
signal_file.unlink()
|
|
268
324
|
|
|
269
325
|
|
|
270
326
|
async def get_job_status(
|
|
271
327
|
backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
272
|
-
job_id: Optional[int]
|
|
273
|
-
job_logger: logging.Logger) -> Optional['job_lib.JobStatus']:
|
|
328
|
+
job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
|
|
274
329
|
"""Check the status of the job running on a managed job cluster.
|
|
275
330
|
|
|
276
331
|
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
|
|
@@ -282,26 +337,28 @@ async def get_job_status(
|
|
|
282
337
|
if handle is None:
|
|
283
338
|
# This can happen if the cluster was preempted and background status
|
|
284
339
|
# refresh already noticed and cleaned it up.
|
|
285
|
-
|
|
340
|
+
logger.info(f'Cluster {cluster_name} not found.')
|
|
286
341
|
return None
|
|
287
342
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
288
343
|
job_ids = None if job_id is None else [job_id]
|
|
289
344
|
for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
|
|
290
345
|
try:
|
|
291
|
-
|
|
292
|
-
statuses = await
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
346
|
+
logger.info('=== Checking the job status... ===')
|
|
347
|
+
statuses = await asyncio.wait_for(
|
|
348
|
+
context_utils.to_thread(backend.get_job_status,
|
|
349
|
+
handle,
|
|
350
|
+
job_ids=job_ids,
|
|
351
|
+
stream_logs=False),
|
|
352
|
+
timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
|
|
296
353
|
status = list(statuses.values())[0]
|
|
297
354
|
if status is None:
|
|
298
|
-
|
|
355
|
+
logger.info('No job found.')
|
|
299
356
|
else:
|
|
300
|
-
|
|
301
|
-
|
|
357
|
+
logger.info(f'Job status: {status}')
|
|
358
|
+
logger.info('=' * 34)
|
|
302
359
|
return status
|
|
303
360
|
except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
|
|
304
|
-
ValueError, TypeError) as e:
|
|
361
|
+
ValueError, TypeError, asyncio.TimeoutError) as e:
|
|
305
362
|
# Note: Each of these exceptions has some additional conditions to
|
|
306
363
|
# limit how we handle it and whether or not we catch it.
|
|
307
364
|
# Retry on k8s transient network errors. This is useful when using
|
|
@@ -322,6 +379,9 @@ async def get_job_status(
|
|
|
322
379
|
is_transient_error = True
|
|
323
380
|
elif isinstance(e, grpc.FutureTimeoutError):
|
|
324
381
|
detailed_reason = 'Timeout'
|
|
382
|
+
elif isinstance(e, asyncio.TimeoutError):
|
|
383
|
+
detailed_reason = ('Job status check timed out after '
|
|
384
|
+
f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
|
|
325
385
|
# TODO(cooperc): Gracefully handle these exceptions in the backend.
|
|
326
386
|
elif isinstance(e, ValueError):
|
|
327
387
|
# If the cluster yaml is deleted in the middle of getting the
|
|
@@ -405,7 +465,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
405
465
|
"""
|
|
406
466
|
managed_job_state.remove_ha_recovery_script(job_id)
|
|
407
467
|
error_msg = None
|
|
408
|
-
tasks = managed_job_state.
|
|
468
|
+
tasks = managed_job_state.get_managed_job_tasks(job_id)
|
|
409
469
|
for task in tasks:
|
|
410
470
|
pool = task.get('pool', None)
|
|
411
471
|
if pool is None:
|
|
@@ -474,7 +534,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
474
534
|
|
|
475
535
|
for job_id in job_ids:
|
|
476
536
|
assert job_id is not None
|
|
477
|
-
tasks = managed_job_state.
|
|
537
|
+
tasks = managed_job_state.get_managed_job_tasks(job_id)
|
|
478
538
|
# Note: controller_pid and schedule_state are in the job_info table
|
|
479
539
|
# which is joined to the spot table, so all tasks with the same job_id
|
|
480
540
|
# will have the same value for these columns. This is what lets us just
|
|
@@ -494,9 +554,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
494
554
|
if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
|
|
495
555
|
# There are two cases where we could get a job that is DONE.
|
|
496
556
|
# 1. At query time (get_jobs_to_check_status), the job was not yet
|
|
497
|
-
# DONE, but since then (before
|
|
498
|
-
# hit a terminal status, marked itself done, and exited.
|
|
499
|
-
# fine.
|
|
557
|
+
# DONE, but since then (before get_managed_job_tasks is called)
|
|
558
|
+
# it has hit a terminal status, marked itself done, and exited.
|
|
559
|
+
# This is fine.
|
|
500
560
|
# 2. The job is DONE, but in a non-terminal status. This is
|
|
501
561
|
# unexpected. For instance, the task status is RUNNING, but the
|
|
502
562
|
# job schedule_state is DONE.
|
|
@@ -850,6 +910,14 @@ def cancel_jobs_by_pool(pool_name: str,
|
|
|
850
910
|
return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
|
|
851
911
|
|
|
852
912
|
|
|
913
|
+
def controller_log_file_for_job(job_id: int,
|
|
914
|
+
create_if_not_exists: bool = False) -> str:
|
|
915
|
+
log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
916
|
+
if create_if_not_exists:
|
|
917
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
918
|
+
return os.path.join(log_dir, f'{job_id}.log')
|
|
919
|
+
|
|
920
|
+
|
|
853
921
|
def stream_logs_by_id(job_id: int,
|
|
854
922
|
follow: bool = True,
|
|
855
923
|
tail: Optional[int] = None) -> Tuple[str, int]:
|
|
@@ -882,13 +950,20 @@ def stream_logs_by_id(job_id: int,
|
|
|
882
950
|
if managed_job_status.is_failed():
|
|
883
951
|
job_msg = ('\nFailure reason: '
|
|
884
952
|
f'{managed_job_state.get_failure_reason(job_id)}')
|
|
885
|
-
|
|
953
|
+
log_file_ever_existed = False
|
|
886
954
|
task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
|
|
887
955
|
job_id)
|
|
888
956
|
num_tasks = len(task_info)
|
|
889
|
-
for task_id, task_name, task_status, log_file
|
|
957
|
+
for (task_id, task_name, task_status, log_file,
|
|
958
|
+
logs_cleaned_at) in task_info:
|
|
890
959
|
if log_file:
|
|
891
|
-
|
|
960
|
+
log_file_ever_existed = True
|
|
961
|
+
if logs_cleaned_at is not None:
|
|
962
|
+
ts_str = datetime.fromtimestamp(
|
|
963
|
+
logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
|
|
964
|
+
print(f'Task {task_name}({task_id}) log has been '
|
|
965
|
+
f'cleaned at {ts_str}.')
|
|
966
|
+
continue
|
|
892
967
|
task_str = (f'Task {task_name}({task_id})'
|
|
893
968
|
if task_name else f'Task {task_id}')
|
|
894
969
|
if num_tasks > 1:
|
|
@@ -923,7 +998,7 @@ def stream_logs_by_id(job_id: int,
|
|
|
923
998
|
f'{task_str} finished '
|
|
924
999
|
f'(status: {task_status.value}).'),
|
|
925
1000
|
flush=True)
|
|
926
|
-
if
|
|
1001
|
+
if log_file_ever_existed:
|
|
927
1002
|
# Add the "Job finished" message for terminal states
|
|
928
1003
|
if managed_job_status.is_terminal():
|
|
929
1004
|
print(ux_utils.finishing_message(
|
|
@@ -1151,7 +1226,8 @@ def stream_logs(job_id: Optional[int],
|
|
|
1151
1226
|
if controller:
|
|
1152
1227
|
if job_id is None:
|
|
1153
1228
|
assert job_name is not None
|
|
1154
|
-
managed_jobs = managed_job_state.
|
|
1229
|
+
managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
|
|
1230
|
+
name_match=job_name, fields=['job_id', 'job_name', 'status'])
|
|
1155
1231
|
# We manually filter the jobs by name, instead of using
|
|
1156
1232
|
# get_nonterminal_job_ids_by_name, as with `controller=True`, we
|
|
1157
1233
|
# should be able to show the logs for jobs in terminal states.
|
|
@@ -1174,9 +1250,7 @@ def stream_logs(job_id: Optional[int],
|
|
|
1174
1250
|
job_id = managed_job_ids.pop()
|
|
1175
1251
|
assert job_id is not None, (job_id, job_name)
|
|
1176
1252
|
|
|
1177
|
-
controller_log_path =
|
|
1178
|
-
os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
|
|
1179
|
-
f'{job_id}.log')
|
|
1253
|
+
controller_log_path = controller_log_file_for_job(job_id)
|
|
1180
1254
|
job_status = None
|
|
1181
1255
|
|
|
1182
1256
|
# Wait for the log file to be written
|
|
@@ -1277,11 +1351,87 @@ def dump_managed_job_queue(
|
|
|
1277
1351
|
limit: Optional[int] = None,
|
|
1278
1352
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1279
1353
|
statuses: Optional[List[str]] = None,
|
|
1354
|
+
fields: Optional[List[str]] = None,
|
|
1280
1355
|
) -> str:
|
|
1281
1356
|
return message_utils.encode_payload(
|
|
1282
1357
|
get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
|
|
1283
1358
|
workspace_match, name_match, pool_match, page,
|
|
1284
|
-
limit, user_hashes, statuses))
|
|
1359
|
+
limit, user_hashes, statuses, fields))
|
|
1360
|
+
|
|
1361
|
+
|
|
1362
|
+
def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
|
|
1363
|
+
"""Update the fields list to include the necessary fields.
|
|
1364
|
+
|
|
1365
|
+
Args:
|
|
1366
|
+
fields: The fields to update.
|
|
1367
|
+
|
|
1368
|
+
It will:
|
|
1369
|
+
- Add the necessary dependent fields to the list.
|
|
1370
|
+
- Remove the fields that are not in the DB.
|
|
1371
|
+
- Determine if cluster handle is required.
|
|
1372
|
+
|
|
1373
|
+
Returns:
|
|
1374
|
+
A tuple containing the updated fields and a boolean indicating if
|
|
1375
|
+
cluster handle is required.
|
|
1376
|
+
"""
|
|
1377
|
+
cluster_handle_required = True
|
|
1378
|
+
if _cluster_handle_not_required(fields):
|
|
1379
|
+
cluster_handle_required = False
|
|
1380
|
+
# Copy the list to avoid modifying the original list
|
|
1381
|
+
new_fields = fields.copy()
|
|
1382
|
+
# status and job_id are always included
|
|
1383
|
+
if 'status' not in new_fields:
|
|
1384
|
+
new_fields.append('status')
|
|
1385
|
+
if 'job_id' not in new_fields:
|
|
1386
|
+
new_fields.append('job_id')
|
|
1387
|
+
# user_hash is required if user_name is present
|
|
1388
|
+
if 'user_name' in new_fields and 'user_hash' not in new_fields:
|
|
1389
|
+
new_fields.append('user_hash')
|
|
1390
|
+
if 'job_duration' in new_fields:
|
|
1391
|
+
if 'last_recovered_at' not in new_fields:
|
|
1392
|
+
new_fields.append('last_recovered_at')
|
|
1393
|
+
if 'end_at' not in new_fields:
|
|
1394
|
+
new_fields.append('end_at')
|
|
1395
|
+
if 'job_name' in new_fields and 'task_name' not in new_fields:
|
|
1396
|
+
new_fields.append('task_name')
|
|
1397
|
+
if 'details' in new_fields:
|
|
1398
|
+
if 'schedule_state' not in new_fields:
|
|
1399
|
+
new_fields.append('schedule_state')
|
|
1400
|
+
if 'priority' not in new_fields:
|
|
1401
|
+
new_fields.append('priority')
|
|
1402
|
+
if 'failure_reason' not in new_fields:
|
|
1403
|
+
new_fields.append('failure_reason')
|
|
1404
|
+
if 'user_yaml' in new_fields:
|
|
1405
|
+
if 'original_user_yaml_path' not in new_fields:
|
|
1406
|
+
new_fields.append('original_user_yaml_path')
|
|
1407
|
+
if 'original_user_yaml_content' not in new_fields:
|
|
1408
|
+
new_fields.append('original_user_yaml_content')
|
|
1409
|
+
if cluster_handle_required:
|
|
1410
|
+
if 'task_name' not in new_fields:
|
|
1411
|
+
new_fields.append('task_name')
|
|
1412
|
+
if 'current_cluster_name' not in new_fields:
|
|
1413
|
+
new_fields.append('current_cluster_name')
|
|
1414
|
+
# Remove _NON_DB_FIELDS
|
|
1415
|
+
# These fields have been mapped to the DB fields in the above code, so we
|
|
1416
|
+
# don't need to include them in the updated fields.
|
|
1417
|
+
for field in _NON_DB_FIELDS:
|
|
1418
|
+
if field in new_fields:
|
|
1419
|
+
new_fields.remove(field)
|
|
1420
|
+
return new_fields, cluster_handle_required
|
|
1421
|
+
|
|
1422
|
+
|
|
1423
|
+
def _cluster_handle_not_required(fields: List[str]) -> bool:
|
|
1424
|
+
"""Determine if cluster handle is not required.
|
|
1425
|
+
|
|
1426
|
+
Args:
|
|
1427
|
+
fields: The fields to check if they contain any of the cluster handle
|
|
1428
|
+
fields.
|
|
1429
|
+
|
|
1430
|
+
Returns:
|
|
1431
|
+
True if the fields do not contain any of the cluster handle fields,
|
|
1432
|
+
False otherwise.
|
|
1433
|
+
"""
|
|
1434
|
+
return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
|
|
1285
1435
|
|
|
1286
1436
|
|
|
1287
1437
|
def get_managed_job_queue(
|
|
@@ -1295,146 +1445,153 @@ def get_managed_job_queue(
|
|
|
1295
1445
|
limit: Optional[int] = None,
|
|
1296
1446
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1297
1447
|
statuses: Optional[List[str]] = None,
|
|
1448
|
+
fields: Optional[List[str]] = None,
|
|
1298
1449
|
) -> Dict[str, Any]:
|
|
1299
|
-
|
|
1300
|
-
# detection) requires a full view of the jobs table.
|
|
1301
|
-
jobs = managed_job_state.get_managed_jobs()
|
|
1450
|
+
"""Get the managed job queue.
|
|
1302
1451
|
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
managed_job_state.ManagedJobScheduleState.WAITING,
|
|
1316
|
-
managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
|
|
1317
|
-
# This job will not block others.
|
|
1318
|
-
continue
|
|
1319
|
-
|
|
1320
|
-
priority = job.get('priority')
|
|
1321
|
-
if priority is not None and priority > highest_blocking_priority:
|
|
1322
|
-
highest_blocking_priority = priority
|
|
1452
|
+
Args:
|
|
1453
|
+
skip_finished: Whether to skip finished jobs.
|
|
1454
|
+
accessible_workspaces: The accessible workspaces.
|
|
1455
|
+
job_ids: The job ids.
|
|
1456
|
+
workspace_match: The workspace name to match.
|
|
1457
|
+
name_match: The job name to match.
|
|
1458
|
+
pool_match: The pool name to match.
|
|
1459
|
+
page: The page number.
|
|
1460
|
+
limit: The limit number.
|
|
1461
|
+
user_hashes: The user hashes.
|
|
1462
|
+
statuses: The statuses.
|
|
1463
|
+
fields: The fields to include in the response.
|
|
1323
1464
|
|
|
1324
|
-
|
|
1465
|
+
Returns:
|
|
1466
|
+
A dictionary containing the managed job queue.
|
|
1467
|
+
"""
|
|
1468
|
+
cluster_handle_required = True
|
|
1469
|
+
updated_fields = None
|
|
1470
|
+
# The caller only need to specify the fields in the
|
|
1471
|
+
# `class ManagedJobRecord` in `response.py`, and the `_update_fields`
|
|
1472
|
+
# function will add the necessary dependent fields to the list, for
|
|
1473
|
+
# example, if the caller specifies `['user_name']`, the `_update_fields`
|
|
1474
|
+
# function will add `['user_hash']` to the list.
|
|
1475
|
+
if fields:
|
|
1476
|
+
updated_fields, cluster_handle_required = _update_fields(fields)
|
|
1477
|
+
|
|
1478
|
+
total_no_filter = managed_job_state.get_managed_jobs_total()
|
|
1479
|
+
|
|
1480
|
+
status_counts = managed_job_state.get_status_count_with_filters(
|
|
1481
|
+
fields=fields,
|
|
1482
|
+
job_ids=job_ids,
|
|
1483
|
+
accessible_workspaces=accessible_workspaces,
|
|
1484
|
+
workspace_match=workspace_match,
|
|
1485
|
+
name_match=name_match,
|
|
1486
|
+
pool_match=pool_match,
|
|
1487
|
+
user_hashes=user_hashes,
|
|
1488
|
+
skip_finished=skip_finished,
|
|
1489
|
+
)
|
|
1490
|
+
|
|
1491
|
+
jobs, total = managed_job_state.get_managed_jobs_with_filters(
|
|
1492
|
+
fields=updated_fields,
|
|
1493
|
+
job_ids=job_ids,
|
|
1494
|
+
accessible_workspaces=accessible_workspaces,
|
|
1495
|
+
workspace_match=workspace_match,
|
|
1496
|
+
name_match=name_match,
|
|
1497
|
+
pool_match=pool_match,
|
|
1498
|
+
user_hashes=user_hashes,
|
|
1499
|
+
statuses=statuses,
|
|
1500
|
+
skip_finished=skip_finished,
|
|
1501
|
+
page=page,
|
|
1502
|
+
limit=limit,
|
|
1503
|
+
)
|
|
1504
|
+
|
|
1505
|
+
if cluster_handle_required:
|
|
1506
|
+
# Fetch the cluster name to handle map for managed clusters only.
|
|
1507
|
+
cluster_name_to_handle = (
|
|
1508
|
+
global_user_state.get_cluster_name_to_handle_map(is_managed=True))
|
|
1325
1509
|
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
1334
|
-
accessible_workspaces
|
|
1335
|
-
]
|
|
1336
|
-
if skip_finished:
|
|
1337
|
-
# Filter out the finished jobs. If a multi-task job is partially
|
|
1338
|
-
# finished, we will include all its tasks.
|
|
1339
|
-
non_finished_tasks = list(
|
|
1340
|
-
filter(
|
|
1341
|
-
lambda job: not managed_job_state.ManagedJobStatus(job[
|
|
1342
|
-
'status']).is_terminal(), jobs))
|
|
1343
|
-
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
1344
|
-
jobs = list(
|
|
1345
|
-
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
1346
|
-
if job_ids:
|
|
1347
|
-
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
1348
|
-
|
|
1349
|
-
jobs, total, status_counts = filter_jobs(jobs,
|
|
1350
|
-
workspace_match,
|
|
1351
|
-
name_match,
|
|
1352
|
-
pool_match,
|
|
1353
|
-
page,
|
|
1354
|
-
limit,
|
|
1355
|
-
statuses=statuses)
|
|
1356
|
-
|
|
1357
|
-
job_ids = set(job['job_id'] for job in jobs)
|
|
1358
|
-
job_id_to_pool_info = (
|
|
1359
|
-
managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
|
|
1360
|
-
cluster_names: Dict[int, str] = {}
|
|
1361
|
-
for job in jobs:
|
|
1362
|
-
# pool info is (pool, cluster_name, job_id_on_pool_cluster)
|
|
1363
|
-
pool_info = job_id_to_pool_info.get(job['job_id'], None)
|
|
1364
|
-
if pool_info and pool_info[0]:
|
|
1365
|
-
cluster_name = pool_info[1]
|
|
1366
|
-
else:
|
|
1367
|
-
cluster_name = generate_managed_job_cluster_name(
|
|
1368
|
-
job['task_name'], job['job_id'])
|
|
1369
|
-
cluster_names[job['job_id']] = cluster_name
|
|
1370
|
-
cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
|
|
1371
|
-
set(cluster_names.values()))
|
|
1510
|
+
highest_blocking_priority = constants.MIN_PRIORITY
|
|
1511
|
+
if not fields or 'details' in fields:
|
|
1512
|
+
# Figure out what the highest priority blocking job is. We need to know
|
|
1513
|
+
# in order to determine if other jobs are blocked by a higher priority
|
|
1514
|
+
# job, or just by the limited controller resources.
|
|
1515
|
+
highest_blocking_priority = (
|
|
1516
|
+
managed_job_state.get_managed_jobs_highest_priority())
|
|
1372
1517
|
|
|
1373
1518
|
for job in jobs:
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
end_at
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1519
|
+
if not fields or 'job_duration' in fields:
|
|
1520
|
+
end_at = job['end_at']
|
|
1521
|
+
if end_at is None:
|
|
1522
|
+
end_at = time.time()
|
|
1523
|
+
|
|
1524
|
+
job_submitted_at = job['last_recovered_at'] - job['job_duration']
|
|
1525
|
+
if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
|
|
1526
|
+
# When job is recovering, the duration is exact
|
|
1527
|
+
# job['job_duration']
|
|
1528
|
+
job_duration = job['job_duration']
|
|
1529
|
+
elif job_submitted_at > 0:
|
|
1530
|
+
job_duration = end_at - job_submitted_at
|
|
1531
|
+
else:
|
|
1532
|
+
# When job_start_at <= 0, that means the last_recovered_at
|
|
1533
|
+
# is not set yet, i.e. the job is not started.
|
|
1534
|
+
job_duration = 0
|
|
1535
|
+
job['job_duration'] = job_duration
|
|
1389
1536
|
job['status'] = job['status'].value
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
cluster_name = cluster_names[job['job_id']]
|
|
1393
|
-
handle = cluster_name_to_handles.get(cluster_name, None)
|
|
1394
|
-
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1395
|
-
resources_str = resources_utils.get_readable_resources_repr(
|
|
1396
|
-
handle, simplify=True)
|
|
1397
|
-
resources_str_full = resources_utils.get_readable_resources_repr(
|
|
1398
|
-
handle, simplify=False)
|
|
1399
|
-
job['cluster_resources'] = resources_str
|
|
1400
|
-
job['cluster_resources_full'] = resources_str_full
|
|
1401
|
-
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1402
|
-
job['region'] = handle.launched_resources.region
|
|
1403
|
-
job['zone'] = handle.launched_resources.zone
|
|
1404
|
-
job['infra'] = infra_utils.InfraInfo(
|
|
1405
|
-
str(handle.launched_resources.cloud),
|
|
1406
|
-
handle.launched_resources.region,
|
|
1407
|
-
handle.launched_resources.zone).formatted_str()
|
|
1408
|
-
job['accelerators'] = handle.launched_resources.accelerators
|
|
1537
|
+
if not fields or 'schedule_state' in fields:
|
|
1538
|
+
job['schedule_state'] = job['schedule_state'].value
|
|
1409
1539
|
else:
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1540
|
+
job['schedule_state'] = None
|
|
1541
|
+
|
|
1542
|
+
if cluster_handle_required:
|
|
1543
|
+
cluster_name = job.get('current_cluster_name', None)
|
|
1544
|
+
if cluster_name is None:
|
|
1545
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1546
|
+
job['task_name'], job['job_id'])
|
|
1547
|
+
handle = cluster_name_to_handle.get(
|
|
1548
|
+
cluster_name, None) if cluster_name is not None else None
|
|
1549
|
+
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1550
|
+
resources_str_simple, resources_str_full = (
|
|
1551
|
+
resources_utils.get_readable_resources_repr(
|
|
1552
|
+
handle, simplified_only=False))
|
|
1553
|
+
assert resources_str_full is not None
|
|
1554
|
+
job['cluster_resources'] = resources_str_simple
|
|
1555
|
+
job['cluster_resources_full'] = resources_str_full
|
|
1556
|
+
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1557
|
+
job['region'] = handle.launched_resources.region
|
|
1558
|
+
job['zone'] = handle.launched_resources.zone
|
|
1559
|
+
job['infra'] = infra_utils.InfraInfo(
|
|
1560
|
+
str(handle.launched_resources.cloud),
|
|
1561
|
+
handle.launched_resources.region,
|
|
1562
|
+
handle.launched_resources.zone).formatted_str()
|
|
1563
|
+
job['accelerators'] = handle.launched_resources.accelerators
|
|
1427
1564
|
else:
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1565
|
+
# FIXME(zongheng): display the last cached values for these.
|
|
1566
|
+
job['cluster_resources'] = '-'
|
|
1567
|
+
job['cluster_resources_full'] = '-'
|
|
1568
|
+
job['cloud'] = '-'
|
|
1569
|
+
job['region'] = '-'
|
|
1570
|
+
job['zone'] = '-'
|
|
1571
|
+
job['infra'] = '-'
|
|
1572
|
+
|
|
1573
|
+
if not fields or 'details' in fields:
|
|
1574
|
+
# Add details about schedule state / backoff.
|
|
1575
|
+
state_details = None
|
|
1576
|
+
if job['schedule_state'] == 'ALIVE_BACKOFF':
|
|
1577
|
+
state_details = 'In backoff, waiting for resources'
|
|
1578
|
+
elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
|
|
1579
|
+
priority = job.get('priority')
|
|
1580
|
+
if (priority is not None and
|
|
1581
|
+
priority < highest_blocking_priority):
|
|
1582
|
+
# Job is lower priority than some other blocking job.
|
|
1583
|
+
state_details = 'Waiting for higher priority jobs to launch'
|
|
1584
|
+
else:
|
|
1585
|
+
state_details = 'Waiting for other jobs to launch'
|
|
1586
|
+
|
|
1587
|
+
if state_details and job['failure_reason']:
|
|
1588
|
+
job['details'] = f'{state_details} - {job["failure_reason"]}'
|
|
1589
|
+
elif state_details:
|
|
1590
|
+
job['details'] = state_details
|
|
1591
|
+
elif job['failure_reason']:
|
|
1592
|
+
job['details'] = f'Failure: {job["failure_reason"]}'
|
|
1593
|
+
else:
|
|
1594
|
+
job['details'] = None
|
|
1438
1595
|
|
|
1439
1596
|
return {
|
|
1440
1597
|
'jobs': jobs,
|
|
@@ -1545,21 +1702,14 @@ def load_managed_job_queue(
|
|
|
1545
1702
|
total_no_filter = total
|
|
1546
1703
|
result_type = ManagedJobQueueResultType.LIST
|
|
1547
1704
|
|
|
1548
|
-
|
|
1705
|
+
all_users = global_user_state.get_all_users()
|
|
1706
|
+
all_users_map = {user.id: user.name for user in all_users}
|
|
1549
1707
|
for job in jobs:
|
|
1708
|
+
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1550
1709
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
1551
1710
|
# Skip jobs that do not have user_hash info.
|
|
1552
1711
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1553
|
-
|
|
1554
|
-
user_hash_to_user = global_user_state.get_users(
|
|
1555
|
-
job_id_to_user_hash.values())
|
|
1556
|
-
|
|
1557
|
-
for job in jobs:
|
|
1558
|
-
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1559
|
-
if job['job_id'] in job_id_to_user_hash:
|
|
1560
|
-
user_hash = job_id_to_user_hash[job['job_id']]
|
|
1561
|
-
user = user_hash_to_user.get(user_hash, None)
|
|
1562
|
-
job['user_name'] = user.name if user is not None else None
|
|
1712
|
+
job['user_name'] = all_users_map.get(job['user_hash'])
|
|
1563
1713
|
return jobs, total, result_type, total_no_filter, status_counts
|
|
1564
1714
|
|
|
1565
1715
|
|
|
@@ -1584,29 +1734,40 @@ def _get_job_status_from_tasks(
|
|
|
1584
1734
|
|
|
1585
1735
|
|
|
1586
1736
|
@typing.overload
|
|
1587
|
-
def format_job_table(
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1737
|
+
def format_job_table(
|
|
1738
|
+
tasks: List[Dict[str, Any]],
|
|
1739
|
+
show_all: bool,
|
|
1740
|
+
show_user: bool,
|
|
1741
|
+
return_rows: Literal[False] = False,
|
|
1742
|
+
pool_status: Optional[List[Dict[str, Any]]] = None,
|
|
1743
|
+
max_jobs: Optional[int] = None,
|
|
1744
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1745
|
+
) -> str:
|
|
1592
1746
|
...
|
|
1593
1747
|
|
|
1594
1748
|
|
|
1595
1749
|
@typing.overload
|
|
1596
|
-
def format_job_table(
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1750
|
+
def format_job_table(
|
|
1751
|
+
tasks: List[Dict[str, Any]],
|
|
1752
|
+
show_all: bool,
|
|
1753
|
+
show_user: bool,
|
|
1754
|
+
return_rows: Literal[True],
|
|
1755
|
+
pool_status: Optional[List[Dict[str, Any]]] = None,
|
|
1756
|
+
max_jobs: Optional[int] = None,
|
|
1757
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1758
|
+
) -> List[List[str]]:
|
|
1601
1759
|
...
|
|
1602
1760
|
|
|
1603
1761
|
|
|
1604
1762
|
def format_job_table(
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1763
|
+
tasks: List[Dict[str, Any]],
|
|
1764
|
+
show_all: bool,
|
|
1765
|
+
show_user: bool,
|
|
1766
|
+
return_rows: bool = False,
|
|
1767
|
+
pool_status: Optional[List[Dict[str, Any]]] = None,
|
|
1768
|
+
max_jobs: Optional[int] = None,
|
|
1769
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1770
|
+
) -> Union[str, List[List[str]]]:
|
|
1610
1771
|
"""Returns managed jobs as a formatted string.
|
|
1611
1772
|
|
|
1612
1773
|
Args:
|
|
@@ -1615,6 +1776,8 @@ def format_job_table(
|
|
|
1615
1776
|
max_jobs: The maximum number of jobs to show in the table.
|
|
1616
1777
|
return_rows: If True, return the rows as a list of strings instead of
|
|
1617
1778
|
all rows concatenated into a single string.
|
|
1779
|
+
pool_status: List of pool status dictionaries with replica_info.
|
|
1780
|
+
job_status_counts: The counts of each job status.
|
|
1618
1781
|
|
|
1619
1782
|
Returns: A formatted string of managed jobs, if not `return_rows`; otherwise
|
|
1620
1783
|
a list of "rows" (each of which is a list of str).
|
|
@@ -1631,17 +1794,37 @@ def format_job_table(
|
|
|
1631
1794
|
return (task['user'], task['job_id'])
|
|
1632
1795
|
return task['job_id']
|
|
1633
1796
|
|
|
1797
|
+
def _get_job_id_to_worker_map(
|
|
1798
|
+
pool_status: Optional[List[Dict[str, Any]]]) -> Dict[int, int]:
|
|
1799
|
+
"""Create a mapping from job_id to worker replica_id.
|
|
1800
|
+
|
|
1801
|
+
Args:
|
|
1802
|
+
pool_status: List of pool status dictionaries with replica_info.
|
|
1803
|
+
|
|
1804
|
+
Returns:
|
|
1805
|
+
Dictionary mapping job_id to replica_id (worker ID).
|
|
1806
|
+
"""
|
|
1807
|
+
job_to_worker: Dict[int, int] = {}
|
|
1808
|
+
if pool_status is None:
|
|
1809
|
+
return job_to_worker
|
|
1810
|
+
for pool in pool_status:
|
|
1811
|
+
replica_info = pool.get('replica_info', [])
|
|
1812
|
+
for replica in replica_info:
|
|
1813
|
+
used_by = replica.get('used_by')
|
|
1814
|
+
if used_by is not None:
|
|
1815
|
+
job_to_worker[used_by] = replica.get('replica_id')
|
|
1816
|
+
return job_to_worker
|
|
1817
|
+
|
|
1818
|
+
# Create mapping from job_id to worker replica_id
|
|
1819
|
+
job_to_worker = _get_job_id_to_worker_map(pool_status)
|
|
1820
|
+
|
|
1634
1821
|
for task in tasks:
|
|
1635
1822
|
# The tasks within the same job_id are already sorted
|
|
1636
1823
|
# by the task_id.
|
|
1637
1824
|
jobs[get_hash(task)].append(task)
|
|
1638
1825
|
|
|
1639
|
-
status_counts: Dict[str, int] = collections.defaultdict(int)
|
|
1640
1826
|
workspaces = set()
|
|
1641
1827
|
for job_tasks in jobs.values():
|
|
1642
|
-
managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
|
|
1643
|
-
if not managed_job_status.is_terminal():
|
|
1644
|
-
status_counts[managed_job_status.value] += 1
|
|
1645
1828
|
workspaces.add(job_tasks[0].get('workspace',
|
|
1646
1829
|
constants.SKYPILOT_DEFAULT_WORKSPACE))
|
|
1647
1830
|
|
|
@@ -1684,9 +1867,15 @@ def format_job_table(
|
|
|
1684
1867
|
job_table = log_utils.create_table(columns)
|
|
1685
1868
|
|
|
1686
1869
|
status_counts: Dict[str, int] = collections.defaultdict(int)
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1870
|
+
if job_status_counts:
|
|
1871
|
+
for status_value, count in job_status_counts.items():
|
|
1872
|
+
status = managed_job_state.ManagedJobStatus(status_value)
|
|
1873
|
+
if not status.is_terminal():
|
|
1874
|
+
status_counts[status_value] = count
|
|
1875
|
+
else:
|
|
1876
|
+
for task in tasks:
|
|
1877
|
+
if not task['status'].is_terminal():
|
|
1878
|
+
status_counts[task['status'].value] += 1
|
|
1690
1879
|
|
|
1691
1880
|
all_tasks = tasks
|
|
1692
1881
|
if max_jobs is not None:
|
|
@@ -1772,7 +1961,12 @@ def format_job_table(
|
|
|
1772
1961
|
if pool is None:
|
|
1773
1962
|
pool = '-'
|
|
1774
1963
|
|
|
1964
|
+
# Add worker information if job is assigned to a worker
|
|
1775
1965
|
job_id = job_hash[1] if tasks_have_k8s_user else job_hash
|
|
1966
|
+
# job_id is now always an integer, use it to look up worker
|
|
1967
|
+
if job_id in job_to_worker and pool != '-':
|
|
1968
|
+
pool = f'{pool} (worker={job_to_worker[job_id]})'
|
|
1969
|
+
|
|
1776
1970
|
job_values = [
|
|
1777
1971
|
job_id,
|
|
1778
1972
|
'',
|
|
@@ -1815,6 +2009,12 @@ def format_job_table(
|
|
|
1815
2009
|
pool = task.get('pool')
|
|
1816
2010
|
if pool is None:
|
|
1817
2011
|
pool = '-'
|
|
2012
|
+
|
|
2013
|
+
# Add worker information if task is assigned to a worker
|
|
2014
|
+
task_job_id = task['job_id']
|
|
2015
|
+
if task_job_id in job_to_worker and pool != '-':
|
|
2016
|
+
pool = f'{pool} (worker={job_to_worker[task_job_id]})'
|
|
2017
|
+
|
|
1818
2018
|
values = [
|
|
1819
2019
|
task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
|
|
1820
2020
|
task['task_id'] if len(job_tasks) > 1 else '-',
|
|
@@ -1934,7 +2134,8 @@ def _job_proto_to_dict(
|
|
|
1934
2134
|
# and Protobuf encodes int64 as decimal strings in JSON,
|
|
1935
2135
|
# so we need to convert them back to ints.
|
|
1936
2136
|
# https://protobuf.dev/programming-guides/json/#field-representation
|
|
1937
|
-
if field.type == descriptor.FieldDescriptor.TYPE_INT64
|
|
2137
|
+
if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
|
|
2138
|
+
job_dict.get(field.name) is not None):
|
|
1938
2139
|
job_dict[field.name] = int(job_dict[field.name])
|
|
1939
2140
|
job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
|
|
1940
2141
|
job_dict['status'])
|
|
@@ -1978,6 +2179,7 @@ class ManagedJobCodeGen:
|
|
|
1978
2179
|
limit: Optional[int] = None,
|
|
1979
2180
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1980
2181
|
statuses: Optional[List[str]] = None,
|
|
2182
|
+
fields: Optional[List[str]] = None,
|
|
1981
2183
|
) -> str:
|
|
1982
2184
|
code = textwrap.dedent(f"""\
|
|
1983
2185
|
if managed_job_version < 9:
|
|
@@ -1996,7 +2198,7 @@ class ManagedJobCodeGen:
|
|
|
1996
2198
|
page={page!r},
|
|
1997
2199
|
limit={limit!r},
|
|
1998
2200
|
user_hashes={user_hashes!r})
|
|
1999
|
-
|
|
2201
|
+
elif managed_job_version < 12:
|
|
2000
2202
|
job_table = utils.dump_managed_job_queue(
|
|
2001
2203
|
skip_finished={skip_finished},
|
|
2002
2204
|
accessible_workspaces={accessible_workspaces!r},
|
|
@@ -2008,6 +2210,19 @@ class ManagedJobCodeGen:
|
|
|
2008
2210
|
limit={limit!r},
|
|
2009
2211
|
user_hashes={user_hashes!r},
|
|
2010
2212
|
statuses={statuses!r})
|
|
2213
|
+
else:
|
|
2214
|
+
job_table = utils.dump_managed_job_queue(
|
|
2215
|
+
skip_finished={skip_finished},
|
|
2216
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
2217
|
+
job_ids={job_ids!r},
|
|
2218
|
+
workspace_match={workspace_match!r},
|
|
2219
|
+
name_match={name_match!r},
|
|
2220
|
+
pool_match={pool_match!r},
|
|
2221
|
+
page={page!r},
|
|
2222
|
+
limit={limit!r},
|
|
2223
|
+
user_hashes={user_hashes!r},
|
|
2224
|
+
statuses={statuses!r},
|
|
2225
|
+
fields={fields!r})
|
|
2011
2226
|
print(job_table, flush=True)
|
|
2012
2227
|
""")
|
|
2013
2228
|
return cls._build(code)
|
|
@@ -2075,6 +2290,18 @@ class ManagedJobCodeGen:
|
|
|
2075
2290
|
""")
|
|
2076
2291
|
return cls._build(code)
|
|
2077
2292
|
|
|
2293
|
+
@classmethod
|
|
2294
|
+
def get_version(cls) -> str:
|
|
2295
|
+
"""Generate code to get controller version."""
|
|
2296
|
+
code = textwrap.dedent("""\
|
|
2297
|
+
from sky.skylet import constants as controller_constants
|
|
2298
|
+
|
|
2299
|
+
# Get controller version
|
|
2300
|
+
controller_version = controller_constants.SKYLET_VERSION
|
|
2301
|
+
print(f"controller_version:{controller_version}", flush=True)
|
|
2302
|
+
""")
|
|
2303
|
+
return cls._build(code)
|
|
2304
|
+
|
|
2078
2305
|
@classmethod
|
|
2079
2306
|
def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
|
|
2080
2307
|
code = textwrap.dedent(f"""\
|
|
@@ -2112,8 +2339,12 @@ class ManagedJobCodeGen:
|
|
|
2112
2339
|
return cls._build(code)
|
|
2113
2340
|
|
|
2114
2341
|
@classmethod
|
|
2115
|
-
def set_pending(cls,
|
|
2116
|
-
|
|
2342
|
+
def set_pending(cls,
|
|
2343
|
+
job_id: int,
|
|
2344
|
+
managed_job_dag: 'dag_lib.Dag',
|
|
2345
|
+
workspace: str,
|
|
2346
|
+
entrypoint: str,
|
|
2347
|
+
user_hash: Optional[str] = None) -> str:
|
|
2117
2348
|
dag_name = managed_job_dag.name
|
|
2118
2349
|
pool = managed_job_dag.pool
|
|
2119
2350
|
# Add the managed job to queue table.
|
|
@@ -2130,6 +2361,8 @@ class ManagedJobCodeGen:
|
|
|
2130
2361
|
pool_hash = serve_state.get_service_hash({pool!r})
|
|
2131
2362
|
set_job_info_kwargs['pool'] = {pool!r}
|
|
2132
2363
|
set_job_info_kwargs['pool_hash'] = pool_hash
|
|
2364
|
+
if managed_job_version >= 11:
|
|
2365
|
+
set_job_info_kwargs['user_hash'] = {user_hash!r}
|
|
2133
2366
|
managed_job_state.set_job_info(
|
|
2134
2367
|
{job_id}, {dag_name!r}, **set_job_info_kwargs)
|
|
2135
2368
|
""")
|