skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py
CHANGED
|
@@ -64,7 +64,6 @@ from sky.jobs import utils as managed_job_utils
|
|
|
64
64
|
from sky.server import config as server_config
|
|
65
65
|
from sky.skylet import constants
|
|
66
66
|
from sky.utils import annotations
|
|
67
|
-
from sky.utils import common_utils
|
|
68
67
|
from sky.utils import controller_utils
|
|
69
68
|
from sky.utils import subprocess_utils
|
|
70
69
|
|
|
@@ -168,11 +167,12 @@ def start_controller() -> None:
|
|
|
168
167
|
logs_dir = os.path.expanduser(
|
|
169
168
|
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
170
169
|
os.makedirs(logs_dir, exist_ok=True)
|
|
171
|
-
|
|
170
|
+
controller_uuid = str(uuid.uuid4())
|
|
171
|
+
log_path = os.path.join(logs_dir, f'controller_{controller_uuid}.log')
|
|
172
172
|
|
|
173
173
|
activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
|
174
174
|
run_controller_cmd = (f'{sys.executable} -u -m'
|
|
175
|
-
'sky.jobs.controller')
|
|
175
|
+
f'sky.jobs.controller {controller_uuid}')
|
|
176
176
|
|
|
177
177
|
run_cmd = (f'{activate_python_env_cmd}'
|
|
178
178
|
f'{run_controller_cmd}')
|
|
@@ -263,6 +263,7 @@ def maybe_start_controllers(from_scheduler: bool = False) -> None:
|
|
|
263
263
|
|
|
264
264
|
if started > 0:
|
|
265
265
|
logger.info(f'Started {started} controllers')
|
|
266
|
+
|
|
266
267
|
except filelock.Timeout:
|
|
267
268
|
# If we can't get the lock, just exit. The process holding the lock
|
|
268
269
|
# should launch any pending jobs.
|
|
@@ -289,9 +290,20 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
289
290
|
maybe_start_controllers(from_scheduler=True)
|
|
290
291
|
return
|
|
291
292
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
293
|
+
with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
|
|
294
|
+
dag_yaml_content = dag_file.read()
|
|
295
|
+
with open(original_user_yaml_path, 'r',
|
|
296
|
+
encoding='utf-8') as original_user_yaml_file:
|
|
297
|
+
original_user_yaml_content = original_user_yaml_file.read()
|
|
298
|
+
with open(env_file_path, 'r', encoding='utf-8') as env_file:
|
|
299
|
+
env_file_content = env_file.read()
|
|
300
|
+
logger.debug(f'Storing job {job_id} file contents in database '
|
|
301
|
+
f'(DAG bytes={len(dag_yaml_content)}, '
|
|
302
|
+
f'original user yaml bytes={len(original_user_yaml_content)}, '
|
|
303
|
+
f'env bytes={len(env_file_content)}).')
|
|
304
|
+
state.scheduler_set_waiting(job_id, dag_yaml_content,
|
|
305
|
+
original_user_yaml_content, env_file_content,
|
|
306
|
+
priority)
|
|
295
307
|
if state.get_ha_recovery_script(job_id) is None:
|
|
296
308
|
# the run command is just the command that called scheduler
|
|
297
309
|
run = (f'source {env_file_path} && '
|
|
@@ -309,7 +321,6 @@ async def scheduled_launch(
|
|
|
309
321
|
starting: Set[int],
|
|
310
322
|
starting_lock: asyncio.Lock,
|
|
311
323
|
starting_signal: asyncio.Condition,
|
|
312
|
-
job_logger: 'logging.Logger',
|
|
313
324
|
):
|
|
314
325
|
"""Launch as part of an ongoing job.
|
|
315
326
|
|
|
@@ -347,10 +358,10 @@ async def scheduled_launch(
|
|
|
347
358
|
starting_count = len(starting)
|
|
348
359
|
if starting_count < LAUNCHES_PER_WORKER:
|
|
349
360
|
break
|
|
350
|
-
|
|
361
|
+
logger.info('Too many jobs starting, waiting for a slot')
|
|
351
362
|
await starting_signal.wait()
|
|
352
363
|
|
|
353
|
-
|
|
364
|
+
logger.info(f'Starting job {job_id}')
|
|
354
365
|
|
|
355
366
|
async with starting_lock:
|
|
356
367
|
starting.add(job_id)
|
sky/jobs/server/core.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
"""SDK functions for managed jobs."""
|
|
2
|
+
import concurrent.futures
|
|
3
|
+
import copy
|
|
2
4
|
import ipaddress
|
|
3
5
|
import os
|
|
4
6
|
import pathlib
|
|
@@ -33,6 +35,7 @@ from sky.schemas.api import responses
|
|
|
33
35
|
from sky.serve import serve_state
|
|
34
36
|
from sky.serve import serve_utils
|
|
35
37
|
from sky.serve.server import impl
|
|
38
|
+
from sky.server.requests import request_names
|
|
36
39
|
from sky.skylet import constants as skylet_constants
|
|
37
40
|
from sky.usage import usage_lib
|
|
38
41
|
from sky.utils import admin_policy_utils
|
|
@@ -60,6 +63,35 @@ else:
|
|
|
60
63
|
|
|
61
64
|
logger = sky_logging.init_logger(__name__)
|
|
62
65
|
|
|
66
|
+
_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
|
|
67
|
+
'job_id',
|
|
68
|
+
'task_id',
|
|
69
|
+
'workspace',
|
|
70
|
+
'job_name',
|
|
71
|
+
'task_name',
|
|
72
|
+
'resources',
|
|
73
|
+
'submitted_at',
|
|
74
|
+
'end_at',
|
|
75
|
+
'job_duration',
|
|
76
|
+
'recovery_count',
|
|
77
|
+
'status',
|
|
78
|
+
'pool',
|
|
79
|
+
'current_cluster_name',
|
|
80
|
+
'job_id_on_pool_cluster',
|
|
81
|
+
'start_at',
|
|
82
|
+
'infra',
|
|
83
|
+
'cloud',
|
|
84
|
+
'region',
|
|
85
|
+
'zone',
|
|
86
|
+
'cluster_resources',
|
|
87
|
+
'schedule_state',
|
|
88
|
+
'details',
|
|
89
|
+
'failure_reason',
|
|
90
|
+
'metadata',
|
|
91
|
+
'user_name',
|
|
92
|
+
'user_hash',
|
|
93
|
+
]
|
|
94
|
+
|
|
63
95
|
|
|
64
96
|
def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
65
97
|
"""Upload files to the controller.
|
|
@@ -142,7 +174,8 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
|
|
|
142
174
|
force_user_workspace=True),
|
|
143
175
|
entrypoint=common_utils.get_current_command(),
|
|
144
176
|
pool=pool,
|
|
145
|
-
pool_hash=pool_hash
|
|
177
|
+
pool_hash=pool_hash,
|
|
178
|
+
user_hash=common_utils.get_user_hash()))
|
|
146
179
|
for task_id, task in enumerate(dag.tasks):
|
|
147
180
|
resources_str = backend_utils.get_task_resources_str(
|
|
148
181
|
task, is_managed_job=True)
|
|
@@ -205,7 +238,8 @@ def launch(
|
|
|
205
238
|
# Always apply the policy again here, even though it might have been applied
|
|
206
239
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
207
240
|
# and get the mutated config.
|
|
208
|
-
dag, mutated_user_config = admin_policy_utils.apply(
|
|
241
|
+
dag, mutated_user_config = admin_policy_utils.apply(
|
|
242
|
+
dag, request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH)
|
|
209
243
|
dag.resolve_and_validate_volumes()
|
|
210
244
|
if not dag.is_chain():
|
|
211
245
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -336,6 +370,7 @@ def launch(
|
|
|
336
370
|
def _submit_one(
|
|
337
371
|
consolidation_mode_job_id: Optional[int] = None,
|
|
338
372
|
job_rank: Optional[int] = None,
|
|
373
|
+
num_jobs: Optional[int] = None,
|
|
339
374
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
340
375
|
rank_suffix = '' if job_rank is None else f'-{job_rank}'
|
|
341
376
|
remote_original_user_yaml_path = (
|
|
@@ -355,11 +390,15 @@ def launch(
|
|
|
355
390
|
) as original_user_yaml_path:
|
|
356
391
|
original_user_yaml_path.write(user_dag_str_user_specified)
|
|
357
392
|
original_user_yaml_path.flush()
|
|
358
|
-
|
|
393
|
+
# Copy tasks to avoid race conditions when multiple threads modify
|
|
394
|
+
# the same dag object concurrently. Each thread needs its own copy.
|
|
395
|
+
dag_copy = copy.deepcopy(dag)
|
|
396
|
+
for task_ in dag_copy.tasks:
|
|
359
397
|
if job_rank is not None:
|
|
360
398
|
task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
|
|
399
|
+
task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
|
|
361
400
|
|
|
362
|
-
dag_utils.dump_chain_dag_to_yaml(
|
|
401
|
+
dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
|
|
363
402
|
|
|
364
403
|
vars_to_fill = {
|
|
365
404
|
'remote_original_user_yaml_path':
|
|
@@ -392,7 +431,8 @@ def launch(
|
|
|
392
431
|
|
|
393
432
|
yaml_path = os.path.join(
|
|
394
433
|
managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
|
|
395
|
-
f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml'
|
|
434
|
+
f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
|
|
435
|
+
)
|
|
396
436
|
common_utils.fill_template(
|
|
397
437
|
managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
|
|
398
438
|
vars_to_fill,
|
|
@@ -400,7 +440,7 @@ def launch(
|
|
|
400
440
|
controller_task = task_lib.Task.from_yaml(yaml_path)
|
|
401
441
|
controller_task.set_resources(controller_resources)
|
|
402
442
|
|
|
403
|
-
controller_task.managed_job_dag =
|
|
443
|
+
controller_task.managed_job_dag = dag_copy
|
|
404
444
|
# pylint: disable=protected-access
|
|
405
445
|
controller_task._metadata = metadata
|
|
406
446
|
|
|
@@ -427,12 +467,15 @@ def launch(
|
|
|
427
467
|
# intermediate bucket and newly created bucket should be in
|
|
428
468
|
# workspace A.
|
|
429
469
|
if consolidation_mode_job_id is None:
|
|
430
|
-
return execution.launch(
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
470
|
+
return execution.launch(
|
|
471
|
+
task=controller_task,
|
|
472
|
+
cluster_name=controller_name,
|
|
473
|
+
stream_logs=stream_logs,
|
|
474
|
+
retry_until_up=True,
|
|
475
|
+
fast=True,
|
|
476
|
+
_request_name=request_names.AdminPolicyRequestName.
|
|
477
|
+
JOBS_LAUNCH_CONTROLLER,
|
|
478
|
+
_disable_controller_check=True)
|
|
436
479
|
# Manually launch the scheduler in consolidation mode.
|
|
437
480
|
local_handle = backend_utils.is_controller_accessible(
|
|
438
481
|
controller=controller, stopped_message='')
|
|
@@ -469,15 +512,49 @@ def launch(
|
|
|
469
512
|
assert len(consolidation_mode_job_ids) == 1
|
|
470
513
|
return _submit_one(consolidation_mode_job_ids[0])
|
|
471
514
|
|
|
472
|
-
ids = []
|
|
473
|
-
all_handle = None
|
|
474
|
-
|
|
475
|
-
|
|
515
|
+
ids: List[int] = []
|
|
516
|
+
all_handle: Optional[backends.ResourceHandle] = None
|
|
517
|
+
|
|
518
|
+
if num_jobs == 1:
|
|
519
|
+
job_id = (consolidation_mode_job_ids[0]
|
|
476
520
|
if consolidation_mode_job_ids is not None else None)
|
|
477
|
-
jid, handle = _submit_one(job_id,
|
|
521
|
+
jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
|
|
478
522
|
assert jid is not None, (job_id, handle)
|
|
479
523
|
ids.append(jid)
|
|
480
524
|
all_handle = handle
|
|
525
|
+
else:
|
|
526
|
+
# Submit jobs in parallel using ThreadPoolExecutor
|
|
527
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
528
|
+
max_workers=min(num_jobs,
|
|
529
|
+
os.cpu_count() or 1)) as executor:
|
|
530
|
+
# Submit jobs concurrently
|
|
531
|
+
future_to_rank = {}
|
|
532
|
+
for job_rank in range(num_jobs):
|
|
533
|
+
job_id = (consolidation_mode_job_ids[job_rank]
|
|
534
|
+
if consolidation_mode_job_ids is not None else None)
|
|
535
|
+
future = executor.submit(_submit_one, job_id, job_rank,
|
|
536
|
+
num_jobs)
|
|
537
|
+
future_to_rank[future] = job_rank
|
|
538
|
+
|
|
539
|
+
# Collect results in order of job_rank to maintain consistent order.
|
|
540
|
+
results: List[Optional[Tuple[
|
|
541
|
+
int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
|
|
542
|
+
for future in concurrent.futures.as_completed(future_to_rank):
|
|
543
|
+
job_rank = future_to_rank[future]
|
|
544
|
+
try:
|
|
545
|
+
jid, handle = future.result()
|
|
546
|
+
assert jid is not None, (job_id, handle)
|
|
547
|
+
results[job_rank] = (jid, handle)
|
|
548
|
+
all_handle = handle # Keep the last handle.
|
|
549
|
+
except Exception as e:
|
|
550
|
+
logger.error(f'Error launching job {job_rank}: {e}')
|
|
551
|
+
raise e
|
|
552
|
+
|
|
553
|
+
# Extract job IDs in order
|
|
554
|
+
for res in results:
|
|
555
|
+
if res is not None:
|
|
556
|
+
ids.append(res[0])
|
|
557
|
+
|
|
481
558
|
return ids, all_handle
|
|
482
559
|
|
|
483
560
|
|
|
@@ -530,7 +607,8 @@ def queue_from_kubernetes_pod(
|
|
|
530
607
|
'kubernetes', cluster_info)[0]
|
|
531
608
|
|
|
532
609
|
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
533
|
-
skip_finished=skip_finished
|
|
610
|
+
skip_finished=skip_finished,
|
|
611
|
+
fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
|
|
534
612
|
returncode, job_table_payload, stderr = managed_jobs_runner.run(
|
|
535
613
|
code,
|
|
536
614
|
require_outputs=True,
|
|
@@ -643,8 +721,7 @@ def queue(refresh: bool,
|
|
|
643
721
|
does not exist.
|
|
644
722
|
RuntimeError: if failed to get the managed jobs with ssh.
|
|
645
723
|
"""
|
|
646
|
-
jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids
|
|
647
|
-
None, None, None, None, None, None)
|
|
724
|
+
jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
|
|
648
725
|
|
|
649
726
|
return jobs
|
|
650
727
|
|
|
@@ -662,12 +739,13 @@ def queue_v2_api(
|
|
|
662
739
|
page: Optional[int] = None,
|
|
663
740
|
limit: Optional[int] = None,
|
|
664
741
|
statuses: Optional[List[str]] = None,
|
|
742
|
+
fields: Optional[List[str]] = None,
|
|
665
743
|
) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
|
|
666
744
|
"""Gets statuses of managed jobs and parse the
|
|
667
745
|
jobs to responses.ManagedJobRecord."""
|
|
668
746
|
jobs, total, status_counts, total_no_filter = queue_v2(
|
|
669
747
|
refresh, skip_finished, all_users, job_ids, user_match, workspace_match,
|
|
670
|
-
name_match, pool_match, page, limit, statuses)
|
|
748
|
+
name_match, pool_match, page, limit, statuses, fields)
|
|
671
749
|
return [responses.ManagedJobRecord(**job) for job in jobs
|
|
672
750
|
], total, status_counts, total_no_filter
|
|
673
751
|
|
|
@@ -685,6 +763,7 @@ def queue_v2(
|
|
|
685
763
|
page: Optional[int] = None,
|
|
686
764
|
limit: Optional[int] = None,
|
|
687
765
|
statuses: Optional[List[str]] = None,
|
|
766
|
+
fields: Optional[List[str]] = None,
|
|
688
767
|
) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
|
|
689
768
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
690
769
|
"""Gets statuses of managed jobs with filtering.
|
|
@@ -759,7 +838,8 @@ def queue_v2(
|
|
|
759
838
|
try:
|
|
760
839
|
request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
761
840
|
skip_finished=skip_finished,
|
|
762
|
-
accessible_workspaces=
|
|
841
|
+
accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
|
|
842
|
+
workspaces=accessible_workspaces)),
|
|
763
843
|
job_ids=managed_jobsv1_pb2.JobIds(
|
|
764
844
|
ids=job_ids) if job_ids is not None else None,
|
|
765
845
|
workspace_match=workspace_match,
|
|
@@ -775,6 +855,8 @@ def queue_v2(
|
|
|
775
855
|
]) if user_hashes is not None else None,
|
|
776
856
|
statuses=managed_jobsv1_pb2.Statuses(
|
|
777
857
|
statuses=statuses) if statuses is not None else None,
|
|
858
|
+
fields=managed_jobsv1_pb2.Fields(
|
|
859
|
+
fields=fields) if fields is not None else None,
|
|
778
860
|
show_jobs_without_user_hash=show_jobs_without_user_hash,
|
|
779
861
|
)
|
|
780
862
|
response = backend_utils.invoke_skylet_with_retries(
|
|
@@ -789,7 +871,7 @@ def queue_v2(
|
|
|
789
871
|
with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
|
|
790
872
|
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
791
873
|
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
792
|
-
name_match, pool_match, page, limit, user_hashes, statuses)
|
|
874
|
+
name_match, pool_match, page, limit, user_hashes, statuses, fields)
|
|
793
875
|
with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
|
|
794
876
|
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
795
877
|
handle,
|
sky/jobs/server/server.py
CHANGED
|
@@ -11,6 +11,7 @@ from sky.server import common as server_common
|
|
|
11
11
|
from sky.server import stream_utils
|
|
12
12
|
from sky.server.requests import executor
|
|
13
13
|
from sky.server.requests import payloads
|
|
14
|
+
from sky.server.requests import request_names
|
|
14
15
|
from sky.server.requests import requests as api_requests
|
|
15
16
|
from sky.skylet import constants
|
|
16
17
|
from sky.utils import common
|
|
@@ -35,9 +36,9 @@ async def launch(request: fastapi.Request,
|
|
|
35
36
|
consolidation_mode = managed_jobs_utils.is_consolidation_mode()
|
|
36
37
|
schedule_type = (api_requests.ScheduleType.SHORT
|
|
37
38
|
if consolidation_mode else api_requests.ScheduleType.LONG)
|
|
38
|
-
executor.
|
|
39
|
+
await executor.schedule_request_async(
|
|
39
40
|
request_id=request.state.request_id,
|
|
40
|
-
request_name=
|
|
41
|
+
request_name=request_names.RequestName.JOBS_LAUNCH,
|
|
41
42
|
request_body=jobs_launch_body,
|
|
42
43
|
func=core.launch,
|
|
43
44
|
schedule_type=schedule_type,
|
|
@@ -50,9 +51,9 @@ async def launch(request: fastapi.Request,
|
|
|
50
51
|
@router.post('/queue')
|
|
51
52
|
async def queue(request: fastapi.Request,
|
|
52
53
|
jobs_queue_body: payloads.JobsQueueBody) -> None:
|
|
53
|
-
executor.
|
|
54
|
+
await executor.schedule_request_async(
|
|
54
55
|
request_id=request.state.request_id,
|
|
55
|
-
request_name=
|
|
56
|
+
request_name=request_names.RequestName.JOBS_QUEUE,
|
|
56
57
|
request_body=jobs_queue_body,
|
|
57
58
|
func=core.queue,
|
|
58
59
|
schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
|
|
@@ -64,9 +65,9 @@ async def queue(request: fastapi.Request,
|
|
|
64
65
|
@router.post('/queue/v2')
|
|
65
66
|
async def queue_v2(request: fastapi.Request,
|
|
66
67
|
jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
|
|
67
|
-
executor.
|
|
68
|
+
await executor.schedule_request_async(
|
|
68
69
|
request_id=request.state.request_id,
|
|
69
|
-
request_name=
|
|
70
|
+
request_name=request_names.RequestName.JOBS_QUEUE_V2,
|
|
70
71
|
request_body=jobs_queue_body_v2,
|
|
71
72
|
func=core.queue_v2_api,
|
|
72
73
|
schedule_type=(api_requests.ScheduleType.LONG
|
|
@@ -79,9 +80,9 @@ async def queue_v2(request: fastapi.Request,
|
|
|
79
80
|
@router.post('/cancel')
|
|
80
81
|
async def cancel(request: fastapi.Request,
|
|
81
82
|
jobs_cancel_body: payloads.JobsCancelBody) -> None:
|
|
82
|
-
executor.
|
|
83
|
+
await executor.schedule_request_async(
|
|
83
84
|
request_id=request.state.request_id,
|
|
84
|
-
request_name=
|
|
85
|
+
request_name=request_names.RequestName.JOBS_CANCEL,
|
|
85
86
|
request_body=jobs_cancel_body,
|
|
86
87
|
func=core.cancel,
|
|
87
88
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -99,27 +100,34 @@ async def logs(
|
|
|
99
100
|
# When refresh is specified, the job controller might be restarted,
|
|
100
101
|
# which takes longer time to finish. We schedule it to long executor.
|
|
101
102
|
schedule_type = api_requests.ScheduleType.LONG
|
|
102
|
-
|
|
103
|
+
if schedule_type == api_requests.ScheduleType.SHORT:
|
|
104
|
+
executor.check_request_thread_executor_available()
|
|
105
|
+
request_task = await executor.prepare_request_async(
|
|
103
106
|
request_id=request.state.request_id,
|
|
104
|
-
request_name=
|
|
107
|
+
request_name=request_names.RequestName.JOBS_LOGS,
|
|
105
108
|
request_body=jobs_logs_body,
|
|
106
109
|
func=core.tail_logs,
|
|
107
110
|
schedule_type=schedule_type,
|
|
108
111
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
109
112
|
)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
else:
|
|
113
|
+
kill_request_on_disconnect = False
|
|
114
|
+
if schedule_type == api_requests.ScheduleType.SHORT:
|
|
113
115
|
# For short request, run in the coroutine to avoid blocking
|
|
114
116
|
# short workers.
|
|
115
117
|
task = executor.execute_request_in_coroutine(request_task)
|
|
116
118
|
# Cancel the coroutine after the request is done or client disconnects
|
|
117
119
|
background_tasks.add_task(task.cancel)
|
|
120
|
+
else:
|
|
121
|
+
executor.schedule_prepared_request(request_task)
|
|
122
|
+
# When runs in long executor process, we should kill the request on
|
|
123
|
+
# disconnect to cancel the running routine.
|
|
124
|
+
kill_request_on_disconnect = True
|
|
118
125
|
|
|
119
|
-
return stream_utils.
|
|
126
|
+
return stream_utils.stream_response_for_long_request(
|
|
120
127
|
request_id=request_task.request_id,
|
|
121
128
|
logs_path=request_task.log_path,
|
|
122
129
|
background_tasks=background_tasks,
|
|
130
|
+
kill_request_on_disconnect=kill_request_on_disconnect,
|
|
123
131
|
)
|
|
124
132
|
|
|
125
133
|
|
|
@@ -134,9 +142,9 @@ async def download_logs(
|
|
|
134
142
|
# We should reuse the original request body, so that the env vars, such as
|
|
135
143
|
# user hash, are kept the same.
|
|
136
144
|
jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
137
|
-
executor.
|
|
145
|
+
await executor.schedule_request_async(
|
|
138
146
|
request_id=request.state.request_id,
|
|
139
|
-
request_name=
|
|
147
|
+
request_name=request_names.RequestName.JOBS_DOWNLOAD_LOGS,
|
|
140
148
|
request_body=jobs_download_logs_body,
|
|
141
149
|
func=core.download_logs,
|
|
142
150
|
schedule_type=api_requests.ScheduleType.LONG
|
|
@@ -148,9 +156,9 @@ async def download_logs(
|
|
|
148
156
|
@router.post('/pool_apply')
|
|
149
157
|
async def pool_apply(request: fastapi.Request,
|
|
150
158
|
jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
|
|
151
|
-
executor.
|
|
159
|
+
await executor.schedule_request_async(
|
|
152
160
|
request_id=request.state.request_id,
|
|
153
|
-
request_name=
|
|
161
|
+
request_name=request_names.RequestName.JOBS_POOL_APPLY,
|
|
154
162
|
request_body=jobs_pool_apply_body,
|
|
155
163
|
func=core.pool_apply,
|
|
156
164
|
schedule_type=api_requests.ScheduleType.LONG,
|
|
@@ -161,9 +169,9 @@ async def pool_apply(request: fastapi.Request,
|
|
|
161
169
|
@router.post('/pool_down')
|
|
162
170
|
async def pool_down(request: fastapi.Request,
|
|
163
171
|
jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
|
|
164
|
-
executor.
|
|
172
|
+
await executor.schedule_request_async(
|
|
165
173
|
request_id=request.state.request_id,
|
|
166
|
-
request_name=
|
|
174
|
+
request_name=request_names.RequestName.JOBS_POOL_DOWN,
|
|
167
175
|
request_body=jobs_pool_down_body,
|
|
168
176
|
func=core.pool_down,
|
|
169
177
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -175,9 +183,9 @@ async def pool_down(request: fastapi.Request,
|
|
|
175
183
|
async def pool_status(
|
|
176
184
|
request: fastapi.Request,
|
|
177
185
|
jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
|
|
178
|
-
executor.
|
|
186
|
+
await executor.schedule_request_async(
|
|
179
187
|
request_id=request.state.request_id,
|
|
180
|
-
request_name=
|
|
188
|
+
request_name=request_names.RequestName.JOBS_POOL_STATUS,
|
|
181
189
|
request_body=jobs_pool_status_body,
|
|
182
190
|
func=core.pool_status,
|
|
183
191
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -190,21 +198,25 @@ async def pool_tail_logs(
|
|
|
190
198
|
request: fastapi.Request, log_body: payloads.JobsPoolLogsBody,
|
|
191
199
|
background_tasks: fastapi.BackgroundTasks
|
|
192
200
|
) -> fastapi.responses.StreamingResponse:
|
|
193
|
-
executor.
|
|
201
|
+
await executor.schedule_request_async(
|
|
194
202
|
request_id=request.state.request_id,
|
|
195
|
-
request_name=
|
|
203
|
+
request_name=request_names.RequestName.JOBS_POOL_LOGS,
|
|
196
204
|
request_body=log_body,
|
|
197
205
|
func=core.pool_tail_logs,
|
|
198
206
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
199
207
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
200
208
|
)
|
|
201
209
|
|
|
202
|
-
request_task = api_requests.
|
|
210
|
+
request_task = await api_requests.get_request_async(
|
|
211
|
+
request.state.request_id, fields=['request_id'])
|
|
203
212
|
|
|
204
|
-
return stream_utils.
|
|
213
|
+
return stream_utils.stream_response_for_long_request(
|
|
205
214
|
request_id=request_task.request_id,
|
|
215
|
+
# req.log_path is derived from request_id,
|
|
216
|
+
# so it's ok to just grab the request_id in the above query.
|
|
206
217
|
logs_path=request_task.log_path,
|
|
207
218
|
background_tasks=background_tasks,
|
|
219
|
+
kill_request_on_disconnect=True,
|
|
208
220
|
)
|
|
209
221
|
|
|
210
222
|
|
|
@@ -222,9 +234,9 @@ async def pool_download_logs(
|
|
|
222
234
|
# We should reuse the original request body, so that the env vars, such as
|
|
223
235
|
# user hash, are kept the same.
|
|
224
236
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
225
|
-
executor.
|
|
237
|
+
await executor.schedule_request_async(
|
|
226
238
|
request_id=request.state.request_id,
|
|
227
|
-
request_name=
|
|
239
|
+
request_name=request_names.RequestName.JOBS_POOL_SYNC_DOWN_LOGS,
|
|
228
240
|
request_body=download_logs_body,
|
|
229
241
|
func=core.pool_sync_down_logs,
|
|
230
242
|
schedule_type=api_requests.ScheduleType.SHORT,
|
sky/jobs/server/utils.py
CHANGED
|
@@ -19,6 +19,11 @@ else:
|
|
|
19
19
|
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
20
20
|
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
21
21
|
|
|
22
|
+
_MANAGED_JOB_FIELDS_TO_GET = [
|
|
23
|
+
'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
|
|
24
|
+
'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
|
|
25
|
+
]
|
|
26
|
+
|
|
22
27
|
|
|
23
28
|
def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
24
29
|
"""Check if controller has version mismatch and non-terminal jobs exist.
|
|
@@ -50,7 +55,11 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
50
55
|
)).get_managed_job_controller_version(version_request))
|
|
51
56
|
controller_version = version_response.controller_version
|
|
52
57
|
|
|
53
|
-
job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
58
|
+
job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
59
|
+
skip_finished=True,
|
|
60
|
+
fields=managed_jobsv1_pb2.Fields(
|
|
61
|
+
fields=_MANAGED_JOB_FIELDS_TO_GET),
|
|
62
|
+
)
|
|
54
63
|
job_table_response = backend_utils.invoke_skylet_with_retries(
|
|
55
64
|
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
56
65
|
handle.get_grpc_channel()).get_managed_job_table(
|
|
@@ -62,7 +71,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
62
71
|
|
|
63
72
|
if use_legacy:
|
|
64
73
|
# Get controller version and raw job table
|
|
65
|
-
code = managed_job_utils.ManagedJobCodeGen.
|
|
74
|
+
code = managed_job_utils.ManagedJobCodeGen.get_version()
|
|
66
75
|
|
|
67
76
|
returncode, output, stderr = backend.run_on_head(handle,
|
|
68
77
|
code,
|
|
@@ -72,7 +81,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
72
81
|
|
|
73
82
|
if returncode != 0:
|
|
74
83
|
logger.error(output + stderr)
|
|
75
|
-
raise ValueError('Failed to check controller version
|
|
84
|
+
raise ValueError('Failed to check controller version with '
|
|
76
85
|
f'returncode: {returncode}.\n{output + stderr}')
|
|
77
86
|
|
|
78
87
|
# Parse the output to extract controller version (split only on first
|
|
@@ -80,19 +89,28 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
80
89
|
output_parts = output.strip().split('\n', 1)
|
|
81
90
|
|
|
82
91
|
# Extract controller version from first line
|
|
83
|
-
if
|
|
84
|
-
'controller_version:'):
|
|
92
|
+
if not output_parts[0].startswith('controller_version:'):
|
|
85
93
|
raise ValueError(
|
|
86
94
|
f'Expected controller version in first line, got: {output}')
|
|
87
95
|
|
|
88
96
|
controller_version = output_parts[0].split(':', 1)[1]
|
|
89
97
|
|
|
90
|
-
|
|
91
|
-
|
|
98
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
99
|
+
skip_finished=True, fields=_MANAGED_JOB_FIELDS_TO_GET)
|
|
100
|
+
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
101
|
+
handle,
|
|
102
|
+
code,
|
|
103
|
+
require_outputs=True,
|
|
104
|
+
stream_logs=False,
|
|
105
|
+
separate_stderr=True)
|
|
106
|
+
|
|
107
|
+
if returncode != 0:
|
|
108
|
+
logger.error(job_table_payload + stderr)
|
|
109
|
+
raise ValueError('Failed to fetch managed jobs with returncode: '
|
|
110
|
+
f'{returncode}.\n{job_table_payload + stderr}')
|
|
92
111
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
job_table_payload)
|
|
112
|
+
jobs, _, _, _, _ = (
|
|
113
|
+
managed_job_utils.load_managed_job_queue(job_table_payload))
|
|
96
114
|
|
|
97
115
|
# Process locally: check version match and filter non-terminal jobs
|
|
98
116
|
version_matches = (controller_version == local_version or
|
|
@@ -103,7 +121,10 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
103
121
|
if not version_matches and has_non_terminal_jobs:
|
|
104
122
|
# Format job table locally using the same method as queue()
|
|
105
123
|
formatted_job_table = managed_job_utils.format_job_table(
|
|
106
|
-
non_terminal_jobs,
|
|
124
|
+
non_terminal_jobs,
|
|
125
|
+
pool_status=None,
|
|
126
|
+
show_all=False,
|
|
127
|
+
show_user=False)
|
|
107
128
|
|
|
108
129
|
error_msg = (
|
|
109
130
|
f'Controller SKYLET_VERSION ({controller_version}) does not match '
|