skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/backends/backend_utils.py +9 -6
- sky/backends/cloud_vm_ray_backend.py +2 -3
- sky/check.py +25 -13
- sky/client/cli/command.py +52 -24
- sky/cloud_stores.py +73 -0
- sky/clouds/aws.py +59 -11
- sky/core.py +7 -5
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +71 -2
- sky/data/storage.py +166 -9
- sky/global_user_state.py +14 -18
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +62 -67
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/scheduler.py +15 -2
- sky/jobs/server/core.py +85 -13
- sky/jobs/server/server.py +14 -13
- sky/jobs/server/utils.py +28 -10
- sky/jobs/state.py +216 -40
- sky/jobs/utils.py +65 -28
- sky/metrics/utils.py +18 -0
- sky/optimizer.py +1 -1
- sky/provision/kubernetes/instance.py +88 -19
- sky/provision/kubernetes/volume.py +2 -2
- sky/schemas/api/responses.py +3 -5
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/replica_managers.py +2 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/server.py +8 -7
- sky/server/common.py +21 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +23 -17
- sky/server/requests/executor.py +7 -3
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/request_names.py +80 -0
- sky/server/requests/requests.py +137 -102
- sky/server/requests/serializers/decoders.py +0 -6
- sky/server/requests/serializers/encoders.py +33 -6
- sky/server/server.py +105 -36
- sky/server/stream_utils.py +56 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +6 -1
- sky/skylet/events.py +7 -0
- sky/skylet/services.py +18 -7
- sky/ssh_node_pools/server.py +5 -4
- sky/task.py +14 -42
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +140 -12
- sky/users/permission.py +4 -1
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/context_utils.py +13 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/volumes/server/server.py +4 -3
- sky/workspaces/server.py +7 -6
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
- sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
- sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
- /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
"""SDK functions for managed jobs."""
|
|
2
|
+
import concurrent.futures
|
|
3
|
+
import copy
|
|
2
4
|
import ipaddress
|
|
3
5
|
import os
|
|
4
6
|
import pathlib
|
|
@@ -60,6 +62,35 @@ else:
|
|
|
60
62
|
|
|
61
63
|
logger = sky_logging.init_logger(__name__)
|
|
62
64
|
|
|
65
|
+
_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
|
|
66
|
+
'job_id',
|
|
67
|
+
'task_id',
|
|
68
|
+
'workspace',
|
|
69
|
+
'job_name',
|
|
70
|
+
'task_name',
|
|
71
|
+
'resources',
|
|
72
|
+
'submitted_at',
|
|
73
|
+
'end_at',
|
|
74
|
+
'job_duration',
|
|
75
|
+
'recovery_count',
|
|
76
|
+
'status',
|
|
77
|
+
'pool',
|
|
78
|
+
'current_cluster_name',
|
|
79
|
+
'job_id_on_pool_cluster',
|
|
80
|
+
'start_at',
|
|
81
|
+
'infra',
|
|
82
|
+
'cloud',
|
|
83
|
+
'region',
|
|
84
|
+
'zone',
|
|
85
|
+
'cluster_resources',
|
|
86
|
+
'schedule_state',
|
|
87
|
+
'details',
|
|
88
|
+
'failure_reason',
|
|
89
|
+
'metadata',
|
|
90
|
+
'user_name',
|
|
91
|
+
'user_hash',
|
|
92
|
+
]
|
|
93
|
+
|
|
63
94
|
|
|
64
95
|
def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
65
96
|
"""Upload files to the controller.
|
|
@@ -357,12 +388,15 @@ def launch(
|
|
|
357
388
|
) as original_user_yaml_path:
|
|
358
389
|
original_user_yaml_path.write(user_dag_str_user_specified)
|
|
359
390
|
original_user_yaml_path.flush()
|
|
360
|
-
|
|
391
|
+
# Copy tasks to avoid race conditions when multiple threads modify
|
|
392
|
+
# the same dag object concurrently. Each thread needs its own copy.
|
|
393
|
+
dag_copy = copy.deepcopy(dag)
|
|
394
|
+
for task_ in dag_copy.tasks:
|
|
361
395
|
if job_rank is not None:
|
|
362
396
|
task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
|
|
363
397
|
task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
|
|
364
398
|
|
|
365
|
-
dag_utils.dump_chain_dag_to_yaml(
|
|
399
|
+
dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
|
|
366
400
|
|
|
367
401
|
vars_to_fill = {
|
|
368
402
|
'remote_original_user_yaml_path':
|
|
@@ -395,7 +429,8 @@ def launch(
|
|
|
395
429
|
|
|
396
430
|
yaml_path = os.path.join(
|
|
397
431
|
managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
|
|
398
|
-
f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml'
|
|
432
|
+
f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
|
|
433
|
+
)
|
|
399
434
|
common_utils.fill_template(
|
|
400
435
|
managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
|
|
401
436
|
vars_to_fill,
|
|
@@ -403,7 +438,7 @@ def launch(
|
|
|
403
438
|
controller_task = task_lib.Task.from_yaml(yaml_path)
|
|
404
439
|
controller_task.set_resources(controller_resources)
|
|
405
440
|
|
|
406
|
-
controller_task.managed_job_dag =
|
|
441
|
+
controller_task.managed_job_dag = dag_copy
|
|
407
442
|
# pylint: disable=protected-access
|
|
408
443
|
controller_task._metadata = metadata
|
|
409
444
|
|
|
@@ -472,15 +507,49 @@ def launch(
|
|
|
472
507
|
assert len(consolidation_mode_job_ids) == 1
|
|
473
508
|
return _submit_one(consolidation_mode_job_ids[0])
|
|
474
509
|
|
|
475
|
-
ids = []
|
|
476
|
-
all_handle = None
|
|
477
|
-
|
|
478
|
-
|
|
510
|
+
ids: List[int] = []
|
|
511
|
+
all_handle: Optional[backends.ResourceHandle] = None
|
|
512
|
+
|
|
513
|
+
if num_jobs == 1:
|
|
514
|
+
job_id = (consolidation_mode_job_ids[0]
|
|
479
515
|
if consolidation_mode_job_ids is not None else None)
|
|
480
|
-
jid, handle = _submit_one(job_id,
|
|
516
|
+
jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
|
|
481
517
|
assert jid is not None, (job_id, handle)
|
|
482
518
|
ids.append(jid)
|
|
483
519
|
all_handle = handle
|
|
520
|
+
else:
|
|
521
|
+
# Submit jobs in parallel using ThreadPoolExecutor
|
|
522
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
523
|
+
max_workers=min(num_jobs,
|
|
524
|
+
os.cpu_count() or 1)) as executor:
|
|
525
|
+
# Submit jobs concurrently
|
|
526
|
+
future_to_rank = {}
|
|
527
|
+
for job_rank in range(num_jobs):
|
|
528
|
+
job_id = (consolidation_mode_job_ids[job_rank]
|
|
529
|
+
if consolidation_mode_job_ids is not None else None)
|
|
530
|
+
future = executor.submit(_submit_one, job_id, job_rank,
|
|
531
|
+
num_jobs)
|
|
532
|
+
future_to_rank[future] = job_rank
|
|
533
|
+
|
|
534
|
+
# Collect results in order of job_rank to maintain consistent order.
|
|
535
|
+
results: List[Optional[Tuple[
|
|
536
|
+
int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
|
|
537
|
+
for future in concurrent.futures.as_completed(future_to_rank):
|
|
538
|
+
job_rank = future_to_rank[future]
|
|
539
|
+
try:
|
|
540
|
+
jid, handle = future.result()
|
|
541
|
+
assert jid is not None, (job_id, handle)
|
|
542
|
+
results[job_rank] = (jid, handle)
|
|
543
|
+
all_handle = handle # Keep the last handle.
|
|
544
|
+
except Exception as e:
|
|
545
|
+
logger.error(f'Error launching job {job_rank}: {e}')
|
|
546
|
+
raise e
|
|
547
|
+
|
|
548
|
+
# Extract job IDs in order
|
|
549
|
+
for res in results:
|
|
550
|
+
if res is not None:
|
|
551
|
+
ids.append(res[0])
|
|
552
|
+
|
|
484
553
|
return ids, all_handle
|
|
485
554
|
|
|
486
555
|
|
|
@@ -533,7 +602,8 @@ def queue_from_kubernetes_pod(
|
|
|
533
602
|
'kubernetes', cluster_info)[0]
|
|
534
603
|
|
|
535
604
|
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
536
|
-
skip_finished=skip_finished
|
|
605
|
+
skip_finished=skip_finished,
|
|
606
|
+
fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
|
|
537
607
|
returncode, job_table_payload, stderr = managed_jobs_runner.run(
|
|
538
608
|
code,
|
|
539
609
|
require_outputs=True,
|
|
@@ -646,8 +716,7 @@ def queue(refresh: bool,
|
|
|
646
716
|
does not exist.
|
|
647
717
|
RuntimeError: if failed to get the managed jobs with ssh.
|
|
648
718
|
"""
|
|
649
|
-
jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids
|
|
650
|
-
None, None, None, None, None, None)
|
|
719
|
+
jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
|
|
651
720
|
|
|
652
721
|
return jobs
|
|
653
722
|
|
|
@@ -764,7 +833,8 @@ def queue_v2(
|
|
|
764
833
|
try:
|
|
765
834
|
request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
766
835
|
skip_finished=skip_finished,
|
|
767
|
-
accessible_workspaces=
|
|
836
|
+
accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
|
|
837
|
+
workspaces=accessible_workspaces)),
|
|
768
838
|
job_ids=managed_jobsv1_pb2.JobIds(
|
|
769
839
|
ids=job_ids) if job_ids is not None else None,
|
|
770
840
|
workspace_match=workspace_match,
|
|
@@ -780,6 +850,8 @@ def queue_v2(
|
|
|
780
850
|
]) if user_hashes is not None else None,
|
|
781
851
|
statuses=managed_jobsv1_pb2.Statuses(
|
|
782
852
|
statuses=statuses) if statuses is not None else None,
|
|
853
|
+
fields=managed_jobsv1_pb2.Fields(
|
|
854
|
+
fields=fields) if fields is not None else None,
|
|
783
855
|
show_jobs_without_user_hash=show_jobs_without_user_hash,
|
|
784
856
|
)
|
|
785
857
|
response = backend_utils.invoke_skylet_with_retries(
|
sky/jobs/server/server.py
CHANGED
|
@@ -11,6 +11,7 @@ from sky.server import common as server_common
|
|
|
11
11
|
from sky.server import stream_utils
|
|
12
12
|
from sky.server.requests import executor
|
|
13
13
|
from sky.server.requests import payloads
|
|
14
|
+
from sky.server.requests import request_names
|
|
14
15
|
from sky.server.requests import requests as api_requests
|
|
15
16
|
from sky.skylet import constants
|
|
16
17
|
from sky.utils import common
|
|
@@ -37,7 +38,7 @@ async def launch(request: fastapi.Request,
|
|
|
37
38
|
if consolidation_mode else api_requests.ScheduleType.LONG)
|
|
38
39
|
await executor.schedule_request_async(
|
|
39
40
|
request_id=request.state.request_id,
|
|
40
|
-
request_name=
|
|
41
|
+
request_name=request_names.RequestName.JOBS_LAUNCH,
|
|
41
42
|
request_body=jobs_launch_body,
|
|
42
43
|
func=core.launch,
|
|
43
44
|
schedule_type=schedule_type,
|
|
@@ -52,7 +53,7 @@ async def queue(request: fastapi.Request,
|
|
|
52
53
|
jobs_queue_body: payloads.JobsQueueBody) -> None:
|
|
53
54
|
await executor.schedule_request_async(
|
|
54
55
|
request_id=request.state.request_id,
|
|
55
|
-
request_name=
|
|
56
|
+
request_name=request_names.RequestName.JOBS_QUEUE,
|
|
56
57
|
request_body=jobs_queue_body,
|
|
57
58
|
func=core.queue,
|
|
58
59
|
schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
|
|
@@ -66,7 +67,7 @@ async def queue_v2(request: fastapi.Request,
|
|
|
66
67
|
jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
|
|
67
68
|
await executor.schedule_request_async(
|
|
68
69
|
request_id=request.state.request_id,
|
|
69
|
-
request_name=
|
|
70
|
+
request_name=request_names.RequestName.JOBS_QUEUE_V2,
|
|
70
71
|
request_body=jobs_queue_body_v2,
|
|
71
72
|
func=core.queue_v2_api,
|
|
72
73
|
schedule_type=(api_requests.ScheduleType.LONG
|
|
@@ -81,7 +82,7 @@ async def cancel(request: fastapi.Request,
|
|
|
81
82
|
jobs_cancel_body: payloads.JobsCancelBody) -> None:
|
|
82
83
|
await executor.schedule_request_async(
|
|
83
84
|
request_id=request.state.request_id,
|
|
84
|
-
request_name=
|
|
85
|
+
request_name=request_names.RequestName.JOBS_CANCEL,
|
|
85
86
|
request_body=jobs_cancel_body,
|
|
86
87
|
func=core.cancel,
|
|
87
88
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -103,7 +104,7 @@ async def logs(
|
|
|
103
104
|
executor.check_request_thread_executor_available()
|
|
104
105
|
request_task = await executor.prepare_request_async(
|
|
105
106
|
request_id=request.state.request_id,
|
|
106
|
-
request_name=
|
|
107
|
+
request_name=request_names.RequestName.JOBS_LOGS,
|
|
107
108
|
request_body=jobs_logs_body,
|
|
108
109
|
func=core.tail_logs,
|
|
109
110
|
schedule_type=schedule_type,
|
|
@@ -143,7 +144,7 @@ async def download_logs(
|
|
|
143
144
|
jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
144
145
|
await executor.schedule_request_async(
|
|
145
146
|
request_id=request.state.request_id,
|
|
146
|
-
request_name=
|
|
147
|
+
request_name=request_names.RequestName.JOBS_DOWNLOAD_LOGS,
|
|
147
148
|
request_body=jobs_download_logs_body,
|
|
148
149
|
func=core.download_logs,
|
|
149
150
|
schedule_type=api_requests.ScheduleType.LONG
|
|
@@ -157,7 +158,7 @@ async def pool_apply(request: fastapi.Request,
|
|
|
157
158
|
jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
|
|
158
159
|
await executor.schedule_request_async(
|
|
159
160
|
request_id=request.state.request_id,
|
|
160
|
-
request_name=
|
|
161
|
+
request_name=request_names.RequestName.JOBS_POOL_APPLY,
|
|
161
162
|
request_body=jobs_pool_apply_body,
|
|
162
163
|
func=core.pool_apply,
|
|
163
164
|
schedule_type=api_requests.ScheduleType.LONG,
|
|
@@ -170,7 +171,7 @@ async def pool_down(request: fastapi.Request,
|
|
|
170
171
|
jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
|
|
171
172
|
await executor.schedule_request_async(
|
|
172
173
|
request_id=request.state.request_id,
|
|
173
|
-
request_name=
|
|
174
|
+
request_name=request_names.RequestName.JOBS_POOL_DOWN,
|
|
174
175
|
request_body=jobs_pool_down_body,
|
|
175
176
|
func=core.pool_down,
|
|
176
177
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -184,7 +185,7 @@ async def pool_status(
|
|
|
184
185
|
jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
|
|
185
186
|
await executor.schedule_request_async(
|
|
186
187
|
request_id=request.state.request_id,
|
|
187
|
-
request_name=
|
|
188
|
+
request_name=request_names.RequestName.JOBS_POOL_STATUS,
|
|
188
189
|
request_body=jobs_pool_status_body,
|
|
189
190
|
func=core.pool_status,
|
|
190
191
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -199,15 +200,15 @@ async def pool_tail_logs(
|
|
|
199
200
|
) -> fastapi.responses.StreamingResponse:
|
|
200
201
|
await executor.schedule_request_async(
|
|
201
202
|
request_id=request.state.request_id,
|
|
202
|
-
request_name=
|
|
203
|
+
request_name=request_names.RequestName.JOBS_POOL_LOGS,
|
|
203
204
|
request_body=log_body,
|
|
204
205
|
func=core.pool_tail_logs,
|
|
205
206
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
206
207
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
207
208
|
)
|
|
208
209
|
|
|
209
|
-
request_task = api_requests.
|
|
210
|
-
|
|
210
|
+
request_task = await api_requests.get_request_async(
|
|
211
|
+
request.state.request_id, fields=['request_id'])
|
|
211
212
|
|
|
212
213
|
return stream_utils.stream_response_for_long_request(
|
|
213
214
|
request_id=request_task.request_id,
|
|
@@ -235,7 +236,7 @@ async def pool_download_logs(
|
|
|
235
236
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
236
237
|
await executor.schedule_request_async(
|
|
237
238
|
request_id=request.state.request_id,
|
|
238
|
-
request_name=
|
|
239
|
+
request_name=request_names.RequestName.JOBS_POOL_SYNC_DOWN_LOGS,
|
|
239
240
|
request_body=download_logs_body,
|
|
240
241
|
func=core.pool_sync_down_logs,
|
|
241
242
|
schedule_type=api_requests.ScheduleType.SHORT,
|
sky/jobs/server/utils.py
CHANGED
|
@@ -19,6 +19,11 @@ else:
|
|
|
19
19
|
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
20
20
|
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
21
21
|
|
|
22
|
+
_MANAGED_JOB_FIELDS_TO_GET = [
|
|
23
|
+
'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
|
|
24
|
+
'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
|
|
25
|
+
]
|
|
26
|
+
|
|
22
27
|
|
|
23
28
|
def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
24
29
|
"""Check if controller has version mismatch and non-terminal jobs exist.
|
|
@@ -50,7 +55,11 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
50
55
|
)).get_managed_job_controller_version(version_request))
|
|
51
56
|
controller_version = version_response.controller_version
|
|
52
57
|
|
|
53
|
-
job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
58
|
+
job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
59
|
+
skip_finished=True,
|
|
60
|
+
fields=managed_jobsv1_pb2.Fields(
|
|
61
|
+
fields=_MANAGED_JOB_FIELDS_TO_GET),
|
|
62
|
+
)
|
|
54
63
|
job_table_response = backend_utils.invoke_skylet_with_retries(
|
|
55
64
|
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
56
65
|
handle.get_grpc_channel()).get_managed_job_table(
|
|
@@ -62,7 +71,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
62
71
|
|
|
63
72
|
if use_legacy:
|
|
64
73
|
# Get controller version and raw job table
|
|
65
|
-
code = managed_job_utils.ManagedJobCodeGen.
|
|
74
|
+
code = managed_job_utils.ManagedJobCodeGen.get_version()
|
|
66
75
|
|
|
67
76
|
returncode, output, stderr = backend.run_on_head(handle,
|
|
68
77
|
code,
|
|
@@ -72,7 +81,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
72
81
|
|
|
73
82
|
if returncode != 0:
|
|
74
83
|
logger.error(output + stderr)
|
|
75
|
-
raise ValueError('Failed to check controller version
|
|
84
|
+
raise ValueError('Failed to check controller version with '
|
|
76
85
|
f'returncode: {returncode}.\n{output + stderr}')
|
|
77
86
|
|
|
78
87
|
# Parse the output to extract controller version (split only on first
|
|
@@ -80,19 +89,28 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
80
89
|
output_parts = output.strip().split('\n', 1)
|
|
81
90
|
|
|
82
91
|
# Extract controller version from first line
|
|
83
|
-
if
|
|
84
|
-
'controller_version:'):
|
|
92
|
+
if not output_parts[0].startswith('controller_version:'):
|
|
85
93
|
raise ValueError(
|
|
86
94
|
f'Expected controller version in first line, got: {output}')
|
|
87
95
|
|
|
88
96
|
controller_version = output_parts[0].split(':', 1)[1]
|
|
89
97
|
|
|
90
|
-
|
|
91
|
-
|
|
98
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
99
|
+
skip_finished=True, fields=_MANAGED_JOB_FIELDS_TO_GET)
|
|
100
|
+
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
101
|
+
handle,
|
|
102
|
+
code,
|
|
103
|
+
require_outputs=True,
|
|
104
|
+
stream_logs=False,
|
|
105
|
+
separate_stderr=True)
|
|
106
|
+
|
|
107
|
+
if returncode != 0:
|
|
108
|
+
logger.error(job_table_payload + stderr)
|
|
109
|
+
raise ValueError('Failed to fetch managed jobs with returncode: '
|
|
110
|
+
f'{returncode}.\n{job_table_payload + stderr}')
|
|
92
111
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
job_table_payload)
|
|
112
|
+
jobs, _, _, _, _ = (
|
|
113
|
+
managed_job_utils.load_managed_job_queue(job_table_payload))
|
|
96
114
|
|
|
97
115
|
# Process locally: check version match and filter non-terminal jobs
|
|
98
116
|
version_matches = (controller_version == local_version or
|