skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (114) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/backends/backend_utils.py +9 -6
  5. sky/backends/cloud_vm_ray_backend.py +2 -3
  6. sky/check.py +25 -13
  7. sky/client/cli/command.py +52 -24
  8. sky/cloud_stores.py +73 -0
  9. sky/clouds/aws.py +59 -11
  10. sky/core.py +7 -5
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
  13. sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
  15. sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
  28. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  29. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  30. sky/dashboard/out/clusters/[cluster].html +1 -1
  31. sky/dashboard/out/clusters.html +1 -1
  32. sky/dashboard/out/config.html +1 -1
  33. sky/dashboard/out/index.html +1 -1
  34. sky/dashboard/out/infra/[context].html +1 -1
  35. sky/dashboard/out/infra.html +1 -1
  36. sky/dashboard/out/jobs/[job].html +1 -1
  37. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  38. sky/dashboard/out/jobs.html +1 -1
  39. sky/dashboard/out/users.html +1 -1
  40. sky/dashboard/out/volumes.html +1 -1
  41. sky/dashboard/out/workspace/new.html +1 -1
  42. sky/dashboard/out/workspaces/[name].html +1 -1
  43. sky/dashboard/out/workspaces.html +1 -1
  44. sky/data/data_utils.py +92 -1
  45. sky/data/mounting_utils.py +71 -2
  46. sky/data/storage.py +166 -9
  47. sky/global_user_state.py +14 -18
  48. sky/jobs/constants.py +2 -0
  49. sky/jobs/controller.py +62 -67
  50. sky/jobs/file_content_utils.py +80 -0
  51. sky/jobs/log_gc.py +201 -0
  52. sky/jobs/scheduler.py +15 -2
  53. sky/jobs/server/core.py +85 -13
  54. sky/jobs/server/server.py +14 -13
  55. sky/jobs/server/utils.py +28 -10
  56. sky/jobs/state.py +216 -40
  57. sky/jobs/utils.py +65 -28
  58. sky/metrics/utils.py +18 -0
  59. sky/optimizer.py +1 -1
  60. sky/provision/kubernetes/instance.py +88 -19
  61. sky/provision/kubernetes/volume.py +2 -2
  62. sky/schemas/api/responses.py +3 -5
  63. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  64. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  65. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  66. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  67. sky/serve/replica_managers.py +2 -2
  68. sky/serve/serve_utils.py +9 -2
  69. sky/serve/server/server.py +8 -7
  70. sky/server/common.py +21 -15
  71. sky/server/constants.py +1 -1
  72. sky/server/daemons.py +23 -17
  73. sky/server/requests/executor.py +7 -3
  74. sky/server/requests/payloads.py +2 -0
  75. sky/server/requests/request_names.py +80 -0
  76. sky/server/requests/requests.py +137 -102
  77. sky/server/requests/serializers/decoders.py +0 -6
  78. sky/server/requests/serializers/encoders.py +33 -6
  79. sky/server/server.py +105 -36
  80. sky/server/stream_utils.py +56 -13
  81. sky/setup_files/dependencies.py +2 -0
  82. sky/skylet/constants.py +6 -1
  83. sky/skylet/events.py +7 -0
  84. sky/skylet/services.py +18 -7
  85. sky/ssh_node_pools/server.py +5 -4
  86. sky/task.py +14 -42
  87. sky/templates/kubernetes-ray.yml.j2 +1 -1
  88. sky/templates/nebius-ray.yml.j2 +1 -0
  89. sky/templates/websocket_proxy.py +140 -12
  90. sky/users/permission.py +4 -1
  91. sky/utils/cli_utils/status_utils.py +8 -2
  92. sky/utils/context_utils.py +13 -1
  93. sky/utils/db/migration_utils.py +1 -1
  94. sky/utils/resource_checker.py +4 -1
  95. sky/utils/resources_utils.py +53 -29
  96. sky/utils/schemas.py +23 -4
  97. sky/volumes/server/server.py +4 -3
  98. sky/workspaces/server.py +7 -6
  99. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
  100. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
  101. sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
  102. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
  107. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
  108. sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
  109. /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
  110. /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
  111. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
  112. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
  113. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
  114. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py CHANGED
@@ -1,4 +1,6 @@
1
1
  """SDK functions for managed jobs."""
2
+ import concurrent.futures
3
+ import copy
2
4
  import ipaddress
3
5
  import os
4
6
  import pathlib
@@ -60,6 +62,35 @@ else:
60
62
 
61
63
  logger = sky_logging.init_logger(__name__)
62
64
 
65
+ _MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
66
+ 'job_id',
67
+ 'task_id',
68
+ 'workspace',
69
+ 'job_name',
70
+ 'task_name',
71
+ 'resources',
72
+ 'submitted_at',
73
+ 'end_at',
74
+ 'job_duration',
75
+ 'recovery_count',
76
+ 'status',
77
+ 'pool',
78
+ 'current_cluster_name',
79
+ 'job_id_on_pool_cluster',
80
+ 'start_at',
81
+ 'infra',
82
+ 'cloud',
83
+ 'region',
84
+ 'zone',
85
+ 'cluster_resources',
86
+ 'schedule_state',
87
+ 'details',
88
+ 'failure_reason',
89
+ 'metadata',
90
+ 'user_name',
91
+ 'user_hash',
92
+ ]
93
+
63
94
 
64
95
  def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
65
96
  """Upload files to the controller.
@@ -357,12 +388,15 @@ def launch(
357
388
  ) as original_user_yaml_path:
358
389
  original_user_yaml_path.write(user_dag_str_user_specified)
359
390
  original_user_yaml_path.flush()
360
- for task_ in dag.tasks:
391
+ # Copy tasks to avoid race conditions when multiple threads modify
392
+ # the same dag object concurrently. Each thread needs its own copy.
393
+ dag_copy = copy.deepcopy(dag)
394
+ for task_ in dag_copy.tasks:
361
395
  if job_rank is not None:
362
396
  task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
363
397
  task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
364
398
 
365
- dag_utils.dump_chain_dag_to_yaml(dag, f.name)
399
+ dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
366
400
 
367
401
  vars_to_fill = {
368
402
  'remote_original_user_yaml_path':
@@ -395,7 +429,8 @@ def launch(
395
429
 
396
430
  yaml_path = os.path.join(
397
431
  managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
398
- f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml')
432
+ f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
433
+ )
399
434
  common_utils.fill_template(
400
435
  managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
401
436
  vars_to_fill,
@@ -403,7 +438,7 @@ def launch(
403
438
  controller_task = task_lib.Task.from_yaml(yaml_path)
404
439
  controller_task.set_resources(controller_resources)
405
440
 
406
- controller_task.managed_job_dag = dag
441
+ controller_task.managed_job_dag = dag_copy
407
442
  # pylint: disable=protected-access
408
443
  controller_task._metadata = metadata
409
444
 
@@ -472,15 +507,49 @@ def launch(
472
507
  assert len(consolidation_mode_job_ids) == 1
473
508
  return _submit_one(consolidation_mode_job_ids[0])
474
509
 
475
- ids = []
476
- all_handle = None
477
- for job_rank in range(num_jobs):
478
- job_id = (consolidation_mode_job_ids[job_rank]
510
+ ids: List[int] = []
511
+ all_handle: Optional[backends.ResourceHandle] = None
512
+
513
+ if num_jobs == 1:
514
+ job_id = (consolidation_mode_job_ids[0]
479
515
  if consolidation_mode_job_ids is not None else None)
480
- jid, handle = _submit_one(job_id, job_rank, num_jobs=num_jobs)
516
+ jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
481
517
  assert jid is not None, (job_id, handle)
482
518
  ids.append(jid)
483
519
  all_handle = handle
520
+ else:
521
+ # Submit jobs in parallel using ThreadPoolExecutor
522
+ with concurrent.futures.ThreadPoolExecutor(
523
+ max_workers=min(num_jobs,
524
+ os.cpu_count() or 1)) as executor:
525
+ # Submit jobs concurrently
526
+ future_to_rank = {}
527
+ for job_rank in range(num_jobs):
528
+ job_id = (consolidation_mode_job_ids[job_rank]
529
+ if consolidation_mode_job_ids is not None else None)
530
+ future = executor.submit(_submit_one, job_id, job_rank,
531
+ num_jobs)
532
+ future_to_rank[future] = job_rank
533
+
534
+ # Collect results in order of job_rank to maintain consistent order.
535
+ results: List[Optional[Tuple[
536
+ int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
537
+ for future in concurrent.futures.as_completed(future_to_rank):
538
+ job_rank = future_to_rank[future]
539
+ try:
540
+ jid, handle = future.result()
541
+ assert jid is not None, (job_id, handle)
542
+ results[job_rank] = (jid, handle)
543
+ all_handle = handle # Keep the last handle.
544
+ except Exception as e:
545
+ logger.error(f'Error launching job {job_rank}: {e}')
546
+ raise e
547
+
548
+ # Extract job IDs in order
549
+ for res in results:
550
+ if res is not None:
551
+ ids.append(res[0])
552
+
484
553
  return ids, all_handle
485
554
 
486
555
 
@@ -533,7 +602,8 @@ def queue_from_kubernetes_pod(
533
602
  'kubernetes', cluster_info)[0]
534
603
 
535
604
  code = managed_job_utils.ManagedJobCodeGen.get_job_table(
536
- skip_finished=skip_finished)
605
+ skip_finished=skip_finished,
606
+ fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
537
607
  returncode, job_table_payload, stderr = managed_jobs_runner.run(
538
608
  code,
539
609
  require_outputs=True,
@@ -646,8 +716,7 @@ def queue(refresh: bool,
646
716
  does not exist.
647
717
  RuntimeError: if failed to get the managed jobs with ssh.
648
718
  """
649
- jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids, None,
650
- None, None, None, None, None, None)
719
+ jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
651
720
 
652
721
  return jobs
653
722
 
@@ -764,7 +833,8 @@ def queue_v2(
764
833
  try:
765
834
  request = managed_jobsv1_pb2.GetJobTableRequest(
766
835
  skip_finished=skip_finished,
767
- accessible_workspaces=accessible_workspaces,
836
+ accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
837
+ workspaces=accessible_workspaces)),
768
838
  job_ids=managed_jobsv1_pb2.JobIds(
769
839
  ids=job_ids) if job_ids is not None else None,
770
840
  workspace_match=workspace_match,
@@ -780,6 +850,8 @@ def queue_v2(
780
850
  ]) if user_hashes is not None else None,
781
851
  statuses=managed_jobsv1_pb2.Statuses(
782
852
  statuses=statuses) if statuses is not None else None,
853
+ fields=managed_jobsv1_pb2.Fields(
854
+ fields=fields) if fields is not None else None,
783
855
  show_jobs_without_user_hash=show_jobs_without_user_hash,
784
856
  )
785
857
  response = backend_utils.invoke_skylet_with_retries(
sky/jobs/server/server.py CHANGED
@@ -11,6 +11,7 @@ from sky.server import common as server_common
11
11
  from sky.server import stream_utils
12
12
  from sky.server.requests import executor
13
13
  from sky.server.requests import payloads
14
+ from sky.server.requests import request_names
14
15
  from sky.server.requests import requests as api_requests
15
16
  from sky.skylet import constants
16
17
  from sky.utils import common
@@ -37,7 +38,7 @@ async def launch(request: fastapi.Request,
37
38
  if consolidation_mode else api_requests.ScheduleType.LONG)
38
39
  await executor.schedule_request_async(
39
40
  request_id=request.state.request_id,
40
- request_name='jobs.launch',
41
+ request_name=request_names.RequestName.JOBS_LAUNCH,
41
42
  request_body=jobs_launch_body,
42
43
  func=core.launch,
43
44
  schedule_type=schedule_type,
@@ -52,7 +53,7 @@ async def queue(request: fastapi.Request,
52
53
  jobs_queue_body: payloads.JobsQueueBody) -> None:
53
54
  await executor.schedule_request_async(
54
55
  request_id=request.state.request_id,
55
- request_name='jobs.queue',
56
+ request_name=request_names.RequestName.JOBS_QUEUE,
56
57
  request_body=jobs_queue_body,
57
58
  func=core.queue,
58
59
  schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
@@ -66,7 +67,7 @@ async def queue_v2(request: fastapi.Request,
66
67
  jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
67
68
  await executor.schedule_request_async(
68
69
  request_id=request.state.request_id,
69
- request_name='jobs.queue_v2',
70
+ request_name=request_names.RequestName.JOBS_QUEUE_V2,
70
71
  request_body=jobs_queue_body_v2,
71
72
  func=core.queue_v2_api,
72
73
  schedule_type=(api_requests.ScheduleType.LONG
@@ -81,7 +82,7 @@ async def cancel(request: fastapi.Request,
81
82
  jobs_cancel_body: payloads.JobsCancelBody) -> None:
82
83
  await executor.schedule_request_async(
83
84
  request_id=request.state.request_id,
84
- request_name='jobs.cancel',
85
+ request_name=request_names.RequestName.JOBS_CANCEL,
85
86
  request_body=jobs_cancel_body,
86
87
  func=core.cancel,
87
88
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -103,7 +104,7 @@ async def logs(
103
104
  executor.check_request_thread_executor_available()
104
105
  request_task = await executor.prepare_request_async(
105
106
  request_id=request.state.request_id,
106
- request_name='jobs.logs',
107
+ request_name=request_names.RequestName.JOBS_LOGS,
107
108
  request_body=jobs_logs_body,
108
109
  func=core.tail_logs,
109
110
  schedule_type=schedule_type,
@@ -143,7 +144,7 @@ async def download_logs(
143
144
  jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
144
145
  await executor.schedule_request_async(
145
146
  request_id=request.state.request_id,
146
- request_name='jobs.download_logs',
147
+ request_name=request_names.RequestName.JOBS_DOWNLOAD_LOGS,
147
148
  request_body=jobs_download_logs_body,
148
149
  func=core.download_logs,
149
150
  schedule_type=api_requests.ScheduleType.LONG
@@ -157,7 +158,7 @@ async def pool_apply(request: fastapi.Request,
157
158
  jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
158
159
  await executor.schedule_request_async(
159
160
  request_id=request.state.request_id,
160
- request_name='jobs.pool_apply',
161
+ request_name=request_names.RequestName.JOBS_POOL_APPLY,
161
162
  request_body=jobs_pool_apply_body,
162
163
  func=core.pool_apply,
163
164
  schedule_type=api_requests.ScheduleType.LONG,
@@ -170,7 +171,7 @@ async def pool_down(request: fastapi.Request,
170
171
  jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
171
172
  await executor.schedule_request_async(
172
173
  request_id=request.state.request_id,
173
- request_name='jobs.pool_down',
174
+ request_name=request_names.RequestName.JOBS_POOL_DOWN,
174
175
  request_body=jobs_pool_down_body,
175
176
  func=core.pool_down,
176
177
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -184,7 +185,7 @@ async def pool_status(
184
185
  jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
185
186
  await executor.schedule_request_async(
186
187
  request_id=request.state.request_id,
187
- request_name='jobs.pool_status',
188
+ request_name=request_names.RequestName.JOBS_POOL_STATUS,
188
189
  request_body=jobs_pool_status_body,
189
190
  func=core.pool_status,
190
191
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -199,15 +200,15 @@ async def pool_tail_logs(
199
200
  ) -> fastapi.responses.StreamingResponse:
200
201
  await executor.schedule_request_async(
201
202
  request_id=request.state.request_id,
202
- request_name='jobs.pool_logs',
203
+ request_name=request_names.RequestName.JOBS_POOL_LOGS,
203
204
  request_body=log_body,
204
205
  func=core.pool_tail_logs,
205
206
  schedule_type=api_requests.ScheduleType.SHORT,
206
207
  request_cluster_name=common.JOB_CONTROLLER_NAME,
207
208
  )
208
209
 
209
- request_task = api_requests.get_request(request.state.request_id,
210
- fields=['request_id'])
210
+ request_task = await api_requests.get_request_async(
211
+ request.state.request_id, fields=['request_id'])
211
212
 
212
213
  return stream_utils.stream_response_for_long_request(
213
214
  request_id=request_task.request_id,
@@ -235,7 +236,7 @@ async def pool_download_logs(
235
236
  download_logs_body.local_dir = str(logs_dir_on_api_server)
236
237
  await executor.schedule_request_async(
237
238
  request_id=request.state.request_id,
238
- request_name='jobs.pool_sync_down_logs',
239
+ request_name=request_names.RequestName.JOBS_POOL_SYNC_DOWN_LOGS,
239
240
  request_body=download_logs_body,
240
241
  func=core.pool_sync_down_logs,
241
242
  schedule_type=api_requests.ScheduleType.SHORT,
sky/jobs/server/utils.py CHANGED
@@ -19,6 +19,11 @@ else:
19
19
  managed_jobsv1_pb2 = adaptors_common.LazyImport(
20
20
  'sky.schemas.generated.managed_jobsv1_pb2')
21
21
 
22
+ _MANAGED_JOB_FIELDS_TO_GET = [
23
+ 'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
24
+ 'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
25
+ ]
26
+
22
27
 
23
28
  def check_version_mismatch_and_non_terminal_jobs() -> None:
24
29
  """Check if controller has version mismatch and non-terminal jobs exist.
@@ -50,7 +55,11 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
50
55
  )).get_managed_job_controller_version(version_request))
51
56
  controller_version = version_response.controller_version
52
57
 
53
- job_table_request = managed_jobsv1_pb2.GetJobTableRequest()
58
+ job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
59
+ skip_finished=True,
60
+ fields=managed_jobsv1_pb2.Fields(
61
+ fields=_MANAGED_JOB_FIELDS_TO_GET),
62
+ )
54
63
  job_table_response = backend_utils.invoke_skylet_with_retries(
55
64
  lambda: cloud_vm_ray_backend.SkyletClient(
56
65
  handle.get_grpc_channel()).get_managed_job_table(
@@ -62,7 +71,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
62
71
 
63
72
  if use_legacy:
64
73
  # Get controller version and raw job table
65
- code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
74
+ code = managed_job_utils.ManagedJobCodeGen.get_version()
66
75
 
67
76
  returncode, output, stderr = backend.run_on_head(handle,
68
77
  code,
@@ -72,7 +81,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
72
81
 
73
82
  if returncode != 0:
74
83
  logger.error(output + stderr)
75
- raise ValueError('Failed to check controller version and jobs with '
84
+ raise ValueError('Failed to check controller version with '
76
85
  f'returncode: {returncode}.\n{output + stderr}')
77
86
 
78
87
  # Parse the output to extract controller version (split only on first
@@ -80,19 +89,28 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
80
89
  output_parts = output.strip().split('\n', 1)
81
90
 
82
91
  # Extract controller version from first line
83
- if len(output_parts) < 2 or not output_parts[0].startswith(
84
- 'controller_version:'):
92
+ if not output_parts[0].startswith('controller_version:'):
85
93
  raise ValueError(
86
94
  f'Expected controller version in first line, got: {output}')
87
95
 
88
96
  controller_version = output_parts[0].split(':', 1)[1]
89
97
 
90
- # Rest is job table payload (preserving any newlines within it)
91
- job_table_payload = output_parts[1]
98
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
99
+ skip_finished=True, fields=_MANAGED_JOB_FIELDS_TO_GET)
100
+ returncode, job_table_payload, stderr = backend.run_on_head(
101
+ handle,
102
+ code,
103
+ require_outputs=True,
104
+ stream_logs=False,
105
+ separate_stderr=True)
106
+
107
+ if returncode != 0:
108
+ logger.error(job_table_payload + stderr)
109
+ raise ValueError('Failed to fetch managed jobs with returncode: '
110
+ f'{returncode}.\n{job_table_payload + stderr}')
92
111
 
93
- # Load and filter jobs locally using existing method
94
- jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
95
- job_table_payload)
112
+ jobs, _, _, _, _ = (
113
+ managed_job_utils.load_managed_job_queue(job_table_payload))
96
114
 
97
115
  # Process locally: check version match and filter non-terminal jobs
98
116
  version_matches = (controller_version == local_version or