skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py CHANGED
@@ -64,7 +64,6 @@ from sky.jobs import utils as managed_job_utils
64
64
  from sky.server import config as server_config
65
65
  from sky.skylet import constants
66
66
  from sky.utils import annotations
67
- from sky.utils import common_utils
68
67
  from sky.utils import controller_utils
69
68
  from sky.utils import subprocess_utils
70
69
 
@@ -168,11 +167,12 @@ def start_controller() -> None:
168
167
  logs_dir = os.path.expanduser(
169
168
  managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
170
169
  os.makedirs(logs_dir, exist_ok=True)
171
- log_path = os.path.join(logs_dir, f'controller_{uuid.uuid4()}.log')
170
+ controller_uuid = str(uuid.uuid4())
171
+ log_path = os.path.join(logs_dir, f'controller_{controller_uuid}.log')
172
172
 
173
173
  activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
174
174
  run_controller_cmd = (f'{sys.executable} -u -m'
175
- 'sky.jobs.controller')
175
+ f'sky.jobs.controller {controller_uuid}')
176
176
 
177
177
  run_cmd = (f'{activate_python_env_cmd}'
178
178
  f'{run_controller_cmd}')
@@ -263,6 +263,7 @@ def maybe_start_controllers(from_scheduler: bool = False) -> None:
263
263
 
264
264
  if started > 0:
265
265
  logger.info(f'Started {started} controllers')
266
+
266
267
  except filelock.Timeout:
267
268
  # If we can't get the lock, just exit. The process holding the lock
268
269
  # should launch any pending jobs.
@@ -289,9 +290,20 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
289
290
  maybe_start_controllers(from_scheduler=True)
290
291
  return
291
292
 
292
- state.scheduler_set_waiting(job_id, dag_yaml_path,
293
- original_user_yaml_path, env_file_path,
294
- common_utils.get_user_hash(), priority)
293
+ with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
294
+ dag_yaml_content = dag_file.read()
295
+ with open(original_user_yaml_path, 'r',
296
+ encoding='utf-8') as original_user_yaml_file:
297
+ original_user_yaml_content = original_user_yaml_file.read()
298
+ with open(env_file_path, 'r', encoding='utf-8') as env_file:
299
+ env_file_content = env_file.read()
300
+ logger.debug(f'Storing job {job_id} file contents in database '
301
+ f'(DAG bytes={len(dag_yaml_content)}, '
302
+ f'original user yaml bytes={len(original_user_yaml_content)}, '
303
+ f'env bytes={len(env_file_content)}).')
304
+ state.scheduler_set_waiting(job_id, dag_yaml_content,
305
+ original_user_yaml_content, env_file_content,
306
+ priority)
295
307
  if state.get_ha_recovery_script(job_id) is None:
296
308
  # the run command is just the command that called scheduler
297
309
  run = (f'source {env_file_path} && '
@@ -309,7 +321,6 @@ async def scheduled_launch(
309
321
  starting: Set[int],
310
322
  starting_lock: asyncio.Lock,
311
323
  starting_signal: asyncio.Condition,
312
- job_logger: 'logging.Logger',
313
324
  ):
314
325
  """Launch as part of an ongoing job.
315
326
 
@@ -347,10 +358,10 @@ async def scheduled_launch(
347
358
  starting_count = len(starting)
348
359
  if starting_count < LAUNCHES_PER_WORKER:
349
360
  break
350
- job_logger.info('Too many jobs starting, waiting for a slot')
361
+ logger.info('Too many jobs starting, waiting for a slot')
351
362
  await starting_signal.wait()
352
363
 
353
- job_logger.info(f'Starting job {job_id}')
364
+ logger.info(f'Starting job {job_id}')
354
365
 
355
366
  async with starting_lock:
356
367
  starting.add(job_id)
sky/jobs/server/core.py CHANGED
@@ -1,4 +1,6 @@
1
1
  """SDK functions for managed jobs."""
2
+ import concurrent.futures
3
+ import copy
2
4
  import ipaddress
3
5
  import os
4
6
  import pathlib
@@ -33,6 +35,7 @@ from sky.schemas.api import responses
33
35
  from sky.serve import serve_state
34
36
  from sky.serve import serve_utils
35
37
  from sky.serve.server import impl
38
+ from sky.server.requests import request_names
36
39
  from sky.skylet import constants as skylet_constants
37
40
  from sky.usage import usage_lib
38
41
  from sky.utils import admin_policy_utils
@@ -60,6 +63,35 @@ else:
60
63
 
61
64
  logger = sky_logging.init_logger(__name__)
62
65
 
66
+ _MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
67
+ 'job_id',
68
+ 'task_id',
69
+ 'workspace',
70
+ 'job_name',
71
+ 'task_name',
72
+ 'resources',
73
+ 'submitted_at',
74
+ 'end_at',
75
+ 'job_duration',
76
+ 'recovery_count',
77
+ 'status',
78
+ 'pool',
79
+ 'current_cluster_name',
80
+ 'job_id_on_pool_cluster',
81
+ 'start_at',
82
+ 'infra',
83
+ 'cloud',
84
+ 'region',
85
+ 'zone',
86
+ 'cluster_resources',
87
+ 'schedule_state',
88
+ 'details',
89
+ 'failure_reason',
90
+ 'metadata',
91
+ 'user_name',
92
+ 'user_hash',
93
+ ]
94
+
63
95
 
64
96
  def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
65
97
  """Upload files to the controller.
@@ -142,7 +174,8 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
142
174
  force_user_workspace=True),
143
175
  entrypoint=common_utils.get_current_command(),
144
176
  pool=pool,
145
- pool_hash=pool_hash))
177
+ pool_hash=pool_hash,
178
+ user_hash=common_utils.get_user_hash()))
146
179
  for task_id, task in enumerate(dag.tasks):
147
180
  resources_str = backend_utils.get_task_resources_str(
148
181
  task, is_managed_job=True)
@@ -205,7 +238,8 @@ def launch(
205
238
  # Always apply the policy again here, even though it might have been applied
206
239
  # in the CLI. This is to ensure that we apply the policy to the final DAG
207
240
  # and get the mutated config.
208
- dag, mutated_user_config = admin_policy_utils.apply(dag)
241
+ dag, mutated_user_config = admin_policy_utils.apply(
242
+ dag, request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH)
209
243
  dag.resolve_and_validate_volumes()
210
244
  if not dag.is_chain():
211
245
  with ux_utils.print_exception_no_traceback():
@@ -336,6 +370,7 @@ def launch(
336
370
  def _submit_one(
337
371
  consolidation_mode_job_id: Optional[int] = None,
338
372
  job_rank: Optional[int] = None,
373
+ num_jobs: Optional[int] = None,
339
374
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
340
375
  rank_suffix = '' if job_rank is None else f'-{job_rank}'
341
376
  remote_original_user_yaml_path = (
@@ -355,11 +390,15 @@ def launch(
355
390
  ) as original_user_yaml_path:
356
391
  original_user_yaml_path.write(user_dag_str_user_specified)
357
392
  original_user_yaml_path.flush()
358
- for task_ in dag.tasks:
393
+ # Copy tasks to avoid race conditions when multiple threads modify
394
+ # the same dag object concurrently. Each thread needs its own copy.
395
+ dag_copy = copy.deepcopy(dag)
396
+ for task_ in dag_copy.tasks:
359
397
  if job_rank is not None:
360
398
  task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
399
+ task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
361
400
 
362
- dag_utils.dump_chain_dag_to_yaml(dag, f.name)
401
+ dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
363
402
 
364
403
  vars_to_fill = {
365
404
  'remote_original_user_yaml_path':
@@ -392,7 +431,8 @@ def launch(
392
431
 
393
432
  yaml_path = os.path.join(
394
433
  managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
395
- f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml')
434
+ f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
435
+ )
396
436
  common_utils.fill_template(
397
437
  managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
398
438
  vars_to_fill,
@@ -400,7 +440,7 @@ def launch(
400
440
  controller_task = task_lib.Task.from_yaml(yaml_path)
401
441
  controller_task.set_resources(controller_resources)
402
442
 
403
- controller_task.managed_job_dag = dag
443
+ controller_task.managed_job_dag = dag_copy
404
444
  # pylint: disable=protected-access
405
445
  controller_task._metadata = metadata
406
446
 
@@ -427,12 +467,15 @@ def launch(
427
467
  # intermediate bucket and newly created bucket should be in
428
468
  # workspace A.
429
469
  if consolidation_mode_job_id is None:
430
- return execution.launch(task=controller_task,
431
- cluster_name=controller_name,
432
- stream_logs=stream_logs,
433
- retry_until_up=True,
434
- fast=True,
435
- _disable_controller_check=True)
470
+ return execution.launch(
471
+ task=controller_task,
472
+ cluster_name=controller_name,
473
+ stream_logs=stream_logs,
474
+ retry_until_up=True,
475
+ fast=True,
476
+ _request_name=request_names.AdminPolicyRequestName.
477
+ JOBS_LAUNCH_CONTROLLER,
478
+ _disable_controller_check=True)
436
479
  # Manually launch the scheduler in consolidation mode.
437
480
  local_handle = backend_utils.is_controller_accessible(
438
481
  controller=controller, stopped_message='')
@@ -469,15 +512,49 @@ def launch(
469
512
  assert len(consolidation_mode_job_ids) == 1
470
513
  return _submit_one(consolidation_mode_job_ids[0])
471
514
 
472
- ids = []
473
- all_handle = None
474
- for job_rank in range(num_jobs):
475
- job_id = (consolidation_mode_job_ids[job_rank]
515
+ ids: List[int] = []
516
+ all_handle: Optional[backends.ResourceHandle] = None
517
+
518
+ if num_jobs == 1:
519
+ job_id = (consolidation_mode_job_ids[0]
476
520
  if consolidation_mode_job_ids is not None else None)
477
- jid, handle = _submit_one(job_id, job_rank)
521
+ jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
478
522
  assert jid is not None, (job_id, handle)
479
523
  ids.append(jid)
480
524
  all_handle = handle
525
+ else:
526
+ # Submit jobs in parallel using ThreadPoolExecutor
527
+ with concurrent.futures.ThreadPoolExecutor(
528
+ max_workers=min(num_jobs,
529
+ os.cpu_count() or 1)) as executor:
530
+ # Submit jobs concurrently
531
+ future_to_rank = {}
532
+ for job_rank in range(num_jobs):
533
+ job_id = (consolidation_mode_job_ids[job_rank]
534
+ if consolidation_mode_job_ids is not None else None)
535
+ future = executor.submit(_submit_one, job_id, job_rank,
536
+ num_jobs)
537
+ future_to_rank[future] = job_rank
538
+
539
+ # Collect results in order of job_rank to maintain consistent order.
540
+ results: List[Optional[Tuple[
541
+ int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
542
+ for future in concurrent.futures.as_completed(future_to_rank):
543
+ job_rank = future_to_rank[future]
544
+ try:
545
+ jid, handle = future.result()
546
+ assert jid is not None, (job_id, handle)
547
+ results[job_rank] = (jid, handle)
548
+ all_handle = handle # Keep the last handle.
549
+ except Exception as e:
550
+ logger.error(f'Error launching job {job_rank}: {e}')
551
+ raise e
552
+
553
+ # Extract job IDs in order
554
+ for res in results:
555
+ if res is not None:
556
+ ids.append(res[0])
557
+
481
558
  return ids, all_handle
482
559
 
483
560
 
@@ -530,7 +607,8 @@ def queue_from_kubernetes_pod(
530
607
  'kubernetes', cluster_info)[0]
531
608
 
532
609
  code = managed_job_utils.ManagedJobCodeGen.get_job_table(
533
- skip_finished=skip_finished)
610
+ skip_finished=skip_finished,
611
+ fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
534
612
  returncode, job_table_payload, stderr = managed_jobs_runner.run(
535
613
  code,
536
614
  require_outputs=True,
@@ -643,8 +721,7 @@ def queue(refresh: bool,
643
721
  does not exist.
644
722
  RuntimeError: if failed to get the managed jobs with ssh.
645
723
  """
646
- jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids, None,
647
- None, None, None, None, None, None)
724
+ jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
648
725
 
649
726
  return jobs
650
727
 
@@ -662,12 +739,13 @@ def queue_v2_api(
662
739
  page: Optional[int] = None,
663
740
  limit: Optional[int] = None,
664
741
  statuses: Optional[List[str]] = None,
742
+ fields: Optional[List[str]] = None,
665
743
  ) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
666
744
  """Gets statuses of managed jobs and parse the
667
745
  jobs to responses.ManagedJobRecord."""
668
746
  jobs, total, status_counts, total_no_filter = queue_v2(
669
747
  refresh, skip_finished, all_users, job_ids, user_match, workspace_match,
670
- name_match, pool_match, page, limit, statuses)
748
+ name_match, pool_match, page, limit, statuses, fields)
671
749
  return [responses.ManagedJobRecord(**job) for job in jobs
672
750
  ], total, status_counts, total_no_filter
673
751
 
@@ -685,6 +763,7 @@ def queue_v2(
685
763
  page: Optional[int] = None,
686
764
  limit: Optional[int] = None,
687
765
  statuses: Optional[List[str]] = None,
766
+ fields: Optional[List[str]] = None,
688
767
  ) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
689
768
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
690
769
  """Gets statuses of managed jobs with filtering.
@@ -759,7 +838,8 @@ def queue_v2(
759
838
  try:
760
839
  request = managed_jobsv1_pb2.GetJobTableRequest(
761
840
  skip_finished=skip_finished,
762
- accessible_workspaces=accessible_workspaces,
841
+ accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
842
+ workspaces=accessible_workspaces)),
763
843
  job_ids=managed_jobsv1_pb2.JobIds(
764
844
  ids=job_ids) if job_ids is not None else None,
765
845
  workspace_match=workspace_match,
@@ -775,6 +855,8 @@ def queue_v2(
775
855
  ]) if user_hashes is not None else None,
776
856
  statuses=managed_jobsv1_pb2.Statuses(
777
857
  statuses=statuses) if statuses is not None else None,
858
+ fields=managed_jobsv1_pb2.Fields(
859
+ fields=fields) if fields is not None else None,
778
860
  show_jobs_without_user_hash=show_jobs_without_user_hash,
779
861
  )
780
862
  response = backend_utils.invoke_skylet_with_retries(
@@ -789,7 +871,7 @@ def queue_v2(
789
871
  with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
790
872
  code = managed_job_utils.ManagedJobCodeGen.get_job_table(
791
873
  skip_finished, accessible_workspaces, job_ids, workspace_match,
792
- name_match, pool_match, page, limit, user_hashes, statuses)
874
+ name_match, pool_match, page, limit, user_hashes, statuses, fields)
793
875
  with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
794
876
  returncode, job_table_payload, stderr = backend.run_on_head(
795
877
  handle,
sky/jobs/server/server.py CHANGED
@@ -11,6 +11,7 @@ from sky.server import common as server_common
11
11
  from sky.server import stream_utils
12
12
  from sky.server.requests import executor
13
13
  from sky.server.requests import payloads
14
+ from sky.server.requests import request_names
14
15
  from sky.server.requests import requests as api_requests
15
16
  from sky.skylet import constants
16
17
  from sky.utils import common
@@ -35,9 +36,9 @@ async def launch(request: fastapi.Request,
35
36
  consolidation_mode = managed_jobs_utils.is_consolidation_mode()
36
37
  schedule_type = (api_requests.ScheduleType.SHORT
37
38
  if consolidation_mode else api_requests.ScheduleType.LONG)
38
- executor.schedule_request(
39
+ await executor.schedule_request_async(
39
40
  request_id=request.state.request_id,
40
- request_name='jobs.launch',
41
+ request_name=request_names.RequestName.JOBS_LAUNCH,
41
42
  request_body=jobs_launch_body,
42
43
  func=core.launch,
43
44
  schedule_type=schedule_type,
@@ -50,9 +51,9 @@ async def launch(request: fastapi.Request,
50
51
  @router.post('/queue')
51
52
  async def queue(request: fastapi.Request,
52
53
  jobs_queue_body: payloads.JobsQueueBody) -> None:
53
- executor.schedule_request(
54
+ await executor.schedule_request_async(
54
55
  request_id=request.state.request_id,
55
- request_name='jobs.queue',
56
+ request_name=request_names.RequestName.JOBS_QUEUE,
56
57
  request_body=jobs_queue_body,
57
58
  func=core.queue,
58
59
  schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
@@ -64,9 +65,9 @@ async def queue(request: fastapi.Request,
64
65
  @router.post('/queue/v2')
65
66
  async def queue_v2(request: fastapi.Request,
66
67
  jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
67
- executor.schedule_request(
68
+ await executor.schedule_request_async(
68
69
  request_id=request.state.request_id,
69
- request_name='jobs.queue_v2',
70
+ request_name=request_names.RequestName.JOBS_QUEUE_V2,
70
71
  request_body=jobs_queue_body_v2,
71
72
  func=core.queue_v2_api,
72
73
  schedule_type=(api_requests.ScheduleType.LONG
@@ -79,9 +80,9 @@ async def queue_v2(request: fastapi.Request,
79
80
  @router.post('/cancel')
80
81
  async def cancel(request: fastapi.Request,
81
82
  jobs_cancel_body: payloads.JobsCancelBody) -> None:
82
- executor.schedule_request(
83
+ await executor.schedule_request_async(
83
84
  request_id=request.state.request_id,
84
- request_name='jobs.cancel',
85
+ request_name=request_names.RequestName.JOBS_CANCEL,
85
86
  request_body=jobs_cancel_body,
86
87
  func=core.cancel,
87
88
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -99,27 +100,34 @@ async def logs(
99
100
  # When refresh is specified, the job controller might be restarted,
100
101
  # which takes longer time to finish. We schedule it to long executor.
101
102
  schedule_type = api_requests.ScheduleType.LONG
102
- request_task = executor.prepare_request(
103
+ if schedule_type == api_requests.ScheduleType.SHORT:
104
+ executor.check_request_thread_executor_available()
105
+ request_task = await executor.prepare_request_async(
103
106
  request_id=request.state.request_id,
104
- request_name='jobs.logs',
107
+ request_name=request_names.RequestName.JOBS_LOGS,
105
108
  request_body=jobs_logs_body,
106
109
  func=core.tail_logs,
107
110
  schedule_type=schedule_type,
108
111
  request_cluster_name=common.JOB_CONTROLLER_NAME,
109
112
  )
110
- if schedule_type == api_requests.ScheduleType.LONG:
111
- executor.schedule_prepared_request(request_task)
112
- else:
113
+ kill_request_on_disconnect = False
114
+ if schedule_type == api_requests.ScheduleType.SHORT:
113
115
  # For short request, run in the coroutine to avoid blocking
114
116
  # short workers.
115
117
  task = executor.execute_request_in_coroutine(request_task)
116
118
  # Cancel the coroutine after the request is done or client disconnects
117
119
  background_tasks.add_task(task.cancel)
120
+ else:
121
+ executor.schedule_prepared_request(request_task)
122
+ # When runs in long executor process, we should kill the request on
123
+ # disconnect to cancel the running routine.
124
+ kill_request_on_disconnect = True
118
125
 
119
- return stream_utils.stream_response(
126
+ return stream_utils.stream_response_for_long_request(
120
127
  request_id=request_task.request_id,
121
128
  logs_path=request_task.log_path,
122
129
  background_tasks=background_tasks,
130
+ kill_request_on_disconnect=kill_request_on_disconnect,
123
131
  )
124
132
 
125
133
 
@@ -134,9 +142,9 @@ async def download_logs(
134
142
  # We should reuse the original request body, so that the env vars, such as
135
143
  # user hash, are kept the same.
136
144
  jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
137
- executor.schedule_request(
145
+ await executor.schedule_request_async(
138
146
  request_id=request.state.request_id,
139
- request_name='jobs.download_logs',
147
+ request_name=request_names.RequestName.JOBS_DOWNLOAD_LOGS,
140
148
  request_body=jobs_download_logs_body,
141
149
  func=core.download_logs,
142
150
  schedule_type=api_requests.ScheduleType.LONG
@@ -148,9 +156,9 @@ async def download_logs(
148
156
  @router.post('/pool_apply')
149
157
  async def pool_apply(request: fastapi.Request,
150
158
  jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
151
- executor.schedule_request(
159
+ await executor.schedule_request_async(
152
160
  request_id=request.state.request_id,
153
- request_name='jobs.pool_apply',
161
+ request_name=request_names.RequestName.JOBS_POOL_APPLY,
154
162
  request_body=jobs_pool_apply_body,
155
163
  func=core.pool_apply,
156
164
  schedule_type=api_requests.ScheduleType.LONG,
@@ -161,9 +169,9 @@ async def pool_apply(request: fastapi.Request,
161
169
  @router.post('/pool_down')
162
170
  async def pool_down(request: fastapi.Request,
163
171
  jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
164
- executor.schedule_request(
172
+ await executor.schedule_request_async(
165
173
  request_id=request.state.request_id,
166
- request_name='jobs.pool_down',
174
+ request_name=request_names.RequestName.JOBS_POOL_DOWN,
167
175
  request_body=jobs_pool_down_body,
168
176
  func=core.pool_down,
169
177
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -175,9 +183,9 @@ async def pool_down(request: fastapi.Request,
175
183
  async def pool_status(
176
184
  request: fastapi.Request,
177
185
  jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
178
- executor.schedule_request(
186
+ await executor.schedule_request_async(
179
187
  request_id=request.state.request_id,
180
- request_name='jobs.pool_status',
188
+ request_name=request_names.RequestName.JOBS_POOL_STATUS,
181
189
  request_body=jobs_pool_status_body,
182
190
  func=core.pool_status,
183
191
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -190,21 +198,25 @@ async def pool_tail_logs(
190
198
  request: fastapi.Request, log_body: payloads.JobsPoolLogsBody,
191
199
  background_tasks: fastapi.BackgroundTasks
192
200
  ) -> fastapi.responses.StreamingResponse:
193
- executor.schedule_request(
201
+ await executor.schedule_request_async(
194
202
  request_id=request.state.request_id,
195
- request_name='jobs.pool_logs',
203
+ request_name=request_names.RequestName.JOBS_POOL_LOGS,
196
204
  request_body=log_body,
197
205
  func=core.pool_tail_logs,
198
206
  schedule_type=api_requests.ScheduleType.SHORT,
199
207
  request_cluster_name=common.JOB_CONTROLLER_NAME,
200
208
  )
201
209
 
202
- request_task = api_requests.get_request(request.state.request_id)
210
+ request_task = await api_requests.get_request_async(
211
+ request.state.request_id, fields=['request_id'])
203
212
 
204
- return stream_utils.stream_response(
213
+ return stream_utils.stream_response_for_long_request(
205
214
  request_id=request_task.request_id,
215
+ # req.log_path is derived from request_id,
216
+ # so it's ok to just grab the request_id in the above query.
206
217
  logs_path=request_task.log_path,
207
218
  background_tasks=background_tasks,
219
+ kill_request_on_disconnect=True,
208
220
  )
209
221
 
210
222
 
@@ -222,9 +234,9 @@ async def pool_download_logs(
222
234
  # We should reuse the original request body, so that the env vars, such as
223
235
  # user hash, are kept the same.
224
236
  download_logs_body.local_dir = str(logs_dir_on_api_server)
225
- executor.schedule_request(
237
+ await executor.schedule_request_async(
226
238
  request_id=request.state.request_id,
227
- request_name='jobs.pool_sync_down_logs',
239
+ request_name=request_names.RequestName.JOBS_POOL_SYNC_DOWN_LOGS,
228
240
  request_body=download_logs_body,
229
241
  func=core.pool_sync_down_logs,
230
242
  schedule_type=api_requests.ScheduleType.SHORT,
sky/jobs/server/utils.py CHANGED
@@ -19,6 +19,11 @@ else:
19
19
  managed_jobsv1_pb2 = adaptors_common.LazyImport(
20
20
  'sky.schemas.generated.managed_jobsv1_pb2')
21
21
 
22
+ _MANAGED_JOB_FIELDS_TO_GET = [
23
+ 'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
24
+ 'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
25
+ ]
26
+
22
27
 
23
28
  def check_version_mismatch_and_non_terminal_jobs() -> None:
24
29
  """Check if controller has version mismatch and non-terminal jobs exist.
@@ -50,7 +55,11 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
50
55
  )).get_managed_job_controller_version(version_request))
51
56
  controller_version = version_response.controller_version
52
57
 
53
- job_table_request = managed_jobsv1_pb2.GetJobTableRequest()
58
+ job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
59
+ skip_finished=True,
60
+ fields=managed_jobsv1_pb2.Fields(
61
+ fields=_MANAGED_JOB_FIELDS_TO_GET),
62
+ )
54
63
  job_table_response = backend_utils.invoke_skylet_with_retries(
55
64
  lambda: cloud_vm_ray_backend.SkyletClient(
56
65
  handle.get_grpc_channel()).get_managed_job_table(
@@ -62,7 +71,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
62
71
 
63
72
  if use_legacy:
64
73
  # Get controller version and raw job table
65
- code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
74
+ code = managed_job_utils.ManagedJobCodeGen.get_version()
66
75
 
67
76
  returncode, output, stderr = backend.run_on_head(handle,
68
77
  code,
@@ -72,7 +81,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
72
81
 
73
82
  if returncode != 0:
74
83
  logger.error(output + stderr)
75
- raise ValueError('Failed to check controller version and jobs with '
84
+ raise ValueError('Failed to check controller version with '
76
85
  f'returncode: {returncode}.\n{output + stderr}')
77
86
 
78
87
  # Parse the output to extract controller version (split only on first
@@ -80,19 +89,28 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
80
89
  output_parts = output.strip().split('\n', 1)
81
90
 
82
91
  # Extract controller version from first line
83
- if len(output_parts) < 2 or not output_parts[0].startswith(
84
- 'controller_version:'):
92
+ if not output_parts[0].startswith('controller_version:'):
85
93
  raise ValueError(
86
94
  f'Expected controller version in first line, got: {output}')
87
95
 
88
96
  controller_version = output_parts[0].split(':', 1)[1]
89
97
 
90
- # Rest is job table payload (preserving any newlines within it)
91
- job_table_payload = output_parts[1]
98
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
99
+ skip_finished=True, fields=_MANAGED_JOB_FIELDS_TO_GET)
100
+ returncode, job_table_payload, stderr = backend.run_on_head(
101
+ handle,
102
+ code,
103
+ require_outputs=True,
104
+ stream_logs=False,
105
+ separate_stderr=True)
106
+
107
+ if returncode != 0:
108
+ logger.error(job_table_payload + stderr)
109
+ raise ValueError('Failed to fetch managed jobs with returncode: '
110
+ f'{returncode}.\n{job_table_payload + stderr}')
92
111
 
93
- # Load and filter jobs locally using existing method
94
- jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
95
- job_table_payload)
112
+ jobs, _, _, _, _ = (
113
+ managed_job_utils.load_managed_job_queue(job_table_payload))
96
114
 
97
115
  # Process locally: check version match and filter non-terminal jobs
98
116
  version_matches = (controller_version == local_version or
@@ -103,7 +121,10 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
103
121
  if not version_matches and has_non_terminal_jobs:
104
122
  # Format job table locally using the same method as queue()
105
123
  formatted_job_table = managed_job_utils.format_job_table(
106
- non_terminal_jobs, show_all=False, show_user=False)
124
+ non_terminal_jobs,
125
+ pool_status=None,
126
+ show_all=False,
127
+ show_user=False)
107
128
 
108
129
  error_msg = (
109
130
  f'Controller SKYLET_VERSION ({controller_version}) does not match '