skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,7 @@ import logging
10
10
  import os
11
11
  import traceback
12
12
  import typing
13
- from typing import Optional, Set
13
+ from typing import List, Optional, Set
14
14
 
15
15
  from sky import backends
16
16
  from sky import dag as dag_lib
@@ -30,6 +30,7 @@ from sky.usage import usage_lib
30
30
  from sky.utils import common_utils
31
31
  from sky.utils import context_utils
32
32
  from sky.utils import env_options
33
+ from sky.utils import instance_links as instance_links_utils
33
34
  from sky.utils import registry
34
35
  from sky.utils import status_lib
35
36
  from sky.utils import ux_utils
@@ -74,6 +75,7 @@ class StrategyExecutor:
74
75
  starting: Set[int],
75
76
  starting_lock: asyncio.Lock,
76
77
  starting_signal: asyncio.Condition,
78
+ recover_on_exit_codes: Optional[List[int]] = None,
77
79
  ) -> None:
78
80
  """Initialize the strategy executor.
79
81
 
@@ -87,6 +89,8 @@ class StrategyExecutor:
87
89
  starting: Set of job IDs that are currently starting.
88
90
  starting_lock: Lock to synchronize starting jobs.
89
91
  starting_signal: Condition to signal when a job can start.
92
+ recover_on_exit_codes: List of exit codes that should trigger
93
+ recovery regardless of max_restarts_on_errors limit.
90
94
  """
91
95
  assert isinstance(backend, backends.CloudVmRayBackend), (
92
96
  'Only CloudVMRayBackend is supported.')
@@ -99,6 +103,7 @@ class StrategyExecutor:
99
103
  self.cluster_name = cluster_name
100
104
  self.backend = backend
101
105
  self.max_restarts_on_errors = max_restarts_on_errors
106
+ self.recover_on_exit_codes = recover_on_exit_codes or []
102
107
  self.job_id = job_id
103
108
  self.task_id = task_id
104
109
  self.pool = pool
@@ -123,6 +128,9 @@ class StrategyExecutor:
123
128
  ) -> 'StrategyExecutor':
124
129
  """Create a strategy from a task."""
125
130
 
131
+ # TODO(cooperc): Consider defaulting to FAILOVER if using k8s with a
132
+ # single context, since there are not multiple clouds/regions to
133
+ # failover through.
126
134
  resource_list = list(task.resources)
127
135
  job_recovery = resource_list[0].job_recovery
128
136
  for resource in resource_list:
@@ -144,16 +152,26 @@ class StrategyExecutor:
144
152
  job_recovery_name: Optional[str] = name
145
153
  max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
146
154
  0)
155
+ recover_exit_codes = job_recovery.pop('recover_on_exit_codes', None)
156
+ # Normalize single integer to list
157
+ recover_on_exit_codes: Optional[List[int]] = None
158
+ if isinstance(recover_exit_codes, int):
159
+ recover_on_exit_codes = [recover_exit_codes]
160
+ elif isinstance(recover_exit_codes, list):
161
+ recover_on_exit_codes = [
162
+ int(code) for code in recover_exit_codes
163
+ ]
147
164
  else:
148
165
  job_recovery_name = job_recovery
149
166
  max_restarts_on_errors = 0
167
+ recover_on_exit_codes = None
150
168
  job_recovery_strategy = (registry.JOBS_RECOVERY_STRATEGY_REGISTRY.
151
169
  from_str(job_recovery_name))
152
170
  assert job_recovery_strategy is not None, job_recovery_name
153
171
  return job_recovery_strategy(cluster_name, backend, task,
154
172
  max_restarts_on_errors, job_id, task_id,
155
173
  pool, starting, starting_lock,
156
- starting_signal)
174
+ starting_signal, recover_on_exit_codes)
157
175
 
158
176
  async def launch(self) -> float:
159
177
  """Launch the cluster for the first time.
@@ -275,19 +293,25 @@ class StrategyExecutor:
275
293
  break
276
294
 
277
295
  try:
278
- status = await managed_job_utils.get_job_status(
279
- self.backend,
280
- self.cluster_name,
281
- job_id=self.job_id_on_pool_cluster)
296
+ status, transient_error_reason = (
297
+ await managed_job_utils.get_job_status(
298
+ self.backend,
299
+ self.cluster_name,
300
+ job_id=self.job_id_on_pool_cluster))
282
301
  except Exception as e: # pylint: disable=broad-except
302
+ transient_error_reason = common_utils.format_exception(e)
283
303
  # If any unexpected error happens, retry the job checking
284
304
  # loop.
285
305
  # Note: the CommandError is already handled in the
286
306
  # get_job_status, so it should not happen here.
287
307
  # TODO(zhwu): log the unexpected error to usage collection
288
308
  # for future debugging.
289
- logger.info(f'Unexpected exception: {e}\nFailed to get the '
290
- 'job status. Retrying.')
309
+ logger.info('Unexpected exception during fetching job status: '
310
+ f'{common_utils.format_exception(e)}')
311
+ continue
312
+ if transient_error_reason is not None:
313
+ logger.info('Transient error when fetching the job status: '
314
+ f'{transient_error_reason}')
291
315
  continue
292
316
 
293
317
  # Check the job status until it is not in initialized status
@@ -444,9 +468,16 @@ class StrategyExecutor:
444
468
  raise
445
469
  logger.info('Managed job cluster launched.')
446
470
  else:
471
+ # Get task resources from DAG for resource-aware
472
+ # scheduling.
473
+ task_resources = None
474
+ if self.dag.tasks:
475
+ task = self.dag.tasks[self.task_id]
476
+ task_resources = task.resources
477
+
447
478
  self.cluster_name = await (context_utils.to_thread(
448
479
  serve_utils.get_next_cluster_name, self.pool,
449
- self.job_id))
480
+ self.job_id, task_resources))
450
481
  if self.cluster_name is None:
451
482
  raise exceptions.NoClusterLaunchedError(
452
483
  'No cluster name found in the pool.')
@@ -537,6 +568,52 @@ class StrategyExecutor:
537
568
  # At this point, a sky.launch() has succeeded. Cluster
538
569
  # may be UP (no preemption since) or DOWN (newly
539
570
  # preempted).
571
+ # Auto-populate instance links if cluster is on a real
572
+ # cloud
573
+ if self.cluster_name is not None and self.pool is None:
574
+ try:
575
+ handle = await context_utils.to_thread(
576
+ global_user_state.
577
+ get_handle_from_cluster_name,
578
+ self.cluster_name)
579
+ if (handle is not None and hasattr(
580
+ handle, 'cached_cluster_info') and
581
+ handle.cached_cluster_info is not None):
582
+ cluster_info = handle.cached_cluster_info
583
+ instance_links = (instance_links_utils.
584
+ generate_instance_links(
585
+ cluster_info,
586
+ self.cluster_name))
587
+ if instance_links:
588
+ # Store instance links directly in
589
+ # database
590
+ await state.update_links_async(
591
+ self.job_id, self.task_id,
592
+ instance_links)
593
+ logger.debug(
594
+ f'Auto-populated instance links: '
595
+ f'{instance_links}')
596
+ else:
597
+ logger.debug('Failed to generate '
598
+ 'instance links')
599
+ else:
600
+ logger.debug(
601
+ 'Cluster handle not found or '
602
+ 'cached cluster info is None so'
603
+ 'not populating instance links')
604
+ except Exception as e: # pylint: disable=broad-except
605
+ # Don't fail the launch if we can't generate
606
+ # links
607
+ logger.debug(
608
+ 'Failed to auto-populate instance links: '
609
+ f'{e}')
610
+ else:
611
+ if self.pool:
612
+ logger.debug('Not populating instance links '
613
+ 'since the cluster is for a pool')
614
+ else:
615
+ logger.debug('Not populating instance links '
616
+ 'since the cluster name is None')
540
617
  job_submitted_at = await (
541
618
  self._wait_until_job_starts_on_cluster())
542
619
  if job_submitted_at is not None:
@@ -589,15 +666,35 @@ class StrategyExecutor:
589
666
  # NoClusterLaunchedError.
590
667
  assert False, 'Unreachable'
591
668
 
592
- def should_restart_on_failure(self) -> bool:
669
+ def should_restart_on_failure(self,
670
+ exit_codes: Optional[List[int]] = None
671
+ ) -> bool:
593
672
  """Increments counter & checks if job should be restarted on a failure.
594
673
 
674
+ Args:
675
+ exit_codes: List of exit codes from the failed job. If any exit code
676
+ matches recover_on_exit_codes, recovery will be triggered
677
+ regardless of max_restarts_on_errors limit.
678
+
595
679
  Returns:
596
680
  True if the job should be restarted, otherwise False.
597
681
  """
682
+ # Check if any exit code matches the configured recover_on_exit_codes
683
+ # This triggers recovery without incrementing the counter
684
+ if exit_codes and self.recover_on_exit_codes:
685
+ for exit_code in exit_codes:
686
+ if exit_code in self.recover_on_exit_codes:
687
+ logger.info(f'Exit code {exit_code} matched '
688
+ 'recover_on_exit_codes, triggering recovery')
689
+ return True
690
+
691
+ # Otherwise, check the max_restarts_on_errors counter
598
692
  self.restart_cnt_on_failure += 1
599
693
  if self.restart_cnt_on_failure > self.max_restarts_on_errors:
600
694
  return False
695
+ logger.info(f'Restart count {self.restart_cnt_on_failure} '
696
+ 'is less than max_restarts_on_errors, '
697
+ 'restarting job')
601
698
  return True
602
699
 
603
700
 
@@ -620,10 +717,11 @@ class FailoverStrategyExecutor(StrategyExecutor):
620
717
  starting: Set[int],
621
718
  starting_lock: asyncio.Lock,
622
719
  starting_signal: asyncio.Condition,
720
+ recover_on_exit_codes: Optional[List[int]] = None,
623
721
  ) -> None:
624
722
  super().__init__(cluster_name, backend, task, max_restarts_on_errors,
625
723
  job_id, task_id, pool, starting, starting_lock,
626
- starting_signal)
724
+ starting_signal, recover_on_exit_codes)
627
725
  # Note down the cloud/region of the launched cluster, so that we can
628
726
  # first retry in the same cloud/region. (Inside recover() we may not
629
727
  # rely on cluster handle, as it can be None if the cluster is
sky/jobs/server/core.py CHANGED
@@ -25,6 +25,7 @@ from sky.adaptors import common as adaptors_common
25
25
  from sky.backends import backend_utils
26
26
  from sky.backends import cloud_vm_ray_backend
27
27
  from sky.catalog import common as service_catalog_common
28
+ from sky.data import data_utils
28
29
  from sky.data import storage as storage_lib
29
30
  from sky.jobs import constants as managed_job_constants
30
31
  from sky.jobs import state as managed_job_state
@@ -93,6 +94,51 @@ _MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
93
94
  ]
94
95
 
95
96
 
97
+ def _warn_file_mounts_rolling_update(dag: 'sky.Dag') -> None:
98
+ """Warn if local file mounts or workdir may be lost during rolling update.
99
+
100
+ When rolling update is enabled with consolidation mode but no jobs bucket
101
+ is configured, local file mounts and workdirs are stored locally on the API
102
+ server pod and will be lost during a rolling update.
103
+ """
104
+ # If rolling update is not enabled, don't warn.
105
+ if os.environ.get(skylet_constants.SKYPILOT_ROLLING_UPDATE_ENABLED) is None:
106
+ return
107
+
108
+ # If consolidation mode is not enabled, don't warn.
109
+ if not managed_job_utils.is_consolidation_mode():
110
+ return
111
+
112
+ # If a jobs bucket is configured, don't warn.
113
+ if skypilot_config.get_nested(('jobs', 'bucket'), None) is not None:
114
+ return
115
+
116
+ # Check if any task has local file_mounts (not cloud store URLs) or workdir
117
+ has_local_file_mounts = False
118
+ has_local_workdir = False
119
+ for task_ in dag.tasks:
120
+ if task_.file_mounts:
121
+ for src in task_.file_mounts.values():
122
+ if not data_utils.is_cloud_store_url(src):
123
+ has_local_file_mounts = True
124
+ break
125
+ if task_.workdir and isinstance(task_.workdir, str):
126
+ has_local_workdir = True
127
+ break
128
+ if has_local_file_mounts:
129
+ break
130
+
131
+ if not has_local_file_mounts and not has_local_workdir:
132
+ return
133
+
134
+ logger.warning(
135
+ f'{colorama.Fore.YELLOW}WARNING: Local file mounts or workdir detected '
136
+ 'with rolling update enabled for API server. To persist files'
137
+ ' across API server restarts/update, use buckets, volumes, or git '
138
+ 'for your file mounts; or, configure a bucket in your SkyPilot config '
139
+ f'under `jobs.bucket`. {colorama.Style.RESET_ALL}')
140
+
141
+
96
142
  def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
97
143
  """Upload files to the controller.
98
144
 
@@ -103,14 +149,21 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
103
149
  """
104
150
  local_to_controller_file_mounts: Dict[str, str] = {}
105
151
 
106
- # For consolidation mode, we don't need to use cloud storage,
107
- # as uploading to the controller is only a local copy.
152
+ # Check if user has explicitly configured a bucket for jobs.
153
+ # If so, we should use cloud storage even in consolidation mode to persist
154
+ # files across rolling updates and pod restarts.
155
+ has_explicit_bucket = skypilot_config.get_nested(('jobs', 'bucket'),
156
+ None) is not None
108
157
  storage_clouds = (
109
158
  storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
110
159
  force_disable_cloud_bucket = skypilot_config.get_nested(
111
160
  ('jobs', 'force_disable_cloud_bucket'), False)
112
- if (not managed_job_utils.is_consolidation_mode() and storage_clouds and
113
- not force_disable_cloud_bucket):
161
+ # Use cloud storage if:
162
+ # 1. Not in consolidation mode, OR
163
+ # 2. In consolidation mode BUT user has explicit bucket configured
164
+ # AND storage clouds are available AND cloud bucket is not force-disabled
165
+ if ((not managed_job_utils.is_consolidation_mode() or has_explicit_bucket)
166
+ and storage_clouds and not force_disable_cloud_bucket):
114
167
  for task_ in dag.tasks:
115
168
  controller_utils.maybe_translate_local_file_mounts_and_sync_up(
116
169
  task_, task_type='jobs')
@@ -346,6 +399,9 @@ def launch(
346
399
  f'with:\n\n`sky down {cluster_name} --purge`\n\n'
347
400
  f'Reason: {common_utils.format_exception(e)}')
348
401
 
402
+ # Warn if file mounts may be lost during rolling update
403
+ _warn_file_mounts_rolling_update(dag)
404
+
349
405
  local_to_controller_file_mounts = _upload_files_to_controller(dag)
350
406
  controller = controller_utils.Controllers.JOBS_CONTROLLER
351
407
  controller_name = controller.value.cluster_name
@@ -1216,3 +1272,24 @@ def pool_sync_down_logs(
1216
1272
  replica_ids=worker_ids,
1217
1273
  tail=tail,
1218
1274
  pool=True)
1275
+
1276
+
1277
+ @usage_lib.entrypoint
1278
+ def get_job_events(
1279
+ job_id: int,
1280
+ task_id: Optional[int] = None,
1281
+ limit: Optional[int] = 10,
1282
+ ) -> List[Dict[str, Any]]:
1283
+ """Get task events for a managed job.
1284
+
1285
+ Args:
1286
+ job_id: The job ID to get task events for.
1287
+ task_id: Optional task ID to filter by.
1288
+ limit: Optional limit on number of task events to return (default 10).
1289
+
1290
+ Returns:
1291
+ List of task event records.
1292
+ """
1293
+ return managed_job_state.get_job_events(job_id=job_id,
1294
+ task_id=task_id,
1295
+ limit=limit)
sky/jobs/server/server.py CHANGED
@@ -242,3 +242,17 @@ async def pool_download_logs(
242
242
  schedule_type=api_requests.ScheduleType.SHORT,
243
243
  request_cluster_name=common.JOB_CONTROLLER_NAME,
244
244
  )
245
+
246
+
247
+ @router.post('/events')
248
+ async def events(request: fastapi.Request,
249
+ body: payloads.GetJobEventsBody) -> None:
250
+ """Gets task events for a managed job."""
251
+ await executor.schedule_request_async(
252
+ request_id=request.state.request_id,
253
+ request_name=request_names.RequestName.JOBS_EVENTS,
254
+ request_body=body,
255
+ func=core.get_job_events,
256
+ schedule_type=api_requests.ScheduleType.SHORT,
257
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
258
+ )