skypilot-nightly 1.0.0.dev20250123__py3-none-any.whl → 1.0.0.dev20250125__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '5b0f80d40362e8761caae527ffd62d8b9360ad8e'
8
+ _SKYPILOT_COMMIT_SHA = '485b1cd4688d5ac984cc666f372b55009cb064b7'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250123'
38
+ __version__ = '1.0.0.dev20250125'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/check.py CHANGED
@@ -155,7 +155,8 @@ def check(
155
155
  # Pretty print for UX.
156
156
  if not quiet:
157
157
  enabled_clouds_str = '\n :heavy_check_mark: '.join(
158
- [''] + sorted(all_enabled_clouds))
158
+ [''] +
159
+ [_format_enabled_cloud(c) for c in sorted(all_enabled_clouds)])
159
160
  rich.print('\n[green]:tada: Enabled clouds :tada:'
160
161
  f'{enabled_clouds_str}[/green]')
161
162
 
@@ -222,3 +223,32 @@ def get_cloud_credential_file_mounts(
222
223
  r2_credential_mounts = cloudflare.get_credential_file_mounts()
223
224
  file_mounts.update(r2_credential_mounts)
224
225
  return file_mounts
226
+
227
+
228
+ def _format_enabled_cloud(cloud_name: str) -> str:
229
+ if cloud_name == repr(sky_clouds.Kubernetes()):
230
+ # Get enabled contexts for Kubernetes
231
+ existing_contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
232
+ if not existing_contexts:
233
+ return cloud_name
234
+
235
+ # Check if allowed_contexts is explicitly set in config
236
+ allowed_contexts = skypilot_config.get_nested(
237
+ ('kubernetes', 'allowed_contexts'), None)
238
+
239
+ # Format the context info with consistent styling
240
+ if allowed_contexts is not None:
241
+ contexts_formatted = []
242
+ for i, context in enumerate(existing_contexts):
243
+ # TODO: We should use ux_utils.INDENT_SYMBOL and
244
+ # INDENT_LAST_SYMBOL but, they are formatted for colorama, while
245
+ # here we are using rich. We should migrate this file to
246
+ # use colorama as we do in the rest of the codebase.
247
+ symbol = ('└── ' if i == len(existing_contexts) - 1 else '├── ')
248
+ contexts_formatted.append(f'\n {symbol}{context}')
249
+ context_info = f'Allowed contexts:{"".join(contexts_formatted)}'
250
+ else:
251
+ context_info = f'Active context: {existing_contexts[0]}'
252
+
253
+ return f'{cloud_name}[/green][dim]\n └── {context_info}[/dim][green]'
254
+ return cloud_name
sky/clouds/kubernetes.py CHANGED
@@ -131,7 +131,7 @@ class Kubernetes(clouds.Cloud):
131
131
  'Ignoring these contexts.')
132
132
 
133
133
  @classmethod
134
- def _existing_allowed_contexts(cls) -> List[str]:
134
+ def existing_allowed_contexts(cls) -> List[str]:
135
135
  """Get existing allowed contexts.
136
136
 
137
137
  If None is returned in the list, it means that we are running in a pod
@@ -175,7 +175,7 @@ class Kubernetes(clouds.Cloud):
175
175
  use_spot: bool, region: Optional[str],
176
176
  zone: Optional[str]) -> List[clouds.Region]:
177
177
  del accelerators, zone, use_spot # unused
178
- existing_contexts = cls._existing_allowed_contexts()
178
+ existing_contexts = cls.existing_allowed_contexts()
179
179
 
180
180
  regions = []
181
181
  for context in existing_contexts:
@@ -591,7 +591,7 @@ class Kubernetes(clouds.Cloud):
591
591
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
592
592
  # Test using python API
593
593
  try:
594
- existing_allowed_contexts = cls._existing_allowed_contexts()
594
+ existing_allowed_contexts = cls.existing_allowed_contexts()
595
595
  except ImportError as e:
596
596
  return (False,
597
597
  f'{common_utils.format_exception(e, use_bracket=True)}')
sky/jobs/controller.py CHANGED
@@ -1,4 +1,7 @@
1
- """Controller: handles the life cycle of a managed job."""
1
+ """Controller: handles the life cycle of a managed job.
2
+
3
+ TODO(cooperc): Document lifecycle, and multiprocess layout.
4
+ """
2
5
  import argparse
3
6
  import multiprocessing
4
7
  import os
sky/jobs/state.py CHANGED
@@ -230,12 +230,12 @@ class ManagedJobStatus(enum.Enum):
230
230
  # RECOVERING: The cluster is preempted, and the controller process is
231
231
  # recovering the cluster (relaunching/failover).
232
232
  RECOVERING = 'RECOVERING'
233
- # Terminal statuses
234
- # SUCCEEDED: The job is finished successfully.
235
- SUCCEEDED = 'SUCCEEDED'
236
233
  # CANCELLING: The job is requested to be cancelled by the user, and the
237
234
  # controller is cleaning up the cluster.
238
235
  CANCELLING = 'CANCELLING'
236
+ # Terminal statuses
237
+ # SUCCEEDED: The job is finished successfully.
238
+ SUCCEEDED = 'SUCCEEDED'
239
239
  # CANCELLED: The job is cancelled by the user. When the managed job is in
240
240
  # CANCELLED status, the cluster has been cleaned up.
241
241
  CANCELLED = 'CANCELLED'
@@ -281,7 +281,6 @@ class ManagedJobStatus(enum.Enum):
281
281
  cls.FAILED_PRECHECKS,
282
282
  cls.FAILED_NO_RESOURCE,
283
283
  cls.FAILED_CONTROLLER,
284
- cls.CANCELLING,
285
284
  cls.CANCELLED,
286
285
  ]
287
286
 
@@ -512,8 +511,12 @@ def set_failed(
512
511
  failure_reason: str,
513
512
  callback_func: Optional[CallbackType] = None,
514
513
  end_time: Optional[float] = None,
514
+ override_terminal: bool = False,
515
515
  ):
516
- """Set an entire job or task to failed, if they are in non-terminal states.
516
+ """Set an entire job or task to failed.
517
+
518
+ By default, don't override tasks that are already terminal (that is, for
519
+ which end_at is already set).
517
520
 
518
521
  Args:
519
522
  job_id: The job id.
@@ -522,12 +525,13 @@ def set_failed(
522
525
  failure_type: The failure type. One of ManagedJobStatus.FAILED_*.
523
526
  failure_reason: The failure reason.
524
527
  end_time: The end time. If None, the current time will be used.
528
+ override_terminal: If True, override the current status even if end_at
529
+ is already set.
525
530
  """
526
531
  assert failure_type.is_failed(), failure_type
527
532
  end_time = time.time() if end_time is None else end_time
528
533
 
529
- fields_to_set = {
530
- 'end_at': end_time,
534
+ fields_to_set: Dict[str, Any] = {
531
535
  'status': failure_type.value,
532
536
  'failure_reason': failure_reason,
533
537
  }
@@ -542,14 +546,31 @@ def set_failed(
542
546
  # affect the job duration calculation.
543
547
  fields_to_set['last_recovered_at'] = end_time
544
548
  set_str = ', '.join(f'{k}=(?)' for k in fields_to_set)
545
- task_str = '' if task_id is None else f' AND task_id={task_id}'
549
+ task_query_str = '' if task_id is None else 'AND task_id=(?)'
550
+ task_value = [] if task_id is None else [
551
+ task_id,
552
+ ]
546
553
 
547
- cursor.execute(
548
- f"""\
549
- UPDATE spot SET
550
- {set_str}
551
- WHERE spot_job_id=(?){task_str} AND end_at IS null""",
552
- (*list(fields_to_set.values()), job_id))
554
+ if override_terminal:
555
+ # Use COALESCE for end_at to avoid overriding the existing end_at if
556
+ # it's already set.
557
+ cursor.execute(
558
+ f"""\
559
+ UPDATE spot SET
560
+ end_at = COALESCE(end_at, ?),
561
+ {set_str}
562
+ WHERE spot_job_id=(?) {task_query_str}""",
563
+ (end_time, *list(fields_to_set.values()), job_id, *task_value))
564
+ else:
565
+ # Only set if end_at is null, i.e. the previous status is not
566
+ # terminal.
567
+ cursor.execute(
568
+ f"""\
569
+ UPDATE spot SET
570
+ end_at = (?),
571
+ {set_str}
572
+ WHERE spot_job_id=(?) {task_query_str} AND end_at IS null""",
573
+ (end_time, *list(fields_to_set.values()), job_id, *task_value))
553
574
  if callback_func:
554
575
  callback_func('FAILED')
555
576
  logger.info(failure_reason)
@@ -677,6 +698,50 @@ def get_schedule_live_jobs(job_id: Optional[int]) -> List[Dict[str, Any]]:
677
698
  return jobs
678
699
 
679
700
 
701
+ def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
702
+ """Get jobs that need controller process checking.
703
+
704
+ Args:
705
+ job_id: Optional job ID to check. If None, checks all jobs.
706
+
707
+ Returns a list of job_ids, including the following:
708
+ - For jobs with schedule state: jobs that have schedule state not DONE
709
+ - For legacy jobs (no schedule state): jobs that are in non-terminal status
710
+ """
711
+ job_filter = '' if job_id is None else 'AND spot.spot_job_id=(?)'
712
+ job_value = () if job_id is None else (job_id,)
713
+
714
+ status_filter_str = ', '.join(['?'] *
715
+ len(ManagedJobStatus.terminal_statuses()))
716
+ terminal_status_values = [
717
+ status.value for status in ManagedJobStatus.terminal_statuses()
718
+ ]
719
+
720
+ # Get jobs that are either:
721
+ # 1. Have schedule state that is not DONE, or
722
+ # 2. Have no schedule state (legacy) AND are in non-terminal status
723
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
724
+ rows = cursor.execute(
725
+ f"""\
726
+ SELECT DISTINCT spot.spot_job_id
727
+ FROM spot
728
+ LEFT OUTER JOIN job_info
729
+ ON spot.spot_job_id=job_info.spot_job_id
730
+ WHERE (
731
+ (job_info.schedule_state IS NOT NULL AND
732
+ job_info.schedule_state IS NOT ?)
733
+ OR
734
+ (job_info.schedule_state IS NULL AND
735
+ status NOT IN ({status_filter_str}))
736
+ )
737
+ {job_filter}
738
+ ORDER BY spot.spot_job_id DESC""", [
739
+ ManagedJobScheduleState.DONE.value, *terminal_status_values,
740
+ *job_value
741
+ ]).fetchall()
742
+ return [row[0] for row in rows if row[0] is not None]
743
+
744
+
680
745
  def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
681
746
  """Get all job ids by name."""
682
747
  name_filter = ''
sky/jobs/utils.py CHANGED
@@ -159,7 +159,7 @@ def _controller_process_alive(pid: int, job_id: int) -> bool:
159
159
  return False
160
160
 
161
161
 
162
- def update_managed_job_status(job_id: Optional[int] = None):
162
+ def update_managed_jobs_statuses(job_id: Optional[int] = None):
163
163
  """Update managed job status if the controller process failed abnormally.
164
164
 
165
165
  Check the status of the controller process. If it is not running, it must
@@ -168,125 +168,175 @@ def update_managed_job_status(job_id: Optional[int] = None):
168
168
  when above happens, which could be not accurate based on the frequency this
169
169
  function is called.
170
170
 
171
- Note: we expect that job_id, if provided, refers to a nonterminal job.
171
+ Note: we expect that job_id, if provided, refers to a nonterminal job or a
172
+ job that has not completed its cleanup (schedule state not DONE).
172
173
  """
173
174
 
174
- if job_id is None:
175
- # Warning: it's totally possible for the managed job to transition to
176
- # a terminal status during the course of this function. The set_failed()
177
- # called below will not update the state for jobs that already have a
178
- # terminal status, so it should be fine.
179
- job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
180
- else:
181
- job_ids = [job_id]
182
- for job_id_ in job_ids:
183
-
184
- failure_reason = None
185
-
186
- tasks = managed_job_state.get_managed_jobs(job_id_)
187
- schedule_state = tasks[0]['schedule_state']
188
- if schedule_state is None:
189
- # Backwards compatibility: this job was submitted when ray was still
190
- # used for managing the parallelism of job controllers.
191
- # TODO(cooperc): Remove before 0.11.0.
192
- controller_status = job_lib.get_status(job_id_)
193
- if controller_status is None or controller_status.is_terminal():
194
- logger.error(f'Controller process for legacy job {job_id_} is '
195
- 'in an unexpected state.')
196
- failure_reason = 'Legacy job is in an unexpected state'
197
-
198
- # Continue to mark the job as failed.
199
- else:
200
- # Still running.
201
- continue
202
- else:
203
- pid = tasks[0]['controller_pid']
204
- if pid is None:
205
- if schedule_state in (
206
- managed_job_state.ManagedJobScheduleState.INACTIVE,
207
- managed_job_state.ManagedJobScheduleState.WAITING):
208
- # Job has not been scheduled yet.
209
- continue
210
- elif (schedule_state ==
211
- managed_job_state.ManagedJobScheduleState.LAUNCHING):
212
- # This should only be the case for a very short period of
213
- # time between marking the job as submitted and writing the
214
- # launched controller process pid back to the database (see
215
- # scheduler.maybe_schedule_next_jobs).
216
- # TODO(cooperc): Find a way to detect if we get stuck in
217
- # this state.
218
- logger.info(f'Job {job_id_} is in LAUNCHING state, '
219
- 'but controller process hasn\'t started yet.')
220
- continue
221
- # All other statuses are unexpected. Proceed to mark as failed.
222
- logger.error(f'Expected to find a controller pid for state '
223
- f'{schedule_state.value} but found none.')
224
- failure_reason = ('No controller pid set for '
225
- f'{schedule_state.value}')
226
- else:
227
- logger.debug(f'Checking controller pid {pid}')
228
- if _controller_process_alive(pid, job_id_):
229
- # The controller is still running.
230
- continue
231
- # Otherwise, proceed to mark the job as failed.
232
- logger.error(f'Controller process for {job_id_} seems to be '
233
- 'dead.')
234
- failure_reason = 'Controller process is dead'
175
+ def _cleanup_job_clusters(job_id: int) -> Optional[str]:
176
+ """Clean up clusters for a job. Returns error message if any.
235
177
 
236
- logger.error(f'Controller process for job {job_id_} has exited '
237
- 'abnormally. Setting the job status to FAILED_CONTROLLER.')
178
+ This function should not throw any exception. If it fails, it will
179
+ capture the error message, and log/return it.
180
+ """
181
+ error_msg = None
182
+ tasks = managed_job_state.get_managed_jobs(job_id)
238
183
  for task in tasks:
239
184
  task_name = task['job_name']
240
- # Tear down the abnormal cluster to avoid resource leakage.
241
- cluster_name = generate_managed_job_cluster_name(task_name, job_id_)
185
+ cluster_name = generate_managed_job_cluster_name(task_name, job_id)
242
186
  handle = global_user_state.get_handle_from_cluster_name(
243
187
  cluster_name)
244
- # If the cluster exists, terminate it.
245
188
  if handle is not None:
246
- terminate_cluster(cluster_name)
189
+ try:
190
+ terminate_cluster(cluster_name)
191
+ except Exception as e: # pylint: disable=broad-except
192
+ error_msg = (
193
+ f'Failed to terminate cluster {cluster_name}: '
194
+ f'{common_utils.format_exception(e, use_bracket=True)}')
195
+ logger.exception(error_msg, exc_info=e)
196
+ return error_msg
197
+
198
+ # For backwards compatible jobs
199
+ # TODO(cooperc): Remove before 0.11.0.
200
+ def _handle_legacy_job(job_id: int):
201
+ controller_status = job_lib.get_status(job_id)
202
+ if controller_status is None or controller_status.is_terminal():
203
+ logger.error(f'Controller process for legacy job {job_id} is '
204
+ 'in an unexpected state.')
205
+
206
+ cleanup_error = _cleanup_job_clusters(job_id)
207
+ if cleanup_error:
208
+ # Unconditionally set the job to failed_controller if the
209
+ # cleanup fails.
210
+ managed_job_state.set_failed(
211
+ job_id,
212
+ task_id=None,
213
+ failure_type=managed_job_state.ManagedJobStatus.
214
+ FAILED_CONTROLLER,
215
+ failure_reason=
216
+ 'Legacy controller process has exited abnormally, and '
217
+ f'cleanup failed: {cleanup_error}. For more details, run: '
218
+ f'sky jobs logs --controller {job_id}',
219
+ override_terminal=True)
220
+ return
221
+
222
+ # It's possible for the job to have transitioned to
223
+ # another terminal state while between when we checked its
224
+ # state and now. In that case, set_failed won't do
225
+ # anything, which is fine.
226
+ managed_job_state.set_failed(
227
+ job_id,
228
+ task_id=None,
229
+ failure_type=managed_job_state.ManagedJobStatus.
230
+ FAILED_CONTROLLER,
231
+ failure_reason=(
232
+ 'Legacy controller process has exited abnormally. For '
233
+ f'more details, run: sky jobs logs --controller {job_id}'))
234
+
235
+ # Get jobs that need checking (non-terminal or not DONE)
236
+ job_ids = managed_job_state.get_jobs_to_check_status(job_id)
237
+ if not job_ids:
238
+ # job_id is already terminal, or if job_id is None, there are no jobs
239
+ # that need to be checked.
240
+ return
241
+
242
+ for job_id in job_ids:
243
+ tasks = managed_job_state.get_managed_jobs(job_id)
244
+ # Note: controller_pid and schedule_state are in the job_info table
245
+ # which is joined to the spot table, so all tasks with the same job_id
246
+ # will have the same value for these columns. This is what lets us just
247
+ # take tasks[0]['controller_pid'] and tasks[0]['schedule_state'].
248
+ schedule_state = tasks[0]['schedule_state']
249
+
250
+ # Backwards compatibility: this job was submitted when ray was still
251
+ # used for managing the parallelism of job controllers.
252
+ # TODO(cooperc): Remove before 0.11.0.
253
+ if (schedule_state is
254
+ managed_job_state.ManagedJobScheduleState.INVALID):
255
+ _handle_legacy_job(job_id)
256
+ continue
257
+
258
+ # For jobs with schedule state:
259
+ pid = tasks[0]['controller_pid']
260
+ if pid is None:
261
+ if schedule_state in (
262
+ managed_job_state.ManagedJobScheduleState.INACTIVE,
263
+ managed_job_state.ManagedJobScheduleState.WAITING):
264
+ # For these states, the controller hasn't been started yet.
265
+ # This is expected.
266
+ continue
267
+
268
+ if (schedule_state ==
269
+ managed_job_state.ManagedJobScheduleState.LAUNCHING):
270
+ # This is unlikely but technically possible. There's a brief
271
+ # period between marking job as scheduled (LAUNCHING) and
272
+ # actually launching the controller process and writing the pid
273
+ # back to the table.
274
+ # TODO(cooperc): Find a way to detect if we get stuck in this
275
+ # state.
276
+ logger.info(f'Job {job_id} is in {schedule_state.value} state, '
277
+ 'but controller process hasn\'t started yet.')
278
+ continue
279
+
280
+ logger.error(f'Expected to find a controller pid for state '
281
+ f'{schedule_state.value} but found none.')
282
+ failure_reason = f'No controller pid set for {schedule_state.value}'
283
+ else:
284
+ logger.debug(f'Checking controller pid {pid}')
285
+ if _controller_process_alive(pid, job_id):
286
+ # The controller is still running, so this job is fine.
287
+ continue
288
+
289
+ # Double check job is not already DONE before marking as failed, to
290
+ # avoid the race where the controller marked itself as DONE and
291
+ # exited between the state check and the pid check. Since the job
292
+ # controller process will mark itself DONE _before_ exiting, if it
293
+ # has exited and it's still not DONE now, it is abnormal.
294
+ if (managed_job_state.get_job_schedule_state(job_id) ==
295
+ managed_job_state.ManagedJobScheduleState.DONE):
296
+ # Never mind, the job is DONE now. This is fine.
297
+ continue
298
+
299
+ logger.error(f'Controller process for {job_id} seems to be dead.')
300
+ failure_reason = 'Controller process is dead'
301
+
302
+ # At this point, either pid is None or process is dead.
247
303
 
248
304
  # The controller process for this managed job is not running: it must
249
305
  # have exited abnormally, and we should set the job status to
250
306
  # FAILED_CONTROLLER.
251
- # The `set_failed` will only update the task's status if the
252
- # status is non-terminal.
307
+ logger.error(f'Controller process for job {job_id} has exited '
308
+ 'abnormally. Setting the job status to FAILED_CONTROLLER.')
309
+
310
+ # Cleanup clusters and capture any errors.
311
+ cleanup_error = _cleanup_job_clusters(job_id)
312
+ cleanup_error_msg = ''
313
+ if cleanup_error:
314
+ cleanup_error_msg = f'Also, cleanup failed: {cleanup_error}. '
315
+
316
+ # Set all tasks to FAILED_CONTROLLER, regardless of current status.
317
+ # This may change a job from SUCCEEDED or another terminal state to
318
+ # FAILED_CONTROLLER. This is what we want - we are sure that this
319
+ # controller process crashed, so we want to capture that even if the
320
+ # underlying job succeeded.
321
+ # Note: 2+ invocations of update_managed_jobs_statuses could be running
322
+ # at the same time, so this could override the FAILED_CONTROLLER status
323
+ # set by another invocation of update_managed_jobs_statuses. That should
324
+ # be okay. The only difference could be that one process failed to clean
325
+ # up the cluster while the other succeeds. No matter which
326
+ # failure_reason ends up in the database, the outcome is acceptable.
327
+ # We assume that no other code path outside the controller process will
328
+ # update the job status.
253
329
  managed_job_state.set_failed(
254
- job_id_,
330
+ job_id,
255
331
  task_id=None,
256
332
  failure_type=managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
257
333
  failure_reason=
258
- f'Controller process has exited abnormally ({failure_reason}). For '
259
- f'more details, run: sky jobs logs --controller {job_id_}')
260
- scheduler.job_done(job_id_, idempotent=True)
261
-
262
- # Some jobs may be in a terminal status, but are not yet DONE. For instance,
263
- # they may be still cleaning up resources, etc. Such jobs won't be captured
264
- # by the above check, which only looks at nonterminal jobs. So, check the
265
- # controller liveness of all jobs that should have live controller
266
- # processes.
267
- for job_info in managed_job_state.get_schedule_live_jobs(job_id):
268
- if not job_info['controller_pid']:
269
- # Technically, a job with no controller process but in LAUNCHING
270
- # schedule state can happen very briefly after the job is set to
271
- # LAUNCHING but before the controller process is actually spawned.
272
- # However, if we observe any state other than LAUNCHING, something
273
- # is clearly wrong.
274
- if (job_info['schedule_state'] !=
275
- managed_job_state.ManagedJobScheduleState.LAUNCHING):
276
- logger.error(
277
- f'Missing controller PID for {job_info["job_id"]}. '
278
- 'Setting to DONE.')
279
- scheduler.job_done(job_info['job_id'])
280
- else:
281
- logger.info(f'LAUNCHING job {job_info["job_id"]} has no '
282
- 'controller process yet. Skipping.')
334
+ f'Controller process has exited abnormally ({failure_reason}). '
335
+ f'{cleanup_error_msg}'
336
+ f'For more details, run: sky jobs logs --controller {job_id}',
337
+ override_terminal=True)
283
338
 
284
- elif not _controller_process_alive(job_info['controller_pid'],
285
- job_info['job_id']):
286
- logger.error(
287
- f'Controller process for job {job_info["job_id"]} is not '
288
- 'alive. Marking the job as DONE.')
289
- scheduler.job_done(job_info['job_id'])
339
+ scheduler.job_done(job_id, idempotent=True)
290
340
 
291
341
 
292
342
  def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
@@ -382,7 +432,7 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
382
432
  f'{job_status.value}. Skipped.')
383
433
  continue
384
434
 
385
- update_managed_job_status(job_id)
435
+ update_managed_jobs_statuses(job_id)
386
436
 
387
437
  # Send the signal to the jobs controller.
388
438
  signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
@@ -424,36 +474,24 @@ def cancel_job_by_name(job_name: str) -> str:
424
474
 
425
475
  def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
426
476
  """Stream logs by job id."""
427
- controller_status = job_lib.get_status(job_id)
428
- status_msg = ux_utils.spinner_message(
429
- 'Waiting for controller process to be RUNNING') + '{status_str}'
430
- status_display = rich_utils.safe_status(status_msg.format(status_str=''))
477
+
478
+ def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
479
+ # If we see CANCELLING, just exit - we could miss some job logs but the
480
+ # job will be terminated momentarily anyway so we don't really care.
481
+ return (not status.is_terminal() and
482
+ status != managed_job_state.ManagedJobStatus.CANCELLING)
483
+
484
+ msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
485
+ status_display = rich_utils.safe_status(msg)
431
486
  num_tasks = managed_job_state.get_num_tasks(job_id)
432
487
 
433
488
  with status_display:
434
- prev_msg = None
435
- while (controller_status != job_lib.JobStatus.RUNNING and
436
- (controller_status is None or
437
- not controller_status.is_terminal())):
438
- status_str = 'None'
439
- if controller_status is not None:
440
- status_str = controller_status.value
441
- msg = status_msg.format(status_str=f' (status: {status_str})')
442
- if msg != prev_msg:
443
- status_display.update(msg)
444
- prev_msg = msg
445
- time.sleep(_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS)
446
- controller_status = job_lib.get_status(job_id)
447
-
448
- msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
449
- status_display.update(msg)
450
489
  prev_msg = msg
451
- managed_job_status = managed_job_state.get_status(job_id)
452
- while managed_job_status is None:
490
+ while (managed_job_status :=
491
+ managed_job_state.get_status(job_id)) is None:
453
492
  time.sleep(1)
454
- managed_job_status = managed_job_state.get_status(job_id)
455
493
 
456
- if managed_job_status.is_terminal():
494
+ if not should_keep_logging(managed_job_status):
457
495
  job_msg = ''
458
496
  if managed_job_status.is_failed():
459
497
  job_msg = ('\nFailure reason: '
@@ -480,10 +518,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
480
518
  task_id, managed_job_status = (
481
519
  managed_job_state.get_latest_task_id_status(job_id))
482
520
 
483
- # task_id and managed_job_status can be None if the controller process
484
- # just started and the managed job status has not set to PENDING yet.
485
- while (managed_job_status is None or
486
- not managed_job_status.is_terminal()):
521
+ # We wait for managed_job_status to be not None above. Once we see that
522
+ # it's not None, we don't expect it to every become None again.
523
+ assert managed_job_status is not None, (job_id, task_id,
524
+ managed_job_status)
525
+
526
+ while should_keep_logging(managed_job_status):
487
527
  handle = None
488
528
  if task_id is not None:
489
529
  task_name = managed_job_state.get_task_name(job_id, task_id)
@@ -513,8 +553,11 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
513
553
  time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
514
554
  task_id, managed_job_status = (
515
555
  managed_job_state.get_latest_task_id_status(job_id))
556
+ assert managed_job_status is not None, (job_id, task_id,
557
+ managed_job_status)
516
558
  continue
517
- assert managed_job_status is not None
559
+ assert (managed_job_status ==
560
+ managed_job_state.ManagedJobStatus.RUNNING)
518
561
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
519
562
  status_display.stop()
520
563
  returncode = backend.tail_logs(handle,
@@ -568,6 +611,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
568
611
  managed_job_status :=
569
612
  managed_job_state.get_status(job_id)):
570
613
  time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
614
+ assert managed_job_status is not None, (
615
+ job_id, managed_job_status)
571
616
  continue
572
617
 
573
618
  if task_id == num_tasks - 1:
@@ -593,6 +638,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
593
638
  if original_task_id != task_id:
594
639
  break
595
640
  time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
641
+ assert managed_job_status is not None, (job_id, task_id,
642
+ managed_job_status)
596
643
  continue
597
644
 
598
645
  # The job can be cancelled by the user or the controller (when
@@ -608,7 +655,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
608
655
  # state.
609
656
  managed_job_status = managed_job_state.get_status(job_id)
610
657
  assert managed_job_status is not None, job_id
611
- if managed_job_status.is_terminal():
658
+ if not should_keep_logging(managed_job_status):
612
659
  break
613
660
  logger.info(f'{colorama.Fore.YELLOW}The job cluster is preempted '
614
661
  f'or failed.{colorama.Style.RESET_ALL}')
@@ -623,6 +670,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
623
670
  # managed job state is updated.
624
671
  time.sleep(3 * JOB_STATUS_CHECK_GAP_SECONDS)
625
672
  managed_job_status = managed_job_state.get_status(job_id)
673
+ assert managed_job_status is not None, (job_id, managed_job_status)
626
674
 
627
675
  # The managed_job_status may not be in terminal status yet, since the
628
676
  # controller has not updated the managed job state yet. We wait for a while,
@@ -630,7 +678,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
630
678
  wait_seconds = 0
631
679
  managed_job_status = managed_job_state.get_status(job_id)
632
680
  assert managed_job_status is not None, job_id
633
- while (not managed_job_status.is_terminal() and follow and
681
+ while (should_keep_logging(managed_job_status) and follow and
634
682
  wait_seconds < _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS):
635
683
  time.sleep(1)
636
684
  wait_seconds += 1
@@ -694,10 +742,7 @@ def stream_logs(job_id: Optional[int],
694
742
  if job_status is None:
695
743
  with ux_utils.print_exception_no_traceback():
696
744
  raise ValueError(f'Job {job_id} not found.')
697
- # We shouldn't count CANCELLING as terminal here, the controller is
698
- # still cleaning up.
699
- if (job_status.is_terminal() and job_status !=
700
- managed_job_state.ManagedJobStatus.CANCELLING):
745
+ if job_status.is_terminal():
701
746
  # Don't keep waiting. If the log file is not created by this
702
747
  # point, it never will be. This job may have been submitted
703
748
  # using an old version that did not create the log file, so this
@@ -729,6 +774,10 @@ def stream_logs(job_id: Optional[int],
729
774
  print(end='', flush=True)
730
775
 
731
776
  # Check if the job if finished.
777
+ # TODO(cooperc): The controller can still be
778
+ # cleaning up if job is in a terminal status
779
+ # (e.g. SUCCEEDED). We want to follow those logs
780
+ # too. Use DONE instead?
732
781
  job_status = managed_job_state.get_status(job_id)
733
782
  assert job_status is not None, (job_id, job_name)
734
783
  if job_status.is_terminal():
@@ -7,6 +7,7 @@ import os
7
7
  import re
8
8
  import shutil
9
9
  import subprocess
10
+ import time
10
11
  import typing
11
12
  from typing import Any, Dict, List, Optional, Set, Tuple, Union
12
13
  from urllib.parse import urlparse
@@ -105,6 +106,75 @@ ANNOTATIONS_POD_NOT_FOUND_ERROR_MSG = ('Pod {pod_name} not found in namespace '
105
106
 
106
107
  logger = sky_logging.init_logger(__name__)
107
108
 
109
+ # Default retry settings for Kubernetes API calls
110
+ DEFAULT_MAX_RETRIES = 3
111
+ DEFAULT_RETRY_INTERVAL_SECONDS = 1
112
+
113
+
114
+ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
115
+ retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
116
+ resource_type: Optional[str] = None):
117
+ """Decorator to retry Kubernetes API calls on transient failures.
118
+
119
+ Args:
120
+ max_retries: Maximum number of retry attempts
121
+ retry_interval: Initial seconds to wait between retries
122
+ resource_type: Type of resource being accessed (e.g. 'node', 'pod').
123
+ Used to provide more specific error messages.
124
+ """
125
+
126
+ def decorator(func):
127
+
128
+ @functools.wraps(func)
129
+ def wrapper(*args, **kwargs):
130
+ last_exception = None
131
+ backoff = common_utils.Backoff(initial_backoff=retry_interval,
132
+ max_backoff_factor=3)
133
+
134
+ for attempt in range(max_retries):
135
+ try:
136
+ return func(*args, **kwargs)
137
+ except (kubernetes.max_retry_error(),
138
+ kubernetes.api_exception(),
139
+ kubernetes.config_exception()) as e:
140
+ last_exception = e
141
+ # Don't retry on permanent errors like 401 (Unauthorized)
142
+ # or 403 (Forbidden)
143
+ if (isinstance(e, kubernetes.api_exception()) and
144
+ e.status in (401, 403)):
145
+ raise
146
+ if attempt < max_retries - 1:
147
+ sleep_time = backoff.current_backoff()
148
+ logger.debug(f'Kubernetes API call {func.__name__} '
149
+ f'failed with {str(e)}. Retrying in '
150
+ f'{sleep_time:.1f}s...')
151
+ time.sleep(sleep_time)
152
+ continue
153
+
154
+ # Format error message based on the type of exception
155
+ resource_msg = f' when trying to get {resource_type} info' \
156
+ if resource_type else ''
157
+ debug_cmd = f' To debug, run: kubectl get {resource_type}s' \
158
+ if resource_type else ''
159
+
160
+ if isinstance(last_exception, kubernetes.max_retry_error()):
161
+ error_msg = f'Timed out{resource_msg} from Kubernetes cluster.'
162
+ elif isinstance(last_exception, kubernetes.api_exception()):
163
+ error_msg = (f'Kubernetes API error{resource_msg}: '
164
+ f'{str(last_exception)}')
165
+ else:
166
+ error_msg = (f'Kubernetes configuration error{resource_msg}: '
167
+ f'{str(last_exception)}')
168
+
169
+ raise exceptions.ResourcesUnavailableError(
170
+ f'{error_msg}'
171
+ f' Please check if the cluster is healthy and retry.'
172
+ f'{debug_cmd}') from last_exception
173
+
174
+ return wrapper
175
+
176
+ return decorator
177
+
108
178
 
109
179
  class GPULabelFormatter:
110
180
  """Base class to define a GPU label formatter for a Kubernetes cluster
@@ -446,6 +516,7 @@ def detect_accelerator_resource(
446
516
 
447
517
 
448
518
  @functools.lru_cache(maxsize=10)
519
+ @_retry_on_error(resource_type='node')
449
520
  def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
450
521
  """Gets the kubernetes nodes in the context.
451
522
 
@@ -454,17 +525,12 @@ def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
454
525
  if context is None:
455
526
  context = get_current_kube_config_context_name()
456
527
 
457
- try:
458
- nodes = kubernetes.core_api(context).list_node(
459
- _request_timeout=kubernetes.API_TIMEOUT).items
460
- except kubernetes.max_retry_error():
461
- raise exceptions.ResourcesUnavailableError(
462
- 'Timed out when trying to get node info from Kubernetes cluster. '
463
- 'Please check if the cluster is healthy and retry. To debug, run: '
464
- 'kubectl get nodes') from None
528
+ nodes = kubernetes.core_api(context).list_node(
529
+ _request_timeout=kubernetes.API_TIMEOUT).items
465
530
  return nodes
466
531
 
467
532
 
533
+ @_retry_on_error(resource_type='pod')
468
534
  def get_all_pods_in_kubernetes_cluster(
469
535
  context: Optional[str] = None) -> List[Any]:
470
536
  """Gets pods in all namespaces in kubernetes cluster indicated by context.
@@ -474,14 +540,8 @@ def get_all_pods_in_kubernetes_cluster(
474
540
  if context is None:
475
541
  context = get_current_kube_config_context_name()
476
542
 
477
- try:
478
- pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
479
- _request_timeout=kubernetes.API_TIMEOUT).items
480
- except kubernetes.max_retry_error():
481
- raise exceptions.ResourcesUnavailableError(
482
- 'Timed out when trying to get pod info from Kubernetes cluster. '
483
- 'Please check if the cluster is healthy and retry. To debug, run: '
484
- 'kubectl get pods') from None
543
+ pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
544
+ _request_timeout=kubernetes.API_TIMEOUT).items
485
545
  return pods
486
546
 
487
547
 
@@ -1758,8 +1818,6 @@ def merge_dicts(source: Dict[Any, Any], destination: Dict[Any, Any]):
1758
1818
  else:
1759
1819
  destination[key].extend(value)
1760
1820
  else:
1761
- if destination is None:
1762
- destination = {}
1763
1821
  destination[key] = value
1764
1822
 
1765
1823
 
sky/skylet/constants.py CHANGED
@@ -86,7 +86,7 @@ TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
86
86
  # cluster yaml is updated.
87
87
  #
88
88
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
89
- SKYLET_VERSION = '10'
89
+ SKYLET_VERSION = '11'
90
90
  # The version of the lib files that skylet/jobs use. Whenever there is an API
91
91
  # change for the job_lib or log_lib, we need to bump this version, so that the
92
92
  # user can be notified to update their SkyPilot version on the remote cluster.
sky/skylet/events.py CHANGED
@@ -74,7 +74,7 @@ class ManagedJobEvent(SkyletEvent):
74
74
  EVENT_INTERVAL_SECONDS = 300
75
75
 
76
76
  def _run(self):
77
- managed_job_utils.update_managed_job_status()
77
+ managed_job_utils.update_managed_jobs_statuses()
78
78
  managed_job_scheduler.maybe_schedule_next_jobs()
79
79
 
80
80
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250123
3
+ Version: 1.0.0.dev20250125
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,7 +1,7 @@
1
- sky/__init__.py,sha256=5YS9q5x2qExYmujWZJJRzlLsBW8uMQ2gDBxLqpt3sgo,5944
1
+ sky/__init__.py,sha256=byguYOHI2wvQyWrh97v5OmKwEiIEv4lxNHqBSxTPCXc,5944
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=LXUDABKP1FJCS256xTTDJa40WXwHKF5x49S-4hZbD1M,21501
4
- sky/check.py,sha256=s8deMVL-k9y8gd519K7NWZc3DqWsEySwiAr0uH3Vvcc,9459
4
+ sky/check.py,sha256=qTpm3N1zUZi2inEZPsrbt278B3h8nsk2gnepzIgLybE,10899
5
5
  sky/cli.py,sha256=suOjHrt7mQTK47Z9ZQjogyUwnxfsKZ3_eP86AI29Dko,213891
6
6
  sky/cloud_stores.py,sha256=PcLT57_8SZy7o6paAluElfBynaLkbaOq3l-8dNg1AVM,23672
7
7
  sky/core.py,sha256=fE1rn4Ku94S0XmWTO5-6t6eT6aaJImNczRqEnTe8v7Q,38742
@@ -50,7 +50,7 @@ sky/clouds/do.py,sha256=zqibtq1gxNPSNkSkZFPfP5yplfIKCwBss3ry0o4C17c,11198
50
50
  sky/clouds/fluidstack.py,sha256=u2I6jXEtTqgqRWi2EafMsKqc8VkUq1cR6CSDUvk72_U,12407
51
51
  sky/clouds/gcp.py,sha256=6QOnefFsYiLCcnajjduLHsayqJ641bBu42jPTpvy7Mc,55007
52
52
  sky/clouds/ibm.py,sha256=0ArRTQx1_DpTNGByFhukzFedEDzmVjBsGiiques1bQ0,21447
53
- sky/clouds/kubernetes.py,sha256=OSkglBxvSimmdR8rctb3PfSzkIf5I7vLb5vT0Z18lkw,31544
53
+ sky/clouds/kubernetes.py,sha256=oZg4Lpn2ZBikyc5NTJIziUPEY0xs2mtz546ButhkZ7g,31541
54
54
  sky/clouds/lambda_cloud.py,sha256=42AmcN2X_wdBMuAw606nR_pQCBAy5QFiAo711_WRqDE,12672
55
55
  sky/clouds/oci.py,sha256=VpPxpMJv52QePVdwdK9EuiMyqjp70dk8_rgUVv5Y-2w,27028
56
56
  sky/clouds/paperspace.py,sha256=F0Sj1RcqRb5fPjrr8qbdeY-JdfAHcRPc902pZOi4erw,10889
@@ -98,12 +98,12 @@ sky/data/storage.py,sha256=CWVKnHhdzXw1biPbRqYizkyVexL_OCELuJCqtd4hit4,204094
98
98
  sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
99
99
  sky/jobs/__init__.py,sha256=ObZcz3lL1ip8JcmR6gbfZ4RMMfXJJdsnuU2zLQUb8jY,1546
100
100
  sky/jobs/constants.py,sha256=6RphkJ6pmafQ7XYW5qwId1Zvqb99HJelA9kgrgfNR7o,1421
101
- sky/jobs/controller.py,sha256=Qv7vOj4OXkbnZF0F9wKrlJsYhkSNJjJ1Mgrn2FyQyaM,28250
101
+ sky/jobs/controller.py,sha256=0WcOk8xRZ-mZWuza-WE-ICKZTgZvXxNzj9pWXUslm6E,28312
102
102
  sky/jobs/core.py,sha256=2_Q9thiBPnd3i2nDqyUtQY-dsGZ1kRgAdnLcXHoycYo,19938
103
103
  sky/jobs/recovery_strategy.py,sha256=m-EA-MWXPFrgx2CYFPr6MmgeUoDTEBmY2xruD2PRSGY,26365
104
104
  sky/jobs/scheduler.py,sha256=WAvNb8-vBk8q1zFordFdpH7gxqWDjPHDGZZay6aodOk,12028
105
- sky/jobs/state.py,sha256=Cjv2UEKfk3j7enXaCkU9CDqsvUfYZ3FWnYEH5HMachs,38153
106
- sky/jobs/utils.py,sha256=waKmLbUNRXeuYKBn_U7sekSFGAEgoPp9QemUULK4Y9k,49491
105
+ sky/jobs/state.py,sha256=bvBNZMg3DzPfS4eHNzMqYaMui2cqnWoWGDIaiOpaXSk,40770
106
+ sky/jobs/utils.py,sha256=RGVytFmB6SmKK3qZp_8UID_T5ssxSJOgwCDgIvRmhtM,51785
107
107
  sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
108
108
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
109
109
  sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
@@ -149,7 +149,7 @@ sky/provision/kubernetes/config.py,sha256=bXwOGdSAnXCkDreew0KsSUqSv3ZrptNeevqat7
149
149
  sky/provision/kubernetes/instance.py,sha256=AQikdRgNklpeMgiEd4w2Hh7kGssVABsy0aCh9xsKi5Y,50313
150
150
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
151
151
  sky/provision/kubernetes/network_utils.py,sha256=52BZY_5ynCH6IXlivKObYyAHDgQCJyAJIjmM7J4MpFo,11393
152
- sky/provision/kubernetes/utils.py,sha256=BklPlHXKNTNKamdAygnQ_sOIROq1bN3xbIPxwNRqMV0,104774
152
+ sky/provision/kubernetes/utils.py,sha256=Soyq-8h1i0ZYjTzVZRgwbyAkfEbNrAR3G2-krzIr6Rk,107132
153
153
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
154
154
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
155
155
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
@@ -207,8 +207,8 @@ sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
207
207
  sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
208
208
  sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,4478
209
209
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
210
- sky/skylet/constants.py,sha256=qW5tilb-D4B5IVlwphCFhPHxB3q63ICjgTbx-ftfAQQ,16022
211
- sky/skylet/events.py,sha256=8xK2J_KOPUugZUFQunnrTCMtGJaI8Fodtv6HJjBLsAI,12532
210
+ sky/skylet/constants.py,sha256=uLEVhMZXpIlj7b_03ixAI6rC6fTM1k5xPUWR4LvzQyo,16022
211
+ sky/skylet/events.py,sha256=0bOjUYpphuAficD9wDB5NOan2vwJDaRqdnm4sl0RK0U,12535
212
212
  sky/skylet/job_lib.py,sha256=Rk-C069cusJIRXsks8xqCb016JSt7GlpU7LrpX0qFJk,42785
213
213
  sky/skylet/log_lib.py,sha256=oFEBd85vDYFrIyyZKekH30yc4rRYILC0F0o-COQ64oE,20445
214
214
  sky/skylet/log_lib.pyi,sha256=rRk4eUX0RHGs1QL9CXsJq6RE7FqqxZlfuPJOLXTvg7I,4453
@@ -289,9 +289,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
289
289
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
290
290
  sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
291
291
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
292
- skypilot_nightly-1.0.0.dev20250123.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
293
- skypilot_nightly-1.0.0.dev20250123.dist-info/METADATA,sha256=oMFccWsTysJeiqQGWjLUCaqdn922FrgqWFy6gsJol7Q,21038
294
- skypilot_nightly-1.0.0.dev20250123.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
295
- skypilot_nightly-1.0.0.dev20250123.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
296
- skypilot_nightly-1.0.0.dev20250123.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
297
- skypilot_nightly-1.0.0.dev20250123.dist-info/RECORD,,
292
+ skypilot_nightly-1.0.0.dev20250125.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
293
+ skypilot_nightly-1.0.0.dev20250125.dist-info/METADATA,sha256=8ozTZDBrQLiIaTS3-_CStvAfJE7XPmuwGGWneS_gj7o,21038
294
+ skypilot_nightly-1.0.0.dev20250125.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
295
+ skypilot_nightly-1.0.0.dev20250125.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
296
+ skypilot_nightly-1.0.0.dev20250125.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
297
+ skypilot_nightly-1.0.0.dev20250125.dist-info/RECORD,,