skypilot-nightly 1.0.0.dev20250123__py3-none-any.whl → 1.0.0.dev20250125__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/check.py +31 -1
- sky/clouds/kubernetes.py +3 -3
- sky/jobs/controller.py +4 -1
- sky/jobs/state.py +79 -14
- sky/jobs/utils.py +188 -139
- sky/provision/kubernetes/utils.py +76 -18
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +1 -1
- {skypilot_nightly-1.0.0.dev20250123.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250123.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/RECORD +15 -15
- {skypilot_nightly-1.0.0.dev20250123.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250123.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250123.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250123.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '485b1cd4688d5ac984cc666f372b55009cb064b7'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250125'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/check.py
CHANGED
@@ -155,7 +155,8 @@ def check(
|
|
155
155
|
# Pretty print for UX.
|
156
156
|
if not quiet:
|
157
157
|
enabled_clouds_str = '\n :heavy_check_mark: '.join(
|
158
|
-
[''] +
|
158
|
+
[''] +
|
159
|
+
[_format_enabled_cloud(c) for c in sorted(all_enabled_clouds)])
|
159
160
|
rich.print('\n[green]:tada: Enabled clouds :tada:'
|
160
161
|
f'{enabled_clouds_str}[/green]')
|
161
162
|
|
@@ -222,3 +223,32 @@ def get_cloud_credential_file_mounts(
|
|
222
223
|
r2_credential_mounts = cloudflare.get_credential_file_mounts()
|
223
224
|
file_mounts.update(r2_credential_mounts)
|
224
225
|
return file_mounts
|
226
|
+
|
227
|
+
|
228
|
+
def _format_enabled_cloud(cloud_name: str) -> str:
|
229
|
+
if cloud_name == repr(sky_clouds.Kubernetes()):
|
230
|
+
# Get enabled contexts for Kubernetes
|
231
|
+
existing_contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
|
232
|
+
if not existing_contexts:
|
233
|
+
return cloud_name
|
234
|
+
|
235
|
+
# Check if allowed_contexts is explicitly set in config
|
236
|
+
allowed_contexts = skypilot_config.get_nested(
|
237
|
+
('kubernetes', 'allowed_contexts'), None)
|
238
|
+
|
239
|
+
# Format the context info with consistent styling
|
240
|
+
if allowed_contexts is not None:
|
241
|
+
contexts_formatted = []
|
242
|
+
for i, context in enumerate(existing_contexts):
|
243
|
+
# TODO: We should use ux_utils.INDENT_SYMBOL and
|
244
|
+
# INDENT_LAST_SYMBOL but, they are formatted for colorama, while
|
245
|
+
# here we are using rich. We should migrate this file to
|
246
|
+
# use colorama as we do in the rest of the codebase.
|
247
|
+
symbol = ('└── ' if i == len(existing_contexts) - 1 else '├── ')
|
248
|
+
contexts_formatted.append(f'\n {symbol}{context}')
|
249
|
+
context_info = f'Allowed contexts:{"".join(contexts_formatted)}'
|
250
|
+
else:
|
251
|
+
context_info = f'Active context: {existing_contexts[0]}'
|
252
|
+
|
253
|
+
return f'{cloud_name}[/green][dim]\n └── {context_info}[/dim][green]'
|
254
|
+
return cloud_name
|
sky/clouds/kubernetes.py
CHANGED
@@ -131,7 +131,7 @@ class Kubernetes(clouds.Cloud):
|
|
131
131
|
'Ignoring these contexts.')
|
132
132
|
|
133
133
|
@classmethod
|
134
|
-
def
|
134
|
+
def existing_allowed_contexts(cls) -> List[str]:
|
135
135
|
"""Get existing allowed contexts.
|
136
136
|
|
137
137
|
If None is returned in the list, it means that we are running in a pod
|
@@ -175,7 +175,7 @@ class Kubernetes(clouds.Cloud):
|
|
175
175
|
use_spot: bool, region: Optional[str],
|
176
176
|
zone: Optional[str]) -> List[clouds.Region]:
|
177
177
|
del accelerators, zone, use_spot # unused
|
178
|
-
existing_contexts = cls.
|
178
|
+
existing_contexts = cls.existing_allowed_contexts()
|
179
179
|
|
180
180
|
regions = []
|
181
181
|
for context in existing_contexts:
|
@@ -591,7 +591,7 @@ class Kubernetes(clouds.Cloud):
|
|
591
591
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
592
592
|
# Test using python API
|
593
593
|
try:
|
594
|
-
existing_allowed_contexts = cls.
|
594
|
+
existing_allowed_contexts = cls.existing_allowed_contexts()
|
595
595
|
except ImportError as e:
|
596
596
|
return (False,
|
597
597
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
sky/jobs/controller.py
CHANGED
sky/jobs/state.py
CHANGED
@@ -230,12 +230,12 @@ class ManagedJobStatus(enum.Enum):
|
|
230
230
|
# RECOVERING: The cluster is preempted, and the controller process is
|
231
231
|
# recovering the cluster (relaunching/failover).
|
232
232
|
RECOVERING = 'RECOVERING'
|
233
|
-
# Terminal statuses
|
234
|
-
# SUCCEEDED: The job is finished successfully.
|
235
|
-
SUCCEEDED = 'SUCCEEDED'
|
236
233
|
# CANCELLING: The job is requested to be cancelled by the user, and the
|
237
234
|
# controller is cleaning up the cluster.
|
238
235
|
CANCELLING = 'CANCELLING'
|
236
|
+
# Terminal statuses
|
237
|
+
# SUCCEEDED: The job is finished successfully.
|
238
|
+
SUCCEEDED = 'SUCCEEDED'
|
239
239
|
# CANCELLED: The job is cancelled by the user. When the managed job is in
|
240
240
|
# CANCELLED status, the cluster has been cleaned up.
|
241
241
|
CANCELLED = 'CANCELLED'
|
@@ -281,7 +281,6 @@ class ManagedJobStatus(enum.Enum):
|
|
281
281
|
cls.FAILED_PRECHECKS,
|
282
282
|
cls.FAILED_NO_RESOURCE,
|
283
283
|
cls.FAILED_CONTROLLER,
|
284
|
-
cls.CANCELLING,
|
285
284
|
cls.CANCELLED,
|
286
285
|
]
|
287
286
|
|
@@ -512,8 +511,12 @@ def set_failed(
|
|
512
511
|
failure_reason: str,
|
513
512
|
callback_func: Optional[CallbackType] = None,
|
514
513
|
end_time: Optional[float] = None,
|
514
|
+
override_terminal: bool = False,
|
515
515
|
):
|
516
|
-
"""Set an entire job or task to failed
|
516
|
+
"""Set an entire job or task to failed.
|
517
|
+
|
518
|
+
By default, don't override tasks that are already terminal (that is, for
|
519
|
+
which end_at is already set).
|
517
520
|
|
518
521
|
Args:
|
519
522
|
job_id: The job id.
|
@@ -522,12 +525,13 @@ def set_failed(
|
|
522
525
|
failure_type: The failure type. One of ManagedJobStatus.FAILED_*.
|
523
526
|
failure_reason: The failure reason.
|
524
527
|
end_time: The end time. If None, the current time will be used.
|
528
|
+
override_terminal: If True, override the current status even if end_at
|
529
|
+
is already set.
|
525
530
|
"""
|
526
531
|
assert failure_type.is_failed(), failure_type
|
527
532
|
end_time = time.time() if end_time is None else end_time
|
528
533
|
|
529
|
-
fields_to_set = {
|
530
|
-
'end_at': end_time,
|
534
|
+
fields_to_set: Dict[str, Any] = {
|
531
535
|
'status': failure_type.value,
|
532
536
|
'failure_reason': failure_reason,
|
533
537
|
}
|
@@ -542,14 +546,31 @@ def set_failed(
|
|
542
546
|
# affect the job duration calculation.
|
543
547
|
fields_to_set['last_recovered_at'] = end_time
|
544
548
|
set_str = ', '.join(f'{k}=(?)' for k in fields_to_set)
|
545
|
-
|
549
|
+
task_query_str = '' if task_id is None else 'AND task_id=(?)'
|
550
|
+
task_value = [] if task_id is None else [
|
551
|
+
task_id,
|
552
|
+
]
|
546
553
|
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
554
|
+
if override_terminal:
|
555
|
+
# Use COALESCE for end_at to avoid overriding the existing end_at if
|
556
|
+
# it's already set.
|
557
|
+
cursor.execute(
|
558
|
+
f"""\
|
559
|
+
UPDATE spot SET
|
560
|
+
end_at = COALESCE(end_at, ?),
|
561
|
+
{set_str}
|
562
|
+
WHERE spot_job_id=(?) {task_query_str}""",
|
563
|
+
(end_time, *list(fields_to_set.values()), job_id, *task_value))
|
564
|
+
else:
|
565
|
+
# Only set if end_at is null, i.e. the previous status is not
|
566
|
+
# terminal.
|
567
|
+
cursor.execute(
|
568
|
+
f"""\
|
569
|
+
UPDATE spot SET
|
570
|
+
end_at = (?),
|
571
|
+
{set_str}
|
572
|
+
WHERE spot_job_id=(?) {task_query_str} AND end_at IS null""",
|
573
|
+
(end_time, *list(fields_to_set.values()), job_id, *task_value))
|
553
574
|
if callback_func:
|
554
575
|
callback_func('FAILED')
|
555
576
|
logger.info(failure_reason)
|
@@ -677,6 +698,50 @@ def get_schedule_live_jobs(job_id: Optional[int]) -> List[Dict[str, Any]]:
|
|
677
698
|
return jobs
|
678
699
|
|
679
700
|
|
701
|
+
def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
|
702
|
+
"""Get jobs that need controller process checking.
|
703
|
+
|
704
|
+
Args:
|
705
|
+
job_id: Optional job ID to check. If None, checks all jobs.
|
706
|
+
|
707
|
+
Returns a list of job_ids, including the following:
|
708
|
+
- For jobs with schedule state: jobs that have schedule state not DONE
|
709
|
+
- For legacy jobs (no schedule state): jobs that are in non-terminal status
|
710
|
+
"""
|
711
|
+
job_filter = '' if job_id is None else 'AND spot.spot_job_id=(?)'
|
712
|
+
job_value = () if job_id is None else (job_id,)
|
713
|
+
|
714
|
+
status_filter_str = ', '.join(['?'] *
|
715
|
+
len(ManagedJobStatus.terminal_statuses()))
|
716
|
+
terminal_status_values = [
|
717
|
+
status.value for status in ManagedJobStatus.terminal_statuses()
|
718
|
+
]
|
719
|
+
|
720
|
+
# Get jobs that are either:
|
721
|
+
# 1. Have schedule state that is not DONE, or
|
722
|
+
# 2. Have no schedule state (legacy) AND are in non-terminal status
|
723
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
724
|
+
rows = cursor.execute(
|
725
|
+
f"""\
|
726
|
+
SELECT DISTINCT spot.spot_job_id
|
727
|
+
FROM spot
|
728
|
+
LEFT OUTER JOIN job_info
|
729
|
+
ON spot.spot_job_id=job_info.spot_job_id
|
730
|
+
WHERE (
|
731
|
+
(job_info.schedule_state IS NOT NULL AND
|
732
|
+
job_info.schedule_state IS NOT ?)
|
733
|
+
OR
|
734
|
+
(job_info.schedule_state IS NULL AND
|
735
|
+
status NOT IN ({status_filter_str}))
|
736
|
+
)
|
737
|
+
{job_filter}
|
738
|
+
ORDER BY spot.spot_job_id DESC""", [
|
739
|
+
ManagedJobScheduleState.DONE.value, *terminal_status_values,
|
740
|
+
*job_value
|
741
|
+
]).fetchall()
|
742
|
+
return [row[0] for row in rows if row[0] is not None]
|
743
|
+
|
744
|
+
|
680
745
|
def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
|
681
746
|
"""Get all job ids by name."""
|
682
747
|
name_filter = ''
|
sky/jobs/utils.py
CHANGED
@@ -159,7 +159,7 @@ def _controller_process_alive(pid: int, job_id: int) -> bool:
|
|
159
159
|
return False
|
160
160
|
|
161
161
|
|
162
|
-
def
|
162
|
+
def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
163
163
|
"""Update managed job status if the controller process failed abnormally.
|
164
164
|
|
165
165
|
Check the status of the controller process. If it is not running, it must
|
@@ -168,125 +168,175 @@ def update_managed_job_status(job_id: Optional[int] = None):
|
|
168
168
|
when above happens, which could be not accurate based on the frequency this
|
169
169
|
function is called.
|
170
170
|
|
171
|
-
Note: we expect that job_id, if provided, refers to a nonterminal job
|
171
|
+
Note: we expect that job_id, if provided, refers to a nonterminal job or a
|
172
|
+
job that has not completed its cleanup (schedule state not DONE).
|
172
173
|
"""
|
173
174
|
|
174
|
-
|
175
|
-
|
176
|
-
# a terminal status during the course of this function. The set_failed()
|
177
|
-
# called below will not update the state for jobs that already have a
|
178
|
-
# terminal status, so it should be fine.
|
179
|
-
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
|
180
|
-
else:
|
181
|
-
job_ids = [job_id]
|
182
|
-
for job_id_ in job_ids:
|
183
|
-
|
184
|
-
failure_reason = None
|
185
|
-
|
186
|
-
tasks = managed_job_state.get_managed_jobs(job_id_)
|
187
|
-
schedule_state = tasks[0]['schedule_state']
|
188
|
-
if schedule_state is None:
|
189
|
-
# Backwards compatibility: this job was submitted when ray was still
|
190
|
-
# used for managing the parallelism of job controllers.
|
191
|
-
# TODO(cooperc): Remove before 0.11.0.
|
192
|
-
controller_status = job_lib.get_status(job_id_)
|
193
|
-
if controller_status is None or controller_status.is_terminal():
|
194
|
-
logger.error(f'Controller process for legacy job {job_id_} is '
|
195
|
-
'in an unexpected state.')
|
196
|
-
failure_reason = 'Legacy job is in an unexpected state'
|
197
|
-
|
198
|
-
# Continue to mark the job as failed.
|
199
|
-
else:
|
200
|
-
# Still running.
|
201
|
-
continue
|
202
|
-
else:
|
203
|
-
pid = tasks[0]['controller_pid']
|
204
|
-
if pid is None:
|
205
|
-
if schedule_state in (
|
206
|
-
managed_job_state.ManagedJobScheduleState.INACTIVE,
|
207
|
-
managed_job_state.ManagedJobScheduleState.WAITING):
|
208
|
-
# Job has not been scheduled yet.
|
209
|
-
continue
|
210
|
-
elif (schedule_state ==
|
211
|
-
managed_job_state.ManagedJobScheduleState.LAUNCHING):
|
212
|
-
# This should only be the case for a very short period of
|
213
|
-
# time between marking the job as submitted and writing the
|
214
|
-
# launched controller process pid back to the database (see
|
215
|
-
# scheduler.maybe_schedule_next_jobs).
|
216
|
-
# TODO(cooperc): Find a way to detect if we get stuck in
|
217
|
-
# this state.
|
218
|
-
logger.info(f'Job {job_id_} is in LAUNCHING state, '
|
219
|
-
'but controller process hasn\'t started yet.')
|
220
|
-
continue
|
221
|
-
# All other statuses are unexpected. Proceed to mark as failed.
|
222
|
-
logger.error(f'Expected to find a controller pid for state '
|
223
|
-
f'{schedule_state.value} but found none.')
|
224
|
-
failure_reason = ('No controller pid set for '
|
225
|
-
f'{schedule_state.value}')
|
226
|
-
else:
|
227
|
-
logger.debug(f'Checking controller pid {pid}')
|
228
|
-
if _controller_process_alive(pid, job_id_):
|
229
|
-
# The controller is still running.
|
230
|
-
continue
|
231
|
-
# Otherwise, proceed to mark the job as failed.
|
232
|
-
logger.error(f'Controller process for {job_id_} seems to be '
|
233
|
-
'dead.')
|
234
|
-
failure_reason = 'Controller process is dead'
|
175
|
+
def _cleanup_job_clusters(job_id: int) -> Optional[str]:
|
176
|
+
"""Clean up clusters for a job. Returns error message if any.
|
235
177
|
|
236
|
-
|
237
|
-
|
178
|
+
This function should not throw any exception. If it fails, it will
|
179
|
+
capture the error message, and log/return it.
|
180
|
+
"""
|
181
|
+
error_msg = None
|
182
|
+
tasks = managed_job_state.get_managed_jobs(job_id)
|
238
183
|
for task in tasks:
|
239
184
|
task_name = task['job_name']
|
240
|
-
|
241
|
-
cluster_name = generate_managed_job_cluster_name(task_name, job_id_)
|
185
|
+
cluster_name = generate_managed_job_cluster_name(task_name, job_id)
|
242
186
|
handle = global_user_state.get_handle_from_cluster_name(
|
243
187
|
cluster_name)
|
244
|
-
# If the cluster exists, terminate it.
|
245
188
|
if handle is not None:
|
246
|
-
|
189
|
+
try:
|
190
|
+
terminate_cluster(cluster_name)
|
191
|
+
except Exception as e: # pylint: disable=broad-except
|
192
|
+
error_msg = (
|
193
|
+
f'Failed to terminate cluster {cluster_name}: '
|
194
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
195
|
+
logger.exception(error_msg, exc_info=e)
|
196
|
+
return error_msg
|
197
|
+
|
198
|
+
# For backwards compatible jobs
|
199
|
+
# TODO(cooperc): Remove before 0.11.0.
|
200
|
+
def _handle_legacy_job(job_id: int):
|
201
|
+
controller_status = job_lib.get_status(job_id)
|
202
|
+
if controller_status is None or controller_status.is_terminal():
|
203
|
+
logger.error(f'Controller process for legacy job {job_id} is '
|
204
|
+
'in an unexpected state.')
|
205
|
+
|
206
|
+
cleanup_error = _cleanup_job_clusters(job_id)
|
207
|
+
if cleanup_error:
|
208
|
+
# Unconditionally set the job to failed_controller if the
|
209
|
+
# cleanup fails.
|
210
|
+
managed_job_state.set_failed(
|
211
|
+
job_id,
|
212
|
+
task_id=None,
|
213
|
+
failure_type=managed_job_state.ManagedJobStatus.
|
214
|
+
FAILED_CONTROLLER,
|
215
|
+
failure_reason=
|
216
|
+
'Legacy controller process has exited abnormally, and '
|
217
|
+
f'cleanup failed: {cleanup_error}. For more details, run: '
|
218
|
+
f'sky jobs logs --controller {job_id}',
|
219
|
+
override_terminal=True)
|
220
|
+
return
|
221
|
+
|
222
|
+
# It's possible for the job to have transitioned to
|
223
|
+
# another terminal state while between when we checked its
|
224
|
+
# state and now. In that case, set_failed won't do
|
225
|
+
# anything, which is fine.
|
226
|
+
managed_job_state.set_failed(
|
227
|
+
job_id,
|
228
|
+
task_id=None,
|
229
|
+
failure_type=managed_job_state.ManagedJobStatus.
|
230
|
+
FAILED_CONTROLLER,
|
231
|
+
failure_reason=(
|
232
|
+
'Legacy controller process has exited abnormally. For '
|
233
|
+
f'more details, run: sky jobs logs --controller {job_id}'))
|
234
|
+
|
235
|
+
# Get jobs that need checking (non-terminal or not DONE)
|
236
|
+
job_ids = managed_job_state.get_jobs_to_check_status(job_id)
|
237
|
+
if not job_ids:
|
238
|
+
# job_id is already terminal, or if job_id is None, there are no jobs
|
239
|
+
# that need to be checked.
|
240
|
+
return
|
241
|
+
|
242
|
+
for job_id in job_ids:
|
243
|
+
tasks = managed_job_state.get_managed_jobs(job_id)
|
244
|
+
# Note: controller_pid and schedule_state are in the job_info table
|
245
|
+
# which is joined to the spot table, so all tasks with the same job_id
|
246
|
+
# will have the same value for these columns. This is what lets us just
|
247
|
+
# take tasks[0]['controller_pid'] and tasks[0]['schedule_state'].
|
248
|
+
schedule_state = tasks[0]['schedule_state']
|
249
|
+
|
250
|
+
# Backwards compatibility: this job was submitted when ray was still
|
251
|
+
# used for managing the parallelism of job controllers.
|
252
|
+
# TODO(cooperc): Remove before 0.11.0.
|
253
|
+
if (schedule_state is
|
254
|
+
managed_job_state.ManagedJobScheduleState.INVALID):
|
255
|
+
_handle_legacy_job(job_id)
|
256
|
+
continue
|
257
|
+
|
258
|
+
# For jobs with schedule state:
|
259
|
+
pid = tasks[0]['controller_pid']
|
260
|
+
if pid is None:
|
261
|
+
if schedule_state in (
|
262
|
+
managed_job_state.ManagedJobScheduleState.INACTIVE,
|
263
|
+
managed_job_state.ManagedJobScheduleState.WAITING):
|
264
|
+
# For these states, the controller hasn't been started yet.
|
265
|
+
# This is expected.
|
266
|
+
continue
|
267
|
+
|
268
|
+
if (schedule_state ==
|
269
|
+
managed_job_state.ManagedJobScheduleState.LAUNCHING):
|
270
|
+
# This is unlikely but technically possible. There's a brief
|
271
|
+
# period between marking job as scheduled (LAUNCHING) and
|
272
|
+
# actually launching the controller process and writing the pid
|
273
|
+
# back to the table.
|
274
|
+
# TODO(cooperc): Find a way to detect if we get stuck in this
|
275
|
+
# state.
|
276
|
+
logger.info(f'Job {job_id} is in {schedule_state.value} state, '
|
277
|
+
'but controller process hasn\'t started yet.')
|
278
|
+
continue
|
279
|
+
|
280
|
+
logger.error(f'Expected to find a controller pid for state '
|
281
|
+
f'{schedule_state.value} but found none.')
|
282
|
+
failure_reason = f'No controller pid set for {schedule_state.value}'
|
283
|
+
else:
|
284
|
+
logger.debug(f'Checking controller pid {pid}')
|
285
|
+
if _controller_process_alive(pid, job_id):
|
286
|
+
# The controller is still running, so this job is fine.
|
287
|
+
continue
|
288
|
+
|
289
|
+
# Double check job is not already DONE before marking as failed, to
|
290
|
+
# avoid the race where the controller marked itself as DONE and
|
291
|
+
# exited between the state check and the pid check. Since the job
|
292
|
+
# controller process will mark itself DONE _before_ exiting, if it
|
293
|
+
# has exited and it's still not DONE now, it is abnormal.
|
294
|
+
if (managed_job_state.get_job_schedule_state(job_id) ==
|
295
|
+
managed_job_state.ManagedJobScheduleState.DONE):
|
296
|
+
# Never mind, the job is DONE now. This is fine.
|
297
|
+
continue
|
298
|
+
|
299
|
+
logger.error(f'Controller process for {job_id} seems to be dead.')
|
300
|
+
failure_reason = 'Controller process is dead'
|
301
|
+
|
302
|
+
# At this point, either pid is None or process is dead.
|
247
303
|
|
248
304
|
# The controller process for this managed job is not running: it must
|
249
305
|
# have exited abnormally, and we should set the job status to
|
250
306
|
# FAILED_CONTROLLER.
|
251
|
-
|
252
|
-
|
307
|
+
logger.error(f'Controller process for job {job_id} has exited '
|
308
|
+
'abnormally. Setting the job status to FAILED_CONTROLLER.')
|
309
|
+
|
310
|
+
# Cleanup clusters and capture any errors.
|
311
|
+
cleanup_error = _cleanup_job_clusters(job_id)
|
312
|
+
cleanup_error_msg = ''
|
313
|
+
if cleanup_error:
|
314
|
+
cleanup_error_msg = f'Also, cleanup failed: {cleanup_error}. '
|
315
|
+
|
316
|
+
# Set all tasks to FAILED_CONTROLLER, regardless of current status.
|
317
|
+
# This may change a job from SUCCEEDED or another terminal state to
|
318
|
+
# FAILED_CONTROLLER. This is what we want - we are sure that this
|
319
|
+
# controller process crashed, so we want to capture that even if the
|
320
|
+
# underlying job succeeded.
|
321
|
+
# Note: 2+ invocations of update_managed_jobs_statuses could be running
|
322
|
+
# at the same time, so this could override the FAILED_CONTROLLER status
|
323
|
+
# set by another invocation of update_managed_jobs_statuses. That should
|
324
|
+
# be okay. The only difference could be that one process failed to clean
|
325
|
+
# up the cluster while the other succeeds. No matter which
|
326
|
+
# failure_reason ends up in the database, the outcome is acceptable.
|
327
|
+
# We assume that no other code path outside the controller process will
|
328
|
+
# update the job status.
|
253
329
|
managed_job_state.set_failed(
|
254
|
-
|
330
|
+
job_id,
|
255
331
|
task_id=None,
|
256
332
|
failure_type=managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
|
257
333
|
failure_reason=
|
258
|
-
f'Controller process has exited abnormally ({failure_reason}).
|
259
|
-
f'
|
260
|
-
|
261
|
-
|
262
|
-
# Some jobs may be in a terminal status, but are not yet DONE. For instance,
|
263
|
-
# they may be still cleaning up resources, etc. Such jobs won't be captured
|
264
|
-
# by the above check, which only looks at nonterminal jobs. So, check the
|
265
|
-
# controller liveness of all jobs that should have live controller
|
266
|
-
# processes.
|
267
|
-
for job_info in managed_job_state.get_schedule_live_jobs(job_id):
|
268
|
-
if not job_info['controller_pid']:
|
269
|
-
# Technically, a job with no controller process but in LAUNCHING
|
270
|
-
# schedule state can happen very briefly after the job is set to
|
271
|
-
# LAUNCHING but before the controller process is actually spawned.
|
272
|
-
# However, if we observe any state other than LAUNCHING, something
|
273
|
-
# is clearly wrong.
|
274
|
-
if (job_info['schedule_state'] !=
|
275
|
-
managed_job_state.ManagedJobScheduleState.LAUNCHING):
|
276
|
-
logger.error(
|
277
|
-
f'Missing controller PID for {job_info["job_id"]}. '
|
278
|
-
'Setting to DONE.')
|
279
|
-
scheduler.job_done(job_info['job_id'])
|
280
|
-
else:
|
281
|
-
logger.info(f'LAUNCHING job {job_info["job_id"]} has no '
|
282
|
-
'controller process yet. Skipping.')
|
334
|
+
f'Controller process has exited abnormally ({failure_reason}). '
|
335
|
+
f'{cleanup_error_msg}'
|
336
|
+
f'For more details, run: sky jobs logs --controller {job_id}',
|
337
|
+
override_terminal=True)
|
283
338
|
|
284
|
-
|
285
|
-
job_info['job_id']):
|
286
|
-
logger.error(
|
287
|
-
f'Controller process for job {job_info["job_id"]} is not '
|
288
|
-
'alive. Marking the job as DONE.')
|
289
|
-
scheduler.job_done(job_info['job_id'])
|
339
|
+
scheduler.job_done(job_id, idempotent=True)
|
290
340
|
|
291
341
|
|
292
342
|
def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
@@ -382,7 +432,7 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
|
|
382
432
|
f'{job_status.value}. Skipped.')
|
383
433
|
continue
|
384
434
|
|
385
|
-
|
435
|
+
update_managed_jobs_statuses(job_id)
|
386
436
|
|
387
437
|
# Send the signal to the jobs controller.
|
388
438
|
signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
|
@@ -424,36 +474,24 @@ def cancel_job_by_name(job_name: str) -> str:
|
|
424
474
|
|
425
475
|
def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
426
476
|
"""Stream logs by job id."""
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
477
|
+
|
478
|
+
def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
|
479
|
+
# If we see CANCELLING, just exit - we could miss some job logs but the
|
480
|
+
# job will be terminated momentarily anyway so we don't really care.
|
481
|
+
return (not status.is_terminal() and
|
482
|
+
status != managed_job_state.ManagedJobStatus.CANCELLING)
|
483
|
+
|
484
|
+
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
|
485
|
+
status_display = rich_utils.safe_status(msg)
|
431
486
|
num_tasks = managed_job_state.get_num_tasks(job_id)
|
432
487
|
|
433
488
|
with status_display:
|
434
|
-
prev_msg = None
|
435
|
-
while (controller_status != job_lib.JobStatus.RUNNING and
|
436
|
-
(controller_status is None or
|
437
|
-
not controller_status.is_terminal())):
|
438
|
-
status_str = 'None'
|
439
|
-
if controller_status is not None:
|
440
|
-
status_str = controller_status.value
|
441
|
-
msg = status_msg.format(status_str=f' (status: {status_str})')
|
442
|
-
if msg != prev_msg:
|
443
|
-
status_display.update(msg)
|
444
|
-
prev_msg = msg
|
445
|
-
time.sleep(_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS)
|
446
|
-
controller_status = job_lib.get_status(job_id)
|
447
|
-
|
448
|
-
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
|
449
|
-
status_display.update(msg)
|
450
489
|
prev_msg = msg
|
451
|
-
managed_job_status
|
452
|
-
|
490
|
+
while (managed_job_status :=
|
491
|
+
managed_job_state.get_status(job_id)) is None:
|
453
492
|
time.sleep(1)
|
454
|
-
managed_job_status = managed_job_state.get_status(job_id)
|
455
493
|
|
456
|
-
if managed_job_status
|
494
|
+
if not should_keep_logging(managed_job_status):
|
457
495
|
job_msg = ''
|
458
496
|
if managed_job_status.is_failed():
|
459
497
|
job_msg = ('\nFailure reason: '
|
@@ -480,10 +518,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
480
518
|
task_id, managed_job_status = (
|
481
519
|
managed_job_state.get_latest_task_id_status(job_id))
|
482
520
|
|
483
|
-
#
|
484
|
-
#
|
485
|
-
|
486
|
-
|
521
|
+
# We wait for managed_job_status to be not None above. Once we see that
|
522
|
+
# it's not None, we don't expect it to every become None again.
|
523
|
+
assert managed_job_status is not None, (job_id, task_id,
|
524
|
+
managed_job_status)
|
525
|
+
|
526
|
+
while should_keep_logging(managed_job_status):
|
487
527
|
handle = None
|
488
528
|
if task_id is not None:
|
489
529
|
task_name = managed_job_state.get_task_name(job_id, task_id)
|
@@ -513,8 +553,11 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
513
553
|
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
514
554
|
task_id, managed_job_status = (
|
515
555
|
managed_job_state.get_latest_task_id_status(job_id))
|
556
|
+
assert managed_job_status is not None, (job_id, task_id,
|
557
|
+
managed_job_status)
|
516
558
|
continue
|
517
|
-
assert managed_job_status
|
559
|
+
assert (managed_job_status ==
|
560
|
+
managed_job_state.ManagedJobStatus.RUNNING)
|
518
561
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
519
562
|
status_display.stop()
|
520
563
|
returncode = backend.tail_logs(handle,
|
@@ -568,6 +611,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
568
611
|
managed_job_status :=
|
569
612
|
managed_job_state.get_status(job_id)):
|
570
613
|
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
614
|
+
assert managed_job_status is not None, (
|
615
|
+
job_id, managed_job_status)
|
571
616
|
continue
|
572
617
|
|
573
618
|
if task_id == num_tasks - 1:
|
@@ -593,6 +638,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
593
638
|
if original_task_id != task_id:
|
594
639
|
break
|
595
640
|
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
641
|
+
assert managed_job_status is not None, (job_id, task_id,
|
642
|
+
managed_job_status)
|
596
643
|
continue
|
597
644
|
|
598
645
|
# The job can be cancelled by the user or the controller (when
|
@@ -608,7 +655,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
608
655
|
# state.
|
609
656
|
managed_job_status = managed_job_state.get_status(job_id)
|
610
657
|
assert managed_job_status is not None, job_id
|
611
|
-
if managed_job_status
|
658
|
+
if not should_keep_logging(managed_job_status):
|
612
659
|
break
|
613
660
|
logger.info(f'{colorama.Fore.YELLOW}The job cluster is preempted '
|
614
661
|
f'or failed.{colorama.Style.RESET_ALL}')
|
@@ -623,6 +670,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
623
670
|
# managed job state is updated.
|
624
671
|
time.sleep(3 * JOB_STATUS_CHECK_GAP_SECONDS)
|
625
672
|
managed_job_status = managed_job_state.get_status(job_id)
|
673
|
+
assert managed_job_status is not None, (job_id, managed_job_status)
|
626
674
|
|
627
675
|
# The managed_job_status may not be in terminal status yet, since the
|
628
676
|
# controller has not updated the managed job state yet. We wait for a while,
|
@@ -630,7 +678,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
630
678
|
wait_seconds = 0
|
631
679
|
managed_job_status = managed_job_state.get_status(job_id)
|
632
680
|
assert managed_job_status is not None, job_id
|
633
|
-
while (
|
681
|
+
while (should_keep_logging(managed_job_status) and follow and
|
634
682
|
wait_seconds < _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS):
|
635
683
|
time.sleep(1)
|
636
684
|
wait_seconds += 1
|
@@ -694,10 +742,7 @@ def stream_logs(job_id: Optional[int],
|
|
694
742
|
if job_status is None:
|
695
743
|
with ux_utils.print_exception_no_traceback():
|
696
744
|
raise ValueError(f'Job {job_id} not found.')
|
697
|
-
|
698
|
-
# still cleaning up.
|
699
|
-
if (job_status.is_terminal() and job_status !=
|
700
|
-
managed_job_state.ManagedJobStatus.CANCELLING):
|
745
|
+
if job_status.is_terminal():
|
701
746
|
# Don't keep waiting. If the log file is not created by this
|
702
747
|
# point, it never will be. This job may have been submitted
|
703
748
|
# using an old version that did not create the log file, so this
|
@@ -729,6 +774,10 @@ def stream_logs(job_id: Optional[int],
|
|
729
774
|
print(end='', flush=True)
|
730
775
|
|
731
776
|
# Check if the job if finished.
|
777
|
+
# TODO(cooperc): The controller can still be
|
778
|
+
# cleaning up if job is in a terminal status
|
779
|
+
# (e.g. SUCCEEDED). We want to follow those logs
|
780
|
+
# too. Use DONE instead?
|
732
781
|
job_status = managed_job_state.get_status(job_id)
|
733
782
|
assert job_status is not None, (job_id, job_name)
|
734
783
|
if job_status.is_terminal():
|
@@ -7,6 +7,7 @@ import os
|
|
7
7
|
import re
|
8
8
|
import shutil
|
9
9
|
import subprocess
|
10
|
+
import time
|
10
11
|
import typing
|
11
12
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
12
13
|
from urllib.parse import urlparse
|
@@ -105,6 +106,75 @@ ANNOTATIONS_POD_NOT_FOUND_ERROR_MSG = ('Pod {pod_name} not found in namespace '
|
|
105
106
|
|
106
107
|
logger = sky_logging.init_logger(__name__)
|
107
108
|
|
109
|
+
# Default retry settings for Kubernetes API calls
|
110
|
+
DEFAULT_MAX_RETRIES = 3
|
111
|
+
DEFAULT_RETRY_INTERVAL_SECONDS = 1
|
112
|
+
|
113
|
+
|
114
|
+
def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
115
|
+
retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
|
116
|
+
resource_type: Optional[str] = None):
|
117
|
+
"""Decorator to retry Kubernetes API calls on transient failures.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
max_retries: Maximum number of retry attempts
|
121
|
+
retry_interval: Initial seconds to wait between retries
|
122
|
+
resource_type: Type of resource being accessed (e.g. 'node', 'pod').
|
123
|
+
Used to provide more specific error messages.
|
124
|
+
"""
|
125
|
+
|
126
|
+
def decorator(func):
|
127
|
+
|
128
|
+
@functools.wraps(func)
|
129
|
+
def wrapper(*args, **kwargs):
|
130
|
+
last_exception = None
|
131
|
+
backoff = common_utils.Backoff(initial_backoff=retry_interval,
|
132
|
+
max_backoff_factor=3)
|
133
|
+
|
134
|
+
for attempt in range(max_retries):
|
135
|
+
try:
|
136
|
+
return func(*args, **kwargs)
|
137
|
+
except (kubernetes.max_retry_error(),
|
138
|
+
kubernetes.api_exception(),
|
139
|
+
kubernetes.config_exception()) as e:
|
140
|
+
last_exception = e
|
141
|
+
# Don't retry on permanent errors like 401 (Unauthorized)
|
142
|
+
# or 403 (Forbidden)
|
143
|
+
if (isinstance(e, kubernetes.api_exception()) and
|
144
|
+
e.status in (401, 403)):
|
145
|
+
raise
|
146
|
+
if attempt < max_retries - 1:
|
147
|
+
sleep_time = backoff.current_backoff()
|
148
|
+
logger.debug(f'Kubernetes API call {func.__name__} '
|
149
|
+
f'failed with {str(e)}. Retrying in '
|
150
|
+
f'{sleep_time:.1f}s...')
|
151
|
+
time.sleep(sleep_time)
|
152
|
+
continue
|
153
|
+
|
154
|
+
# Format error message based on the type of exception
|
155
|
+
resource_msg = f' when trying to get {resource_type} info' \
|
156
|
+
if resource_type else ''
|
157
|
+
debug_cmd = f' To debug, run: kubectl get {resource_type}s' \
|
158
|
+
if resource_type else ''
|
159
|
+
|
160
|
+
if isinstance(last_exception, kubernetes.max_retry_error()):
|
161
|
+
error_msg = f'Timed out{resource_msg} from Kubernetes cluster.'
|
162
|
+
elif isinstance(last_exception, kubernetes.api_exception()):
|
163
|
+
error_msg = (f'Kubernetes API error{resource_msg}: '
|
164
|
+
f'{str(last_exception)}')
|
165
|
+
else:
|
166
|
+
error_msg = (f'Kubernetes configuration error{resource_msg}: '
|
167
|
+
f'{str(last_exception)}')
|
168
|
+
|
169
|
+
raise exceptions.ResourcesUnavailableError(
|
170
|
+
f'{error_msg}'
|
171
|
+
f' Please check if the cluster is healthy and retry.'
|
172
|
+
f'{debug_cmd}') from last_exception
|
173
|
+
|
174
|
+
return wrapper
|
175
|
+
|
176
|
+
return decorator
|
177
|
+
|
108
178
|
|
109
179
|
class GPULabelFormatter:
|
110
180
|
"""Base class to define a GPU label formatter for a Kubernetes cluster
|
@@ -446,6 +516,7 @@ def detect_accelerator_resource(
|
|
446
516
|
|
447
517
|
|
448
518
|
@functools.lru_cache(maxsize=10)
|
519
|
+
@_retry_on_error(resource_type='node')
|
449
520
|
def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
|
450
521
|
"""Gets the kubernetes nodes in the context.
|
451
522
|
|
@@ -454,17 +525,12 @@ def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
|
|
454
525
|
if context is None:
|
455
526
|
context = get_current_kube_config_context_name()
|
456
527
|
|
457
|
-
|
458
|
-
|
459
|
-
_request_timeout=kubernetes.API_TIMEOUT).items
|
460
|
-
except kubernetes.max_retry_error():
|
461
|
-
raise exceptions.ResourcesUnavailableError(
|
462
|
-
'Timed out when trying to get node info from Kubernetes cluster. '
|
463
|
-
'Please check if the cluster is healthy and retry. To debug, run: '
|
464
|
-
'kubectl get nodes') from None
|
528
|
+
nodes = kubernetes.core_api(context).list_node(
|
529
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
465
530
|
return nodes
|
466
531
|
|
467
532
|
|
533
|
+
@_retry_on_error(resource_type='pod')
|
468
534
|
def get_all_pods_in_kubernetes_cluster(
|
469
535
|
context: Optional[str] = None) -> List[Any]:
|
470
536
|
"""Gets pods in all namespaces in kubernetes cluster indicated by context.
|
@@ -474,14 +540,8 @@ def get_all_pods_in_kubernetes_cluster(
|
|
474
540
|
if context is None:
|
475
541
|
context = get_current_kube_config_context_name()
|
476
542
|
|
477
|
-
|
478
|
-
|
479
|
-
_request_timeout=kubernetes.API_TIMEOUT).items
|
480
|
-
except kubernetes.max_retry_error():
|
481
|
-
raise exceptions.ResourcesUnavailableError(
|
482
|
-
'Timed out when trying to get pod info from Kubernetes cluster. '
|
483
|
-
'Please check if the cluster is healthy and retry. To debug, run: '
|
484
|
-
'kubectl get pods') from None
|
543
|
+
pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
544
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
485
545
|
return pods
|
486
546
|
|
487
547
|
|
@@ -1758,8 +1818,6 @@ def merge_dicts(source: Dict[Any, Any], destination: Dict[Any, Any]):
|
|
1758
1818
|
else:
|
1759
1819
|
destination[key].extend(value)
|
1760
1820
|
else:
|
1761
|
-
if destination is None:
|
1762
|
-
destination = {}
|
1763
1821
|
destination[key] = value
|
1764
1822
|
|
1765
1823
|
|
sky/skylet/constants.py
CHANGED
@@ -86,7 +86,7 @@ TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
|
|
86
86
|
# cluster yaml is updated.
|
87
87
|
#
|
88
88
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
89
|
-
SKYLET_VERSION = '
|
89
|
+
SKYLET_VERSION = '11'
|
90
90
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
91
91
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
92
92
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
sky/skylet/events.py
CHANGED
{skypilot_nightly-1.0.0.dev20250123.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/RECORD
RENAMED
@@ -1,7 +1,7 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=byguYOHI2wvQyWrh97v5OmKwEiIEv4lxNHqBSxTPCXc,5944
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=LXUDABKP1FJCS256xTTDJa40WXwHKF5x49S-4hZbD1M,21501
|
4
|
-
sky/check.py,sha256=
|
4
|
+
sky/check.py,sha256=qTpm3N1zUZi2inEZPsrbt278B3h8nsk2gnepzIgLybE,10899
|
5
5
|
sky/cli.py,sha256=suOjHrt7mQTK47Z9ZQjogyUwnxfsKZ3_eP86AI29Dko,213891
|
6
6
|
sky/cloud_stores.py,sha256=PcLT57_8SZy7o6paAluElfBynaLkbaOq3l-8dNg1AVM,23672
|
7
7
|
sky/core.py,sha256=fE1rn4Ku94S0XmWTO5-6t6eT6aaJImNczRqEnTe8v7Q,38742
|
@@ -50,7 +50,7 @@ sky/clouds/do.py,sha256=zqibtq1gxNPSNkSkZFPfP5yplfIKCwBss3ry0o4C17c,11198
|
|
50
50
|
sky/clouds/fluidstack.py,sha256=u2I6jXEtTqgqRWi2EafMsKqc8VkUq1cR6CSDUvk72_U,12407
|
51
51
|
sky/clouds/gcp.py,sha256=6QOnefFsYiLCcnajjduLHsayqJ641bBu42jPTpvy7Mc,55007
|
52
52
|
sky/clouds/ibm.py,sha256=0ArRTQx1_DpTNGByFhukzFedEDzmVjBsGiiques1bQ0,21447
|
53
|
-
sky/clouds/kubernetes.py,sha256=
|
53
|
+
sky/clouds/kubernetes.py,sha256=oZg4Lpn2ZBikyc5NTJIziUPEY0xs2mtz546ButhkZ7g,31541
|
54
54
|
sky/clouds/lambda_cloud.py,sha256=42AmcN2X_wdBMuAw606nR_pQCBAy5QFiAo711_WRqDE,12672
|
55
55
|
sky/clouds/oci.py,sha256=VpPxpMJv52QePVdwdK9EuiMyqjp70dk8_rgUVv5Y-2w,27028
|
56
56
|
sky/clouds/paperspace.py,sha256=F0Sj1RcqRb5fPjrr8qbdeY-JdfAHcRPc902pZOi4erw,10889
|
@@ -98,12 +98,12 @@ sky/data/storage.py,sha256=CWVKnHhdzXw1biPbRqYizkyVexL_OCELuJCqtd4hit4,204094
|
|
98
98
|
sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
|
99
99
|
sky/jobs/__init__.py,sha256=ObZcz3lL1ip8JcmR6gbfZ4RMMfXJJdsnuU2zLQUb8jY,1546
|
100
100
|
sky/jobs/constants.py,sha256=6RphkJ6pmafQ7XYW5qwId1Zvqb99HJelA9kgrgfNR7o,1421
|
101
|
-
sky/jobs/controller.py,sha256=
|
101
|
+
sky/jobs/controller.py,sha256=0WcOk8xRZ-mZWuza-WE-ICKZTgZvXxNzj9pWXUslm6E,28312
|
102
102
|
sky/jobs/core.py,sha256=2_Q9thiBPnd3i2nDqyUtQY-dsGZ1kRgAdnLcXHoycYo,19938
|
103
103
|
sky/jobs/recovery_strategy.py,sha256=m-EA-MWXPFrgx2CYFPr6MmgeUoDTEBmY2xruD2PRSGY,26365
|
104
104
|
sky/jobs/scheduler.py,sha256=WAvNb8-vBk8q1zFordFdpH7gxqWDjPHDGZZay6aodOk,12028
|
105
|
-
sky/jobs/state.py,sha256=
|
106
|
-
sky/jobs/utils.py,sha256=
|
105
|
+
sky/jobs/state.py,sha256=bvBNZMg3DzPfS4eHNzMqYaMui2cqnWoWGDIaiOpaXSk,40770
|
106
|
+
sky/jobs/utils.py,sha256=RGVytFmB6SmKK3qZp_8UID_T5ssxSJOgwCDgIvRmhtM,51785
|
107
107
|
sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
|
108
108
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
109
109
|
sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
|
@@ -149,7 +149,7 @@ sky/provision/kubernetes/config.py,sha256=bXwOGdSAnXCkDreew0KsSUqSv3ZrptNeevqat7
|
|
149
149
|
sky/provision/kubernetes/instance.py,sha256=AQikdRgNklpeMgiEd4w2Hh7kGssVABsy0aCh9xsKi5Y,50313
|
150
150
|
sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
|
151
151
|
sky/provision/kubernetes/network_utils.py,sha256=52BZY_5ynCH6IXlivKObYyAHDgQCJyAJIjmM7J4MpFo,11393
|
152
|
-
sky/provision/kubernetes/utils.py,sha256=
|
152
|
+
sky/provision/kubernetes/utils.py,sha256=Soyq-8h1i0ZYjTzVZRgwbyAkfEbNrAR3G2-krzIr6Rk,107132
|
153
153
|
sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
|
154
154
|
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
|
155
155
|
sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
|
@@ -207,8 +207,8 @@ sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
207
207
|
sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
|
208
208
|
sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,4478
|
209
209
|
sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
|
210
|
-
sky/skylet/constants.py,sha256=
|
211
|
-
sky/skylet/events.py,sha256=
|
210
|
+
sky/skylet/constants.py,sha256=uLEVhMZXpIlj7b_03ixAI6rC6fTM1k5xPUWR4LvzQyo,16022
|
211
|
+
sky/skylet/events.py,sha256=0bOjUYpphuAficD9wDB5NOan2vwJDaRqdnm4sl0RK0U,12535
|
212
212
|
sky/skylet/job_lib.py,sha256=Rk-C069cusJIRXsks8xqCb016JSt7GlpU7LrpX0qFJk,42785
|
213
213
|
sky/skylet/log_lib.py,sha256=oFEBd85vDYFrIyyZKekH30yc4rRYILC0F0o-COQ64oE,20445
|
214
214
|
sky/skylet/log_lib.pyi,sha256=rRk4eUX0RHGs1QL9CXsJq6RE7FqqxZlfuPJOLXTvg7I,4453
|
@@ -289,9 +289,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
289
289
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
290
290
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
291
291
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
292
|
-
skypilot_nightly-1.0.0.
|
293
|
-
skypilot_nightly-1.0.0.
|
294
|
-
skypilot_nightly-1.0.0.
|
295
|
-
skypilot_nightly-1.0.0.
|
296
|
-
skypilot_nightly-1.0.0.
|
297
|
-
skypilot_nightly-1.0.0.
|
292
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
293
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/METADATA,sha256=8ozTZDBrQLiIaTS3-_CStvAfJE7XPmuwGGWneS_gj7o,21038
|
294
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
295
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
296
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
297
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20250123.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|