skypilot-nightly 1.0.0.dev20241106__py3-none-any.whl → 1.0.0.dev20241107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/provision/lambda_cloud/lambda_utils.py +2 -2
- sky/skylet/job_lib.py +55 -31
- sky/skylet/providers/scp/node_provider.py +1 -1
- {skypilot_nightly-1.0.0.dev20241106.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241106.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/RECORD +10 -10
- {skypilot_nightly-1.0.0.dev20241106.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241106.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241106.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241106.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'fe2ce9a262c059722ddce46f5594fc2ca2370c0d'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241107'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -82,7 +82,7 @@ def raise_lambda_error(response: requests.Response) -> None:
|
|
82
82
|
if status_code == 200:
|
83
83
|
return
|
84
84
|
if status_code == 429:
|
85
|
-
# https://docs.lambdalabs.com/cloud/
|
85
|
+
# https://docs.lambdalabs.com/public-cloud/cloud-api/
|
86
86
|
raise LambdaCloudError('Your API requests are being rate limited.')
|
87
87
|
try:
|
88
88
|
resp_json = response.json()
|
@@ -145,7 +145,7 @@ class LambdaCloudClient:
|
|
145
145
|
# Most API requests are rate limited at ~1 request every second but
|
146
146
|
# launch requests are rate limited at ~1 request every 10 seconds.
|
147
147
|
# So don't use launch requests to check availability.
|
148
|
-
# See https://docs.lambdalabs.com/cloud/
|
148
|
+
# See https://docs.lambdalabs.com/public-cloud/cloud-api/ for more.
|
149
149
|
available_regions = (self.list_catalog()[instance_type]
|
150
150
|
['regions_with_capacity_available'])
|
151
151
|
available_regions = [reg['name'] for reg in available_regions]
|
sky/skylet/job_lib.py
CHANGED
@@ -181,14 +181,19 @@ class JobScheduler:
|
|
181
181
|
subprocess.Popen(run_cmd, shell=True, stdout=subprocess.DEVNULL)
|
182
182
|
|
183
183
|
def schedule_step(self, force_update_jobs: bool = False) -> None:
|
184
|
-
|
185
|
-
if len(jobs) > 0 or force_update_jobs:
|
184
|
+
if force_update_jobs:
|
186
185
|
update_status()
|
186
|
+
pending_jobs = self._get_pending_jobs()
|
187
187
|
# TODO(zhwu, mraheja): One optimization can be allowing more than one
|
188
188
|
# job staying in the pending state after ray job submit, so that to be
|
189
189
|
# faster to schedule a large amount of jobs.
|
190
|
-
for job_id, run_cmd, submit, created_time in
|
190
|
+
for job_id, run_cmd, submit, created_time in pending_jobs:
|
191
191
|
with filelock.FileLock(_get_lock_path(job_id)):
|
192
|
+
# We don't have to refresh the job status before checking, as
|
193
|
+
# the job status will only be stale in rare cases where ray job
|
194
|
+
# crashes; or the job stays in INIT state for a long time.
|
195
|
+
# In those cases, the periodic JobSchedulerEvent event will
|
196
|
+
# update the job status every 300 seconds.
|
192
197
|
status = get_status_no_lock(job_id)
|
193
198
|
if (status not in _PRE_RESOURCE_STATUSES or
|
194
199
|
created_time < psutil.boot_time()):
|
@@ -202,7 +207,7 @@ class JobScheduler:
|
|
202
207
|
self._run_job(job_id, run_cmd)
|
203
208
|
return
|
204
209
|
|
205
|
-
def
|
210
|
+
def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
|
206
211
|
"""Returns the metadata for jobs in the pending jobs table
|
207
212
|
|
208
213
|
The information contains job_id, run command, submit time,
|
@@ -214,7 +219,7 @@ class JobScheduler:
|
|
214
219
|
class FIFOScheduler(JobScheduler):
|
215
220
|
"""First in first out job scheduler"""
|
216
221
|
|
217
|
-
def
|
222
|
+
def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
|
218
223
|
return list(
|
219
224
|
_CURSOR.execute('SELECT * FROM pending_jobs ORDER BY job_id'))
|
220
225
|
|
@@ -534,25 +539,13 @@ def update_job_status(job_ids: List[int],
|
|
534
539
|
|
535
540
|
This function should only be run on the remote instance with ray>=2.4.0.
|
536
541
|
"""
|
542
|
+
echo = logger.info if not silent else logger.debug
|
537
543
|
if len(job_ids) == 0:
|
538
544
|
return []
|
539
545
|
|
540
|
-
# TODO: if too slow, directly query against redis.
|
541
546
|
ray_job_ids = [make_ray_job_id(job_id) for job_id in job_ids]
|
542
|
-
|
543
547
|
job_client = _create_ray_job_submission_client()
|
544
548
|
|
545
|
-
# In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
|
546
|
-
# which contains the job status (str) and submission_id (str).
|
547
|
-
ray_job_query_time = time.time()
|
548
|
-
job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
|
549
|
-
|
550
|
-
job_details = {}
|
551
|
-
ray_job_ids_set = set(ray_job_ids)
|
552
|
-
for job_detail in job_detail_lists:
|
553
|
-
if job_detail.submission_id in ray_job_ids_set:
|
554
|
-
job_details[job_detail.submission_id] = job_detail
|
555
|
-
|
556
549
|
statuses = []
|
557
550
|
for job_id, ray_job_id in zip(job_ids, ray_job_ids):
|
558
551
|
# Per-job status lock is required because between the job status
|
@@ -560,15 +553,48 @@ def update_job_status(job_ids: List[int],
|
|
560
553
|
# can be modified by the generated ray program.
|
561
554
|
with filelock.FileLock(_get_lock_path(job_id)):
|
562
555
|
status = None
|
563
|
-
|
564
|
-
|
565
|
-
|
556
|
+
job_record = _get_jobs_by_ids([job_id])[0]
|
557
|
+
original_status = job_record['status']
|
558
|
+
job_submitted_at = job_record['submitted_at']
|
559
|
+
|
560
|
+
ray_job_query_time = time.time()
|
561
|
+
if original_status == JobStatus.INIT:
|
562
|
+
if (job_submitted_at >= psutil.boot_time() and job_submitted_at
|
563
|
+
>= ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
|
564
|
+
# The job id is reserved, but the job is not submitted yet.
|
565
|
+
# We should keep it in INIT.
|
566
|
+
status = JobStatus.INIT
|
567
|
+
else:
|
568
|
+
# We always immediately submit job after the job id is
|
569
|
+
# allocated, i.e. INIT -> PENDING, if a job stays in INIT
|
570
|
+
# for too long, it is likely the job submission process
|
571
|
+
# was killed before the job is submitted. We should set it
|
572
|
+
# to FAILED then. Note, if ray job indicates the job is
|
573
|
+
# running, we will change status to PENDING below.
|
574
|
+
echo(f'INIT job {job_id} is stale, setting to FAILED')
|
575
|
+
status = JobStatus.FAILED
|
576
|
+
|
577
|
+
try:
|
578
|
+
# Querying status within the lock is safer than querying
|
579
|
+
# outside, as it avoids the race condition when job table is
|
580
|
+
# updated after the ray job status query.
|
581
|
+
# Also, getting per-job status is faster than querying all jobs,
|
582
|
+
# when there are significant number of finished jobs.
|
583
|
+
# Reference: getting 124 finished jobs takes 0.038s, while
|
584
|
+
# querying a single job takes 0.006s, 10 jobs takes 0.066s.
|
585
|
+
# TODO: if too slow, directly query against redis.
|
586
|
+
ray_job_status = job_client.get_job_status(ray_job_id)
|
587
|
+
status = _RAY_TO_JOB_STATUS_MAP[ray_job_status.value]
|
588
|
+
except RuntimeError:
|
589
|
+
# Job not found.
|
590
|
+
pass
|
591
|
+
|
566
592
|
pending_job = _get_pending_job(job_id)
|
567
593
|
if pending_job is not None:
|
568
594
|
if pending_job['created_time'] < psutil.boot_time():
|
569
|
-
|
570
|
-
|
571
|
-
|
595
|
+
echo(f'Job {job_id} is stale, setting to FAILED: '
|
596
|
+
f'created_time={pending_job["created_time"]}, '
|
597
|
+
f'boot_time={psutil.boot_time()}')
|
572
598
|
# The job is stale as it is created before the instance
|
573
599
|
# is booted, e.g. the instance is rebooted.
|
574
600
|
status = JobStatus.FAILED
|
@@ -583,22 +609,20 @@ def update_job_status(job_ids: List[int],
|
|
583
609
|
# as stale.
|
584
610
|
status = JobStatus.PENDING
|
585
611
|
|
586
|
-
original_status = get_status_no_lock(job_id)
|
587
612
|
assert original_status is not None, (job_id, status)
|
588
613
|
if status is None:
|
589
614
|
status = original_status
|
590
615
|
if (original_status is not None and
|
591
616
|
not original_status.is_terminal()):
|
592
|
-
|
593
|
-
|
617
|
+
echo(f'Ray job status for job {job_id} is None, '
|
618
|
+
'setting it to FAILED.')
|
594
619
|
# The job may be stale, when the instance is restarted
|
595
620
|
# (the ray redis is volatile). We need to reset the
|
596
621
|
# status of the task to FAILED if its original status
|
597
622
|
# is RUNNING or PENDING.
|
598
623
|
status = JobStatus.FAILED
|
599
624
|
_set_status_no_lock(job_id, status)
|
600
|
-
|
601
|
-
logger.info(f'Updated job {job_id} status to {status}')
|
625
|
+
echo(f'Updated job {job_id} status to {status}')
|
602
626
|
else:
|
603
627
|
# Taking max of the status is necessary because:
|
604
628
|
# 1. It avoids race condition, where the original status has
|
@@ -611,10 +635,10 @@ def update_job_status(job_ids: List[int],
|
|
611
635
|
# DB) would already have that value. So we take the max here to
|
612
636
|
# keep it at later status.
|
613
637
|
status = max(status, original_status)
|
638
|
+
assert status is not None, (job_id, status, original_status)
|
614
639
|
if status != original_status: # Prevents redundant update.
|
615
640
|
_set_status_no_lock(job_id, status)
|
616
|
-
|
617
|
-
logger.info(f'Updated job {job_id} status to {status}')
|
641
|
+
echo(f'Updated job {job_id} status to {status}')
|
618
642
|
statuses.append(status)
|
619
643
|
return statuses
|
620
644
|
|
@@ -180,7 +180,7 @@ class SCPNodeProvider(NodeProvider):
|
|
180
180
|
metadata['tags'] = instance_info['tags']
|
181
181
|
# TODO(ewzeng): The internal ip is hard to get, so set it to the
|
182
182
|
# external ip as a hack. This should be changed in the future.
|
183
|
-
# https://docs.lambdalabs.com/cloud/learn-private-ip-address
|
183
|
+
# https://docs.lambdalabs.com/public-cloud/on-demand/getting-started/#learn-your-instances-private-ip-address
|
184
184
|
metadata['internal_ip'] = vm['ip']
|
185
185
|
metadata['external_ip'] = vm['external_ip']
|
186
186
|
return metadata
|
{skypilot_nightly-1.0.0.dev20241106.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=LDYVc006Bm6m_yCUJiTKF3oPp3_O3ODjp1KhoU5meCE,5882
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
|
@@ -146,7 +146,7 @@ sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=
|
|
146
146
|
sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
|
147
147
|
sky/provision/lambda_cloud/config.py,sha256=jq1iLzp4Up61r4JGxvtpVbJlgXnea3LHYQhCQyyl7ik,272
|
148
148
|
sky/provision/lambda_cloud/instance.py,sha256=5-XuX-KwlRq8y62NXNzY_p6aJs4iCPGBf5U4pIR4liI,8975
|
149
|
-
sky/provision/lambda_cloud/lambda_utils.py,sha256=
|
149
|
+
sky/provision/lambda_cloud/lambda_utils.py,sha256=wIXV1Qe362f8Q9u8DSx2e9IJs4CF03Jr3idHCzhlRz4,9879
|
150
150
|
sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
|
151
151
|
sky/provision/paperspace/config.py,sha256=oNmffSt-V466pE0DmML8hOCX1CiA24jAqE5JEKuqpyI,1541
|
152
152
|
sky/provision/paperspace/constants.py,sha256=NcLJGivJxshJwhR28yVHysWQ2gtMAkTVmHC91d3kyKM,957
|
@@ -192,7 +192,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
|
|
192
192
|
sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
|
193
193
|
sky/skylet/constants.py,sha256=TL-O0ZoxA1ZeNvKXzzA_UyIMXsma7flbsDZ1N_o9dKg,14468
|
194
194
|
sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
|
195
|
-
sky/skylet/job_lib.py,sha256
|
195
|
+
sky/skylet/job_lib.py,sha256=-SCbpJRiWMSwvhDjUwfwnvBap7Y5B3ol1l_PDPra3XI,36860
|
196
196
|
sky/skylet/log_lib.py,sha256=Jyj3h2yMBlheFX53AabXEiPaKyCbu06hLEhay5_ZRN0,18734
|
197
197
|
sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
|
198
198
|
sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
|
@@ -209,7 +209,7 @@ sky/skylet/providers/oci/query_helper.py,sha256=dUsvPGzWPNF5O2NjQvuC8tkilT4H11gM
|
|
209
209
|
sky/skylet/providers/oci/utils.py,sha256=lCpdklxgSwK-hqErTicpIe_xkpSlIc8u943C-9_MJfU,508
|
210
210
|
sky/skylet/providers/scp/__init__.py,sha256=15SiAh1YphXkZsHySaw_CeAmXRdoM4JtNIAt7SLbUvg,91
|
211
211
|
sky/skylet/providers/scp/config.py,sha256=lhMXyG9btMlg59nmvtnMdIDN07jBbQOheAx-bHbGbhw,5077
|
212
|
-
sky/skylet/providers/scp/node_provider.py,sha256=
|
212
|
+
sky/skylet/providers/scp/node_provider.py,sha256=W5J-170JVIpwT9Fv20fJ_PpdAVsqx9pigE-RkkG_kQE,22459
|
213
213
|
sky/skylet/ray_patches/__init__.py,sha256=IoCzj9zFaHW-n__bLD8dgC2pJMTfZRxRpr8rZGvMyrw,2761
|
214
214
|
sky/skylet/ray_patches/autoscaler.py.patch,sha256=cZK15L29aay-qx6JoGVPNsPIo3UiG0bAHh8fqfFd-44,291
|
215
215
|
sky/skylet/ray_patches/cli.py.patch,sha256=ooEAr3OfA6LN7v4iaNltY6w63TaOFssgw9iKWP49iJc,349
|
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
274
274
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
275
275
|
sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
|
276
276
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
282
|
-
skypilot_nightly-1.0.0.
|
277
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
278
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/METADATA,sha256=ICnKtcpMVvZVf_1H6k63r29XgS_-heZ4BcgH-p5J5s4,19708
|
279
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
280
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
281
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
282
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241106.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|