skypilot-nightly 1.0.0.dev20241106__py3-none-any.whl → 1.0.0.dev20241107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '1dcd2f009c1989cbc130e5b8490170a5a96c3e23'
8
+ _SKYPILOT_COMMIT_SHA = 'fe2ce9a262c059722ddce46f5594fc2ca2370c0d'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241106'
38
+ __version__ = '1.0.0.dev20241107'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -82,7 +82,7 @@ def raise_lambda_error(response: requests.Response) -> None:
82
82
  if status_code == 200:
83
83
  return
84
84
  if status_code == 429:
85
- # https://docs.lambdalabs.com/cloud/rate-limiting/
85
+ # https://docs.lambdalabs.com/public-cloud/cloud-api/
86
86
  raise LambdaCloudError('Your API requests are being rate limited.')
87
87
  try:
88
88
  resp_json = response.json()
@@ -145,7 +145,7 @@ class LambdaCloudClient:
145
145
  # Most API requests are rate limited at ~1 request every second but
146
146
  # launch requests are rate limited at ~1 request every 10 seconds.
147
147
  # So don't use launch requests to check availability.
148
- # See https://docs.lambdalabs.com/cloud/rate-limiting/ for more.
148
+ # See https://docs.lambdalabs.com/public-cloud/cloud-api/ for more.
149
149
  available_regions = (self.list_catalog()[instance_type]
150
150
  ['regions_with_capacity_available'])
151
151
  available_regions = [reg['name'] for reg in available_regions]
sky/skylet/job_lib.py CHANGED
@@ -181,14 +181,19 @@ class JobScheduler:
181
181
  subprocess.Popen(run_cmd, shell=True, stdout=subprocess.DEVNULL)
182
182
 
183
183
  def schedule_step(self, force_update_jobs: bool = False) -> None:
184
- jobs = self._get_jobs()
185
- if len(jobs) > 0 or force_update_jobs:
184
+ if force_update_jobs:
186
185
  update_status()
186
+ pending_jobs = self._get_pending_jobs()
187
187
  # TODO(zhwu, mraheja): One optimization can be allowing more than one
188
188
  # job staying in the pending state after ray job submit, so that to be
189
189
  # faster to schedule a large amount of jobs.
190
- for job_id, run_cmd, submit, created_time in jobs:
190
+ for job_id, run_cmd, submit, created_time in pending_jobs:
191
191
  with filelock.FileLock(_get_lock_path(job_id)):
192
+ # We don't have to refresh the job status before checking, as
193
+ # the job status will only be stale in rare cases where ray job
194
+ # crashes; or the job stays in INIT state for a long time.
195
+ # In those cases, the periodic JobSchedulerEvent event will
196
+ # update the job status every 300 seconds.
192
197
  status = get_status_no_lock(job_id)
193
198
  if (status not in _PRE_RESOURCE_STATUSES or
194
199
  created_time < psutil.boot_time()):
@@ -202,7 +207,7 @@ class JobScheduler:
202
207
  self._run_job(job_id, run_cmd)
203
208
  return
204
209
 
205
- def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
210
+ def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
206
211
  """Returns the metadata for jobs in the pending jobs table
207
212
 
208
213
  The information contains job_id, run command, submit time,
@@ -214,7 +219,7 @@ class JobScheduler:
214
219
  class FIFOScheduler(JobScheduler):
215
220
  """First in first out job scheduler"""
216
221
 
217
- def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
222
+ def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
218
223
  return list(
219
224
  _CURSOR.execute('SELECT * FROM pending_jobs ORDER BY job_id'))
220
225
 
@@ -534,25 +539,13 @@ def update_job_status(job_ids: List[int],
534
539
 
535
540
  This function should only be run on the remote instance with ray>=2.4.0.
536
541
  """
542
+ echo = logger.info if not silent else logger.debug
537
543
  if len(job_ids) == 0:
538
544
  return []
539
545
 
540
- # TODO: if too slow, directly query against redis.
541
546
  ray_job_ids = [make_ray_job_id(job_id) for job_id in job_ids]
542
-
543
547
  job_client = _create_ray_job_submission_client()
544
548
 
545
- # In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
546
- # which contains the job status (str) and submission_id (str).
547
- ray_job_query_time = time.time()
548
- job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
549
-
550
- job_details = {}
551
- ray_job_ids_set = set(ray_job_ids)
552
- for job_detail in job_detail_lists:
553
- if job_detail.submission_id in ray_job_ids_set:
554
- job_details[job_detail.submission_id] = job_detail
555
-
556
549
  statuses = []
557
550
  for job_id, ray_job_id in zip(job_ids, ray_job_ids):
558
551
  # Per-job status lock is required because between the job status
@@ -560,15 +553,48 @@ def update_job_status(job_ids: List[int],
560
553
  # can be modified by the generated ray program.
561
554
  with filelock.FileLock(_get_lock_path(job_id)):
562
555
  status = None
563
- if ray_job_id in job_details:
564
- ray_status = job_details[ray_job_id].status
565
- status = _RAY_TO_JOB_STATUS_MAP[ray_status]
556
+ job_record = _get_jobs_by_ids([job_id])[0]
557
+ original_status = job_record['status']
558
+ job_submitted_at = job_record['submitted_at']
559
+
560
+ ray_job_query_time = time.time()
561
+ if original_status == JobStatus.INIT:
562
+ if (job_submitted_at >= psutil.boot_time() and job_submitted_at
563
+ >= ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
564
+ # The job id is reserved, but the job is not submitted yet.
565
+ # We should keep it in INIT.
566
+ status = JobStatus.INIT
567
+ else:
568
+ # We always immediately submit job after the job id is
569
+ # allocated, i.e. INIT -> PENDING, if a job stays in INIT
570
+ # for too long, it is likely the job submission process
571
+ # was killed before the job is submitted. We should set it
572
+ # to FAILED then. Note, if ray job indicates the job is
573
+ # running, we will change status to PENDING below.
574
+ echo(f'INIT job {job_id} is stale, setting to FAILED')
575
+ status = JobStatus.FAILED
576
+
577
+ try:
578
+ # Querying status within the lock is safer than querying
579
+ # outside, as it avoids the race condition when job table is
580
+ # updated after the ray job status query.
581
+ # Also, getting per-job status is faster than querying all jobs,
582
+ # when there are significant number of finished jobs.
583
+ # Reference: getting 124 finished jobs takes 0.038s, while
584
+ # querying a single job takes 0.006s, 10 jobs takes 0.066s.
585
+ # TODO: if too slow, directly query against redis.
586
+ ray_job_status = job_client.get_job_status(ray_job_id)
587
+ status = _RAY_TO_JOB_STATUS_MAP[ray_job_status.value]
588
+ except RuntimeError:
589
+ # Job not found.
590
+ pass
591
+
566
592
  pending_job = _get_pending_job(job_id)
567
593
  if pending_job is not None:
568
594
  if pending_job['created_time'] < psutil.boot_time():
569
- logger.info(f'Job {job_id} is stale, setting to FAILED: '
570
- f'created_time={pending_job["created_time"]}, '
571
- f'boot_time={psutil.boot_time()}')
595
+ echo(f'Job {job_id} is stale, setting to FAILED: '
596
+ f'created_time={pending_job["created_time"]}, '
597
+ f'boot_time={psutil.boot_time()}')
572
598
  # The job is stale as it is created before the instance
573
599
  # is booted, e.g. the instance is rebooted.
574
600
  status = JobStatus.FAILED
@@ -583,22 +609,20 @@ def update_job_status(job_ids: List[int],
583
609
  # as stale.
584
610
  status = JobStatus.PENDING
585
611
 
586
- original_status = get_status_no_lock(job_id)
587
612
  assert original_status is not None, (job_id, status)
588
613
  if status is None:
589
614
  status = original_status
590
615
  if (original_status is not None and
591
616
  not original_status.is_terminal()):
592
- logger.info(f'Ray job status for job {job_id} is None, '
593
- 'setting it to FAILED.')
617
+ echo(f'Ray job status for job {job_id} is None, '
618
+ 'setting it to FAILED.')
594
619
  # The job may be stale, when the instance is restarted
595
620
  # (the ray redis is volatile). We need to reset the
596
621
  # status of the task to FAILED if its original status
597
622
  # is RUNNING or PENDING.
598
623
  status = JobStatus.FAILED
599
624
  _set_status_no_lock(job_id, status)
600
- if not silent:
601
- logger.info(f'Updated job {job_id} status to {status}')
625
+ echo(f'Updated job {job_id} status to {status}')
602
626
  else:
603
627
  # Taking max of the status is necessary because:
604
628
  # 1. It avoids race condition, where the original status has
@@ -611,10 +635,10 @@ def update_job_status(job_ids: List[int],
611
635
  # DB) would already have that value. So we take the max here to
612
636
  # keep it at later status.
613
637
  status = max(status, original_status)
638
+ assert status is not None, (job_id, status, original_status)
614
639
  if status != original_status: # Prevents redundant update.
615
640
  _set_status_no_lock(job_id, status)
616
- if not silent:
617
- logger.info(f'Updated job {job_id} status to {status}')
641
+ echo(f'Updated job {job_id} status to {status}')
618
642
  statuses.append(status)
619
643
  return statuses
620
644
 
@@ -180,7 +180,7 @@ class SCPNodeProvider(NodeProvider):
180
180
  metadata['tags'] = instance_info['tags']
181
181
  # TODO(ewzeng): The internal ip is hard to get, so set it to the
182
182
  # external ip as a hack. This should be changed in the future.
183
- # https://docs.lambdalabs.com/cloud/learn-private-ip-address/
183
+ # https://docs.lambdalabs.com/public-cloud/on-demand/getting-started/#learn-your-instances-private-ip-address
184
184
  metadata['internal_ip'] = vm['ip']
185
185
  metadata['external_ip'] = vm['external_ip']
186
186
  return metadata
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241106
3
+ Version: 1.0.0.dev20241107
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,4 +1,4 @@
1
- sky/__init__.py,sha256=gJi4nCnW9_tfOdSmOh1s0EemDMl3aeTk1lG8K9lrsHA,5882
1
+ sky/__init__.py,sha256=LDYVc006Bm6m_yCUJiTKF3oPp3_O3ODjp1KhoU5meCE,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
@@ -146,7 +146,7 @@ sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=
146
146
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
147
147
  sky/provision/lambda_cloud/config.py,sha256=jq1iLzp4Up61r4JGxvtpVbJlgXnea3LHYQhCQyyl7ik,272
148
148
  sky/provision/lambda_cloud/instance.py,sha256=5-XuX-KwlRq8y62NXNzY_p6aJs4iCPGBf5U4pIR4liI,8975
149
- sky/provision/lambda_cloud/lambda_utils.py,sha256=H8uaaMEpLn5cqGCdhUH_oJiccv_cuMguUNAl0NqB0Ik,9873
149
+ sky/provision/lambda_cloud/lambda_utils.py,sha256=wIXV1Qe362f8Q9u8DSx2e9IJs4CF03Jr3idHCzhlRz4,9879
150
150
  sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
151
151
  sky/provision/paperspace/config.py,sha256=oNmffSt-V466pE0DmML8hOCX1CiA24jAqE5JEKuqpyI,1541
152
152
  sky/provision/paperspace/constants.py,sha256=NcLJGivJxshJwhR28yVHysWQ2gtMAkTVmHC91d3kyKM,957
@@ -192,7 +192,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
192
192
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
193
193
  sky/skylet/constants.py,sha256=TL-O0ZoxA1ZeNvKXzzA_UyIMXsma7flbsDZ1N_o9dKg,14468
194
194
  sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
195
- sky/skylet/job_lib.py,sha256=jqJ4D3UeG6fNMm8xPtdWclnrVHQb6WiRqb1nrBp8TPg,35200
195
+ sky/skylet/job_lib.py,sha256=-SCbpJRiWMSwvhDjUwfwnvBap7Y5B3ol1l_PDPra3XI,36860
196
196
  sky/skylet/log_lib.py,sha256=Jyj3h2yMBlheFX53AabXEiPaKyCbu06hLEhay5_ZRN0,18734
197
197
  sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
198
198
  sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
@@ -209,7 +209,7 @@ sky/skylet/providers/oci/query_helper.py,sha256=dUsvPGzWPNF5O2NjQvuC8tkilT4H11gM
209
209
  sky/skylet/providers/oci/utils.py,sha256=lCpdklxgSwK-hqErTicpIe_xkpSlIc8u943C-9_MJfU,508
210
210
  sky/skylet/providers/scp/__init__.py,sha256=15SiAh1YphXkZsHySaw_CeAmXRdoM4JtNIAt7SLbUvg,91
211
211
  sky/skylet/providers/scp/config.py,sha256=lhMXyG9btMlg59nmvtnMdIDN07jBbQOheAx-bHbGbhw,5077
212
- sky/skylet/providers/scp/node_provider.py,sha256=5HjFEGqKAqVcszEpcan_IzY9NKVMQdm2BUgGdfw9aUY,22411
212
+ sky/skylet/providers/scp/node_provider.py,sha256=W5J-170JVIpwT9Fv20fJ_PpdAVsqx9pigE-RkkG_kQE,22459
213
213
  sky/skylet/ray_patches/__init__.py,sha256=IoCzj9zFaHW-n__bLD8dgC2pJMTfZRxRpr8rZGvMyrw,2761
214
214
  sky/skylet/ray_patches/autoscaler.py.patch,sha256=cZK15L29aay-qx6JoGVPNsPIo3UiG0bAHh8fqfFd-44,291
215
215
  sky/skylet/ray_patches/cli.py.patch,sha256=ooEAr3OfA6LN7v4iaNltY6w63TaOFssgw9iKWP49iJc,349
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
274
274
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
275
275
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
276
276
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
277
- skypilot_nightly-1.0.0.dev20241106.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
- skypilot_nightly-1.0.0.dev20241106.dist-info/METADATA,sha256=xDHkghCeZl-VGNYV5hps-0I-il3EKeUO9Rg7JcokqPI,19708
279
- skypilot_nightly-1.0.0.dev20241106.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
- skypilot_nightly-1.0.0.dev20241106.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
- skypilot_nightly-1.0.0.dev20241106.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
- skypilot_nightly-1.0.0.dev20241106.dist-info/RECORD,,
277
+ skypilot_nightly-1.0.0.dev20241107.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
+ skypilot_nightly-1.0.0.dev20241107.dist-info/METADATA,sha256=ICnKtcpMVvZVf_1H6k63r29XgS_-heZ4BcgH-p5J5s4,19708
279
+ skypilot_nightly-1.0.0.dev20241107.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
+ skypilot_nightly-1.0.0.dev20241107.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
+ skypilot_nightly-1.0.0.dev20241107.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
+ skypilot_nightly-1.0.0.dev20241107.dist-info/RECORD,,