skypilot-nightly 1.0.0.dev20241105__py3-none-any.whl → 1.0.0.dev20241107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'c24a0b3a8f9b1d8193e27f976da9f3d71867506f'
8
+ _SKYPILOT_COMMIT_SHA = 'fe2ce9a262c059722ddce46f5594fc2ca2370c0d'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241105'
38
+ __version__ = '1.0.0.dev20241107'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -3278,9 +3278,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3278
3278
  f'{cd} && {constants.SKY_RAY_CMD} job submit '
3279
3279
  '--address=http://127.0.0.1:$RAY_DASHBOARD_PORT '
3280
3280
  f'--submission-id {job_id}-$(whoami) --no-wait '
3281
- # Redirect stderr to /dev/null to avoid distracting error from ray.
3282
- f'"{constants.SKY_PYTHON_CMD} -u {script_path} > {remote_log_path} '
3283
- '2> /dev/null"')
3281
+ f'"{constants.SKY_PYTHON_CMD} -u {script_path} '
3282
+ # Do not use &>, which is not POSIX and may not work.
3283
+ # Note that the order of ">filename 2>&1" matters.
3284
+ f'> {remote_log_path} 2>&1"')
3284
3285
 
3285
3286
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3286
3287
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
@@ -283,11 +283,27 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
283
283
  # the same credentials. Otherwise, `ray status` will fail to fetch the
284
284
  # available nodes.
285
285
  # Reference: https://github.com/skypilot-org/skypilot/issues/2441
286
- cmd = (f'{constants.SKY_RAY_CMD} stop; '
287
- 'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; '
288
- 'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
289
- f'{constants.SKY_RAY_CMD} start --head {ray_options} || exit 1;' +
290
- _RAY_PRLIMIT + _DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND)
286
+ cmd = (
287
+ f'{constants.SKY_RAY_CMD} stop; '
288
+ 'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; '
289
+ 'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
290
+ # worker_maximum_startup_concurrency controls the maximum number of
291
+ # workers that can be started concurrently. However, it also controls
292
+ # this warning message:
293
+ # https://github.com/ray-project/ray/blob/d5d03e6e24ae3cfafb87637ade795fb1480636e6/src/ray/raylet/worker_pool.cc#L1535-L1545
294
+ # maximum_startup_concurrency defaults to the number of CPUs given by
295
+ # multiprocessing.cpu_count() or manually specified to ray. (See
296
+ # https://github.com/ray-project/ray/blob/fab26e1813779eb568acba01281c6dd963c13635/python/ray/_private/services.py#L1622-L1624.)
297
+ # The warning will show when the number of workers is >4x the
298
+ # maximum_startup_concurrency, so typically 4x CPU count. However, the
299
+ # job controller uses 0.25cpu reservations, and each job can use two
300
+ # workers (one for the submitted job and one for remote actors),
301
+ # resulting in a worker count of 8x CPUs or more. Increase the
302
+ # worker_maximum_startup_concurrency to 3x CPUs so that we will only see
303
+ # the warning when the worker count is >12x CPUs.
304
+ 'RAY_worker_maximum_startup_concurrency=$(( 3 * $(nproc --all) )) '
305
+ f'{constants.SKY_RAY_CMD} start --head {ray_options} || exit 1;' +
306
+ _RAY_PRLIMIT + _DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND)
291
307
  logger.info(f'Running command on head node: {cmd}')
292
308
  # TODO(zhwu): add the output to log files.
293
309
  returncode, stdout, stderr = head_runner.run(
@@ -2,7 +2,7 @@
2
2
  import copy
3
3
  import json
4
4
  import time
5
- from typing import Any, Dict, List, Optional
5
+ from typing import Any, Callable, Dict, List, Optional
6
6
  import uuid
7
7
 
8
8
  from sky import exceptions
@@ -24,6 +24,8 @@ from sky.utils import ux_utils
24
24
 
25
25
  POLL_INTERVAL = 2
26
26
  _TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
27
+ _MAX_RETRIES = 3
28
+ NUM_THREADS = subprocess_utils.get_parallel_threads() * 2
27
29
 
28
30
  logger = sky_logging.init_logger(__name__)
29
31
  TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
@@ -304,6 +306,33 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
304
306
  time.sleep(1)
305
307
 
306
308
 
309
+ def _run_function_with_retries(func: Callable,
310
+ operation_name: str,
311
+ max_retries: int = _MAX_RETRIES,
312
+ retry_delay: int = 5) -> Any:
313
+ """Runs a function with retries on Kubernetes errors.
314
+
315
+ Args:
316
+ func: Function to retry
317
+ operation_name: Name of the operation for logging
318
+ max_retries: Maximum number of retry attempts
319
+ retry_delay: Delay between retries in seconds
320
+
321
+ Raises:
322
+ The last exception encountered if all retries fail.
323
+ """
324
+ for attempt in range(max_retries + 1):
325
+ try:
326
+ return func()
327
+ except config_lib.KubernetesError:
328
+ if attempt < max_retries:
329
+ logger.warning(f'Failed to {operation_name} - '
330
+ f'retrying in {retry_delay} seconds.')
331
+ time.sleep(retry_delay)
332
+ else:
333
+ raise
334
+
335
+
307
336
  def _set_env_vars_in_pods(namespace: str, context: Optional[str],
308
337
  new_pods: List):
309
338
  """Setting environment variables in pods.
@@ -323,14 +352,27 @@ def _set_env_vars_in_pods(namespace: str, context: Optional[str],
323
352
  """
324
353
  set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
325
354
 
326
- for new_pod in new_pods:
355
+ def _set_env_vars_thread(new_pod):
356
+ pod_name = new_pod.metadata.name
357
+ logger.info(f'{"-"*20}Start: Set up env vars in pod {pod_name!r} '
358
+ f'{"-"*20}')
327
359
  runner = command_runner.KubernetesCommandRunner(
328
- ((namespace, context), new_pod.metadata.name))
329
- rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
330
- require_outputs=True,
331
- stream_logs=False)
332
- _raise_command_running_error('set env vars', set_k8s_env_var_cmd,
333
- new_pod.metadata.name, rc, stdout)
360
+ ((namespace, context), pod_name))
361
+
362
+ def _run_env_vars_cmd():
363
+ rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
364
+ require_outputs=True,
365
+ stream_logs=False)
366
+ _raise_command_running_error('set env vars', set_k8s_env_var_cmd,
367
+ pod_name, rc, stdout)
368
+
369
+ _run_function_with_retries(_run_env_vars_cmd,
370
+ f'set env vars in pod {pod_name}')
371
+ logger.info(f'{"-"*20}End: Set up env vars in pod {pod_name!r} '
372
+ f'{"-"*20}')
373
+
374
+ subprocess_utils.run_in_parallel(_set_env_vars_thread, new_pods,
375
+ NUM_THREADS)
334
376
 
335
377
 
336
378
  def _check_user_privilege(namespace: str, context: Optional[str],
@@ -350,23 +392,37 @@ def _check_user_privilege(namespace: str, context: Optional[str],
350
392
  ' fi; '
351
393
  'fi')
352
394
 
353
- for new_node in new_nodes:
354
- runner = command_runner.KubernetesCommandRunner(
355
- ((namespace, context), new_node.metadata.name))
395
+ # This check needs to run on a per-image basis, so running the check on
396
+ # any one pod is sufficient.
397
+ new_node = new_nodes[0]
398
+ pod_name = new_node.metadata.name
399
+
400
+ runner = command_runner.KubernetesCommandRunner(
401
+ ((namespace, context), pod_name))
402
+ logger.info(f'{"-"*20}Start: Check user privilege in pod {pod_name!r} '
403
+ f'{"-"*20}')
404
+
405
+ def _run_privilege_check():
356
406
  rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd,
357
407
  require_outputs=True,
358
408
  separate_stderr=True,
359
409
  stream_logs=False)
360
410
  _raise_command_running_error('check user privilege',
361
- check_k8s_user_sudo_cmd,
362
- new_node.metadata.name, rc,
411
+ check_k8s_user_sudo_cmd, pod_name, rc,
363
412
  stdout + stderr)
364
- if stdout == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
365
- raise config_lib.KubernetesError(
366
- 'Insufficient system privileges detected. '
367
- 'Ensure the default user has root access or '
368
- '"sudo" is installed and the user is added to the sudoers '
369
- 'from the image.')
413
+ return stdout
414
+
415
+ stdout = _run_function_with_retries(
416
+ _run_privilege_check, f'check user privilege in pod {pod_name!r}')
417
+
418
+ if stdout == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
419
+ raise config_lib.KubernetesError(
420
+ 'Insufficient system privileges detected. '
421
+ 'Ensure the default user has root access or '
422
+ '"sudo" is installed and the user is added to the sudoers '
423
+ 'from the image.')
424
+ logger.info(f'{"-"*20}End: Check user privilege in pod {pod_name!r} '
425
+ f'{"-"*20}')
370
426
 
371
427
 
372
428
  def _setup_ssh_in_pods(namespace: str, context: Optional[str],
@@ -405,14 +461,19 @@ def _setup_ssh_in_pods(namespace: str, context: Optional[str],
405
461
  runner = command_runner.KubernetesCommandRunner(
406
462
  ((namespace, context), pod_name))
407
463
  logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
408
- rc, stdout, _ = runner.run(set_k8s_ssh_cmd,
409
- require_outputs=True,
410
- stream_logs=False)
411
- _raise_command_running_error('setup ssh', set_k8s_ssh_cmd, pod_name, rc,
412
- stdout)
464
+
465
+ def _run_ssh_setup():
466
+ rc, stdout, _ = runner.run(set_k8s_ssh_cmd,
467
+ require_outputs=True,
468
+ stream_logs=False)
469
+ _raise_command_running_error('setup ssh', set_k8s_ssh_cmd, pod_name,
470
+ rc, stdout)
471
+
472
+ _run_function_with_retries(_run_ssh_setup,
473
+ f'setup ssh in pod {pod_name!r}')
413
474
  logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
414
475
 
415
- subprocess_utils.run_in_parallel(_setup_ssh_thread, new_nodes)
476
+ subprocess_utils.run_in_parallel(_setup_ssh_thread, new_nodes, NUM_THREADS)
416
477
 
417
478
 
418
479
  def _label_pod(namespace: str, context: Optional[str], pod_name: str,
@@ -765,12 +826,17 @@ def terminate_instances(
765
826
  def _is_head(pod) -> bool:
766
827
  return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
767
828
 
768
- for pod_name, pod in pods.items():
769
- logger.debug(f'Terminating instance {pod_name}: {pod}')
829
+ def _terminate_pod_thread(pod_info):
830
+ pod_name, pod = pod_info
770
831
  if _is_head(pod) and worker_only:
771
- continue
832
+ return
833
+ logger.debug(f'Terminating instance {pod_name}: {pod}')
772
834
  _terminate_node(namespace, context, pod_name)
773
835
 
836
+ # Run pod termination in parallel
837
+ subprocess_utils.run_in_parallel(_terminate_pod_thread, pods.items(),
838
+ NUM_THREADS)
839
+
774
840
 
775
841
  def get_cluster_info(
776
842
  region: str,
@@ -82,7 +82,7 @@ def raise_lambda_error(response: requests.Response) -> None:
82
82
  if status_code == 200:
83
83
  return
84
84
  if status_code == 429:
85
- # https://docs.lambdalabs.com/cloud/rate-limiting/
85
+ # https://docs.lambdalabs.com/public-cloud/cloud-api/
86
86
  raise LambdaCloudError('Your API requests are being rate limited.')
87
87
  try:
88
88
  resp_json = response.json()
@@ -145,7 +145,7 @@ class LambdaCloudClient:
145
145
  # Most API requests are rate limited at ~1 request every second but
146
146
  # launch requests are rate limited at ~1 request every 10 seconds.
147
147
  # So don't use launch requests to check availability.
148
- # See https://docs.lambdalabs.com/cloud/rate-limiting/ for more.
148
+ # See https://docs.lambdalabs.com/public-cloud/cloud-api/ for more.
149
149
  available_regions = (self.list_catalog()[instance_type]
150
150
  ['regions_with_capacity_available'])
151
151
  available_regions = [reg['name'] for reg in available_regions]
sky/skylet/job_lib.py CHANGED
@@ -181,14 +181,19 @@ class JobScheduler:
181
181
  subprocess.Popen(run_cmd, shell=True, stdout=subprocess.DEVNULL)
182
182
 
183
183
  def schedule_step(self, force_update_jobs: bool = False) -> None:
184
- jobs = self._get_jobs()
185
- if len(jobs) > 0 or force_update_jobs:
184
+ if force_update_jobs:
186
185
  update_status()
186
+ pending_jobs = self._get_pending_jobs()
187
187
  # TODO(zhwu, mraheja): One optimization can be allowing more than one
188
188
  # job staying in the pending state after ray job submit, so that to be
189
189
  # faster to schedule a large amount of jobs.
190
- for job_id, run_cmd, submit, created_time in jobs:
190
+ for job_id, run_cmd, submit, created_time in pending_jobs:
191
191
  with filelock.FileLock(_get_lock_path(job_id)):
192
+ # We don't have to refresh the job status before checking, as
193
+ # the job status will only be stale in rare cases where ray job
194
+ # crashes; or the job stays in INIT state for a long time.
195
+ # In those cases, the periodic JobSchedulerEvent event will
196
+ # update the job status every 300 seconds.
192
197
  status = get_status_no_lock(job_id)
193
198
  if (status not in _PRE_RESOURCE_STATUSES or
194
199
  created_time < psutil.boot_time()):
@@ -202,7 +207,7 @@ class JobScheduler:
202
207
  self._run_job(job_id, run_cmd)
203
208
  return
204
209
 
205
- def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
210
+ def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
206
211
  """Returns the metadata for jobs in the pending jobs table
207
212
 
208
213
  The information contains job_id, run command, submit time,
@@ -214,7 +219,7 @@ class JobScheduler:
214
219
  class FIFOScheduler(JobScheduler):
215
220
  """First in first out job scheduler"""
216
221
 
217
- def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
222
+ def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
218
223
  return list(
219
224
  _CURSOR.execute('SELECT * FROM pending_jobs ORDER BY job_id'))
220
225
 
@@ -534,25 +539,13 @@ def update_job_status(job_ids: List[int],
534
539
 
535
540
  This function should only be run on the remote instance with ray>=2.4.0.
536
541
  """
542
+ echo = logger.info if not silent else logger.debug
537
543
  if len(job_ids) == 0:
538
544
  return []
539
545
 
540
- # TODO: if too slow, directly query against redis.
541
546
  ray_job_ids = [make_ray_job_id(job_id) for job_id in job_ids]
542
-
543
547
  job_client = _create_ray_job_submission_client()
544
548
 
545
- # In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
546
- # which contains the job status (str) and submission_id (str).
547
- ray_job_query_time = time.time()
548
- job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
549
-
550
- job_details = {}
551
- ray_job_ids_set = set(ray_job_ids)
552
- for job_detail in job_detail_lists:
553
- if job_detail.submission_id in ray_job_ids_set:
554
- job_details[job_detail.submission_id] = job_detail
555
-
556
549
  statuses = []
557
550
  for job_id, ray_job_id in zip(job_ids, ray_job_ids):
558
551
  # Per-job status lock is required because between the job status
@@ -560,15 +553,48 @@ def update_job_status(job_ids: List[int],
560
553
  # can be modified by the generated ray program.
561
554
  with filelock.FileLock(_get_lock_path(job_id)):
562
555
  status = None
563
- if ray_job_id in job_details:
564
- ray_status = job_details[ray_job_id].status
565
- status = _RAY_TO_JOB_STATUS_MAP[ray_status]
556
+ job_record = _get_jobs_by_ids([job_id])[0]
557
+ original_status = job_record['status']
558
+ job_submitted_at = job_record['submitted_at']
559
+
560
+ ray_job_query_time = time.time()
561
+ if original_status == JobStatus.INIT:
562
+ if (job_submitted_at >= psutil.boot_time() and job_submitted_at
563
+ >= ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
564
+ # The job id is reserved, but the job is not submitted yet.
565
+ # We should keep it in INIT.
566
+ status = JobStatus.INIT
567
+ else:
568
+ # We always immediately submit job after the job id is
569
+ # allocated, i.e. INIT -> PENDING, if a job stays in INIT
570
+ # for too long, it is likely the job submission process
571
+ # was killed before the job is submitted. We should set it
572
+ # to FAILED then. Note, if ray job indicates the job is
573
+ # running, we will change status to PENDING below.
574
+ echo(f'INIT job {job_id} is stale, setting to FAILED')
575
+ status = JobStatus.FAILED
576
+
577
+ try:
578
+ # Querying status within the lock is safer than querying
579
+ # outside, as it avoids the race condition when job table is
580
+ # updated after the ray job status query.
581
+ # Also, getting per-job status is faster than querying all jobs,
582
+ # when there are significant number of finished jobs.
583
+ # Reference: getting 124 finished jobs takes 0.038s, while
584
+ # querying a single job takes 0.006s, 10 jobs takes 0.066s.
585
+ # TODO: if too slow, directly query against redis.
586
+ ray_job_status = job_client.get_job_status(ray_job_id)
587
+ status = _RAY_TO_JOB_STATUS_MAP[ray_job_status.value]
588
+ except RuntimeError:
589
+ # Job not found.
590
+ pass
591
+
566
592
  pending_job = _get_pending_job(job_id)
567
593
  if pending_job is not None:
568
594
  if pending_job['created_time'] < psutil.boot_time():
569
- logger.info(f'Job {job_id} is stale, setting to FAILED: '
570
- f'created_time={pending_job["created_time"]}, '
571
- f'boot_time={psutil.boot_time()}')
595
+ echo(f'Job {job_id} is stale, setting to FAILED: '
596
+ f'created_time={pending_job["created_time"]}, '
597
+ f'boot_time={psutil.boot_time()}')
572
598
  # The job is stale as it is created before the instance
573
599
  # is booted, e.g. the instance is rebooted.
574
600
  status = JobStatus.FAILED
@@ -583,22 +609,20 @@ def update_job_status(job_ids: List[int],
583
609
  # as stale.
584
610
  status = JobStatus.PENDING
585
611
 
586
- original_status = get_status_no_lock(job_id)
587
612
  assert original_status is not None, (job_id, status)
588
613
  if status is None:
589
614
  status = original_status
590
615
  if (original_status is not None and
591
616
  not original_status.is_terminal()):
592
- logger.info(f'Ray job status for job {job_id} is None, '
593
- 'setting it to FAILED.')
617
+ echo(f'Ray job status for job {job_id} is None, '
618
+ 'setting it to FAILED.')
594
619
  # The job may be stale, when the instance is restarted
595
620
  # (the ray redis is volatile). We need to reset the
596
621
  # status of the task to FAILED if its original status
597
622
  # is RUNNING or PENDING.
598
623
  status = JobStatus.FAILED
599
624
  _set_status_no_lock(job_id, status)
600
- if not silent:
601
- logger.info(f'Updated job {job_id} status to {status}')
625
+ echo(f'Updated job {job_id} status to {status}')
602
626
  else:
603
627
  # Taking max of the status is necessary because:
604
628
  # 1. It avoids race condition, where the original status has
@@ -611,10 +635,10 @@ def update_job_status(job_ids: List[int],
611
635
  # DB) would already have that value. So we take the max here to
612
636
  # keep it at later status.
613
637
  status = max(status, original_status)
638
+ assert status is not None, (job_id, status, original_status)
614
639
  if status != original_status: # Prevents redundant update.
615
640
  _set_status_no_lock(job_id, status)
616
- if not silent:
617
- logger.info(f'Updated job {job_id} status to {status}')
641
+ echo(f'Updated job {job_id} status to {status}')
618
642
  statuses.append(status)
619
643
  return statuses
620
644
 
@@ -180,7 +180,7 @@ class SCPNodeProvider(NodeProvider):
180
180
  metadata['tags'] = instance_info['tags']
181
181
  # TODO(ewzeng): The internal ip is hard to get, so set it to the
182
182
  # external ip as a hack. This should be changed in the future.
183
- # https://docs.lambdalabs.com/cloud/learn-private-ip-address/
183
+ # https://docs.lambdalabs.com/public-cloud/on-demand/getting-started/#learn-your-instances-private-ip-address
184
184
  metadata['internal_ip'] = vm['ip']
185
185
  metadata['external_ip'] = vm['external_ip']
186
186
  return metadata
@@ -237,6 +237,23 @@ class CommandRunner:
237
237
  rsync_command.append(prefix_command)
238
238
  rsync_command += ['rsync', RSYNC_DISPLAY_OPTION]
239
239
 
240
+ def _get_remote_home_dir_with_retry():
241
+ backoff = common_utils.Backoff(initial_backoff=1,
242
+ max_backoff_factor=5)
243
+ retries_left = max_retry
244
+ assert retries_left > 0, f'max_retry {max_retry} must be positive.'
245
+ while retries_left >= 0:
246
+ try:
247
+ return get_remote_home_dir()
248
+ except Exception: # pylint: disable=broad-except
249
+ if retries_left == 0:
250
+ raise
251
+ sleep_time = backoff.current_backoff()
252
+ logger.warning(f'Failed to get remote home dir '
253
+ f'- retrying in {sleep_time} seconds.')
254
+ retries_left -= 1
255
+ time.sleep(sleep_time)
256
+
240
257
  # --filter
241
258
  # The source is a local path, so we need to resolve it.
242
259
  resolved_source = pathlib.Path(source).expanduser().resolve()
@@ -261,7 +278,7 @@ class CommandRunner:
261
278
  if up:
262
279
  resolved_target = target
263
280
  if target.startswith('~'):
264
- remote_home_dir = get_remote_home_dir()
281
+ remote_home_dir = _get_remote_home_dir_with_retry()
265
282
  resolved_target = target.replace('~', remote_home_dir)
266
283
  full_source_str = str(resolved_source)
267
284
  if resolved_source.is_dir():
@@ -273,7 +290,7 @@ class CommandRunner:
273
290
  else:
274
291
  resolved_source = source
275
292
  if source.startswith('~'):
276
- remote_home_dir = get_remote_home_dir()
293
+ remote_home_dir = _get_remote_home_dir_with_retry()
277
294
  resolved_source = source.replace('~', remote_home_dir)
278
295
  rsync_command.extend([
279
296
  f'{node_destination}:{resolved_source!r}',
@@ -656,6 +673,8 @@ class SSHCommandRunner(CommandRunner):
656
673
  class KubernetesCommandRunner(CommandRunner):
657
674
  """Runner for Kubernetes commands."""
658
675
 
676
+ _MAX_RETRIES_FOR_RSYNC = 3
677
+
659
678
  def __init__(
660
679
  self,
661
680
  node: Tuple[Tuple[str, Optional[str]], str],
@@ -798,7 +817,7 @@ class KubernetesCommandRunner(CommandRunner):
798
817
  # Advanced options.
799
818
  log_path: str = os.devnull,
800
819
  stream_logs: bool = True,
801
- max_retry: int = 1,
820
+ max_retry: int = _MAX_RETRIES_FOR_RSYNC,
802
821
  ) -> None:
803
822
  """Uses 'rsync' to sync 'source' to 'target'.
804
823
 
@@ -50,17 +50,27 @@ def get_parallel_threads() -> int:
50
50
  return max(4, cpu_count - 1)
51
51
 
52
52
 
53
- def run_in_parallel(func: Callable, args: Iterable[Any]) -> List[Any]:
53
+ def run_in_parallel(func: Callable,
54
+ args: Iterable[Any],
55
+ num_threads: Optional[int] = None) -> List[Any]:
54
56
  """Run a function in parallel on a list of arguments.
55
57
 
56
58
  The function 'func' should raise a CommandError if the command fails.
57
59
 
60
+ Args:
61
+ func: The function to run in parallel
62
+ args: Iterable of arguments to pass to func
63
+ num_threads: Number of threads to use. If None, uses
64
+ get_parallel_threads()
65
+
58
66
  Returns:
59
67
  A list of the return values of the function func, in the same order as the
60
68
  arguments.
61
69
  """
62
70
  # Reference: https://stackoverflow.com/questions/25790279/python-multiprocessing-early-termination # pylint: disable=line-too-long
63
- with pool.ThreadPool(processes=get_parallel_threads()) as p:
71
+ processes = num_threads if num_threads is not None else get_parallel_threads(
72
+ )
73
+ with pool.ThreadPool(processes=processes) as p:
64
74
  # Run the function in parallel on the arguments, keeping the order.
65
75
  return list(p.imap(func, args))
66
76
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241105
3
+ Version: 1.0.0.dev20241107
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,4 +1,4 @@
1
- sky/__init__.py,sha256=KPG1pZjMsW9mwIrklMgLfKnzG4EXAac9U_tEhB9VUrE,5882
1
+ sky/__init__.py,sha256=LDYVc006Bm6m_yCUJiTKF3oPp3_O3ODjp1KhoU5meCE,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
@@ -31,7 +31,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
31
31
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
32
32
  sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
33
33
  sky/backends/backend_utils.py,sha256=2myfryj1zG9xxPaX6XYYJruxAOGNGbpsy2ckT4A77sE,121813
34
- sky/backends/cloud_vm_ray_backend.py,sha256=jdG17FDAOUoHjXib2P73Hhdl9yXoDJxPTY5Dyqvp6j4,232757
34
+ sky/backends/cloud_vm_ray_backend.py,sha256=jlX1atSF4L31ZMzC_tnBaWnxvc2Wb8DRwt5G_ukrlJk,232799
35
35
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
36
36
  sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
37
37
  sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
@@ -107,7 +107,7 @@ sky/provision/__init__.py,sha256=UhYsGRribEyK1--PPT0Dom9051jlpdn8UCNhO8qpPOc,626
107
107
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
108
108
  sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
109
109
  sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc,19199
110
- sky/provision/instance_setup.py,sha256=n1Px_KOYZl7Rf1WLXrfTTHyqxyA8_5QTN9BNLjQRkgc,22427
110
+ sky/provision/instance_setup.py,sha256=c6i_NC6GrW4hXAQIU5_dUBbnThjZQNS3cL2M6yMtzes,23616
111
111
  sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
112
112
  sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
113
113
  sky/provision/provisioner.py,sha256=mTvtBjS-Xz64LJcyeHx_-wdM8Gin8D49YRaV_TADaz4,25334
@@ -137,7 +137,7 @@ sky/provision/gcp/instance_utils.py,sha256=veRBr6Oziv0KaUdC4acuWeaOremNV0gMYCCHa
137
137
  sky/provision/gcp/mig_utils.py,sha256=oFpcFZoapHMILSE4iIm8V5bxP1RhbMHRF7cciqq8qAk,7883
138
138
  sky/provision/kubernetes/__init__.py,sha256=y6yVfii81WYG3ROxv4hiIj-ydinS5-xGxLvXnARVQoI,719
139
139
  sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2vmzldc,29072
140
- sky/provision/kubernetes/instance.py,sha256=1dN2vdh-ZdeIe39ZxH5DAnnc8kXHWpzD6q-f14-8cDE,41576
140
+ sky/provision/kubernetes/instance.py,sha256=rY43hZOInP20kYofW0MGs7wDbJ4NxMw1FtKAJAPGIOU,43960
141
141
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
142
142
  sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
143
143
  sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
@@ -146,7 +146,7 @@ sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=
146
146
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
147
147
  sky/provision/lambda_cloud/config.py,sha256=jq1iLzp4Up61r4JGxvtpVbJlgXnea3LHYQhCQyyl7ik,272
148
148
  sky/provision/lambda_cloud/instance.py,sha256=5-XuX-KwlRq8y62NXNzY_p6aJs4iCPGBf5U4pIR4liI,8975
149
- sky/provision/lambda_cloud/lambda_utils.py,sha256=H8uaaMEpLn5cqGCdhUH_oJiccv_cuMguUNAl0NqB0Ik,9873
149
+ sky/provision/lambda_cloud/lambda_utils.py,sha256=wIXV1Qe362f8Q9u8DSx2e9IJs4CF03Jr3idHCzhlRz4,9879
150
150
  sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
151
151
  sky/provision/paperspace/config.py,sha256=oNmffSt-V466pE0DmML8hOCX1CiA24jAqE5JEKuqpyI,1541
152
152
  sky/provision/paperspace/constants.py,sha256=NcLJGivJxshJwhR28yVHysWQ2gtMAkTVmHC91d3kyKM,957
@@ -192,7 +192,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
192
192
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
193
193
  sky/skylet/constants.py,sha256=TL-O0ZoxA1ZeNvKXzzA_UyIMXsma7flbsDZ1N_o9dKg,14468
194
194
  sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
195
- sky/skylet/job_lib.py,sha256=jqJ4D3UeG6fNMm8xPtdWclnrVHQb6WiRqb1nrBp8TPg,35200
195
+ sky/skylet/job_lib.py,sha256=-SCbpJRiWMSwvhDjUwfwnvBap7Y5B3ol1l_PDPra3XI,36860
196
196
  sky/skylet/log_lib.py,sha256=Jyj3h2yMBlheFX53AabXEiPaKyCbu06hLEhay5_ZRN0,18734
197
197
  sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
198
198
  sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
@@ -209,7 +209,7 @@ sky/skylet/providers/oci/query_helper.py,sha256=dUsvPGzWPNF5O2NjQvuC8tkilT4H11gM
209
209
  sky/skylet/providers/oci/utils.py,sha256=lCpdklxgSwK-hqErTicpIe_xkpSlIc8u943C-9_MJfU,508
210
210
  sky/skylet/providers/scp/__init__.py,sha256=15SiAh1YphXkZsHySaw_CeAmXRdoM4JtNIAt7SLbUvg,91
211
211
  sky/skylet/providers/scp/config.py,sha256=lhMXyG9btMlg59nmvtnMdIDN07jBbQOheAx-bHbGbhw,5077
212
- sky/skylet/providers/scp/node_provider.py,sha256=5HjFEGqKAqVcszEpcan_IzY9NKVMQdm2BUgGdfw9aUY,22411
212
+ sky/skylet/providers/scp/node_provider.py,sha256=W5J-170JVIpwT9Fv20fJ_PpdAVsqx9pigE-RkkG_kQE,22459
213
213
  sky/skylet/ray_patches/__init__.py,sha256=IoCzj9zFaHW-n__bLD8dgC2pJMTfZRxRpr8rZGvMyrw,2761
214
214
  sky/skylet/ray_patches/autoscaler.py.patch,sha256=cZK15L29aay-qx6JoGVPNsPIo3UiG0bAHh8fqfFd-44,291
215
215
  sky/skylet/ray_patches/cli.py.patch,sha256=ooEAr3OfA6LN7v4iaNltY6w63TaOFssgw9iKWP49iJc,349
@@ -245,7 +245,7 @@ sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
245
245
  sky/utils/accelerator_registry.py,sha256=BO4iYH5bV80Xyp4EPfO0n1D3LL0FvESCy7xm59Je3_o,3798
246
246
  sky/utils/admin_policy_utils.py,sha256=zFCu1OFIrZRfQNY0JFRO1502WFfdqZhwAU_QgM4fO9U,5943
247
247
  sky/utils/cluster_yaml_utils.py,sha256=1wRRYqI1kI-eFs1pMW4r_FFjHJ0zamq6v2RRI-Gtx5E,849
248
- sky/utils/command_runner.py,sha256=3CDcqRXEmoe3C-t2P58McgcRg6p9m5haUWYj1rOLuqM,34858
248
+ sky/utils/command_runner.py,sha256=seU7uX9CrxiC8WOWBKHW94m67-V6DYghqRXhYdUIdQI,35756
249
249
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
250
250
  sky/utils/common_utils.py,sha256=Qy25LuIoTT0qg391EWyT9i5D6fwk1S4OdFwRpCTZ9Vk,24657
251
251
  sky/utils/controller_utils.py,sha256=wF4_y1PCsLAWoo3XEtECwkNYTN6hO3vn_cxGxgQYcd8,43268
@@ -257,7 +257,7 @@ sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
257
257
  sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
258
258
  sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
259
259
  sky/utils/schemas.py,sha256=mogoStpQ77S936VfChinAW2I1DT4q2c5E7qY_qNiO0w,29094
260
- sky/utils/subprocess_utils.py,sha256=3R54Elc2n8DQeO6Y8MCDJ6N6v27HDGpbNMIfCquqXYQ,6552
260
+ sky/utils/subprocess_utils.py,sha256=mMFCTfxbyav5LJ1epJJXkgfFYmd828naTOMVfYjuEWY,6905
261
261
  sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
262
262
  sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
263
263
  sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
274
274
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
275
275
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
276
276
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
277
- skypilot_nightly-1.0.0.dev20241105.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
- skypilot_nightly-1.0.0.dev20241105.dist-info/METADATA,sha256=9ZfrVo53ijjnGcFaUYwHlYlHq2Hy7NlHs715qguqkSU,19708
279
- skypilot_nightly-1.0.0.dev20241105.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
- skypilot_nightly-1.0.0.dev20241105.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
- skypilot_nightly-1.0.0.dev20241105.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
- skypilot_nightly-1.0.0.dev20241105.dist-info/RECORD,,
277
+ skypilot_nightly-1.0.0.dev20241107.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
+ skypilot_nightly-1.0.0.dev20241107.dist-info/METADATA,sha256=ICnKtcpMVvZVf_1H6k63r29XgS_-heZ4BcgH-p5J5s4,19708
279
+ skypilot_nightly-1.0.0.dev20241107.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
+ skypilot_nightly-1.0.0.dev20241107.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
+ skypilot_nightly-1.0.0.dev20241107.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
+ skypilot_nightly-1.0.0.dev20241107.dist-info/RECORD,,