skypilot-nightly 1.0.0.dev20241105__py3-none-any.whl → 1.0.0.dev20241107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/provision/instance_setup.py +21 -5
- sky/provision/kubernetes/instance.py +94 -28
- sky/provision/lambda_cloud/lambda_utils.py +2 -2
- sky/skylet/job_lib.py +55 -31
- sky/skylet/providers/scp/node_provider.py +1 -1
- sky/utils/command_runner.py +22 -3
- sky/utils/subprocess_utils.py +12 -2
- {skypilot_nightly-1.0.0.dev20241105.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241105.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/RECORD +15 -15
- {skypilot_nightly-1.0.0.dev20241105.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241105.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241105.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241105.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'fe2ce9a262c059722ddce46f5594fc2ca2370c0d'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241107'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -3278,9 +3278,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3278
3278
|
f'{cd} && {constants.SKY_RAY_CMD} job submit '
|
3279
3279
|
'--address=http://127.0.0.1:$RAY_DASHBOARD_PORT '
|
3280
3280
|
f'--submission-id {job_id}-$(whoami) --no-wait '
|
3281
|
-
|
3282
|
-
|
3283
|
-
|
3281
|
+
f'"{constants.SKY_PYTHON_CMD} -u {script_path} '
|
3282
|
+
# Do not use &>, which is not POSIX and may not work.
|
3283
|
+
# Note that the order of ">filename 2>&1" matters.
|
3284
|
+
f'> {remote_log_path} 2>&1"')
|
3284
3285
|
|
3285
3286
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
3286
3287
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
sky/provision/instance_setup.py
CHANGED
@@ -283,11 +283,27 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
|
|
283
283
|
# the same credentials. Otherwise, `ray status` will fail to fetch the
|
284
284
|
# available nodes.
|
285
285
|
# Reference: https://github.com/skypilot-org/skypilot/issues/2441
|
286
|
-
cmd = (
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
286
|
+
cmd = (
|
287
|
+
f'{constants.SKY_RAY_CMD} stop; '
|
288
|
+
'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; '
|
289
|
+
'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
|
290
|
+
# worker_maximum_startup_concurrency controls the maximum number of
|
291
|
+
# workers that can be started concurrently. However, it also controls
|
292
|
+
# this warning message:
|
293
|
+
# https://github.com/ray-project/ray/blob/d5d03e6e24ae3cfafb87637ade795fb1480636e6/src/ray/raylet/worker_pool.cc#L1535-L1545
|
294
|
+
# maximum_startup_concurrency defaults to the number of CPUs given by
|
295
|
+
# multiprocessing.cpu_count() or manually specified to ray. (See
|
296
|
+
# https://github.com/ray-project/ray/blob/fab26e1813779eb568acba01281c6dd963c13635/python/ray/_private/services.py#L1622-L1624.)
|
297
|
+
# The warning will show when the number of workers is >4x the
|
298
|
+
# maximum_startup_concurrency, so typically 4x CPU count. However, the
|
299
|
+
# job controller uses 0.25cpu reservations, and each job can use two
|
300
|
+
# workers (one for the submitted job and one for remote actors),
|
301
|
+
# resulting in a worker count of 8x CPUs or more. Increase the
|
302
|
+
# worker_maximum_startup_concurrency to 3x CPUs so that we will only see
|
303
|
+
# the warning when the worker count is >12x CPUs.
|
304
|
+
'RAY_worker_maximum_startup_concurrency=$(( 3 * $(nproc --all) )) '
|
305
|
+
f'{constants.SKY_RAY_CMD} start --head {ray_options} || exit 1;' +
|
306
|
+
_RAY_PRLIMIT + _DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND)
|
291
307
|
logger.info(f'Running command on head node: {cmd}')
|
292
308
|
# TODO(zhwu): add the output to log files.
|
293
309
|
returncode, stdout, stderr = head_runner.run(
|
@@ -2,7 +2,7 @@
|
|
2
2
|
import copy
|
3
3
|
import json
|
4
4
|
import time
|
5
|
-
from typing import Any, Dict, List, Optional
|
5
|
+
from typing import Any, Callable, Dict, List, Optional
|
6
6
|
import uuid
|
7
7
|
|
8
8
|
from sky import exceptions
|
@@ -24,6 +24,8 @@ from sky.utils import ux_utils
|
|
24
24
|
|
25
25
|
POLL_INTERVAL = 2
|
26
26
|
_TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
|
27
|
+
_MAX_RETRIES = 3
|
28
|
+
NUM_THREADS = subprocess_utils.get_parallel_threads() * 2
|
27
29
|
|
28
30
|
logger = sky_logging.init_logger(__name__)
|
29
31
|
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
@@ -304,6 +306,33 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
304
306
|
time.sleep(1)
|
305
307
|
|
306
308
|
|
309
|
+
def _run_function_with_retries(func: Callable,
|
310
|
+
operation_name: str,
|
311
|
+
max_retries: int = _MAX_RETRIES,
|
312
|
+
retry_delay: int = 5) -> Any:
|
313
|
+
"""Runs a function with retries on Kubernetes errors.
|
314
|
+
|
315
|
+
Args:
|
316
|
+
func: Function to retry
|
317
|
+
operation_name: Name of the operation for logging
|
318
|
+
max_retries: Maximum number of retry attempts
|
319
|
+
retry_delay: Delay between retries in seconds
|
320
|
+
|
321
|
+
Raises:
|
322
|
+
The last exception encountered if all retries fail.
|
323
|
+
"""
|
324
|
+
for attempt in range(max_retries + 1):
|
325
|
+
try:
|
326
|
+
return func()
|
327
|
+
except config_lib.KubernetesError:
|
328
|
+
if attempt < max_retries:
|
329
|
+
logger.warning(f'Failed to {operation_name} - '
|
330
|
+
f'retrying in {retry_delay} seconds.')
|
331
|
+
time.sleep(retry_delay)
|
332
|
+
else:
|
333
|
+
raise
|
334
|
+
|
335
|
+
|
307
336
|
def _set_env_vars_in_pods(namespace: str, context: Optional[str],
|
308
337
|
new_pods: List):
|
309
338
|
"""Setting environment variables in pods.
|
@@ -323,14 +352,27 @@ def _set_env_vars_in_pods(namespace: str, context: Optional[str],
|
|
323
352
|
"""
|
324
353
|
set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
|
325
354
|
|
326
|
-
|
355
|
+
def _set_env_vars_thread(new_pod):
|
356
|
+
pod_name = new_pod.metadata.name
|
357
|
+
logger.info(f'{"-"*20}Start: Set up env vars in pod {pod_name!r} '
|
358
|
+
f'{"-"*20}')
|
327
359
|
runner = command_runner.KubernetesCommandRunner(
|
328
|
-
((namespace, context),
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
360
|
+
((namespace, context), pod_name))
|
361
|
+
|
362
|
+
def _run_env_vars_cmd():
|
363
|
+
rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
|
364
|
+
require_outputs=True,
|
365
|
+
stream_logs=False)
|
366
|
+
_raise_command_running_error('set env vars', set_k8s_env_var_cmd,
|
367
|
+
pod_name, rc, stdout)
|
368
|
+
|
369
|
+
_run_function_with_retries(_run_env_vars_cmd,
|
370
|
+
f'set env vars in pod {pod_name}')
|
371
|
+
logger.info(f'{"-"*20}End: Set up env vars in pod {pod_name!r} '
|
372
|
+
f'{"-"*20}')
|
373
|
+
|
374
|
+
subprocess_utils.run_in_parallel(_set_env_vars_thread, new_pods,
|
375
|
+
NUM_THREADS)
|
334
376
|
|
335
377
|
|
336
378
|
def _check_user_privilege(namespace: str, context: Optional[str],
|
@@ -350,23 +392,37 @@ def _check_user_privilege(namespace: str, context: Optional[str],
|
|
350
392
|
' fi; '
|
351
393
|
'fi')
|
352
394
|
|
353
|
-
|
354
|
-
|
355
|
-
|
395
|
+
# This check needs to run on a per-image basis, so running the check on
|
396
|
+
# any one pod is sufficient.
|
397
|
+
new_node = new_nodes[0]
|
398
|
+
pod_name = new_node.metadata.name
|
399
|
+
|
400
|
+
runner = command_runner.KubernetesCommandRunner(
|
401
|
+
((namespace, context), pod_name))
|
402
|
+
logger.info(f'{"-"*20}Start: Check user privilege in pod {pod_name!r} '
|
403
|
+
f'{"-"*20}')
|
404
|
+
|
405
|
+
def _run_privilege_check():
|
356
406
|
rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd,
|
357
407
|
require_outputs=True,
|
358
408
|
separate_stderr=True,
|
359
409
|
stream_logs=False)
|
360
410
|
_raise_command_running_error('check user privilege',
|
361
|
-
check_k8s_user_sudo_cmd,
|
362
|
-
new_node.metadata.name, rc,
|
411
|
+
check_k8s_user_sudo_cmd, pod_name, rc,
|
363
412
|
stdout + stderr)
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
413
|
+
return stdout
|
414
|
+
|
415
|
+
stdout = _run_function_with_retries(
|
416
|
+
_run_privilege_check, f'check user privilege in pod {pod_name!r}')
|
417
|
+
|
418
|
+
if stdout == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
|
419
|
+
raise config_lib.KubernetesError(
|
420
|
+
'Insufficient system privileges detected. '
|
421
|
+
'Ensure the default user has root access or '
|
422
|
+
'"sudo" is installed and the user is added to the sudoers '
|
423
|
+
'from the image.')
|
424
|
+
logger.info(f'{"-"*20}End: Check user privilege in pod {pod_name!r} '
|
425
|
+
f'{"-"*20}')
|
370
426
|
|
371
427
|
|
372
428
|
def _setup_ssh_in_pods(namespace: str, context: Optional[str],
|
@@ -405,14 +461,19 @@ def _setup_ssh_in_pods(namespace: str, context: Optional[str],
|
|
405
461
|
runner = command_runner.KubernetesCommandRunner(
|
406
462
|
((namespace, context), pod_name))
|
407
463
|
logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
464
|
+
|
465
|
+
def _run_ssh_setup():
|
466
|
+
rc, stdout, _ = runner.run(set_k8s_ssh_cmd,
|
467
|
+
require_outputs=True,
|
468
|
+
stream_logs=False)
|
469
|
+
_raise_command_running_error('setup ssh', set_k8s_ssh_cmd, pod_name,
|
470
|
+
rc, stdout)
|
471
|
+
|
472
|
+
_run_function_with_retries(_run_ssh_setup,
|
473
|
+
f'setup ssh in pod {pod_name!r}')
|
413
474
|
logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
|
414
475
|
|
415
|
-
subprocess_utils.run_in_parallel(_setup_ssh_thread, new_nodes)
|
476
|
+
subprocess_utils.run_in_parallel(_setup_ssh_thread, new_nodes, NUM_THREADS)
|
416
477
|
|
417
478
|
|
418
479
|
def _label_pod(namespace: str, context: Optional[str], pod_name: str,
|
@@ -765,12 +826,17 @@ def terminate_instances(
|
|
765
826
|
def _is_head(pod) -> bool:
|
766
827
|
return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
|
767
828
|
|
768
|
-
|
769
|
-
|
829
|
+
def _terminate_pod_thread(pod_info):
|
830
|
+
pod_name, pod = pod_info
|
770
831
|
if _is_head(pod) and worker_only:
|
771
|
-
|
832
|
+
return
|
833
|
+
logger.debug(f'Terminating instance {pod_name}: {pod}')
|
772
834
|
_terminate_node(namespace, context, pod_name)
|
773
835
|
|
836
|
+
# Run pod termination in parallel
|
837
|
+
subprocess_utils.run_in_parallel(_terminate_pod_thread, pods.items(),
|
838
|
+
NUM_THREADS)
|
839
|
+
|
774
840
|
|
775
841
|
def get_cluster_info(
|
776
842
|
region: str,
|
@@ -82,7 +82,7 @@ def raise_lambda_error(response: requests.Response) -> None:
|
|
82
82
|
if status_code == 200:
|
83
83
|
return
|
84
84
|
if status_code == 429:
|
85
|
-
# https://docs.lambdalabs.com/cloud/
|
85
|
+
# https://docs.lambdalabs.com/public-cloud/cloud-api/
|
86
86
|
raise LambdaCloudError('Your API requests are being rate limited.')
|
87
87
|
try:
|
88
88
|
resp_json = response.json()
|
@@ -145,7 +145,7 @@ class LambdaCloudClient:
|
|
145
145
|
# Most API requests are rate limited at ~1 request every second but
|
146
146
|
# launch requests are rate limited at ~1 request every 10 seconds.
|
147
147
|
# So don't use launch requests to check availability.
|
148
|
-
# See https://docs.lambdalabs.com/cloud/
|
148
|
+
# See https://docs.lambdalabs.com/public-cloud/cloud-api/ for more.
|
149
149
|
available_regions = (self.list_catalog()[instance_type]
|
150
150
|
['regions_with_capacity_available'])
|
151
151
|
available_regions = [reg['name'] for reg in available_regions]
|
sky/skylet/job_lib.py
CHANGED
@@ -181,14 +181,19 @@ class JobScheduler:
|
|
181
181
|
subprocess.Popen(run_cmd, shell=True, stdout=subprocess.DEVNULL)
|
182
182
|
|
183
183
|
def schedule_step(self, force_update_jobs: bool = False) -> None:
|
184
|
-
|
185
|
-
if len(jobs) > 0 or force_update_jobs:
|
184
|
+
if force_update_jobs:
|
186
185
|
update_status()
|
186
|
+
pending_jobs = self._get_pending_jobs()
|
187
187
|
# TODO(zhwu, mraheja): One optimization can be allowing more than one
|
188
188
|
# job staying in the pending state after ray job submit, so that to be
|
189
189
|
# faster to schedule a large amount of jobs.
|
190
|
-
for job_id, run_cmd, submit, created_time in
|
190
|
+
for job_id, run_cmd, submit, created_time in pending_jobs:
|
191
191
|
with filelock.FileLock(_get_lock_path(job_id)):
|
192
|
+
# We don't have to refresh the job status before checking, as
|
193
|
+
# the job status will only be stale in rare cases where ray job
|
194
|
+
# crashes; or the job stays in INIT state for a long time.
|
195
|
+
# In those cases, the periodic JobSchedulerEvent event will
|
196
|
+
# update the job status every 300 seconds.
|
192
197
|
status = get_status_no_lock(job_id)
|
193
198
|
if (status not in _PRE_RESOURCE_STATUSES or
|
194
199
|
created_time < psutil.boot_time()):
|
@@ -202,7 +207,7 @@ class JobScheduler:
|
|
202
207
|
self._run_job(job_id, run_cmd)
|
203
208
|
return
|
204
209
|
|
205
|
-
def
|
210
|
+
def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
|
206
211
|
"""Returns the metadata for jobs in the pending jobs table
|
207
212
|
|
208
213
|
The information contains job_id, run command, submit time,
|
@@ -214,7 +219,7 @@ class JobScheduler:
|
|
214
219
|
class FIFOScheduler(JobScheduler):
|
215
220
|
"""First in first out job scheduler"""
|
216
221
|
|
217
|
-
def
|
222
|
+
def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
|
218
223
|
return list(
|
219
224
|
_CURSOR.execute('SELECT * FROM pending_jobs ORDER BY job_id'))
|
220
225
|
|
@@ -534,25 +539,13 @@ def update_job_status(job_ids: List[int],
|
|
534
539
|
|
535
540
|
This function should only be run on the remote instance with ray>=2.4.0.
|
536
541
|
"""
|
542
|
+
echo = logger.info if not silent else logger.debug
|
537
543
|
if len(job_ids) == 0:
|
538
544
|
return []
|
539
545
|
|
540
|
-
# TODO: if too slow, directly query against redis.
|
541
546
|
ray_job_ids = [make_ray_job_id(job_id) for job_id in job_ids]
|
542
|
-
|
543
547
|
job_client = _create_ray_job_submission_client()
|
544
548
|
|
545
|
-
# In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
|
546
|
-
# which contains the job status (str) and submission_id (str).
|
547
|
-
ray_job_query_time = time.time()
|
548
|
-
job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
|
549
|
-
|
550
|
-
job_details = {}
|
551
|
-
ray_job_ids_set = set(ray_job_ids)
|
552
|
-
for job_detail in job_detail_lists:
|
553
|
-
if job_detail.submission_id in ray_job_ids_set:
|
554
|
-
job_details[job_detail.submission_id] = job_detail
|
555
|
-
|
556
549
|
statuses = []
|
557
550
|
for job_id, ray_job_id in zip(job_ids, ray_job_ids):
|
558
551
|
# Per-job status lock is required because between the job status
|
@@ -560,15 +553,48 @@ def update_job_status(job_ids: List[int],
|
|
560
553
|
# can be modified by the generated ray program.
|
561
554
|
with filelock.FileLock(_get_lock_path(job_id)):
|
562
555
|
status = None
|
563
|
-
|
564
|
-
|
565
|
-
|
556
|
+
job_record = _get_jobs_by_ids([job_id])[0]
|
557
|
+
original_status = job_record['status']
|
558
|
+
job_submitted_at = job_record['submitted_at']
|
559
|
+
|
560
|
+
ray_job_query_time = time.time()
|
561
|
+
if original_status == JobStatus.INIT:
|
562
|
+
if (job_submitted_at >= psutil.boot_time() and job_submitted_at
|
563
|
+
>= ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
|
564
|
+
# The job id is reserved, but the job is not submitted yet.
|
565
|
+
# We should keep it in INIT.
|
566
|
+
status = JobStatus.INIT
|
567
|
+
else:
|
568
|
+
# We always immediately submit job after the job id is
|
569
|
+
# allocated, i.e. INIT -> PENDING, if a job stays in INIT
|
570
|
+
# for too long, it is likely the job submission process
|
571
|
+
# was killed before the job is submitted. We should set it
|
572
|
+
# to FAILED then. Note, if ray job indicates the job is
|
573
|
+
# running, we will change status to PENDING below.
|
574
|
+
echo(f'INIT job {job_id} is stale, setting to FAILED')
|
575
|
+
status = JobStatus.FAILED
|
576
|
+
|
577
|
+
try:
|
578
|
+
# Querying status within the lock is safer than querying
|
579
|
+
# outside, as it avoids the race condition when job table is
|
580
|
+
# updated after the ray job status query.
|
581
|
+
# Also, getting per-job status is faster than querying all jobs,
|
582
|
+
# when there are significant number of finished jobs.
|
583
|
+
# Reference: getting 124 finished jobs takes 0.038s, while
|
584
|
+
# querying a single job takes 0.006s, 10 jobs takes 0.066s.
|
585
|
+
# TODO: if too slow, directly query against redis.
|
586
|
+
ray_job_status = job_client.get_job_status(ray_job_id)
|
587
|
+
status = _RAY_TO_JOB_STATUS_MAP[ray_job_status.value]
|
588
|
+
except RuntimeError:
|
589
|
+
# Job not found.
|
590
|
+
pass
|
591
|
+
|
566
592
|
pending_job = _get_pending_job(job_id)
|
567
593
|
if pending_job is not None:
|
568
594
|
if pending_job['created_time'] < psutil.boot_time():
|
569
|
-
|
570
|
-
|
571
|
-
|
595
|
+
echo(f'Job {job_id} is stale, setting to FAILED: '
|
596
|
+
f'created_time={pending_job["created_time"]}, '
|
597
|
+
f'boot_time={psutil.boot_time()}')
|
572
598
|
# The job is stale as it is created before the instance
|
573
599
|
# is booted, e.g. the instance is rebooted.
|
574
600
|
status = JobStatus.FAILED
|
@@ -583,22 +609,20 @@ def update_job_status(job_ids: List[int],
|
|
583
609
|
# as stale.
|
584
610
|
status = JobStatus.PENDING
|
585
611
|
|
586
|
-
original_status = get_status_no_lock(job_id)
|
587
612
|
assert original_status is not None, (job_id, status)
|
588
613
|
if status is None:
|
589
614
|
status = original_status
|
590
615
|
if (original_status is not None and
|
591
616
|
not original_status.is_terminal()):
|
592
|
-
|
593
|
-
|
617
|
+
echo(f'Ray job status for job {job_id} is None, '
|
618
|
+
'setting it to FAILED.')
|
594
619
|
# The job may be stale, when the instance is restarted
|
595
620
|
# (the ray redis is volatile). We need to reset the
|
596
621
|
# status of the task to FAILED if its original status
|
597
622
|
# is RUNNING or PENDING.
|
598
623
|
status = JobStatus.FAILED
|
599
624
|
_set_status_no_lock(job_id, status)
|
600
|
-
|
601
|
-
logger.info(f'Updated job {job_id} status to {status}')
|
625
|
+
echo(f'Updated job {job_id} status to {status}')
|
602
626
|
else:
|
603
627
|
# Taking max of the status is necessary because:
|
604
628
|
# 1. It avoids race condition, where the original status has
|
@@ -611,10 +635,10 @@ def update_job_status(job_ids: List[int],
|
|
611
635
|
# DB) would already have that value. So we take the max here to
|
612
636
|
# keep it at later status.
|
613
637
|
status = max(status, original_status)
|
638
|
+
assert status is not None, (job_id, status, original_status)
|
614
639
|
if status != original_status: # Prevents redundant update.
|
615
640
|
_set_status_no_lock(job_id, status)
|
616
|
-
|
617
|
-
logger.info(f'Updated job {job_id} status to {status}')
|
641
|
+
echo(f'Updated job {job_id} status to {status}')
|
618
642
|
statuses.append(status)
|
619
643
|
return statuses
|
620
644
|
|
@@ -180,7 +180,7 @@ class SCPNodeProvider(NodeProvider):
|
|
180
180
|
metadata['tags'] = instance_info['tags']
|
181
181
|
# TODO(ewzeng): The internal ip is hard to get, so set it to the
|
182
182
|
# external ip as a hack. This should be changed in the future.
|
183
|
-
# https://docs.lambdalabs.com/cloud/learn-private-ip-address
|
183
|
+
# https://docs.lambdalabs.com/public-cloud/on-demand/getting-started/#learn-your-instances-private-ip-address
|
184
184
|
metadata['internal_ip'] = vm['ip']
|
185
185
|
metadata['external_ip'] = vm['external_ip']
|
186
186
|
return metadata
|
sky/utils/command_runner.py
CHANGED
@@ -237,6 +237,23 @@ class CommandRunner:
|
|
237
237
|
rsync_command.append(prefix_command)
|
238
238
|
rsync_command += ['rsync', RSYNC_DISPLAY_OPTION]
|
239
239
|
|
240
|
+
def _get_remote_home_dir_with_retry():
|
241
|
+
backoff = common_utils.Backoff(initial_backoff=1,
|
242
|
+
max_backoff_factor=5)
|
243
|
+
retries_left = max_retry
|
244
|
+
assert retries_left > 0, f'max_retry {max_retry} must be positive.'
|
245
|
+
while retries_left >= 0:
|
246
|
+
try:
|
247
|
+
return get_remote_home_dir()
|
248
|
+
except Exception: # pylint: disable=broad-except
|
249
|
+
if retries_left == 0:
|
250
|
+
raise
|
251
|
+
sleep_time = backoff.current_backoff()
|
252
|
+
logger.warning(f'Failed to get remote home dir '
|
253
|
+
f'- retrying in {sleep_time} seconds.')
|
254
|
+
retries_left -= 1
|
255
|
+
time.sleep(sleep_time)
|
256
|
+
|
240
257
|
# --filter
|
241
258
|
# The source is a local path, so we need to resolve it.
|
242
259
|
resolved_source = pathlib.Path(source).expanduser().resolve()
|
@@ -261,7 +278,7 @@ class CommandRunner:
|
|
261
278
|
if up:
|
262
279
|
resolved_target = target
|
263
280
|
if target.startswith('~'):
|
264
|
-
remote_home_dir =
|
281
|
+
remote_home_dir = _get_remote_home_dir_with_retry()
|
265
282
|
resolved_target = target.replace('~', remote_home_dir)
|
266
283
|
full_source_str = str(resolved_source)
|
267
284
|
if resolved_source.is_dir():
|
@@ -273,7 +290,7 @@ class CommandRunner:
|
|
273
290
|
else:
|
274
291
|
resolved_source = source
|
275
292
|
if source.startswith('~'):
|
276
|
-
remote_home_dir =
|
293
|
+
remote_home_dir = _get_remote_home_dir_with_retry()
|
277
294
|
resolved_source = source.replace('~', remote_home_dir)
|
278
295
|
rsync_command.extend([
|
279
296
|
f'{node_destination}:{resolved_source!r}',
|
@@ -656,6 +673,8 @@ class SSHCommandRunner(CommandRunner):
|
|
656
673
|
class KubernetesCommandRunner(CommandRunner):
|
657
674
|
"""Runner for Kubernetes commands."""
|
658
675
|
|
676
|
+
_MAX_RETRIES_FOR_RSYNC = 3
|
677
|
+
|
659
678
|
def __init__(
|
660
679
|
self,
|
661
680
|
node: Tuple[Tuple[str, Optional[str]], str],
|
@@ -798,7 +817,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
798
817
|
# Advanced options.
|
799
818
|
log_path: str = os.devnull,
|
800
819
|
stream_logs: bool = True,
|
801
|
-
max_retry: int =
|
820
|
+
max_retry: int = _MAX_RETRIES_FOR_RSYNC,
|
802
821
|
) -> None:
|
803
822
|
"""Uses 'rsync' to sync 'source' to 'target'.
|
804
823
|
|
sky/utils/subprocess_utils.py
CHANGED
@@ -50,17 +50,27 @@ def get_parallel_threads() -> int:
|
|
50
50
|
return max(4, cpu_count - 1)
|
51
51
|
|
52
52
|
|
53
|
-
def run_in_parallel(func: Callable,
|
53
|
+
def run_in_parallel(func: Callable,
|
54
|
+
args: Iterable[Any],
|
55
|
+
num_threads: Optional[int] = None) -> List[Any]:
|
54
56
|
"""Run a function in parallel on a list of arguments.
|
55
57
|
|
56
58
|
The function 'func' should raise a CommandError if the command fails.
|
57
59
|
|
60
|
+
Args:
|
61
|
+
func: The function to run in parallel
|
62
|
+
args: Iterable of arguments to pass to func
|
63
|
+
num_threads: Number of threads to use. If None, uses
|
64
|
+
get_parallel_threads()
|
65
|
+
|
58
66
|
Returns:
|
59
67
|
A list of the return values of the function func, in the same order as the
|
60
68
|
arguments.
|
61
69
|
"""
|
62
70
|
# Reference: https://stackoverflow.com/questions/25790279/python-multiprocessing-early-termination # pylint: disable=line-too-long
|
63
|
-
|
71
|
+
processes = num_threads if num_threads is not None else get_parallel_threads(
|
72
|
+
)
|
73
|
+
with pool.ThreadPool(processes=processes) as p:
|
64
74
|
# Run the function in parallel on the arguments, keeping the order.
|
65
75
|
return list(p.imap(func, args))
|
66
76
|
|
{skypilot_nightly-1.0.0.dev20241105.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=LDYVc006Bm6m_yCUJiTKF3oPp3_O3ODjp1KhoU5meCE,5882
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
|
@@ -31,7 +31,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
|
|
31
31
|
sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
|
32
32
|
sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
|
33
33
|
sky/backends/backend_utils.py,sha256=2myfryj1zG9xxPaX6XYYJruxAOGNGbpsy2ckT4A77sE,121813
|
34
|
-
sky/backends/cloud_vm_ray_backend.py,sha256=
|
34
|
+
sky/backends/cloud_vm_ray_backend.py,sha256=jlX1atSF4L31ZMzC_tnBaWnxvc2Wb8DRwt5G_ukrlJk,232799
|
35
35
|
sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
|
36
36
|
sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
|
37
37
|
sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
|
@@ -107,7 +107,7 @@ sky/provision/__init__.py,sha256=UhYsGRribEyK1--PPT0Dom9051jlpdn8UCNhO8qpPOc,626
|
|
107
107
|
sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
|
108
108
|
sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
|
109
109
|
sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc,19199
|
110
|
-
sky/provision/instance_setup.py,sha256=
|
110
|
+
sky/provision/instance_setup.py,sha256=c6i_NC6GrW4hXAQIU5_dUBbnThjZQNS3cL2M6yMtzes,23616
|
111
111
|
sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
|
112
112
|
sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
|
113
113
|
sky/provision/provisioner.py,sha256=mTvtBjS-Xz64LJcyeHx_-wdM8Gin8D49YRaV_TADaz4,25334
|
@@ -137,7 +137,7 @@ sky/provision/gcp/instance_utils.py,sha256=veRBr6Oziv0KaUdC4acuWeaOremNV0gMYCCHa
|
|
137
137
|
sky/provision/gcp/mig_utils.py,sha256=oFpcFZoapHMILSE4iIm8V5bxP1RhbMHRF7cciqq8qAk,7883
|
138
138
|
sky/provision/kubernetes/__init__.py,sha256=y6yVfii81WYG3ROxv4hiIj-ydinS5-xGxLvXnARVQoI,719
|
139
139
|
sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2vmzldc,29072
|
140
|
-
sky/provision/kubernetes/instance.py,sha256=
|
140
|
+
sky/provision/kubernetes/instance.py,sha256=rY43hZOInP20kYofW0MGs7wDbJ4NxMw1FtKAJAPGIOU,43960
|
141
141
|
sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
|
142
142
|
sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
|
143
143
|
sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
|
@@ -146,7 +146,7 @@ sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=
|
|
146
146
|
sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
|
147
147
|
sky/provision/lambda_cloud/config.py,sha256=jq1iLzp4Up61r4JGxvtpVbJlgXnea3LHYQhCQyyl7ik,272
|
148
148
|
sky/provision/lambda_cloud/instance.py,sha256=5-XuX-KwlRq8y62NXNzY_p6aJs4iCPGBf5U4pIR4liI,8975
|
149
|
-
sky/provision/lambda_cloud/lambda_utils.py,sha256=
|
149
|
+
sky/provision/lambda_cloud/lambda_utils.py,sha256=wIXV1Qe362f8Q9u8DSx2e9IJs4CF03Jr3idHCzhlRz4,9879
|
150
150
|
sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
|
151
151
|
sky/provision/paperspace/config.py,sha256=oNmffSt-V466pE0DmML8hOCX1CiA24jAqE5JEKuqpyI,1541
|
152
152
|
sky/provision/paperspace/constants.py,sha256=NcLJGivJxshJwhR28yVHysWQ2gtMAkTVmHC91d3kyKM,957
|
@@ -192,7 +192,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
|
|
192
192
|
sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
|
193
193
|
sky/skylet/constants.py,sha256=TL-O0ZoxA1ZeNvKXzzA_UyIMXsma7flbsDZ1N_o9dKg,14468
|
194
194
|
sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
|
195
|
-
sky/skylet/job_lib.py,sha256
|
195
|
+
sky/skylet/job_lib.py,sha256=-SCbpJRiWMSwvhDjUwfwnvBap7Y5B3ol1l_PDPra3XI,36860
|
196
196
|
sky/skylet/log_lib.py,sha256=Jyj3h2yMBlheFX53AabXEiPaKyCbu06hLEhay5_ZRN0,18734
|
197
197
|
sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
|
198
198
|
sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
|
@@ -209,7 +209,7 @@ sky/skylet/providers/oci/query_helper.py,sha256=dUsvPGzWPNF5O2NjQvuC8tkilT4H11gM
|
|
209
209
|
sky/skylet/providers/oci/utils.py,sha256=lCpdklxgSwK-hqErTicpIe_xkpSlIc8u943C-9_MJfU,508
|
210
210
|
sky/skylet/providers/scp/__init__.py,sha256=15SiAh1YphXkZsHySaw_CeAmXRdoM4JtNIAt7SLbUvg,91
|
211
211
|
sky/skylet/providers/scp/config.py,sha256=lhMXyG9btMlg59nmvtnMdIDN07jBbQOheAx-bHbGbhw,5077
|
212
|
-
sky/skylet/providers/scp/node_provider.py,sha256=
|
212
|
+
sky/skylet/providers/scp/node_provider.py,sha256=W5J-170JVIpwT9Fv20fJ_PpdAVsqx9pigE-RkkG_kQE,22459
|
213
213
|
sky/skylet/ray_patches/__init__.py,sha256=IoCzj9zFaHW-n__bLD8dgC2pJMTfZRxRpr8rZGvMyrw,2761
|
214
214
|
sky/skylet/ray_patches/autoscaler.py.patch,sha256=cZK15L29aay-qx6JoGVPNsPIo3UiG0bAHh8fqfFd-44,291
|
215
215
|
sky/skylet/ray_patches/cli.py.patch,sha256=ooEAr3OfA6LN7v4iaNltY6w63TaOFssgw9iKWP49iJc,349
|
@@ -245,7 +245,7 @@ sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
245
245
|
sky/utils/accelerator_registry.py,sha256=BO4iYH5bV80Xyp4EPfO0n1D3LL0FvESCy7xm59Je3_o,3798
|
246
246
|
sky/utils/admin_policy_utils.py,sha256=zFCu1OFIrZRfQNY0JFRO1502WFfdqZhwAU_QgM4fO9U,5943
|
247
247
|
sky/utils/cluster_yaml_utils.py,sha256=1wRRYqI1kI-eFs1pMW4r_FFjHJ0zamq6v2RRI-Gtx5E,849
|
248
|
-
sky/utils/command_runner.py,sha256=
|
248
|
+
sky/utils/command_runner.py,sha256=seU7uX9CrxiC8WOWBKHW94m67-V6DYghqRXhYdUIdQI,35756
|
249
249
|
sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
|
250
250
|
sky/utils/common_utils.py,sha256=Qy25LuIoTT0qg391EWyT9i5D6fwk1S4OdFwRpCTZ9Vk,24657
|
251
251
|
sky/utils/controller_utils.py,sha256=wF4_y1PCsLAWoo3XEtECwkNYTN6hO3vn_cxGxgQYcd8,43268
|
@@ -257,7 +257,7 @@ sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
|
|
257
257
|
sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
|
258
258
|
sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
|
259
259
|
sky/utils/schemas.py,sha256=mogoStpQ77S936VfChinAW2I1DT4q2c5E7qY_qNiO0w,29094
|
260
|
-
sky/utils/subprocess_utils.py,sha256=
|
260
|
+
sky/utils/subprocess_utils.py,sha256=mMFCTfxbyav5LJ1epJJXkgfFYmd828naTOMVfYjuEWY,6905
|
261
261
|
sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
|
262
262
|
sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
|
263
263
|
sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
|
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
274
274
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
275
275
|
sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
|
276
276
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
282
|
-
skypilot_nightly-1.0.0.
|
277
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
278
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/METADATA,sha256=ICnKtcpMVvZVf_1H6k63r29XgS_-heZ4BcgH-p5J5s4,19708
|
279
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
280
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
281
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
282
|
+
skypilot_nightly-1.0.0.dev20241107.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241105.dist-info → skypilot_nightly-1.0.0.dev20241107.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|