skypilot-nightly 1.0.0.dev20250130__py3-none-any.whl → 1.0.0.dev20250201__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +0 -17
- sky/jobs/controller.py +21 -13
- sky/jobs/core.py +1 -5
- sky/jobs/utils.py +22 -0
- sky/provision/kubernetes/utils.py +72 -5
- sky/serve/core.py +2 -2
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/constants.py +1 -1
- sky/utils/controller_utils.py +66 -32
- {skypilot_nightly-1.0.0.dev20250130.dist-info → skypilot_nightly-1.0.0.dev20250201.dist-info}/METADATA +3 -3
- {skypilot_nightly-1.0.0.dev20250130.dist-info → skypilot_nightly-1.0.0.dev20250201.dist-info}/RECORD +16 -16
- {skypilot_nightly-1.0.0.dev20250130.dist-info → skypilot_nightly-1.0.0.dev20250201.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250130.dist-info → skypilot_nightly-1.0.0.dev20250201.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250130.dist-info → skypilot_nightly-1.0.0.dev20250201.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250130.dist-info → skypilot_nightly-1.0.0.dev20250201.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '269dfb19286a79f4f3a233aa525c73f6562dae37'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250201'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/cli.py
CHANGED
@@ -3600,12 +3600,6 @@ def jobs():
|
|
3600
3600
|
default=False,
|
3601
3601
|
required=False,
|
3602
3602
|
help='Skip confirmation prompt.')
|
3603
|
-
# TODO(cooperc): remove this flag before releasing 0.8.0
|
3604
|
-
@click.option('--fast',
|
3605
|
-
default=False,
|
3606
|
-
is_flag=True,
|
3607
|
-
help=('[Deprecated] Does nothing. Previous flag behavior is now '
|
3608
|
-
'enabled by default.'))
|
3609
3603
|
@timeline.event
|
3610
3604
|
@usage_lib.entrypoint
|
3611
3605
|
def jobs_launch(
|
@@ -3631,7 +3625,6 @@ def jobs_launch(
|
|
3631
3625
|
ports: Tuple[str],
|
3632
3626
|
detach_run: bool,
|
3633
3627
|
yes: bool,
|
3634
|
-
fast: bool,
|
3635
3628
|
):
|
3636
3629
|
"""Launch a managed job from a YAML or a command.
|
3637
3630
|
|
@@ -3674,16 +3667,6 @@ def jobs_launch(
|
|
3674
3667
|
job_recovery=job_recovery,
|
3675
3668
|
)
|
3676
3669
|
|
3677
|
-
# Deprecation. The default behavior is fast, and the flag will be removed.
|
3678
|
-
# The flag was not present in 0.7.x (only nightly), so we will remove before
|
3679
|
-
# 0.8.0 so that it never enters a stable release.
|
3680
|
-
if fast:
|
3681
|
-
click.secho(
|
3682
|
-
'Flag --fast is deprecated, as the behavior is now default. The '
|
3683
|
-
'flag will be removed soon. Please do not use it, so that you '
|
3684
|
-
'avoid "No such option" errors.',
|
3685
|
-
fg='yellow')
|
3686
|
-
|
3687
3670
|
if not isinstance(task_or_dag, sky.Dag):
|
3688
3671
|
assert isinstance(task_or_dag, sky.Task), task_or_dag
|
3689
3672
|
with sky.Dag() as dag:
|
sky/jobs/controller.py
CHANGED
@@ -224,8 +224,8 @@ class JobsController:
|
|
224
224
|
self._backend, cluster_name)
|
225
225
|
|
226
226
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
227
|
-
end_time = managed_job_utils.
|
228
|
-
self._backend, cluster_name
|
227
|
+
end_time = managed_job_utils.try_to_get_job_end_time(
|
228
|
+
self._backend, cluster_name)
|
229
229
|
# The job is done. Set the job to SUCCEEDED first before start
|
230
230
|
# downloading and streaming the logs to make it more responsive.
|
231
231
|
managed_job_state.set_succeeded(self._job_id,
|
@@ -235,15 +235,23 @@ class JobsController:
|
|
235
235
|
logger.info(
|
236
236
|
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
237
237
|
f'Cleaning up the cluster {cluster_name}.')
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
238
|
+
try:
|
239
|
+
clusters = backend_utils.get_clusters(
|
240
|
+
cluster_names=[cluster_name],
|
241
|
+
refresh=False,
|
242
|
+
include_controller=False)
|
243
|
+
if clusters:
|
244
|
+
assert len(clusters) == 1, (clusters, cluster_name)
|
245
|
+
handle = clusters[0].get('handle')
|
246
|
+
# Best effort to download and stream the logs.
|
247
|
+
self._download_log_and_stream(task_id, handle)
|
248
|
+
except Exception as e: # pylint: disable=broad-except
|
249
|
+
# We don't want to crash here, so just log and continue.
|
250
|
+
logger.warning(
|
251
|
+
f'Failed to download and stream logs: '
|
252
|
+
f'{common_utils.format_exception(e)}',
|
253
|
+
exc_info=True)
|
254
|
+
|
247
255
|
# Only clean up the cluster, not the storages, because tasks may
|
248
256
|
# share storages.
|
249
257
|
managed_job_utils.terminate_cluster(cluster_name=cluster_name)
|
@@ -291,8 +299,8 @@ class JobsController:
|
|
291
299
|
continue
|
292
300
|
elif job_status in job_lib.JobStatus.user_code_failure_states():
|
293
301
|
# The user code has probably crashed, fail immediately.
|
294
|
-
end_time = managed_job_utils.
|
295
|
-
self._backend, cluster_name
|
302
|
+
end_time = managed_job_utils.try_to_get_job_end_time(
|
303
|
+
self._backend, cluster_name)
|
296
304
|
logger.info(
|
297
305
|
'The user job failed. Please check the logs below.\n'
|
298
306
|
f'== Logs of the user job (ID: {self._job_id}) ==\n')
|
sky/jobs/core.py
CHANGED
@@ -41,8 +41,6 @@ def launch(
|
|
41
41
|
name: Optional[str] = None,
|
42
42
|
stream_logs: bool = True,
|
43
43
|
detach_run: bool = False,
|
44
|
-
# TODO(cooperc): remove fast arg before 0.8.0
|
45
|
-
fast: bool = True, # pylint: disable=unused-argument for compatibility
|
46
44
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
47
45
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
48
46
|
"""Launch a managed job.
|
@@ -54,8 +52,6 @@ def launch(
|
|
54
52
|
managed job.
|
55
53
|
name: Name of the managed job.
|
56
54
|
detach_run: Whether to detach the run.
|
57
|
-
fast: [Deprecated] Does nothing, and will be removed soon. We will
|
58
|
-
always use fast mode as it's fully safe now.
|
59
55
|
|
60
56
|
Raises:
|
61
57
|
ValueError: cluster does not exist. Or, the entrypoint is not a valid
|
@@ -101,7 +97,7 @@ def launch(
|
|
101
97
|
ux_utils.spinner_message('Initializing managed job')):
|
102
98
|
for task_ in dag.tasks:
|
103
99
|
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
104
|
-
task_,
|
100
|
+
task_, task_type='jobs')
|
105
101
|
|
106
102
|
with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
|
107
103
|
mode='w') as f:
|
sky/jobs/utils.py
CHANGED
@@ -356,6 +356,28 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
356
356
|
return float(stdout)
|
357
357
|
|
358
358
|
|
359
|
+
def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
360
|
+
cluster_name: str) -> float:
|
361
|
+
"""Try to get the end time of the job.
|
362
|
+
|
363
|
+
If the job is preempted or we can't connect to the instance for whatever
|
364
|
+
reason, fall back to the current time.
|
365
|
+
"""
|
366
|
+
try:
|
367
|
+
return get_job_timestamp(backend, cluster_name, get_end_time=True)
|
368
|
+
except exceptions.CommandError as e:
|
369
|
+
if e.returncode == 255:
|
370
|
+
# Failed to connect - probably the instance was preempted since the
|
371
|
+
# job completed. We shouldn't crash here, so just log and use the
|
372
|
+
# current time.
|
373
|
+
logger.info(f'Failed to connect to the instance {cluster_name} '
|
374
|
+
'since the job completed. Assuming the instance '
|
375
|
+
'was preempted.')
|
376
|
+
return time.time()
|
377
|
+
else:
|
378
|
+
raise
|
379
|
+
|
380
|
+
|
359
381
|
def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
|
360
382
|
"""Run event callback for the task."""
|
361
383
|
|
@@ -184,6 +184,22 @@ class GPULabelFormatter:
|
|
184
184
|
key:value pair to use as node selector for GPU nodes.
|
185
185
|
"""
|
186
186
|
|
187
|
+
@classmethod
|
188
|
+
def get_tpu_topology_label_key(cls) -> str:
|
189
|
+
"""Returns the label for TPU topology used by the Kubernetes cluster.
|
190
|
+
|
191
|
+
Only implemented by formatters that support TPUs.
|
192
|
+
"""
|
193
|
+
raise NotImplementedError
|
194
|
+
|
195
|
+
@classmethod
|
196
|
+
def get_tpu_topology_label_value(cls, acc_type: str, acc_count: int) -> str:
|
197
|
+
"""Returns the TPU topology value for the given TPU type and count.
|
198
|
+
|
199
|
+
Only implemented by formatters that support TPUs.
|
200
|
+
"""
|
201
|
+
raise NotImplementedError
|
202
|
+
|
187
203
|
@classmethod
|
188
204
|
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
189
205
|
"""Returns the label key for GPU type used by the Kubernetes cluster"""
|
@@ -320,12 +336,32 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
320
336
|
GKE nodes by default are populated with `cloud.google.com/gke-accelerator`
|
321
337
|
label, which is used to identify the GPU type.
|
322
338
|
"""
|
323
|
-
|
324
339
|
GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator'
|
325
340
|
TPU_LABEL_KEY = 'cloud.google.com/gke-tpu-accelerator'
|
326
341
|
ACCELERATOR_COUNT_LABEL_KEY = 'cloud.google.com/gke-accelerator-count'
|
327
342
|
TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology'
|
328
343
|
|
344
|
+
# Mapping from TPU type to {count: topologies}. Used to determine topology
|
345
|
+
# label to use in an autoscaling environment. For list of topologies, see:
|
346
|
+
# tpu v5e: https://cloud.google.com/tpu/docs/tpus-in-gke
|
347
|
+
# tpu v5p: https://cloud.google.com/tpu/docs/v5p
|
348
|
+
# TODO(romilb): Add support for TPU v4 and v6.
|
349
|
+
GKE_TPU_TOPOLOGIES = {
|
350
|
+
'tpu-v5-lite-podslice': {
|
351
|
+
1: '1x1',
|
352
|
+
4: '2x2',
|
353
|
+
8: '2x4'
|
354
|
+
},
|
355
|
+
'tpu-v5-lite-device': {
|
356
|
+
1: '1x1',
|
357
|
+
4: '2x2',
|
358
|
+
8: '2x4'
|
359
|
+
},
|
360
|
+
'tpu-v5p-slice': {
|
361
|
+
4: '2x2x1'
|
362
|
+
},
|
363
|
+
}
|
364
|
+
|
329
365
|
@classmethod
|
330
366
|
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
331
367
|
if accelerator is not None and accelerator.startswith('tpu-'):
|
@@ -344,6 +380,24 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
344
380
|
def get_tpu_topology_label_key(cls) -> str:
|
345
381
|
return cls.TPU_TOPOLOGY_LABEL_KEY
|
346
382
|
|
383
|
+
@classmethod
|
384
|
+
def get_tpu_topology_label_value(cls, acc_type: str, acc_count: int) -> str:
|
385
|
+
"""Returns the TPU topology label value for the given TPU count.
|
386
|
+
|
387
|
+
e.g. tpu-v5-lite-podslice:8 -> '2x4'
|
388
|
+
"""
|
389
|
+
count_to_topology = cls.GKE_TPU_TOPOLOGIES.get(acc_type,
|
390
|
+
{}).get(acc_count, None)
|
391
|
+
if count_to_topology is None:
|
392
|
+
supported_tpus = {
|
393
|
+
tpu: list(topologies.values())
|
394
|
+
for tpu, topologies in cls.GKE_TPU_TOPOLOGIES.items()
|
395
|
+
}
|
396
|
+
raise ValueError(
|
397
|
+
f'No TPU topology found for {acc_type} with count {acc_count}. '
|
398
|
+
f'Supported TPU types and counts: {supported_tpus}')
|
399
|
+
return count_to_topology
|
400
|
+
|
347
401
|
@classmethod
|
348
402
|
def get_label_value(cls, accelerator: str) -> str:
|
349
403
|
return get_gke_accelerator_name(accelerator)
|
@@ -633,6 +687,7 @@ def check_instance_fits(context: Optional[str],
|
|
633
687
|
# If GPU/TPUs are requested, check if GPU/TPU type is available, and
|
634
688
|
# if so, check if CPU and memory requirements on the specific node are
|
635
689
|
# met.
|
690
|
+
assert acc_count is not None, (acc_type, acc_count)
|
636
691
|
try:
|
637
692
|
gpu_label_key, gpu_label_val, _, _ = (
|
638
693
|
get_accelerator_label_key_value(context, acc_type, acc_count))
|
@@ -677,7 +732,7 @@ def check_instance_fits(context: Optional[str],
|
|
677
732
|
def get_accelerator_label_key_value(
|
678
733
|
context: Optional[str],
|
679
734
|
acc_type: str,
|
680
|
-
acc_count:
|
735
|
+
acc_count: int,
|
681
736
|
check_mode=False
|
682
737
|
) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
|
683
738
|
"""Returns the label key and value for the given GPU/TPU type.
|
@@ -723,8 +778,15 @@ def get_accelerator_label_key_value(
|
|
723
778
|
formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
|
724
779
|
assert formatter is not None, ('Unsupported autoscaler type:'
|
725
780
|
f' {autoscaler_type}')
|
781
|
+
tpu_topology_label_key = None
|
782
|
+
tpu_topology_label_value = None
|
783
|
+
if is_tpu_on_gke(acc_type):
|
784
|
+
assert formatter == GKELabelFormatter, formatter
|
785
|
+
tpu_topology_label_key = formatter.get_tpu_topology_label_key()
|
786
|
+
tpu_topology_label_value = formatter.get_tpu_topology_label_value(
|
787
|
+
acc_type, acc_count)
|
726
788
|
return formatter.get_label_key(acc_type), formatter.get_label_value(
|
727
|
-
acc_type),
|
789
|
+
acc_type), tpu_topology_label_key, tpu_topology_label_value
|
728
790
|
|
729
791
|
has_gpus, cluster_resources = detect_accelerator_resource(context)
|
730
792
|
if has_gpus:
|
@@ -787,7 +849,12 @@ def get_accelerator_label_key_value(
|
|
787
849
|
if node_metadata_labels.get(
|
788
850
|
label_formatter.TPU_LABEL_KEY) == acc_type:
|
789
851
|
topology_label_key = (
|
790
|
-
label_formatter.
|
852
|
+
label_formatter.get_tpu_topology_label_key(
|
853
|
+
))
|
854
|
+
# Instead of using get_tpu_topology_label_value,
|
855
|
+
# we use the node's label value to determine the
|
856
|
+
# topology. This is to make sure the node's
|
857
|
+
# available topology matches our request.
|
791
858
|
topology_value = node_metadata_labels.get(
|
792
859
|
topology_label_key)
|
793
860
|
assert topology_value is not None
|
@@ -2340,7 +2407,7 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
|
2340
2407
|
|
2341
2408
|
|
2342
2409
|
def is_tpu_on_gke(accelerator: str) -> bool:
|
2343
|
-
"""
|
2410
|
+
"""Determines if the given accelerator is a TPU supported on GKE."""
|
2344
2411
|
return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
|
2345
2412
|
|
2346
2413
|
|
sky/serve/core.py
CHANGED
@@ -175,7 +175,7 @@ def up(
|
|
175
175
|
with rich_utils.safe_status(
|
176
176
|
ux_utils.spinner_message('Initializing service')):
|
177
177
|
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
178
|
-
task,
|
178
|
+
task, task_type='serve')
|
179
179
|
|
180
180
|
tls_template_vars = _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
181
181
|
service_name, task)
|
@@ -458,7 +458,7 @@ def update(
|
|
458
458
|
with rich_utils.safe_status(
|
459
459
|
ux_utils.spinner_message('Initializing service')):
|
460
460
|
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
461
|
-
task,
|
461
|
+
task, task_type='serve')
|
462
462
|
|
463
463
|
code = serve_utils.ServeCodeGen.add_version(service_name)
|
464
464
|
returncode, version_string_payload, stderr = backend.run_on_head(
|
sky/setup_files/dependencies.py
CHANGED
@@ -122,7 +122,7 @@ extras_require: Dict[str, List[str]] = {
|
|
122
122
|
'scp': local_ray,
|
123
123
|
'oci': ['oci'] + local_ray,
|
124
124
|
# Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
|
125
|
-
'kubernetes': ['kubernetes>=20.0.0
|
125
|
+
'kubernetes': ['kubernetes>=20.0.0,!=32.0.0'],
|
126
126
|
'remote': remote,
|
127
127
|
# For the container registry auth api. Reference:
|
128
128
|
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
|
sky/skylet/constants.py
CHANGED
@@ -268,7 +268,7 @@ CLUSTER_NAME_VALID_REGEX = '[a-zA-Z]([-_.a-zA-Z0-9]*[a-zA-Z0-9])?'
|
|
268
268
|
# Used for translate local file mounts to cloud storage. Please refer to
|
269
269
|
# sky/execution.py::_maybe_translate_local_file_mounts_and_sync_up for
|
270
270
|
# more details.
|
271
|
-
FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-{username}-{id}'
|
271
|
+
FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-{username}-{user_hash}-{id}'
|
272
272
|
FILE_MOUNTS_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}'
|
273
273
|
FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/sky-{}-filemounts-files'
|
274
274
|
|
sky/utils/controller_utils.py
CHANGED
@@ -7,6 +7,7 @@ import os
|
|
7
7
|
import tempfile
|
8
8
|
import typing
|
9
9
|
from typing import Any, Dict, Iterable, List, Optional, Set
|
10
|
+
import uuid
|
10
11
|
|
11
12
|
import colorama
|
12
13
|
|
@@ -314,6 +315,8 @@ def download_and_stream_latest_job_log(
|
|
314
315
|
"""Downloads and streams the latest job log.
|
315
316
|
|
316
317
|
This function is only used by jobs controller and sky serve controller.
|
318
|
+
|
319
|
+
If the log cannot be fetched for any reason, return None.
|
317
320
|
"""
|
318
321
|
os.makedirs(local_dir, exist_ok=True)
|
319
322
|
log_file = None
|
@@ -328,31 +331,47 @@ def download_and_stream_latest_job_log(
|
|
328
331
|
# job_ids all represent the same logical managed job.
|
329
332
|
job_ids=None,
|
330
333
|
local_dir=local_dir)
|
331
|
-
except
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
334
|
+
except Exception as e: # pylint: disable=broad-except
|
335
|
+
# We want to avoid crashing the controller. sync_down_logs() is pretty
|
336
|
+
# complicated and could crash in various places (creating remote
|
337
|
+
# runners, executing remote code, decoding the payload, etc.). So, we
|
338
|
+
# use a broad except and just return None.
|
339
|
+
logger.info(
|
340
|
+
f'Failed to download the logs: '
|
341
|
+
f'{common_utils.format_exception(e)}',
|
342
|
+
exc_info=True)
|
343
|
+
return None
|
344
|
+
|
345
|
+
if not log_dirs:
|
346
|
+
logger.error('Failed to find the logs for the user program.')
|
347
|
+
return None
|
348
|
+
|
349
|
+
log_dir = list(log_dirs.values())[0]
|
350
|
+
log_file = os.path.join(log_dir, 'run.log')
|
351
|
+
|
352
|
+
# Print the logs to the console.
|
353
|
+
# TODO(zhwu): refactor this into log_utils, along with the refactoring for
|
354
|
+
# the log_lib.tail_logs.
|
355
|
+
try:
|
356
|
+
with open(log_file, 'r', encoding='utf-8') as f:
|
357
|
+
# Stream the logs to the console without reading the whole file into
|
358
|
+
# memory.
|
359
|
+
start_streaming = False
|
360
|
+
for line in f:
|
361
|
+
if log_lib.LOG_FILE_START_STREAMING_AT in line:
|
362
|
+
start_streaming = True
|
363
|
+
if start_streaming:
|
364
|
+
print(line, end='', flush=True)
|
365
|
+
except FileNotFoundError:
|
366
|
+
logger.error('Failed to find the logs for the user '
|
367
|
+
f'program at {log_file}.')
|
368
|
+
except Exception as e: # pylint: disable=broad-except
|
369
|
+
logger.error(
|
370
|
+
f'Failed to stream the logs for the user program at '
|
371
|
+
f'{log_file}: {common_utils.format_exception(e)}',
|
372
|
+
exc_info=True)
|
373
|
+
# Return the log_file anyway.
|
374
|
+
|
356
375
|
return log_file
|
357
376
|
|
358
377
|
|
@@ -642,7 +661,7 @@ def replace_skypilot_config_path_in_file_mounts(
|
|
642
661
|
|
643
662
|
|
644
663
|
def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
645
|
-
|
664
|
+
task_type: str) -> None:
|
646
665
|
"""Translates local->VM mounts into Storage->VM, then syncs up any Storage.
|
647
666
|
|
648
667
|
Eagerly syncing up local->Storage ensures Storage->VM would work at task
|
@@ -651,6 +670,13 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
651
670
|
If there are no local source paths to be translated, this function would
|
652
671
|
still sync up any storage mounts with local source paths (which do not
|
653
672
|
undergo translation).
|
673
|
+
|
674
|
+
When jobs.bucket or serve.bucket is not specified, an intermediate storage
|
675
|
+
dedicated for the job is created for the workdir and local file mounts and
|
676
|
+
the storage is deleted when the job finishes. We don't share the storage
|
677
|
+
between jobs, because jobs might have different resources requirements, and
|
678
|
+
sharing storage between jobs may cause egress costs or slower transfer
|
679
|
+
speeds.
|
654
680
|
"""
|
655
681
|
|
656
682
|
# ================================================================
|
@@ -669,11 +695,17 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
669
695
|
store.delete()
|
670
696
|
with ux_utils.print_exception_no_traceback():
|
671
697
|
raise exceptions.StorageBucketCreateError(
|
672
|
-
f'
|
673
|
-
'Please check
|
698
|
+
f'{task_type.capitalize()} bucket {store.name!r} does not '
|
699
|
+
f'exist. Please check {task_type}.bucket configuration in '
|
674
700
|
'your SkyPilot config.')
|
675
701
|
|
676
|
-
|
702
|
+
# We use uuid to generate a unique run id for the job, so that the bucket/
|
703
|
+
# subdirectory name is unique across different jobs/services.
|
704
|
+
# We should not use common_utils.get_usage_run_id() here, because when
|
705
|
+
# Python API is used, the run id will be the same across multiple
|
706
|
+
# jobs.launch/serve.up calls after the sky is imported.
|
707
|
+
run_id = common_utils.base36_encode(uuid.uuid4().hex)[:8]
|
708
|
+
user_hash = common_utils.get_user_hash()
|
677
709
|
original_file_mounts = task.file_mounts if task.file_mounts else {}
|
678
710
|
original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
|
679
711
|
|
@@ -701,13 +733,15 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
701
733
|
|
702
734
|
# Get the bucket name for the workdir and file mounts,
|
703
735
|
# we store all these files in same bucket from config.
|
704
|
-
bucket_wth_prefix = skypilot_config.get_nested((
|
736
|
+
bucket_wth_prefix = skypilot_config.get_nested((task_type, 'bucket'), None)
|
705
737
|
store_kwargs: Dict[str, Any] = {}
|
706
738
|
if bucket_wth_prefix is None:
|
707
739
|
store_type = store_cls = sub_path = None
|
708
740
|
storage_account_name = region = None
|
709
741
|
bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
|
710
|
-
username=common_utils.get_cleaned_username(),
|
742
|
+
username=common_utils.get_cleaned_username(),
|
743
|
+
user_hash=user_hash,
|
744
|
+
id=run_id)
|
711
745
|
else:
|
712
746
|
store_type, store_cls, bucket_name, sub_path, storage_account_name, \
|
713
747
|
region = storage_lib.StoreType.get_fields_from_store_url(
|
@@ -798,7 +832,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
798
832
|
constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id))
|
799
833
|
os.makedirs(local_fm_path, exist_ok=True)
|
800
834
|
file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format(
|
801
|
-
|
835
|
+
task_type)
|
802
836
|
if copy_mounts_with_file_in_src:
|
803
837
|
src_to_file_id = {}
|
804
838
|
for i, src in enumerate(set(copy_mounts_with_file_in_src.values())):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20250201
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -81,7 +81,7 @@ Provides-Extra: oci
|
|
81
81
|
Requires-Dist: oci; extra == "oci"
|
82
82
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "oci"
|
83
83
|
Provides-Extra: kubernetes
|
84
|
-
Requires-Dist: kubernetes
|
84
|
+
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "kubernetes"
|
85
85
|
Provides-Extra: remote
|
86
86
|
Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.32.0; (python_version < "3.10" and sys_platform == "darwin") and extra == "remote"
|
87
87
|
Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.42.0; (python_version >= "3.10" and sys_platform == "darwin") and extra == "remote"
|
@@ -133,7 +133,7 @@ Requires-Dist: colorama<0.4.5; extra == "all"
|
|
133
133
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
134
134
|
Requires-Dist: oci; extra == "all"
|
135
135
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
136
|
-
Requires-Dist: kubernetes
|
136
|
+
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
137
137
|
Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.32.0; (python_version < "3.10" and sys_platform == "darwin") and extra == "all"
|
138
138
|
Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.42.0; (python_version >= "3.10" and sys_platform == "darwin") and extra == "all"
|
139
139
|
Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.32.0; (python_version < "3.10" and sys_platform != "darwin") and extra == "all"
|
{skypilot_nightly-1.0.0.dev20250130.dist-info → skypilot_nightly-1.0.0.dev20250201.dist-info}/RECORD
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=WGHbjb3X_Lpj1KaiFHN_O0Fcj0db6JHWo7SmKXIOIzo,5529
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=LXUDABKP1FJCS256xTTDJa40WXwHKF5x49S-4hZbD1M,21501
|
4
4
|
sky/check.py,sha256=qTpm3N1zUZi2inEZPsrbt278B3h8nsk2gnepzIgLybE,10899
|
5
|
-
sky/cli.py,sha256
|
5
|
+
sky/cli.py,sha256=B-YWYiKnfSGdSOXtAY8SRGOGhneUeNPBjXFZ0FuLZ8w,214131
|
6
6
|
sky/cloud_stores.py,sha256=PcLT57_8SZy7o6paAluElfBynaLkbaOq3l-8dNg1AVM,23672
|
7
7
|
sky/core.py,sha256=fE1rn4Ku94S0XmWTO5-6t6eT6aaJImNczRqEnTe8v7Q,38742
|
8
8
|
sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
|
@@ -98,12 +98,12 @@ sky/data/storage.py,sha256=CWVKnHhdzXw1biPbRqYizkyVexL_OCELuJCqtd4hit4,204094
|
|
98
98
|
sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
|
99
99
|
sky/jobs/__init__.py,sha256=ObZcz3lL1ip8JcmR6gbfZ4RMMfXJJdsnuU2zLQUb8jY,1546
|
100
100
|
sky/jobs/constants.py,sha256=6RphkJ6pmafQ7XYW5qwId1Zvqb99HJelA9kgrgfNR7o,1421
|
101
|
-
sky/jobs/controller.py,sha256=
|
102
|
-
sky/jobs/core.py,sha256=
|
101
|
+
sky/jobs/controller.py,sha256=h4F60FMja-GHlyNpbujqb8lx82P4qf0ghKkXORfYMWY,28694
|
102
|
+
sky/jobs/core.py,sha256=16oNEXz6HuoPYjnIa9UZBciwZKPGOwhkBd_mkWw4iOw,20063
|
103
103
|
sky/jobs/recovery_strategy.py,sha256=m-EA-MWXPFrgx2CYFPr6MmgeUoDTEBmY2xruD2PRSGY,26365
|
104
104
|
sky/jobs/scheduler.py,sha256=WAvNb8-vBk8q1zFordFdpH7gxqWDjPHDGZZay6aodOk,12028
|
105
105
|
sky/jobs/state.py,sha256=bvBNZMg3DzPfS4eHNzMqYaMui2cqnWoWGDIaiOpaXSk,40770
|
106
|
-
sky/jobs/utils.py,sha256=
|
106
|
+
sky/jobs/utils.py,sha256=9tCKeY2x1lOgFQdaxqx6tZd2zd2e3pdUOQGvgvbf1Rk,52682
|
107
107
|
sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
|
108
108
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
109
109
|
sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
|
@@ -149,7 +149,7 @@ sky/provision/kubernetes/config.py,sha256=bXwOGdSAnXCkDreew0KsSUqSv3ZrptNeevqat7
|
|
149
149
|
sky/provision/kubernetes/instance.py,sha256=AQikdRgNklpeMgiEd4w2Hh7kGssVABsy0aCh9xsKi5Y,50313
|
150
150
|
sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
|
151
151
|
sky/provision/kubernetes/network_utils.py,sha256=52BZY_5ynCH6IXlivKObYyAHDgQCJyAJIjmM7J4MpFo,11393
|
152
|
-
sky/provision/kubernetes/utils.py,sha256=
|
152
|
+
sky/provision/kubernetes/utils.py,sha256=4kSEx6NZB3MAsDqCxLO-elo7EO6Coh-9wypwVqs3jgk,109895
|
153
153
|
sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
|
154
154
|
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
|
155
155
|
sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
|
@@ -191,7 +191,7 @@ sky/serve/__init__.py,sha256=Bqw8nB9u1QF3ryjbV797SPZq0DWAcjT94E_5B8J24ag,1808
|
|
191
191
|
sky/serve/autoscalers.py,sha256=OxaynplCqbmrMA3fIGhxkugaGm-50QoI8S1fIfHK0M0,31667
|
192
192
|
sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
|
193
193
|
sky/serve/controller.py,sha256=jtzWHsLHnVPQ727ZpDZTUpGTtIOssbnQpXeWOyAuW_s,11886
|
194
|
-
sky/serve/core.py,sha256=
|
194
|
+
sky/serve/core.py,sha256=ANjALyYiQUmcpWjQ1YJor2rqHJypQpzuQxuIPnDyEk0,35788
|
195
195
|
sky/serve/load_balancer.py,sha256=2nkMPRvy-h7hJL4Qq__tkT8nIAVC_nmjyXf8mMGYEFk,13658
|
196
196
|
sky/serve/load_balancing_policies.py,sha256=XVj76qBgqh7h6wfx53RKQFzBefDWTE4TCdCEtFLLtI4,5398
|
197
197
|
sky/serve/replica_managers.py,sha256=SW7k2iivUZ6dw_YMgGYOHOGD9_yyV4byfKa8e5t8_HE,57587
|
@@ -200,14 +200,14 @@ sky/serve/serve_utils.py,sha256=m1Zcjslnzcr5AAppzV48WDOwMWjRaXotTUd_iN-dHgc,4065
|
|
200
200
|
sky/serve/service.py,sha256=DPU1PJGuHa1WaNqxYqgpmqd4LA9jBbQM-KlLrA6C1M0,12156
|
201
201
|
sky/serve/service_spec.py,sha256=Q0qnFRjNnfGIpksubH5VqPKIlvpWs5had_Ma_PSHyo8,16940
|
202
202
|
sky/setup_files/MANIFEST.in,sha256=WF0T89NLichHxZDDSQzvSpiONtAEFyur2MPmGczgTIo,555
|
203
|
-
sky/setup_files/dependencies.py,sha256=
|
203
|
+
sky/setup_files/dependencies.py,sha256=Z7dJM8wN-pAjXnLl3Q3hE1h2NmRZ8R6zfprF0kC5-Zw,6235
|
204
204
|
sky/setup_files/setup.py,sha256=HMqAIxHrhtQUOlm6_Iz5E_bL4dUvsYgXc9YVQIFayPs,7417
|
205
205
|
sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
|
206
206
|
sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
207
207
|
sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
|
208
208
|
sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,4478
|
209
209
|
sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
|
210
|
-
sky/skylet/constants.py,sha256=
|
210
|
+
sky/skylet/constants.py,sha256=cMUJmj9iEY7dFW5pllijwrUlcKQmsJxgQSSrvTq9Ua8,16057
|
211
211
|
sky/skylet/events.py,sha256=0bOjUYpphuAficD9wDB5NOan2vwJDaRqdnm4sl0RK0U,12535
|
212
212
|
sky/skylet/job_lib.py,sha256=Rk-C069cusJIRXsks8xqCb016JSt7GlpU7LrpX0qFJk,42785
|
213
213
|
sky/skylet/log_lib.py,sha256=oFEBd85vDYFrIyyZKekH30yc4rRYILC0F0o-COQ64oE,20445
|
@@ -263,7 +263,7 @@ sky/utils/command_runner.py,sha256=ewDjFxcCOv0OeG2aUOIfVWmTls65up9DvSnAXURvGfM,3
|
|
263
263
|
sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
|
264
264
|
sky/utils/common_utils.py,sha256=Kh0iymQl9I4HXxYSc3TTcv-xeso27pU_1hGNOc9Xw2o,25370
|
265
265
|
sky/utils/control_master_utils.py,sha256=90hnxiAUP20gbJ9e3MERh7rb04ZO_I3LsljNjR26H5I,1416
|
266
|
-
sky/utils/controller_utils.py,sha256=
|
266
|
+
sky/utils/controller_utils.py,sha256=z7f4iQqaJEyxNlHXY83nAgkcppEATZu8OfUuk-dJinc,45826
|
267
267
|
sky/utils/dag_utils.py,sha256=R1yhJssvzDg13p6PJIC8OkYFBiR64eIx5xQeRpAG9n4,6099
|
268
268
|
sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
|
269
269
|
sky/utils/env_options.py,sha256=E5iwRFBUY2Iq6e0y0c1Mv5OSQ4MRNdk0-p38xUyVerc,1366
|
@@ -289,9 +289,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
289
289
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
290
290
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
291
291
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
292
|
-
skypilot_nightly-1.0.0.
|
293
|
-
skypilot_nightly-1.0.0.
|
294
|
-
skypilot_nightly-1.0.0.
|
295
|
-
skypilot_nightly-1.0.0.
|
296
|
-
skypilot_nightly-1.0.0.
|
297
|
-
skypilot_nightly-1.0.0.
|
292
|
+
skypilot_nightly-1.0.0.dev20250201.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
293
|
+
skypilot_nightly-1.0.0.dev20250201.dist-info/METADATA,sha256=ZH9ZKsIKy2cBJ6efiplTjtdutTTG5FgKXyvhJ_OmWjk,21251
|
294
|
+
skypilot_nightly-1.0.0.dev20250201.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
295
|
+
skypilot_nightly-1.0.0.dev20250201.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
296
|
+
skypilot_nightly-1.0.0.dev20250201.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
297
|
+
skypilot_nightly-1.0.0.dev20250201.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20250130.dist-info → skypilot_nightly-1.0.0.dev20250201.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|