skypilot-nightly 1.0.0.dev20250130__py3-none-any.whl → 1.0.0.dev20250201__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'db90a41f4dff842aaaacd105d77f9e9a4e29b4bc'
8
+ _SKYPILOT_COMMIT_SHA = '269dfb19286a79f4f3a233aa525c73f6562dae37'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250130'
38
+ __version__ = '1.0.0.dev20250201'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/cli.py CHANGED
@@ -3600,12 +3600,6 @@ def jobs():
3600
3600
  default=False,
3601
3601
  required=False,
3602
3602
  help='Skip confirmation prompt.')
3603
- # TODO(cooperc): remove this flag before releasing 0.8.0
3604
- @click.option('--fast',
3605
- default=False,
3606
- is_flag=True,
3607
- help=('[Deprecated] Does nothing. Previous flag behavior is now '
3608
- 'enabled by default.'))
3609
3603
  @timeline.event
3610
3604
  @usage_lib.entrypoint
3611
3605
  def jobs_launch(
@@ -3631,7 +3625,6 @@ def jobs_launch(
3631
3625
  ports: Tuple[str],
3632
3626
  detach_run: bool,
3633
3627
  yes: bool,
3634
- fast: bool,
3635
3628
  ):
3636
3629
  """Launch a managed job from a YAML or a command.
3637
3630
 
@@ -3674,16 +3667,6 @@ def jobs_launch(
3674
3667
  job_recovery=job_recovery,
3675
3668
  )
3676
3669
 
3677
- # Deprecation. The default behavior is fast, and the flag will be removed.
3678
- # The flag was not present in 0.7.x (only nightly), so we will remove before
3679
- # 0.8.0 so that it never enters a stable release.
3680
- if fast:
3681
- click.secho(
3682
- 'Flag --fast is deprecated, as the behavior is now default. The '
3683
- 'flag will be removed soon. Please do not use it, so that you '
3684
- 'avoid "No such option" errors.',
3685
- fg='yellow')
3686
-
3687
3670
  if not isinstance(task_or_dag, sky.Dag):
3688
3671
  assert isinstance(task_or_dag, sky.Task), task_or_dag
3689
3672
  with sky.Dag() as dag:
sky/jobs/controller.py CHANGED
@@ -224,8 +224,8 @@ class JobsController:
224
224
  self._backend, cluster_name)
225
225
 
226
226
  if job_status == job_lib.JobStatus.SUCCEEDED:
227
- end_time = managed_job_utils.get_job_timestamp(
228
- self._backend, cluster_name, get_end_time=True)
227
+ end_time = managed_job_utils.try_to_get_job_end_time(
228
+ self._backend, cluster_name)
229
229
  # The job is done. Set the job to SUCCEEDED first before start
230
230
  # downloading and streaming the logs to make it more responsive.
231
231
  managed_job_state.set_succeeded(self._job_id,
@@ -235,15 +235,23 @@ class JobsController:
235
235
  logger.info(
236
236
  f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
237
237
  f'Cleaning up the cluster {cluster_name}.')
238
- clusters = backend_utils.get_clusters(
239
- cluster_names=[cluster_name],
240
- refresh=False,
241
- include_controller=False)
242
- if clusters:
243
- assert len(clusters) == 1, (clusters, cluster_name)
244
- handle = clusters[0].get('handle')
245
- # Best effort to download and stream the logs.
246
- self._download_log_and_stream(task_id, handle)
238
+ try:
239
+ clusters = backend_utils.get_clusters(
240
+ cluster_names=[cluster_name],
241
+ refresh=False,
242
+ include_controller=False)
243
+ if clusters:
244
+ assert len(clusters) == 1, (clusters, cluster_name)
245
+ handle = clusters[0].get('handle')
246
+ # Best effort to download and stream the logs.
247
+ self._download_log_and_stream(task_id, handle)
248
+ except Exception as e: # pylint: disable=broad-except
249
+ # We don't want to crash here, so just log and continue.
250
+ logger.warning(
251
+ f'Failed to download and stream logs: '
252
+ f'{common_utils.format_exception(e)}',
253
+ exc_info=True)
254
+
247
255
  # Only clean up the cluster, not the storages, because tasks may
248
256
  # share storages.
249
257
  managed_job_utils.terminate_cluster(cluster_name=cluster_name)
@@ -291,8 +299,8 @@ class JobsController:
291
299
  continue
292
300
  elif job_status in job_lib.JobStatus.user_code_failure_states():
293
301
  # The user code has probably crashed, fail immediately.
294
- end_time = managed_job_utils.get_job_timestamp(
295
- self._backend, cluster_name, get_end_time=True)
302
+ end_time = managed_job_utils.try_to_get_job_end_time(
303
+ self._backend, cluster_name)
296
304
  logger.info(
297
305
  'The user job failed. Please check the logs below.\n'
298
306
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
sky/jobs/core.py CHANGED
@@ -41,8 +41,6 @@ def launch(
41
41
  name: Optional[str] = None,
42
42
  stream_logs: bool = True,
43
43
  detach_run: bool = False,
44
- # TODO(cooperc): remove fast arg before 0.8.0
45
- fast: bool = True, # pylint: disable=unused-argument for compatibility
46
44
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
47
45
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
48
46
  """Launch a managed job.
@@ -54,8 +52,6 @@ def launch(
54
52
  managed job.
55
53
  name: Name of the managed job.
56
54
  detach_run: Whether to detach the run.
57
- fast: [Deprecated] Does nothing, and will be removed soon. We will
58
- always use fast mode as it's fully safe now.
59
55
 
60
56
  Raises:
61
57
  ValueError: cluster does not exist. Or, the entrypoint is not a valid
@@ -101,7 +97,7 @@ def launch(
101
97
  ux_utils.spinner_message('Initializing managed job')):
102
98
  for task_ in dag.tasks:
103
99
  controller_utils.maybe_translate_local_file_mounts_and_sync_up(
104
- task_, path='jobs')
100
+ task_, task_type='jobs')
105
101
 
106
102
  with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
107
103
  mode='w') as f:
sky/jobs/utils.py CHANGED
@@ -356,6 +356,28 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
356
356
  return float(stdout)
357
357
 
358
358
 
359
+ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
360
+ cluster_name: str) -> float:
361
+ """Try to get the end time of the job.
362
+
363
+ If the job is preempted or we can't connect to the instance for whatever
364
+ reason, fall back to the current time.
365
+ """
366
+ try:
367
+ return get_job_timestamp(backend, cluster_name, get_end_time=True)
368
+ except exceptions.CommandError as e:
369
+ if e.returncode == 255:
370
+ # Failed to connect - probably the instance was preempted since the
371
+ # job completed. We shouldn't crash here, so just log and use the
372
+ # current time.
373
+ logger.info(f'Failed to connect to the instance {cluster_name} '
374
+ 'since the job completed. Assuming the instance '
375
+ 'was preempted.')
376
+ return time.time()
377
+ else:
378
+ raise
379
+
380
+
359
381
  def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
360
382
  """Run event callback for the task."""
361
383
 
@@ -184,6 +184,22 @@ class GPULabelFormatter:
184
184
  key:value pair to use as node selector for GPU nodes.
185
185
  """
186
186
 
187
+ @classmethod
188
+ def get_tpu_topology_label_key(cls) -> str:
189
+ """Returns the label for TPU topology used by the Kubernetes cluster.
190
+
191
+ Only implemented by formatters that support TPUs.
192
+ """
193
+ raise NotImplementedError
194
+
195
+ @classmethod
196
+ def get_tpu_topology_label_value(cls, acc_type: str, acc_count: int) -> str:
197
+ """Returns the TPU topology value for the given TPU type and count.
198
+
199
+ Only implemented by formatters that support TPUs.
200
+ """
201
+ raise NotImplementedError
202
+
187
203
  @classmethod
188
204
  def get_label_key(cls, accelerator: Optional[str] = None) -> str:
189
205
  """Returns the label key for GPU type used by the Kubernetes cluster"""
@@ -320,12 +336,32 @@ class GKELabelFormatter(GPULabelFormatter):
320
336
  GKE nodes by default are populated with `cloud.google.com/gke-accelerator`
321
337
  label, which is used to identify the GPU type.
322
338
  """
323
-
324
339
  GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator'
325
340
  TPU_LABEL_KEY = 'cloud.google.com/gke-tpu-accelerator'
326
341
  ACCELERATOR_COUNT_LABEL_KEY = 'cloud.google.com/gke-accelerator-count'
327
342
  TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology'
328
343
 
344
+ # Mapping from TPU type to {count: topologies}. Used to determine topology
345
+ # label to use in an autoscaling environment. For list of topologies, see:
346
+ # tpu v5e: https://cloud.google.com/tpu/docs/tpus-in-gke
347
+ # tpu v5p: https://cloud.google.com/tpu/docs/v5p
348
+ # TODO(romilb): Add support for TPU v4 and v6.
349
+ GKE_TPU_TOPOLOGIES = {
350
+ 'tpu-v5-lite-podslice': {
351
+ 1: '1x1',
352
+ 4: '2x2',
353
+ 8: '2x4'
354
+ },
355
+ 'tpu-v5-lite-device': {
356
+ 1: '1x1',
357
+ 4: '2x2',
358
+ 8: '2x4'
359
+ },
360
+ 'tpu-v5p-slice': {
361
+ 4: '2x2x1'
362
+ },
363
+ }
364
+
329
365
  @classmethod
330
366
  def get_label_key(cls, accelerator: Optional[str] = None) -> str:
331
367
  if accelerator is not None and accelerator.startswith('tpu-'):
@@ -344,6 +380,24 @@ class GKELabelFormatter(GPULabelFormatter):
344
380
  def get_tpu_topology_label_key(cls) -> str:
345
381
  return cls.TPU_TOPOLOGY_LABEL_KEY
346
382
 
383
+ @classmethod
384
+ def get_tpu_topology_label_value(cls, acc_type: str, acc_count: int) -> str:
385
+ """Returns the TPU topology label value for the given TPU count.
386
+
387
+ e.g. tpu-v5-lite-podslice:8 -> '2x4'
388
+ """
389
+ count_to_topology = cls.GKE_TPU_TOPOLOGIES.get(acc_type,
390
+ {}).get(acc_count, None)
391
+ if count_to_topology is None:
392
+ supported_tpus = {
393
+ tpu: list(topologies.values())
394
+ for tpu, topologies in cls.GKE_TPU_TOPOLOGIES.items()
395
+ }
396
+ raise ValueError(
397
+ f'No TPU topology found for {acc_type} with count {acc_count}. '
398
+ f'Supported TPU types and counts: {supported_tpus}')
399
+ return count_to_topology
400
+
347
401
  @classmethod
348
402
  def get_label_value(cls, accelerator: str) -> str:
349
403
  return get_gke_accelerator_name(accelerator)
@@ -633,6 +687,7 @@ def check_instance_fits(context: Optional[str],
633
687
  # If GPU/TPUs are requested, check if GPU/TPU type is available, and
634
688
  # if so, check if CPU and memory requirements on the specific node are
635
689
  # met.
690
+ assert acc_count is not None, (acc_type, acc_count)
636
691
  try:
637
692
  gpu_label_key, gpu_label_val, _, _ = (
638
693
  get_accelerator_label_key_value(context, acc_type, acc_count))
@@ -677,7 +732,7 @@ def check_instance_fits(context: Optional[str],
677
732
  def get_accelerator_label_key_value(
678
733
  context: Optional[str],
679
734
  acc_type: str,
680
- acc_count: Optional[int],
735
+ acc_count: int,
681
736
  check_mode=False
682
737
  ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
683
738
  """Returns the label key and value for the given GPU/TPU type.
@@ -723,8 +778,15 @@ def get_accelerator_label_key_value(
723
778
  formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
724
779
  assert formatter is not None, ('Unsupported autoscaler type:'
725
780
  f' {autoscaler_type}')
781
+ tpu_topology_label_key = None
782
+ tpu_topology_label_value = None
783
+ if is_tpu_on_gke(acc_type):
784
+ assert formatter == GKELabelFormatter, formatter
785
+ tpu_topology_label_key = formatter.get_tpu_topology_label_key()
786
+ tpu_topology_label_value = formatter.get_tpu_topology_label_value(
787
+ acc_type, acc_count)
726
788
  return formatter.get_label_key(acc_type), formatter.get_label_value(
727
- acc_type), None, None
789
+ acc_type), tpu_topology_label_key, tpu_topology_label_value
728
790
 
729
791
  has_gpus, cluster_resources = detect_accelerator_resource(context)
730
792
  if has_gpus:
@@ -787,7 +849,12 @@ def get_accelerator_label_key_value(
787
849
  if node_metadata_labels.get(
788
850
  label_formatter.TPU_LABEL_KEY) == acc_type:
789
851
  topology_label_key = (
790
- label_formatter.TPU_TOPOLOGY_LABEL_KEY)
852
+ label_formatter.get_tpu_topology_label_key(
853
+ ))
854
+ # Instead of using get_tpu_topology_label_value,
855
+ # we use the node's label value to determine the
856
+ # topology. This is to make sure the node's
857
+ # available topology matches our request.
791
858
  topology_value = node_metadata_labels.get(
792
859
  topology_label_key)
793
860
  assert topology_value is not None
@@ -2340,7 +2407,7 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
2340
2407
 
2341
2408
 
2342
2409
  def is_tpu_on_gke(accelerator: str) -> bool:
2343
- """Determins if the given accelerator is a TPU supported on GKE."""
2410
+ """Determines if the given accelerator is a TPU supported on GKE."""
2344
2411
  return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
2345
2412
 
2346
2413
 
sky/serve/core.py CHANGED
@@ -175,7 +175,7 @@ def up(
175
175
  with rich_utils.safe_status(
176
176
  ux_utils.spinner_message('Initializing service')):
177
177
  controller_utils.maybe_translate_local_file_mounts_and_sync_up(
178
- task, path='serve')
178
+ task, task_type='serve')
179
179
 
180
180
  tls_template_vars = _rewrite_tls_credential_paths_and_get_tls_env_vars(
181
181
  service_name, task)
@@ -458,7 +458,7 @@ def update(
458
458
  with rich_utils.safe_status(
459
459
  ux_utils.spinner_message('Initializing service')):
460
460
  controller_utils.maybe_translate_local_file_mounts_and_sync_up(
461
- task, path='serve')
461
+ task, task_type='serve')
462
462
 
463
463
  code = serve_utils.ServeCodeGen.add_version(service_name)
464
464
  returncode, version_string_payload, stderr = backend.run_on_head(
@@ -122,7 +122,7 @@ extras_require: Dict[str, List[str]] = {
122
122
  'scp': local_ray,
123
123
  'oci': ['oci'] + local_ray,
124
124
  # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
125
- 'kubernetes': ['kubernetes>=20.0.0,<32.0.0'],
125
+ 'kubernetes': ['kubernetes>=20.0.0,!=32.0.0'],
126
126
  'remote': remote,
127
127
  # For the container registry auth api. Reference:
128
128
  # https://github.com/runpod/runpod-python/releases/tag/1.6.1
sky/skylet/constants.py CHANGED
@@ -268,7 +268,7 @@ CLUSTER_NAME_VALID_REGEX = '[a-zA-Z]([-_.a-zA-Z0-9]*[a-zA-Z0-9])?'
268
268
  # Used for translate local file mounts to cloud storage. Please refer to
269
269
  # sky/execution.py::_maybe_translate_local_file_mounts_and_sync_up for
270
270
  # more details.
271
- FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-{username}-{id}'
271
+ FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-{username}-{user_hash}-{id}'
272
272
  FILE_MOUNTS_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}'
273
273
  FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/sky-{}-filemounts-files'
274
274
 
@@ -7,6 +7,7 @@ import os
7
7
  import tempfile
8
8
  import typing
9
9
  from typing import Any, Dict, Iterable, List, Optional, Set
10
+ import uuid
10
11
 
11
12
  import colorama
12
13
 
@@ -314,6 +315,8 @@ def download_and_stream_latest_job_log(
314
315
  """Downloads and streams the latest job log.
315
316
 
316
317
  This function is only used by jobs controller and sky serve controller.
318
+
319
+ If the log cannot be fetched for any reason, return None.
317
320
  """
318
321
  os.makedirs(local_dir, exist_ok=True)
319
322
  log_file = None
@@ -328,31 +331,47 @@ def download_and_stream_latest_job_log(
328
331
  # job_ids all represent the same logical managed job.
329
332
  job_ids=None,
330
333
  local_dir=local_dir)
331
- except exceptions.CommandError as e:
332
- logger.info(f'Failed to download the logs: '
333
- f'{common_utils.format_exception(e)}')
334
- else:
335
- if not log_dirs:
336
- logger.error('Failed to find the logs for the user program.')
337
- else:
338
- log_dir = list(log_dirs.values())[0]
339
- log_file = os.path.join(log_dir, 'run.log')
340
- # Print the logs to the console.
341
- # TODO(zhwu): refactor this into log_utils, along with the
342
- # refactoring for the log_lib.tail_logs.
343
- try:
344
- with open(log_file, 'r', encoding='utf-8') as f:
345
- # Stream the logs to the console without reading the whole
346
- # file into memory.
347
- start_streaming = False
348
- for line in f:
349
- if log_lib.LOG_FILE_START_STREAMING_AT in line:
350
- start_streaming = True
351
- if start_streaming:
352
- print(line, end='', flush=True)
353
- except FileNotFoundError:
354
- logger.error('Failed to find the logs for the user '
355
- f'program at {log_file}.')
334
+ except Exception as e: # pylint: disable=broad-except
335
+ # We want to avoid crashing the controller. sync_down_logs() is pretty
336
+ # complicated and could crash in various places (creating remote
337
+ # runners, executing remote code, decoding the payload, etc.). So, we
338
+ # use a broad except and just return None.
339
+ logger.info(
340
+ f'Failed to download the logs: '
341
+ f'{common_utils.format_exception(e)}',
342
+ exc_info=True)
343
+ return None
344
+
345
+ if not log_dirs:
346
+ logger.error('Failed to find the logs for the user program.')
347
+ return None
348
+
349
+ log_dir = list(log_dirs.values())[0]
350
+ log_file = os.path.join(log_dir, 'run.log')
351
+
352
+ # Print the logs to the console.
353
+ # TODO(zhwu): refactor this into log_utils, along with the refactoring for
354
+ # the log_lib.tail_logs.
355
+ try:
356
+ with open(log_file, 'r', encoding='utf-8') as f:
357
+ # Stream the logs to the console without reading the whole file into
358
+ # memory.
359
+ start_streaming = False
360
+ for line in f:
361
+ if log_lib.LOG_FILE_START_STREAMING_AT in line:
362
+ start_streaming = True
363
+ if start_streaming:
364
+ print(line, end='', flush=True)
365
+ except FileNotFoundError:
366
+ logger.error('Failed to find the logs for the user '
367
+ f'program at {log_file}.')
368
+ except Exception as e: # pylint: disable=broad-except
369
+ logger.error(
370
+ f'Failed to stream the logs for the user program at '
371
+ f'{log_file}: {common_utils.format_exception(e)}',
372
+ exc_info=True)
373
+ # Return the log_file anyway.
374
+
356
375
  return log_file
357
376
 
358
377
 
@@ -642,7 +661,7 @@ def replace_skypilot_config_path_in_file_mounts(
642
661
 
643
662
 
644
663
  def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
645
- path: str) -> None:
664
+ task_type: str) -> None:
646
665
  """Translates local->VM mounts into Storage->VM, then syncs up any Storage.
647
666
 
648
667
  Eagerly syncing up local->Storage ensures Storage->VM would work at task
@@ -651,6 +670,13 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
651
670
  If there are no local source paths to be translated, this function would
652
671
  still sync up any storage mounts with local source paths (which do not
653
672
  undergo translation).
673
+
674
+ When jobs.bucket or serve.bucket is not specified, an intermediate storage
675
+ dedicated for the job is created for the workdir and local file mounts and
676
+ the storage is deleted when the job finishes. We don't share the storage
677
+ between jobs, because jobs might have different resources requirements, and
678
+ sharing storage between jobs may cause egress costs or slower transfer
679
+ speeds.
654
680
  """
655
681
 
656
682
  # ================================================================
@@ -669,11 +695,17 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
669
695
  store.delete()
670
696
  with ux_utils.print_exception_no_traceback():
671
697
  raise exceptions.StorageBucketCreateError(
672
- f'Jobs bucket {store.name!r} does not exist. '
673
- 'Please check jobs.bucket configuration in '
698
+ f'{task_type.capitalize()} bucket {store.name!r} does not '
699
+ f'exist. Please check {task_type}.bucket configuration in '
674
700
  'your SkyPilot config.')
675
701
 
676
- run_id = common_utils.get_usage_run_id()[:8]
702
+ # We use uuid to generate a unique run id for the job, so that the bucket/
703
+ # subdirectory name is unique across different jobs/services.
704
+ # We should not use common_utils.get_usage_run_id() here, because when
705
+ # Python API is used, the run id will be the same across multiple
706
+ # jobs.launch/serve.up calls after the sky is imported.
707
+ run_id = common_utils.base36_encode(uuid.uuid4().hex)[:8]
708
+ user_hash = common_utils.get_user_hash()
677
709
  original_file_mounts = task.file_mounts if task.file_mounts else {}
678
710
  original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
679
711
 
@@ -701,13 +733,15 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
701
733
 
702
734
  # Get the bucket name for the workdir and file mounts,
703
735
  # we store all these files in same bucket from config.
704
- bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None)
736
+ bucket_wth_prefix = skypilot_config.get_nested((task_type, 'bucket'), None)
705
737
  store_kwargs: Dict[str, Any] = {}
706
738
  if bucket_wth_prefix is None:
707
739
  store_type = store_cls = sub_path = None
708
740
  storage_account_name = region = None
709
741
  bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
710
- username=common_utils.get_cleaned_username(), id=run_id)
742
+ username=common_utils.get_cleaned_username(),
743
+ user_hash=user_hash,
744
+ id=run_id)
711
745
  else:
712
746
  store_type, store_cls, bucket_name, sub_path, storage_account_name, \
713
747
  region = storage_lib.StoreType.get_fields_from_store_url(
@@ -798,7 +832,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
798
832
  constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id))
799
833
  os.makedirs(local_fm_path, exist_ok=True)
800
834
  file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format(
801
- path)
835
+ task_type)
802
836
  if copy_mounts_with_file_in_src:
803
837
  src_to_file_id = {}
804
838
  for i, src in enumerate(set(copy_mounts_with_file_in_src.values())):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250130
3
+ Version: 1.0.0.dev20250201
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -81,7 +81,7 @@ Provides-Extra: oci
81
81
  Requires-Dist: oci; extra == "oci"
82
82
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "oci"
83
83
  Provides-Extra: kubernetes
84
- Requires-Dist: kubernetes<32.0.0,>=20.0.0; extra == "kubernetes"
84
+ Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "kubernetes"
85
85
  Provides-Extra: remote
86
86
  Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.32.0; (python_version < "3.10" and sys_platform == "darwin") and extra == "remote"
87
87
  Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.42.0; (python_version >= "3.10" and sys_platform == "darwin") and extra == "remote"
@@ -133,7 +133,7 @@ Requires-Dist: colorama<0.4.5; extra == "all"
133
133
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
134
134
  Requires-Dist: oci; extra == "all"
135
135
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
136
- Requires-Dist: kubernetes<32.0.0,>=20.0.0; extra == "all"
136
+ Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
137
137
  Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.32.0; (python_version < "3.10" and sys_platform == "darwin") and extra == "all"
138
138
  Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.42.0; (python_version >= "3.10" and sys_platform == "darwin") and extra == "all"
139
139
  Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.32.0; (python_version < "3.10" and sys_platform != "darwin") and extra == "all"
@@ -1,8 +1,8 @@
1
- sky/__init__.py,sha256=_2mFMzFyJAHhW4BNdYQodCXMrGYYxEh_Dod5PwSOzJc,5529
1
+ sky/__init__.py,sha256=WGHbjb3X_Lpj1KaiFHN_O0Fcj0db6JHWo7SmKXIOIzo,5529
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=LXUDABKP1FJCS256xTTDJa40WXwHKF5x49S-4hZbD1M,21501
4
4
  sky/check.py,sha256=qTpm3N1zUZi2inEZPsrbt278B3h8nsk2gnepzIgLybE,10899
5
- sky/cli.py,sha256=-41DELQxAwIqLN3T1meUctCnjomujK9PK7njIiQUhKc,214883
5
+ sky/cli.py,sha256=B-YWYiKnfSGdSOXtAY8SRGOGhneUeNPBjXFZ0FuLZ8w,214131
6
6
  sky/cloud_stores.py,sha256=PcLT57_8SZy7o6paAluElfBynaLkbaOq3l-8dNg1AVM,23672
7
7
  sky/core.py,sha256=fE1rn4Ku94S0XmWTO5-6t6eT6aaJImNczRqEnTe8v7Q,38742
8
8
  sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
@@ -98,12 +98,12 @@ sky/data/storage.py,sha256=CWVKnHhdzXw1biPbRqYizkyVexL_OCELuJCqtd4hit4,204094
98
98
  sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
99
99
  sky/jobs/__init__.py,sha256=ObZcz3lL1ip8JcmR6gbfZ4RMMfXJJdsnuU2zLQUb8jY,1546
100
100
  sky/jobs/constants.py,sha256=6RphkJ6pmafQ7XYW5qwId1Zvqb99HJelA9kgrgfNR7o,1421
101
- sky/jobs/controller.py,sha256=0WcOk8xRZ-mZWuza-WE-ICKZTgZvXxNzj9pWXUslm6E,28312
102
- sky/jobs/core.py,sha256=99fXKR-lhhFI4fTV7aGpbtbqgAkSgZBeUJ8Zt-68ar4,20314
101
+ sky/jobs/controller.py,sha256=h4F60FMja-GHlyNpbujqb8lx82P4qf0ghKkXORfYMWY,28694
102
+ sky/jobs/core.py,sha256=16oNEXz6HuoPYjnIa9UZBciwZKPGOwhkBd_mkWw4iOw,20063
103
103
  sky/jobs/recovery_strategy.py,sha256=m-EA-MWXPFrgx2CYFPr6MmgeUoDTEBmY2xruD2PRSGY,26365
104
104
  sky/jobs/scheduler.py,sha256=WAvNb8-vBk8q1zFordFdpH7gxqWDjPHDGZZay6aodOk,12028
105
105
  sky/jobs/state.py,sha256=bvBNZMg3DzPfS4eHNzMqYaMui2cqnWoWGDIaiOpaXSk,40770
106
- sky/jobs/utils.py,sha256=RGVytFmB6SmKK3qZp_8UID_T5ssxSJOgwCDgIvRmhtM,51785
106
+ sky/jobs/utils.py,sha256=9tCKeY2x1lOgFQdaxqx6tZd2zd2e3pdUOQGvgvbf1Rk,52682
107
107
  sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
108
108
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
109
109
  sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
@@ -149,7 +149,7 @@ sky/provision/kubernetes/config.py,sha256=bXwOGdSAnXCkDreew0KsSUqSv3ZrptNeevqat7
149
149
  sky/provision/kubernetes/instance.py,sha256=AQikdRgNklpeMgiEd4w2Hh7kGssVABsy0aCh9xsKi5Y,50313
150
150
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
151
151
  sky/provision/kubernetes/network_utils.py,sha256=52BZY_5ynCH6IXlivKObYyAHDgQCJyAJIjmM7J4MpFo,11393
152
- sky/provision/kubernetes/utils.py,sha256=Soyq-8h1i0ZYjTzVZRgwbyAkfEbNrAR3G2-krzIr6Rk,107132
152
+ sky/provision/kubernetes/utils.py,sha256=4kSEx6NZB3MAsDqCxLO-elo7EO6Coh-9wypwVqs3jgk,109895
153
153
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
154
154
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
155
155
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
@@ -191,7 +191,7 @@ sky/serve/__init__.py,sha256=Bqw8nB9u1QF3ryjbV797SPZq0DWAcjT94E_5B8J24ag,1808
191
191
  sky/serve/autoscalers.py,sha256=OxaynplCqbmrMA3fIGhxkugaGm-50QoI8S1fIfHK0M0,31667
192
192
  sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
193
193
  sky/serve/controller.py,sha256=jtzWHsLHnVPQ727ZpDZTUpGTtIOssbnQpXeWOyAuW_s,11886
194
- sky/serve/core.py,sha256=rUCXibVGixO7j7b7nUcBY1pCcejOoa6tztIVf5MQ9bw,35778
194
+ sky/serve/core.py,sha256=ANjALyYiQUmcpWjQ1YJor2rqHJypQpzuQxuIPnDyEk0,35788
195
195
  sky/serve/load_balancer.py,sha256=2nkMPRvy-h7hJL4Qq__tkT8nIAVC_nmjyXf8mMGYEFk,13658
196
196
  sky/serve/load_balancing_policies.py,sha256=XVj76qBgqh7h6wfx53RKQFzBefDWTE4TCdCEtFLLtI4,5398
197
197
  sky/serve/replica_managers.py,sha256=SW7k2iivUZ6dw_YMgGYOHOGD9_yyV4byfKa8e5t8_HE,57587
@@ -200,14 +200,14 @@ sky/serve/serve_utils.py,sha256=m1Zcjslnzcr5AAppzV48WDOwMWjRaXotTUd_iN-dHgc,4065
200
200
  sky/serve/service.py,sha256=DPU1PJGuHa1WaNqxYqgpmqd4LA9jBbQM-KlLrA6C1M0,12156
201
201
  sky/serve/service_spec.py,sha256=Q0qnFRjNnfGIpksubH5VqPKIlvpWs5had_Ma_PSHyo8,16940
202
202
  sky/setup_files/MANIFEST.in,sha256=WF0T89NLichHxZDDSQzvSpiONtAEFyur2MPmGczgTIo,555
203
- sky/setup_files/dependencies.py,sha256=OftFwWuV41sb_ZMD5euA6DABZx1Th1V_vCZcLV9CyMU,6234
203
+ sky/setup_files/dependencies.py,sha256=Z7dJM8wN-pAjXnLl3Q3hE1h2NmRZ8R6zfprF0kC5-Zw,6235
204
204
  sky/setup_files/setup.py,sha256=HMqAIxHrhtQUOlm6_Iz5E_bL4dUvsYgXc9YVQIFayPs,7417
205
205
  sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
206
206
  sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
207
207
  sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
208
208
  sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,4478
209
209
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
210
- sky/skylet/constants.py,sha256=w_6GtXX6g0Rns4dA6-reQmd2YOHzTuTb-THkXFeznsg,16045
210
+ sky/skylet/constants.py,sha256=cMUJmj9iEY7dFW5pllijwrUlcKQmsJxgQSSrvTq9Ua8,16057
211
211
  sky/skylet/events.py,sha256=0bOjUYpphuAficD9wDB5NOan2vwJDaRqdnm4sl0RK0U,12535
212
212
  sky/skylet/job_lib.py,sha256=Rk-C069cusJIRXsks8xqCb016JSt7GlpU7LrpX0qFJk,42785
213
213
  sky/skylet/log_lib.py,sha256=oFEBd85vDYFrIyyZKekH30yc4rRYILC0F0o-COQ64oE,20445
@@ -263,7 +263,7 @@ sky/utils/command_runner.py,sha256=ewDjFxcCOv0OeG2aUOIfVWmTls65up9DvSnAXURvGfM,3
263
263
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
264
264
  sky/utils/common_utils.py,sha256=Kh0iymQl9I4HXxYSc3TTcv-xeso27pU_1hGNOc9Xw2o,25370
265
265
  sky/utils/control_master_utils.py,sha256=90hnxiAUP20gbJ9e3MERh7rb04ZO_I3LsljNjR26H5I,1416
266
- sky/utils/controller_utils.py,sha256=g4wvp6BrXUcwjRbMvy_LBtZPMPOzHXeRWyEoXORoZrU,44381
266
+ sky/utils/controller_utils.py,sha256=z7f4iQqaJEyxNlHXY83nAgkcppEATZu8OfUuk-dJinc,45826
267
267
  sky/utils/dag_utils.py,sha256=R1yhJssvzDg13p6PJIC8OkYFBiR64eIx5xQeRpAG9n4,6099
268
268
  sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
269
269
  sky/utils/env_options.py,sha256=E5iwRFBUY2Iq6e0y0c1Mv5OSQ4MRNdk0-p38xUyVerc,1366
@@ -289,9 +289,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
289
289
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
290
290
  sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
291
291
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
292
- skypilot_nightly-1.0.0.dev20250130.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
293
- skypilot_nightly-1.0.0.dev20250130.dist-info/METADATA,sha256=h7cYgI0Xy2rdgSRNc8aciOdrnEJ4oqZuAlnCy0N9xb8,21249
294
- skypilot_nightly-1.0.0.dev20250130.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
295
- skypilot_nightly-1.0.0.dev20250130.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
296
- skypilot_nightly-1.0.0.dev20250130.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
297
- skypilot_nightly-1.0.0.dev20250130.dist-info/RECORD,,
292
+ skypilot_nightly-1.0.0.dev20250201.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
293
+ skypilot_nightly-1.0.0.dev20250201.dist-info/METADATA,sha256=ZH9ZKsIKy2cBJ6efiplTjtdutTTG5FgKXyvhJ_OmWjk,21251
294
+ skypilot_nightly-1.0.0.dev20250201.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
295
+ skypilot_nightly-1.0.0.dev20250201.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
296
+ skypilot_nightly-1.0.0.dev20250201.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
297
+ skypilot_nightly-1.0.0.dev20250201.dist-info/RECORD,,