skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/common.py +15 -9
  3. sky/adaptors/do.py +20 -0
  4. sky/adaptors/oci.py +32 -1
  5. sky/authentication.py +20 -8
  6. sky/backends/backend_utils.py +44 -0
  7. sky/backends/cloud_vm_ray_backend.py +202 -41
  8. sky/backends/wheel_utils.py +4 -1
  9. sky/check.py +31 -1
  10. sky/cli.py +39 -43
  11. sky/cloud_stores.py +71 -2
  12. sky/clouds/__init__.py +2 -0
  13. sky/clouds/aws.py +137 -50
  14. sky/clouds/cloud.py +4 -0
  15. sky/clouds/do.py +303 -0
  16. sky/clouds/gcp.py +9 -0
  17. sky/clouds/kubernetes.py +3 -3
  18. sky/clouds/oci.py +20 -9
  19. sky/clouds/service_catalog/__init__.py +7 -3
  20. sky/clouds/service_catalog/constants.py +1 -1
  21. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
  22. sky/clouds/service_catalog/do_catalog.py +111 -0
  23. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  24. sky/clouds/utils/oci_utils.py +15 -2
  25. sky/core.py +8 -5
  26. sky/data/data_transfer.py +37 -0
  27. sky/data/data_utils.py +19 -4
  28. sky/data/mounting_utils.py +99 -15
  29. sky/data/storage.py +961 -130
  30. sky/global_user_state.py +1 -1
  31. sky/jobs/__init__.py +2 -0
  32. sky/jobs/constants.py +8 -7
  33. sky/jobs/controller.py +19 -22
  34. sky/jobs/core.py +46 -2
  35. sky/jobs/recovery_strategy.py +114 -143
  36. sky/jobs/scheduler.py +283 -0
  37. sky/jobs/state.py +290 -21
  38. sky/jobs/utils.py +346 -95
  39. sky/optimizer.py +6 -3
  40. sky/provision/aws/config.py +59 -29
  41. sky/provision/azure/instance.py +1 -1
  42. sky/provision/do/__init__.py +11 -0
  43. sky/provision/do/config.py +14 -0
  44. sky/provision/do/constants.py +10 -0
  45. sky/provision/do/instance.py +287 -0
  46. sky/provision/do/utils.py +306 -0
  47. sky/provision/docker_utils.py +22 -11
  48. sky/provision/gcp/instance_utils.py +15 -9
  49. sky/provision/kubernetes/instance.py +3 -2
  50. sky/provision/kubernetes/utils.py +125 -20
  51. sky/provision/oci/query_utils.py +17 -14
  52. sky/provision/provisioner.py +0 -1
  53. sky/provision/runpod/instance.py +10 -1
  54. sky/provision/runpod/utils.py +170 -13
  55. sky/resources.py +1 -1
  56. sky/serve/autoscalers.py +359 -301
  57. sky/serve/controller.py +10 -8
  58. sky/serve/core.py +84 -7
  59. sky/serve/load_balancer.py +27 -10
  60. sky/serve/replica_managers.py +1 -3
  61. sky/serve/serve_state.py +10 -5
  62. sky/serve/serve_utils.py +28 -1
  63. sky/serve/service.py +4 -3
  64. sky/serve/service_spec.py +31 -0
  65. sky/setup_files/dependencies.py +4 -1
  66. sky/skylet/constants.py +8 -4
  67. sky/skylet/events.py +7 -3
  68. sky/skylet/job_lib.py +10 -30
  69. sky/skylet/log_lib.py +8 -8
  70. sky/skylet/log_lib.pyi +3 -0
  71. sky/skylet/providers/command_runner.py +5 -7
  72. sky/skylet/skylet.py +1 -1
  73. sky/task.py +28 -1
  74. sky/templates/do-ray.yml.j2 +98 -0
  75. sky/templates/jobs-controller.yaml.j2 +41 -7
  76. sky/templates/runpod-ray.yml.j2 +13 -0
  77. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  78. sky/usage/usage_lib.py +10 -2
  79. sky/utils/accelerator_registry.py +12 -8
  80. sky/utils/controller_utils.py +114 -39
  81. sky/utils/db_utils.py +18 -4
  82. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  83. sky/utils/log_utils.py +2 -0
  84. sky/utils/resources_utils.py +25 -21
  85. sky/utils/schemas.py +27 -0
  86. sky/utils/subprocess_utils.py +54 -10
  87. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
  88. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
  89. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
  90. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  91. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  92. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
@@ -206,6 +206,9 @@ def _get_cloud_dependencies_installation_commands(
206
206
  # installed, so we don't check that.
207
207
  python_packages: Set[str] = set()
208
208
 
209
+ # add flask to the controller dependencies for dashboard
210
+ python_packages.add('flask')
211
+
209
212
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
210
213
  commands.append(f'echo -en "\\r{step_prefix}uv{empty_str}" &&'
211
214
  f'{constants.SKY_UV_INSTALL_CMD} >/dev/null 2>&1')
@@ -649,10 +652,27 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
649
652
  still sync up any storage mounts with local source paths (which do not
650
653
  undergo translation).
651
654
  """
655
+
652
656
  # ================================================================
653
657
  # Translate the workdir and local file mounts to cloud file mounts.
654
658
  # ================================================================
655
659
 
660
+ def _sub_path_join(sub_path: Optional[str], path: str) -> str:
661
+ if sub_path is None:
662
+ return path
663
+ return os.path.join(sub_path, path).strip('/')
664
+
665
+ def assert_no_bucket_creation(store: storage_lib.AbstractStore) -> None:
666
+ if store.is_sky_managed:
667
+ # Bucket was created, this should not happen since use configured
668
+ # the bucket and we assumed it already exists.
669
+ store.delete()
670
+ with ux_utils.print_exception_no_traceback():
671
+ raise exceptions.StorageBucketCreateError(
672
+ f'Jobs bucket {store.name!r} does not exist. '
673
+ 'Please check jobs.bucket configuration in '
674
+ 'your SkyPilot config.')
675
+
656
676
  run_id = common_utils.get_usage_run_id()[:8]
657
677
  original_file_mounts = task.file_mounts if task.file_mounts else {}
658
678
  original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
@@ -679,11 +699,27 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
679
699
  ux_utils.spinner_message(
680
700
  f'Translating {msg} to SkyPilot Storage...'))
681
701
 
702
+ # Get the bucket name for the workdir and file mounts,
703
+ # we store all these files in same bucket from config.
704
+ bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None)
705
+ store_kwargs: Dict[str, Any] = {}
706
+ if bucket_wth_prefix is None:
707
+ store_type = store_cls = sub_path = None
708
+ storage_account_name = region = None
709
+ bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
710
+ username=common_utils.get_cleaned_username(), id=run_id)
711
+ else:
712
+ store_type, store_cls, bucket_name, sub_path, storage_account_name, \
713
+ region = storage_lib.StoreType.get_fields_from_store_url(
714
+ bucket_wth_prefix)
715
+ if storage_account_name is not None:
716
+ store_kwargs['storage_account_name'] = storage_account_name
717
+ if region is not None:
718
+ store_kwargs['region'] = region
719
+
682
720
  # Step 1: Translate the workdir to SkyPilot storage.
683
721
  new_storage_mounts = {}
684
722
  if task.workdir is not None:
685
- bucket_name = constants.WORKDIR_BUCKET_NAME.format(
686
- username=common_utils.get_cleaned_username(), id=run_id)
687
723
  workdir = task.workdir
688
724
  task.workdir = None
689
725
  if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or
@@ -691,14 +727,28 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
691
727
  raise ValueError(
692
728
  f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the '
693
729
  'workdir and file_mounts contains it as the target.')
694
- new_storage_mounts[
695
- constants.
696
- SKY_REMOTE_WORKDIR] = storage_lib.Storage.from_yaml_config({
697
- 'name': bucket_name,
698
- 'source': workdir,
699
- 'persistent': False,
700
- 'mode': 'COPY',
701
- })
730
+ bucket_sub_path = _sub_path_join(
731
+ sub_path,
732
+ constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(run_id=run_id))
733
+ stores = None
734
+ if store_type is not None:
735
+ assert store_cls is not None
736
+ with sky_logging.silent():
737
+ stores = {
738
+ store_type: store_cls(name=bucket_name,
739
+ source=workdir,
740
+ _bucket_sub_path=bucket_sub_path,
741
+ **store_kwargs)
742
+ }
743
+ assert_no_bucket_creation(stores[store_type])
744
+
745
+ storage_obj = storage_lib.Storage(name=bucket_name,
746
+ source=workdir,
747
+ persistent=False,
748
+ mode=storage_lib.StorageMode.COPY,
749
+ stores=stores,
750
+ _bucket_sub_path=bucket_sub_path)
751
+ new_storage_mounts[constants.SKY_REMOTE_WORKDIR] = storage_obj
702
752
  # Check of the existence of the workdir in file_mounts is done in
703
753
  # the task construction.
704
754
  logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} '
@@ -716,27 +766,37 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
716
766
  if os.path.isfile(os.path.abspath(os.path.expanduser(src))):
717
767
  copy_mounts_with_file_in_src[dst] = src
718
768
  continue
719
- bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
720
- username=common_utils.get_cleaned_username(),
721
- id=f'{run_id}-{i}',
722
- )
723
- new_storage_mounts[dst] = storage_lib.Storage.from_yaml_config({
724
- 'name': bucket_name,
725
- 'source': src,
726
- 'persistent': False,
727
- 'mode': 'COPY',
728
- })
769
+ bucket_sub_path = _sub_path_join(
770
+ sub_path, constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id))
771
+ stores = None
772
+ if store_type is not None:
773
+ assert store_cls is not None
774
+ with sky_logging.silent():
775
+ store = store_cls(name=bucket_name,
776
+ source=src,
777
+ _bucket_sub_path=bucket_sub_path,
778
+ **store_kwargs)
779
+
780
+ stores = {store_type: store}
781
+ assert_no_bucket_creation(stores[store_type])
782
+ storage_obj = storage_lib.Storage(name=bucket_name,
783
+ source=src,
784
+ persistent=False,
785
+ mode=storage_lib.StorageMode.COPY,
786
+ stores=stores,
787
+ _bucket_sub_path=bucket_sub_path)
788
+ new_storage_mounts[dst] = storage_obj
729
789
  logger.info(f' {colorama.Style.DIM}Folder : {src!r} '
730
790
  f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}')
731
791
 
732
792
  # Step 3: Translate local file mounts with file in src to SkyPilot storage.
733
793
  # Hard link the files in src to a temporary directory, and upload folder.
794
+ file_mounts_tmp_subpath = _sub_path_join(
795
+ sub_path, constants.FILE_MOUNTS_TMP_SUBPATH.format(run_id=run_id))
734
796
  local_fm_path = os.path.join(
735
797
  tempfile.gettempdir(),
736
798
  constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id))
737
799
  os.makedirs(local_fm_path, exist_ok=True)
738
- file_bucket_name = constants.FILE_MOUNTS_FILE_ONLY_BUCKET_NAME.format(
739
- username=common_utils.get_cleaned_username(), id=run_id)
740
800
  file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format(
741
801
  path)
742
802
  if copy_mounts_with_file_in_src:
@@ -745,14 +805,27 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
745
805
  src_to_file_id[src] = i
746
806
  os.link(os.path.abspath(os.path.expanduser(src)),
747
807
  os.path.join(local_fm_path, f'file-{i}'))
748
-
749
- new_storage_mounts[
750
- file_mount_remote_tmp_dir] = storage_lib.Storage.from_yaml_config({
751
- 'name': file_bucket_name,
752
- 'source': local_fm_path,
753
- 'persistent': False,
754
- 'mode': 'MOUNT',
755
- })
808
+ stores = None
809
+ if store_type is not None:
810
+ assert store_cls is not None
811
+ with sky_logging.silent():
812
+ stores = {
813
+ store_type: store_cls(
814
+ name=bucket_name,
815
+ source=local_fm_path,
816
+ _bucket_sub_path=file_mounts_tmp_subpath,
817
+ **store_kwargs)
818
+ }
819
+ assert_no_bucket_creation(stores[store_type])
820
+ storage_obj = storage_lib.Storage(
821
+ name=bucket_name,
822
+ source=local_fm_path,
823
+ persistent=False,
824
+ mode=storage_lib.StorageMode.MOUNT,
825
+ stores=stores,
826
+ _bucket_sub_path=file_mounts_tmp_subpath)
827
+
828
+ new_storage_mounts[file_mount_remote_tmp_dir] = storage_obj
756
829
  if file_mount_remote_tmp_dir in original_storage_mounts:
757
830
  with ux_utils.print_exception_no_traceback():
758
831
  raise ValueError(
@@ -762,8 +835,9 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
762
835
  sources = list(src_to_file_id.keys())
763
836
  sources_str = '\n '.join(sources)
764
837
  logger.info(f' {colorama.Style.DIM}Files (listed below) '
765
- f' -> storage: {file_bucket_name}:'
838
+ f' -> storage: {bucket_name}:'
766
839
  f'\n {sources_str}{colorama.Style.RESET_ALL}')
840
+
767
841
  rich_utils.force_update_status(
768
842
  ux_utils.spinner_message('Uploading translated local files/folders'))
769
843
  task.update_storage_mounts(new_storage_mounts)
@@ -779,7 +853,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
779
853
  ux_utils.spinner_message('Uploading local sources to storage[/] '
780
854
  '[dim]View storages: sky storage ls'))
781
855
  try:
782
- task.sync_storage_mounts()
856
+ task.sync_storage_mounts(force_sync=bucket_wth_prefix is not None)
783
857
  except (ValueError, exceptions.NoCloudAccessError) as e:
784
858
  if 'No enabled cloud for storage' in str(e) or isinstance(
785
859
  e, exceptions.NoCloudAccessError):
@@ -809,10 +883,11 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
809
883
  # file_mount_remote_tmp_dir will only exist when there are files in
810
884
  # the src for copy mounts.
811
885
  storage_obj = task.storage_mounts[file_mount_remote_tmp_dir]
812
- store_type = list(storage_obj.stores.keys())[0]
813
- store_object = storage_obj.stores[store_type]
886
+ curr_store_type = list(storage_obj.stores.keys())[0]
887
+ store_object = storage_obj.stores[curr_store_type]
814
888
  bucket_url = storage_lib.StoreType.get_endpoint_url(
815
- store_object, file_bucket_name)
889
+ store_object, bucket_name)
890
+ bucket_url += f'/{file_mounts_tmp_subpath}'
816
891
  for dst, src in copy_mounts_with_file_in_src.items():
817
892
  file_id = src_to_file_id[src]
818
893
  new_file_mounts[dst] = bucket_url + f'/file-{file_id}'
@@ -829,8 +904,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
829
904
  store_types = list(storage_obj.stores.keys())
830
905
  assert len(store_types) == 1, (
831
906
  'We only support one store type for now.', storage_obj.stores)
832
- store_type = store_types[0]
833
- store_object = storage_obj.stores[store_type]
907
+ curr_store_type = store_types[0]
908
+ store_object = storage_obj.stores[curr_store_type]
834
909
  storage_obj.source = storage_lib.StoreType.get_endpoint_url(
835
910
  store_object, storage_obj.name)
836
911
  storage_obj.force_delete = True
@@ -847,8 +922,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
847
922
  store_types = list(storage_obj.stores.keys())
848
923
  assert len(store_types) == 1, (
849
924
  'We only support one store type for now.', storage_obj.stores)
850
- store_type = store_types[0]
851
- store_object = storage_obj.stores[store_type]
925
+ curr_store_type = store_types[0]
926
+ store_object = storage_obj.stores[curr_store_type]
852
927
  source = storage_lib.StoreType.get_endpoint_url(
853
928
  store_object, storage_obj.name)
854
929
  new_storage = storage_lib.Storage.from_yaml_config({
sky/utils/db_utils.py CHANGED
@@ -4,11 +4,27 @@ import sqlite3
4
4
  import threading
5
5
  from typing import Any, Callable, Optional
6
6
 
7
+ # This parameter (passed to sqlite3.connect) controls how long we will wait to
8
+ # obtains a database lock (not necessarily during connection, but whenever it is
9
+ # needed). It is not a connection timeout.
10
+ # Even in WAL mode, only a single writer is allowed at a time. Other writers
11
+ # will block until the write lock can be obtained. This behavior is described in
12
+ # the SQLite documentation for WAL: https://www.sqlite.org/wal.html
13
+ # Python's default timeout is 5s. In normal usage, lock contention is very low,
14
+ # and this is more than sufficient. However, in some highly concurrent cases,
15
+ # such as a jobs controller suddenly recovering thousands of jobs at once, we
16
+ # can see a small number of processes that take much longer to obtain the lock.
17
+ # In contrived highly contentious cases, around 0.1% of transactions will take
18
+ # >30s to take the lock. We have not seen cases that take >60s. For cases up to
19
+ # 1000x parallelism, this is thus thought to be a conservative setting.
20
+ # For more info, see the PR description for #4552.
21
+ _DB_TIMEOUT_S = 60
22
+
7
23
 
8
24
  @contextlib.contextmanager
9
25
  def safe_cursor(db_path: str):
10
26
  """A newly created, auto-committing, auto-closing cursor."""
11
- conn = sqlite3.connect(db_path)
27
+ conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
12
28
  cursor = conn.cursor()
13
29
  try:
14
30
  yield cursor
@@ -79,8 +95,6 @@ class SQLiteConn(threading.local):
79
95
  def __init__(self, db_path: str, create_table: Callable):
80
96
  super().__init__()
81
97
  self.db_path = db_path
82
- # NOTE: We use a timeout of 10 seconds to avoid database locked
83
- # errors. This is a hack, but it works.
84
- self.conn = sqlite3.connect(db_path, timeout=10)
98
+ self.conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
85
99
  self.cursor = self.conn.cursor()
86
100
  create_table(self.cursor, self.conn)
@@ -93,11 +93,11 @@ cleanup_agent_node() {
93
93
 
94
94
  check_gpu() {
95
95
  local NODE_IP=$1
96
- run_remote "$NODE_IP" "
97
- if command -v nvidia-smi &> /dev/null; then
98
- nvidia-smi --list-gpus | grep 'GPU 0'
99
- fi
100
- "
96
+ if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
97
+ return 0 # GPU detected
98
+ else
99
+ return 1 # No GPU detected
100
+ fi
101
101
  }
102
102
 
103
103
  # Pre-flight checks
sky/utils/log_utils.py CHANGED
@@ -5,6 +5,8 @@ import types
5
5
  from typing import Callable, Iterator, List, Optional, TextIO, Type
6
6
 
7
7
  import colorama
8
+ # slow due to https://github.com/python-pendulum/pendulum/issues/808
9
+ # FIXME(aylei): bump pendulum if it get fixed
8
10
  import pendulum
9
11
  import prettytable
10
12
 
@@ -137,31 +137,35 @@ def simplify_ports(ports: List[str]) -> List[str]:
137
137
  return port_set_to_ranges(port_ranges_to_set(ports))
138
138
 
139
139
 
140
+ def format_resource(resource: 'resources_lib.Resources',
141
+ simplify: bool = False) -> str:
142
+ if simplify:
143
+ cloud = resource.cloud
144
+ if resource.accelerators is None:
145
+ vcpu, _ = cloud.get_vcpus_mem_from_instance_type(
146
+ resource.instance_type)
147
+ hardware = f'vCPU={int(vcpu)}'
148
+ else:
149
+ hardware = f'{resource.accelerators}'
150
+ spot = '[Spot]' if resource.use_spot else ''
151
+ return f'{cloud}({spot}{hardware})'
152
+ else:
153
+ # accelerator_args is way too long.
154
+ # Convert from:
155
+ # GCP(n1-highmem-8, {'tpu-v2-8': 1}, accelerator_args={'runtime_version': '2.12.0'} # pylint: disable=line-too-long
156
+ # to:
157
+ # GCP(n1-highmem-8, {'tpu-v2-8': 1}...)
158
+ pattern = ', accelerator_args={.*}'
159
+ launched_resource_str = re.sub(pattern, '...', str(resource))
160
+ return launched_resource_str
161
+
162
+
140
163
  def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle',
141
164
  simplify: bool = False) -> str:
142
165
  if (handle.launched_nodes is not None and
143
166
  handle.launched_resources is not None):
144
- if simplify:
145
- cloud = handle.launched_resources.cloud
146
- if handle.launched_resources.accelerators is None:
147
- vcpu, _ = cloud.get_vcpus_mem_from_instance_type(
148
- handle.launched_resources.instance_type)
149
- hardware = f'vCPU={int(vcpu)}'
150
- else:
151
- hardware = f'{handle.launched_resources.accelerators}'
152
- spot = '[Spot]' if handle.launched_resources.use_spot else ''
153
- return f'{handle.launched_nodes}x {cloud}({spot}{hardware})'
154
- else:
155
- launched_resource_str = str(handle.launched_resources)
156
- # accelerator_args is way too long.
157
- # Convert from:
158
- # GCP(n1-highmem-8, {'tpu-v2-8': 1}, accelerator_args={'runtime_version': '2.12.0'} # pylint: disable=line-too-long
159
- # to:
160
- # GCP(n1-highmem-8, {'tpu-v2-8': 1}...)
161
- pattern = ', accelerator_args={.*}'
162
- launched_resource_str = re.sub(pattern, '...',
163
- launched_resource_str)
164
- return f'{handle.launched_nodes}x {launched_resource_str}'
167
+ return (f'{handle.launched_nodes}x '
168
+ f'{format_resource(handle.launched_resources, simplify)}')
165
169
  return _DEFAULT_MESSAGE_HANDLE_INITIALIZING
166
170
 
167
171
 
sky/utils/schemas.py CHANGED
@@ -299,6 +299,12 @@ def get_storage_schema():
299
299
  mode.value for mode in storage.StorageMode
300
300
  ]
301
301
  },
302
+ '_is_sky_managed': {
303
+ 'type': 'boolean',
304
+ },
305
+ '_bucket_sub_path': {
306
+ 'type': 'string',
307
+ },
302
308
  '_force_delete': {
303
309
  'type': 'boolean',
304
310
  }
@@ -390,6 +396,19 @@ def get_service_schema():
390
396
  'case_insensitive_enum': list(
391
397
  load_balancing_policies.LB_POLICIES.keys())
392
398
  },
399
+ 'tls': {
400
+ 'type': 'object',
401
+ 'required': ['keyfile', 'certfile'],
402
+ 'additionalProperties': False,
403
+ 'properties': {
404
+ 'keyfile': {
405
+ 'type': 'string',
406
+ },
407
+ 'certfile': {
408
+ 'type': 'string',
409
+ },
410
+ },
411
+ },
393
412
  }
394
413
  }
395
414
 
@@ -721,6 +740,11 @@ def get_config_schema():
721
740
  'resources': resources_schema,
722
741
  }
723
742
  },
743
+ 'bucket': {
744
+ 'type': 'string',
745
+ 'pattern': '^(https|s3|gs|r2|cos)://.+',
746
+ 'required': [],
747
+ }
724
748
  }
725
749
  }
726
750
  cloud_configs = {
@@ -875,6 +899,9 @@ def get_config_schema():
875
899
  'image_tag_gpu': {
876
900
  'type': 'string',
877
901
  },
902
+ 'vcn_ocid': {
903
+ 'type': 'string',
904
+ },
878
905
  'vcn_subnet': {
879
906
  'type': 'string',
880
907
  },
@@ -3,9 +3,10 @@ from multiprocessing import pool
3
3
  import os
4
4
  import random
5
5
  import resource
6
+ import shlex
6
7
  import subprocess
7
8
  import time
8
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
9
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
9
10
 
10
11
  import colorama
11
12
  import psutil
@@ -97,12 +98,10 @@ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
97
98
 
98
99
 
99
100
  def run_in_parallel(func: Callable,
100
- args: Iterable[Any],
101
+ args: List[Any],
101
102
  num_threads: Optional[int] = None) -> List[Any]:
102
103
  """Run a function in parallel on a list of arguments.
103
104
 
104
- The function 'func' should raise a CommandError if the command fails.
105
-
106
105
  Args:
107
106
  func: The function to run in parallel
108
107
  args: Iterable of arguments to pass to func
@@ -111,14 +110,23 @@ def run_in_parallel(func: Callable,
111
110
 
112
111
  Returns:
113
112
  A list of the return values of the function func, in the same order as the
114
- arguments.
113
+ arguments.
114
+
115
+ Raises:
116
+ Exception: The first exception encountered.
115
117
  """
116
- # Reference: https://stackoverflow.com/questions/25790279/python-multiprocessing-early-termination # pylint: disable=line-too-long
117
- processes = num_threads if num_threads is not None else get_parallel_threads(
118
- )
118
+ # Short-circuit for short lists
119
+ if len(args) == 0:
120
+ return []
121
+ if len(args) == 1:
122
+ return [func(args[0])]
123
+
124
+ processes = (num_threads
125
+ if num_threads is not None else get_parallel_threads())
126
+
119
127
  with pool.ThreadPool(processes=processes) as p:
120
- # Run the function in parallel on the arguments, keeping the order.
121
- return list(p.imap(func, args))
128
+ ordered_iterators = p.imap(func, args)
129
+ return list(ordered_iterators)
122
130
 
123
131
 
124
132
  def handle_returncode(returncode: int,
@@ -293,3 +301,39 @@ def kill_process_daemon(process_pid: int) -> None:
293
301
  # Disable input
294
302
  stdin=subprocess.DEVNULL,
295
303
  )
304
+
305
+
306
+ def launch_new_process_tree(cmd: str, log_output: str = '/dev/null') -> int:
307
+ """Launch a new process that will not be a child of the current process.
308
+
309
+ This will launch bash in a new session, which will launch the given cmd.
310
+ This will ensure that cmd is in its own process tree, and once bash exits,
311
+ will not be an ancestor of the current process. This is useful for job
312
+ launching.
313
+
314
+ Returns the pid of the launched cmd.
315
+ """
316
+ # Use nohup to ensure the job driver process is a separate process tree,
317
+ # instead of being a child of the current process. This is important to
318
+ # avoid a chain of driver processes (job driver can call schedule_step() to
319
+ # submit new jobs, and the new job can also call schedule_step()
320
+ # recursively).
321
+ #
322
+ # echo $! will output the PID of the last background process started in the
323
+ # current shell, so we can retrieve it and record in the DB.
324
+ #
325
+ # TODO(zhwu): A more elegant solution is to use another daemon process to be
326
+ # in charge of starting these driver processes, instead of starting them in
327
+ # the current process.
328
+ wrapped_cmd = (f'nohup bash -c {shlex.quote(cmd)} '
329
+ f'</dev/null >{log_output} 2>&1 & echo $!')
330
+ proc = subprocess.run(wrapped_cmd,
331
+ stdout=subprocess.PIPE,
332
+ stderr=subprocess.PIPE,
333
+ stdin=subprocess.DEVNULL,
334
+ start_new_session=True,
335
+ check=True,
336
+ shell=True,
337
+ text=True)
338
+ # Get the PID of the detached process
339
+ return int(proc.stdout.strip())
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241227
3
+ Version: 1.0.0.dev20250124
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -90,11 +90,15 @@ Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.42.0; (python_version >= "3.10" and s
90
90
  Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "remote"
91
91
  Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3; extra == "remote"
92
92
  Provides-Extra: runpod
93
- Requires-Dist: runpod>=1.5.1; extra == "runpod"
93
+ Requires-Dist: runpod>=1.6.1; extra == "runpod"
94
94
  Provides-Extra: fluidstack
95
95
  Provides-Extra: cudo
96
96
  Requires-Dist: cudo-compute>=0.1.10; extra == "cudo"
97
97
  Provides-Extra: paperspace
98
+ Provides-Extra: do
99
+ Requires-Dist: pydo>=0.3.0; extra == "do"
100
+ Requires-Dist: azure-core>=1.24.0; extra == "do"
101
+ Requires-Dist: azure-common; extra == "do"
98
102
  Provides-Extra: vsphere
99
103
  Requires-Dist: pyvmomi==8.0.1.0.2; extra == "vsphere"
100
104
  Provides-Extra: all
@@ -136,9 +140,21 @@ Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.32.0; (python_version < "3.10" and sy
136
140
  Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.42.0; (python_version >= "3.10" and sys_platform != "darwin") and extra == "all"
137
141
  Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "all"
138
142
  Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3; extra == "all"
139
- Requires-Dist: runpod>=1.5.1; extra == "all"
143
+ Requires-Dist: runpod>=1.6.1; extra == "all"
140
144
  Requires-Dist: cudo-compute>=0.1.10; extra == "all"
145
+ Requires-Dist: pydo>=0.3.0; extra == "all"
146
+ Requires-Dist: azure-core>=1.24.0; extra == "all"
147
+ Requires-Dist: azure-common; extra == "all"
141
148
  Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
149
+ Dynamic: author
150
+ Dynamic: classifier
151
+ Dynamic: description
152
+ Dynamic: description-content-type
153
+ Dynamic: license
154
+ Dynamic: project-url
155
+ Dynamic: provides-extra
156
+ Dynamic: requires-dist
157
+ Dynamic: summary
142
158
 
143
159
  <p align="center">
144
160
  <img alt="SkyPilot" src="https://raw.githubusercontent.com/skypilot-org/skypilot/master/docs/source/images/skypilot-wide-light-1k.png" width=55%>
@@ -165,6 +181,7 @@ Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
165
181
 
166
182
  ----
167
183
  :fire: *News* :fire:
184
+ - [Jan 2025] Launch and Serve **[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)** on Kubernetes or Any Cloud: [**example**](./llm/deepseek-r1/)
168
185
  - [Oct 2024] :tada: **SkyPilot crossed 1M+ downloads** :tada:: Thank you to our community! [**Twitter/X**](https://x.com/skypilot_org/status/1844770841718067638)
169
186
  - [Sep 2024] Point, Launch and Serve **Llama 3.2** on Kubernetes or Any Cloud: [**example**](./llm/llama-3_2/)
170
187
  - [Sep 2024] Run and deploy [**Pixtral**](./llm/pixtral), the first open-source multimodal model from Mistral AI.
@@ -335,6 +352,8 @@ Read the research:
335
352
  - [Sky Computing vision paper](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s02-stoica.pdf) (HotOS 2021)
336
353
  - [Policy for Managed Spot Jobs](https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao) (NSDI 2024)
337
354
 
355
+ SkyPilot was initially started at the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley and has since gained many industry contributors. Read more about the project's origin [here](https://docs.skypilot.co/en/latest/sky-computing.html).
356
+
338
357
  ## Support and Questions
339
358
  We are excited to hear your feedback!
340
359
  * For issues and feature requests, please [open a GitHub issue](https://github.com/skypilot-org/skypilot/issues/new).