skypilot-nightly 1.0.0.dev20250716__py3-none-any.whl → 1.0.0.dev20250717__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. sky/__init__.py +4 -2
  2. sky/backends/backend.py +8 -4
  3. sky/backends/cloud_vm_ray_backend.py +50 -1
  4. sky/backends/docker_utils.py +1 -1
  5. sky/backends/local_docker_backend.py +2 -1
  6. sky/catalog/common.py +60 -50
  7. sky/catalog/data_fetchers/fetch_gcp.py +1 -0
  8. sky/catalog/gcp_catalog.py +24 -7
  9. sky/catalog/kubernetes_catalog.py +5 -1
  10. sky/client/cli/command.py +180 -77
  11. sky/client/cli/git.py +549 -0
  12. sky/client/common.py +1 -1
  13. sky/clouds/gcp.py +1 -1
  14. sky/dashboard/out/404.html +1 -1
  15. sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +16 -0
  16. sky/dashboard/out/_next/static/chunks/{webpack-3fad5d4a0541a02d.js → webpack-c3b45b7b0eaef66f.js} +1 -1
  17. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  18. sky/dashboard/out/clusters/[cluster].html +1 -1
  19. sky/dashboard/out/clusters.html +1 -1
  20. sky/dashboard/out/config.html +1 -1
  21. sky/dashboard/out/index.html +1 -1
  22. sky/dashboard/out/infra/[context].html +1 -1
  23. sky/dashboard/out/infra.html +1 -1
  24. sky/dashboard/out/jobs/[job].html +1 -1
  25. sky/dashboard/out/jobs.html +1 -1
  26. sky/dashboard/out/users.html +1 -1
  27. sky/dashboard/out/volumes.html +1 -1
  28. sky/dashboard/out/workspace/new.html +1 -1
  29. sky/dashboard/out/workspaces/[name].html +1 -1
  30. sky/dashboard/out/workspaces.html +1 -1
  31. sky/exceptions.py +5 -0
  32. sky/execution.py +1 -1
  33. sky/provision/kubernetes/utils.py +6 -0
  34. sky/server/common.py +4 -3
  35. sky/setup_files/MANIFEST.in +1 -0
  36. sky/setup_files/dependencies.py +2 -0
  37. sky/task.py +12 -2
  38. sky/utils/command_runner.py +144 -35
  39. sky/utils/controller_utils.py +4 -3
  40. sky/utils/git.py +9 -0
  41. sky/utils/git_clone.sh +460 -0
  42. sky/utils/schemas.py +15 -1
  43. {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/METADATA +3 -1
  44. {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/RECORD +50 -47
  45. sky/dashboard/out/_next/static/chunks/4869.c139c0124e677fc8.js +0 -16
  46. /sky/dashboard/out/_next/static/{gVXjeFhvtWXyOsx9xYNvM → Et5IQ5Y3WvH608nXClo4z}/_buildManifest.js +0 -0
  47. /sky/dashboard/out/_next/static/{gVXjeFhvtWXyOsx9xYNvM → Et5IQ5Y3WvH608nXClo4z}/_ssgManifest.js +0 -0
  48. {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/WHEEL +0 -0
  49. {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/entry_points.txt +0 -0
  50. {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/licenses/LICENSE +0 -0
  51. {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '41c25f4073c1d5024d415c63f8e7d584fbb5517a'
8
+ _SKYPILOT_COMMIT_SHA = '5ed1775c386bbd0cc4b9d1c80fc0d2d91b352870'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250716'
38
+ __version__ = '1.0.0.dev20250717'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -88,6 +88,7 @@ from sky.admin_policy import UserRequest
88
88
  from sky.catalog import list_accelerators
89
89
  from sky.client.sdk import api_cancel
90
90
  from sky.client.sdk import api_info
91
+ from sky.client.sdk import api_login
91
92
  from sky.client.sdk import api_server_logs
92
93
  from sky.client.sdk import api_start
93
94
  from sky.client.sdk import api_status
@@ -206,6 +207,7 @@ __all__ = [
206
207
  'api_status',
207
208
  'api_cancel',
208
209
  'api_info',
210
+ 'api_login',
209
211
  'api_start',
210
212
  'api_stop',
211
213
  'api_server_logs',
sky/backends/backend.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """Sky backend interface."""
2
2
  import typing
3
- from typing import Dict, Generic, Optional, Tuple
3
+ from typing import Any, Dict, Generic, Optional, Tuple, Union
4
4
 
5
5
  from sky.usage import usage_lib
6
6
  from sky.utils import cluster_utils
@@ -90,8 +90,10 @@ class Backend(Generic[_ResourceHandleType]):
90
90
 
91
91
  @timeline.event
92
92
  @usage_lib.messages.usage.update_runtime('sync_workdir')
93
- def sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None:
94
- return self._sync_workdir(handle, workdir)
93
+ def sync_workdir(self, handle: _ResourceHandleType,
94
+ workdir: Union[Path, Dict[str, Any]],
95
+ envs_and_secrets: Dict[str, str]) -> None:
96
+ return self._sync_workdir(handle, workdir, envs_and_secrets)
95
97
 
96
98
  @timeline.event
97
99
  @usage_lib.messages.usage.update_runtime('sync_file_mounts')
@@ -165,7 +167,9 @@ class Backend(Generic[_ResourceHandleType]):
165
167
  ) -> Tuple[Optional[_ResourceHandleType], bool]:
166
168
  raise NotImplementedError
167
169
 
168
- def _sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None:
170
+ def _sync_workdir(self, handle: _ResourceHandleType,
171
+ workdir: Union[Path, Dict[str, Any]],
172
+ envs_and_secrets: Dict[str, str]) -> None:
169
173
  raise NotImplementedError
170
174
 
171
175
  def _sync_file_mounts(
@@ -3240,10 +3240,59 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3240
3240
  common_utils.remove_file_if_exists(lock_path)
3241
3241
 
3242
3242
  def _sync_workdir(self, handle: CloudVmRayResourceHandle,
3243
- workdir: Path) -> None:
3243
+ workdir: Union[Path, Dict[str, Any]],
3244
+ envs_and_secrets: Dict[str, str]) -> None:
3244
3245
  # Even though provision() takes care of it, there may be cases where
3245
3246
  # this function is called in isolation, without calling provision(),
3246
3247
  # e.g., in CLI. So we should rerun rsync_up.
3248
+ if isinstance(workdir, dict):
3249
+ self._sync_git_workdir(handle, envs_and_secrets)
3250
+ else:
3251
+ self._sync_path_workdir(handle, workdir)
3252
+
3253
+ def _sync_git_workdir(self, handle: CloudVmRayResourceHandle,
3254
+ envs_and_secrets: Dict[str, str]) -> None:
3255
+ style = colorama.Style
3256
+ ip_list = handle.external_ips()
3257
+ assert ip_list is not None, 'external_ips is not cached in handle'
3258
+
3259
+ log_path = os.path.join(self.log_dir, 'workdir_sync.log')
3260
+
3261
+ # TODO(zhwu): refactor this with backend_utils.parallel_cmd_with_rsync
3262
+ runners = handle.get_command_runners()
3263
+
3264
+ def _sync_git_workdir_node(
3265
+ runner: command_runner.CommandRunner) -> None:
3266
+ # Type assertion to help mypy understand the type
3267
+ assert hasattr(
3268
+ runner, 'git_clone'
3269
+ ), f'CommandRunner should have git_clone method, ' \
3270
+ f'got {type(runner)}'
3271
+ runner.git_clone(
3272
+ target_dir=SKY_REMOTE_WORKDIR,
3273
+ log_path=log_path,
3274
+ stream_logs=False,
3275
+ max_retry=3,
3276
+ envs_and_secrets=envs_and_secrets,
3277
+ )
3278
+
3279
+ num_nodes = handle.launched_nodes
3280
+ plural = 's' if num_nodes > 1 else ''
3281
+ logger.info(
3282
+ f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
3283
+ f'{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
3284
+ os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
3285
+ os.system(f'touch {log_path}')
3286
+ num_threads = subprocess_utils.get_parallel_threads(
3287
+ str(handle.launched_resources.cloud))
3288
+ with rich_utils.safe_status(
3289
+ ux_utils.spinner_message('Syncing workdir', log_path)):
3290
+ subprocess_utils.run_in_parallel(_sync_git_workdir_node, runners,
3291
+ num_threads)
3292
+ logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
3293
+
3294
+ def _sync_path_workdir(self, handle: CloudVmRayResourceHandle,
3295
+ workdir: Path) -> None:
3247
3296
  fore = colorama.Fore
3248
3297
  style = colorama.Style
3249
3298
  ip_list = handle.external_ips()
@@ -168,7 +168,7 @@ def build_dockerimage(task: task_mod.Task,
168
168
  build_dir=temp_dir)
169
169
 
170
170
  dst = os.path.join(temp_dir, SKY_DOCKER_WORKDIR)
171
- if task.workdir is not None:
171
+ if task.workdir is not None and isinstance(task.workdir, str):
172
172
  # Copy workdir contents to tempdir
173
173
  shutil.copytree(os.path.expanduser(task.workdir), dst)
174
174
  else:
@@ -178,7 +178,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
178
178
  return handle, False
179
179
 
180
180
  def _sync_workdir(self, handle: LocalDockerResourceHandle,
181
- workdir: Path) -> None:
181
+ workdir: Union[Path, Dict[str, Any]],
182
+ envs_and_secrets: Dict[str, str]) -> None:
182
183
  """Workdir is sync'd by adding to the docker image.
183
184
 
184
185
  This happens in the execute step.
sky/catalog/common.py CHANGED
@@ -13,6 +13,7 @@ from sky import sky_logging
13
13
  from sky.adaptors import common as adaptors_common
14
14
  from sky.clouds import cloud as cloud_lib
15
15
  from sky.skylet import constants
16
+ from sky.utils import annotations
16
17
  from sky.utils import common_utils
17
18
  from sky.utils import registry
18
19
  from sky.utils import rich_utils
@@ -125,17 +126,21 @@ class LazyDataFrame:
125
126
 
126
127
  We don't need to load the catalog for every SkyPilot call, and this class
127
128
  allows us to load the catalog only when needed.
129
+
130
+ Use update_if_stale_func to pass in a function that decides whether to
131
+ update the catalog on disk, updates it if needed, and returns
132
+ a bool indicating whether the update was done.
128
133
  """
129
134
 
130
- def __init__(self, filename: str, update_func: Callable[[], None]):
135
+ def __init__(self, filename: str, update_if_stale_func: Callable[[], bool]):
131
136
  self._filename = filename
132
137
  self._df: Optional['pd.DataFrame'] = None
133
- self._update_func = update_func
138
+ self._update_if_stale_func = update_if_stale_func
134
139
 
140
+ @annotations.lru_cache(scope='request')
135
141
  def _load_df(self) -> 'pd.DataFrame':
136
- if self._df is None:
142
+ if self._update_if_stale_func() or self._df is None:
137
143
  try:
138
- self._update_func()
139
144
  self._df = pd.read_csv(self._filename)
140
145
  except Exception as e: # pylint: disable=broad-except
141
146
  # As users can manually modify the catalog, read_csv can fail.
@@ -193,55 +198,60 @@ def read_catalog(filename: str,
193
198
  return last_update + pull_frequency_hours * 3600 < time.time()
194
199
 
195
200
  def _update_catalog():
201
+ # Fast path: Exit early to avoid lock contention.
202
+ if not _need_update():
203
+ return False
204
+
196
205
  # Atomic check, to avoid conflicts with other processes.
197
206
  with filelock.FileLock(meta_path + '.lock'):
198
- if _need_update():
199
- url = f'{constants.HOSTED_CATALOG_DIR_URL}/{constants.CATALOG_SCHEMA_VERSION}/{filename}' # pylint: disable=line-too-long
200
- url_fallback = f'{constants.HOSTED_CATALOG_DIR_URL_S3_MIRROR}/{constants.CATALOG_SCHEMA_VERSION}/{filename}' # pylint: disable=line-too-long
201
- headers = {'User-Agent': 'SkyPilot/0.7'}
202
- update_frequency_str = ''
203
- if pull_frequency_hours is not None:
204
- update_frequency_str = (
205
- f' (every {pull_frequency_hours} hours)')
206
- with rich_utils.safe_status(
207
- ux_utils.spinner_message(
208
- f'Updating {cloud} catalog: {filename}') +
209
- f'{update_frequency_str}'):
210
- try:
211
- r = requests.get(url=url, headers=headers)
212
- if r.status_code == 429:
213
- # fallback to s3 mirror, github introduced rate
214
- # limit after 2025-05, see
215
- # https://github.com/skypilot-org/skypilot/issues/5438
216
- # for more details
217
- r = requests.get(url=url_fallback, headers=headers)
218
- r.raise_for_status()
219
- except requests.exceptions.RequestException as e:
220
- error_str = (f'Failed to fetch {cloud} catalog '
221
- f'{filename}. ')
222
- if os.path.exists(catalog_path):
223
- logger.warning(
224
- f'{error_str}Using cached catalog files.')
225
- # Update catalog file modification time.
226
- os.utime(catalog_path, None) # Sets to current time
227
- else:
228
- logger.error(
229
- f'{error_str}Please check your internet '
230
- 'connection.')
231
- with ux_utils.print_exception_no_traceback():
232
- raise e
207
+ # Double check after acquiring the lock.
208
+ if not _need_update():
209
+ return False
210
+
211
+ url = f'{constants.HOSTED_CATALOG_DIR_URL}/{constants.CATALOG_SCHEMA_VERSION}/{filename}' # pylint: disable=line-too-long
212
+ url_fallback = f'{constants.HOSTED_CATALOG_DIR_URL_S3_MIRROR}/{constants.CATALOG_SCHEMA_VERSION}/{filename}' # pylint: disable=line-too-long
213
+ headers = {'User-Agent': 'SkyPilot/0.7'}
214
+ update_frequency_str = ''
215
+ if pull_frequency_hours is not None:
216
+ update_frequency_str = (
217
+ f' (every {pull_frequency_hours} hours)')
218
+ with rich_utils.safe_status(
219
+ ux_utils.spinner_message(
220
+ f'Updating {cloud} catalog: {filename}') +
221
+ f'{update_frequency_str}'):
222
+ try:
223
+ r = requests.get(url=url, headers=headers)
224
+ if r.status_code == 429:
225
+ # fallback to s3 mirror, github introduced rate
226
+ # limit after 2025-05, see
227
+ # https://github.com/skypilot-org/skypilot/issues/5438
228
+ # for more details
229
+ r = requests.get(url=url_fallback, headers=headers)
230
+ r.raise_for_status()
231
+ except requests.exceptions.RequestException as e:
232
+ error_str = (f'Failed to fetch {cloud} catalog '
233
+ f'{filename}. ')
234
+ if os.path.exists(catalog_path):
235
+ logger.warning(
236
+ f'{error_str}Using cached catalog files.')
237
+ # Update catalog file modification time.
238
+ os.utime(catalog_path, None) # Sets to current time
233
239
  else:
234
- # Download successful, save the catalog to a local file.
235
- os.makedirs(os.path.dirname(catalog_path),
236
- exist_ok=True)
237
- with open(catalog_path, 'w', encoding='utf-8') as f:
238
- f.write(r.text)
239
- with open(meta_path + '.md5', 'w',
240
- encoding='utf-8') as f:
241
- f.write(hashlib.md5(r.text.encode()).hexdigest())
242
- logger.debug(f'Updated {cloud} catalog {filename}.')
243
-
244
- return LazyDataFrame(catalog_path, update_func=_update_catalog)
240
+ logger.error(f'{error_str}Please check your internet '
241
+ 'connection.')
242
+ with ux_utils.print_exception_no_traceback():
243
+ raise e
244
+ else:
245
+ # Download successful, save the catalog to a local file.
246
+ os.makedirs(os.path.dirname(catalog_path), exist_ok=True)
247
+ with open(catalog_path, 'w', encoding='utf-8') as f:
248
+ f.write(r.text)
249
+ with open(meta_path + '.md5', 'w', encoding='utf-8') as f:
250
+ f.write(hashlib.md5(r.text.encode()).hexdigest())
251
+ logger.debug(f'Updated {cloud} catalog {filename}.')
252
+ return True
253
+
254
+ return LazyDataFrame(catalog_path, update_if_stale_func=_update_catalog)
245
255
 
246
256
 
247
257
  def _get_instance_type(
@@ -198,6 +198,7 @@ SERIES_TO_DESCRIPTION = {
198
198
  'n1': 'N1 Predefined Instance',
199
199
  'n2': 'N2 Instance',
200
200
  'n2d': 'N2D AMD Instance',
201
+ 'n4': 'N4 Instance',
201
202
  't2a': 'T2A Arm Instance',
202
203
  't2d': 'T2D AMD Instance',
203
204
  }
@@ -37,20 +37,37 @@ _image_df = common.read_catalog('gcp/images.csv',
37
37
  _quotas_df = common.read_catalog('gcp/accelerator_quota_mapping.csv',
38
38
  pull_frequency_hours=_PULL_FREQUENCY_HOURS)
39
39
 
40
- # We will select from the following three CPU instance families:
40
+ # We will select from the following six CPU instance families:
41
41
  _DEFAULT_INSTANCE_FAMILY = [
42
- # This is the latest general-purpose instance family as of Mar 2023.
43
- # CPU: Intel Ice Lake 8373C or Cascade Lake 6268CL.
42
+ # This is a widely used general-purpose instance family as of July 2025.
43
+ # CPU: Primarily Intel Ice Lake (3rd Gen Intel Xeon Scalable Processors)
44
+ # or Cascade Lake (2nd Gen Intel Xeon Scalable Processors).
44
45
  # Memory: 4 GiB RAM per 1 vCPU;
45
46
  'n2-standard',
46
- # This is the latest memory-optimized instance family as of Mar 2023.
47
- # CPU: Intel Ice Lake 8373C or Cascade Lake 6268CL.
47
+ # This is a memory-optimized instance family as of July 2025.
48
+ # CPU: Primarily Intel Ice Lake (3rd Gen Intel Xeon Scalable Processors)
49
+ # or Cascade Lake (2nd Gen Intel Xeon Scalable Processors).
48
50
  # Memory: 8 GiB RAM per 1 vCPU;
49
51
  'n2-highmem',
50
- # This is the latest compute-optimized instance family as of Mar 2023.
51
- # CPU: Intel Ice Lake 8373C or Cascade Lake 6268CL.
52
+ # This is a compute-optimized instance family as of July 2025.
53
+ # CPU: Primarily Intel Ice Lake (3rd Gen Intel Xeon Scalable Processors)
54
+ # or Cascade Lake (2nd Gen Intel Xeon Scalable Processors).
52
55
  # Memory: 1 GiB RAM per 1 vCPU;
53
56
  'n2-highcpu',
57
+ # This is the latest general-purpose instance family as of July 2025.
58
+ # CPU: Intel 5th Gen Xeon Scalable processor (Emerald Rapids).
59
+ # Memory: 4 GiB RAM per 1 vCPU;
60
+ 'n4-standard',
61
+ # This is the latest general-purpose instance family
62
+ # with a higher vCPU to memory ratio as of July 2025.
63
+ # CPU: Intel 5th Gen Xeon Scalable processor (Emerald Rapids).
64
+ # Memory: 2 GiB RAM per 1 vCPU;
65
+ 'n4-highcpu',
66
+ # This is the latest general-purpose instance family
67
+ # with a lower vCPU to memory ratio as of July 2025.
68
+ # CPU: Intel 5th Gen Xeon Scalable processor (Emerald Rapids).
69
+ # Memory: 8 GiB RAM per 1 vCPU;
70
+ 'n4-highmem',
54
71
  ]
55
72
  # n2 is not allowed for launching GPUs for now.
56
73
  _DEFAULT_HOST_VM_FAMILY = (
@@ -195,6 +195,10 @@ def _list_accelerators(
195
195
  accelerator_name = lf.get_accelerator_from_label_value(
196
196
  node.metadata.labels.get(key))
197
197
 
198
+ # Heterogenous cluster may have some nodes with empty labels.
199
+ if not accelerator_name:
200
+ continue
201
+
198
202
  # Exclude multi-host TPUs from being processed.
199
203
  # TODO(Doyoung): Remove the logic when adding support for
200
204
  # multi-host TPUs.
@@ -212,7 +216,7 @@ def _list_accelerators(
212
216
  kubernetes_utils.get_node_accelerator_count(
213
217
  node.status.allocatable))
214
218
 
215
- if accelerator_name and accelerator_count > 0:
219
+ if accelerator_count > 0:
216
220
  # TPUs are counted in a different way compared to GPUs.
217
221
  # Multi-node GPUs can be split into smaller units and be
218
222
  # provisioned, but TPUs are considered as an atomic unit.