skypilot-nightly 1.0.0.dev20250710__py3-none-any.whl → 1.0.0.dev20250712__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. sky/__init__.py +2 -2
  2. sky/clouds/kubernetes.py +137 -23
  3. sky/core.py +3 -1
  4. sky/dashboard/out/404.html +1 -1
  5. sky/dashboard/out/_next/static/{P2Di1JdUlHuKN2lBws4Mr → Xv9sc7FbOn47FoLhF0fUv}/_buildManifest.js +1 -1
  6. sky/dashboard/out/_next/static/chunks/{1043-1b39779691bb4030.js → 1043-5e5ef6198735ff7e.js} +1 -1
  7. sky/dashboard/out/_next/static/chunks/1871-cf1a47986d716dd2.js +6 -0
  8. sky/dashboard/out/_next/static/chunks/6601-d38d10f957dff832.js +1 -0
  9. sky/dashboard/out/_next/static/chunks/{6989-6ff4e45dfb49d11d.js → 6989-eab0e9c16b64fd9f.js} +1 -1
  10. sky/dashboard/out/_next/static/chunks/938-8e25c8ea0baa271a.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4608dc89f95eba89.js +6 -0
  12. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-980d6f6b64ca7833.js +16 -0
  13. sky/dashboard/out/_next/static/chunks/{webpack-fd62f17bd9ce1fcc.js → webpack-4d50ce5087a63a95.js} +1 -1
  14. sky/dashboard/out/_next/static/css/a713705ccc8fe059.css +3 -0
  15. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  16. sky/dashboard/out/clusters/[cluster].html +1 -1
  17. sky/dashboard/out/clusters.html +1 -1
  18. sky/dashboard/out/config.html +1 -1
  19. sky/dashboard/out/index.html +1 -1
  20. sky/dashboard/out/infra/[context].html +1 -1
  21. sky/dashboard/out/infra.html +1 -1
  22. sky/dashboard/out/jobs/[job].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/global_user_state.py +10 -11
  30. sky/jobs/state.py +10 -11
  31. sky/jobs/utils.py +11 -3
  32. sky/optimizer.py +22 -14
  33. sky/provision/kubernetes/utils.py +132 -0
  34. sky/setup_files/dependencies.py +1 -0
  35. sky/skypilot_config.py +4 -1
  36. sky/templates/kubernetes-ray.yml.j2 +298 -10
  37. sky/users/permission.py +15 -1
  38. sky/users/token_service.py +25 -3
  39. sky/utils/schemas.py +3 -0
  40. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/METADATA +3 -1
  41. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/RECORD +49 -49
  42. sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +0 -6
  43. sky/dashboard/out/_next/static/chunks/6601-fcfad0ddf92ec7ab.js +0 -1
  44. sky/dashboard/out/_next/static/chunks/938-044ad21de8b4626b.js +0 -1
  45. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8135aba0712bda37.js +0 -6
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c4d5cfac7fbc0668.js +0 -16
  47. sky/dashboard/out/_next/static/css/0da6afe66176678a.css +0 -3
  48. /sky/dashboard/out/_next/static/{P2Di1JdUlHuKN2lBws4Mr → Xv9sc7FbOn47FoLhF0fUv}/_ssgManifest.js +0 -0
  49. /sky/dashboard/out/_next/static/chunks/pages/{_app-a37b06ddb64521fd.js → _app-49ff6c04332cc621.js} +0 -0
  50. /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-1159f362b960e2b8.js → [cluster]-0fbfb1dd0b08c90c.js} +0 -0
  51. /sky/dashboard/out/_next/static/chunks/pages/{clusters-9744c271a1642f76.js → clusters-102d169e87913ba1.js} +0 -0
  52. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/WHEEL +0 -0
  53. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/entry_points.txt +0 -0
  54. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/licenses/LICENSE +0 -0
  55. {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -220,17 +220,16 @@ def _glob_to_similar(glob_pattern):
220
220
  return like_pattern
221
221
 
222
222
 
223
- def create_table():
223
+ def create_table(engine: sqlalchemy.engine.Engine):
224
224
  # Enable WAL mode to avoid locking issues.
225
225
  # See: issue #1441 and PR #1509
226
226
  # https://github.com/microsoft/WSL/issues/2395
227
227
  # TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
228
228
  # This may cause the database locked problem from WSL issue #1441.
229
- if (_SQLALCHEMY_ENGINE.dialect.name
230
- == db_utils.SQLAlchemyDialect.SQLITE.value and
229
+ if (engine.dialect.name == db_utils.SQLAlchemyDialect.SQLITE.value and
231
230
  not common_utils.is_wsl()):
232
231
  try:
233
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
232
+ with orm.Session(engine) as session:
234
233
  session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
235
234
  session.commit()
236
235
  except sqlalchemy_exc.OperationalError as e:
@@ -240,12 +239,12 @@ def create_table():
240
239
  # is not critical and is likely to be enabled by other processes.
241
240
 
242
241
  # Create tables if they don't exist
243
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, _SQLALCHEMY_ENGINE)
242
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata, engine)
244
243
 
245
244
  # For backward compatibility.
246
245
  # TODO(zhwu): Remove this function after all users have migrated to
247
246
  # the latest version of SkyPilot.
248
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
247
+ with orm.Session(engine) as session:
249
248
  # Add autostop column to clusters table
250
249
  db_utils.add_column_to_table_sqlalchemy(session,
251
250
  'clusters',
@@ -391,15 +390,15 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
391
390
  conn_string = skypilot_config.get_nested(('db',), None)
392
391
  if conn_string:
393
392
  logger.debug(f'using db URI from {conn_string}')
394
- _SQLALCHEMY_ENGINE = sqlalchemy.create_engine(
395
- conn_string, poolclass=sqlalchemy.NullPool)
393
+ engine = sqlalchemy.create_engine(conn_string,
394
+ poolclass=sqlalchemy.NullPool)
396
395
  else:
397
396
  db_path = os.path.expanduser('~/.sky/state.db')
398
397
  pathlib.Path(db_path).parents[0].mkdir(parents=True,
399
398
  exist_ok=True)
400
- _SQLALCHEMY_ENGINE = sqlalchemy.create_engine('sqlite:///' +
401
- db_path)
402
- create_table()
399
+ engine = sqlalchemy.create_engine('sqlite:///' + db_path)
400
+ create_table(engine)
401
+ _SQLALCHEMY_ENGINE = engine
403
402
  return _SQLALCHEMY_ENGINE
404
403
 
405
404
 
sky/jobs/state.py CHANGED
@@ -112,17 +112,16 @@ ha_recovery_script_table = sqlalchemy.Table(
112
112
  )
113
113
 
114
114
 
115
- def create_table():
115
+ def create_table(engine: sqlalchemy.engine.Engine):
116
116
  # Enable WAL mode to avoid locking issues.
117
117
  # See: issue #3863, #1441 and PR #1509
118
118
  # https://github.com/microsoft/WSL/issues/2395
119
119
  # TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
120
120
  # This may cause the database locked problem from WSL issue #1441.
121
- if (_SQLALCHEMY_ENGINE.dialect.name
122
- == db_utils.SQLAlchemyDialect.SQLITE.value and
121
+ if (engine.dialect.name == db_utils.SQLAlchemyDialect.SQLITE.value and
123
122
  not common_utils.is_wsl()):
124
123
  try:
125
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
124
+ with orm.Session(engine) as session:
126
125
  session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
127
126
  session.commit()
128
127
  except sqlalchemy_exc.OperationalError as e:
@@ -132,10 +131,10 @@ def create_table():
132
131
  # is not critical and is likely to be enabled by other processes.
133
132
 
134
133
  # Create tables if they don't exist
135
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, _SQLALCHEMY_ENGINE)
134
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata, engine)
136
135
 
137
136
  # Backward compatibility: add columns that not exist in older databases
138
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
137
+ with orm.Session(engine) as session:
139
138
  db_utils.add_column_to_table_sqlalchemy(session, 'spot',
140
139
  'failure_reason',
141
140
  sqlalchemy.Text())
@@ -228,15 +227,15 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
228
227
  conn_string = skypilot_config.get_nested(('db',), None)
229
228
  if conn_string:
230
229
  logger.debug(f'using db URI from {conn_string}')
231
- _SQLALCHEMY_ENGINE = sqlalchemy.create_engine(
232
- conn_string, poolclass=sqlalchemy.NullPool)
230
+ engine = sqlalchemy.create_engine(conn_string,
231
+ poolclass=sqlalchemy.NullPool)
233
232
  else:
234
233
  db_path = os.path.expanduser('~/.sky/spot_jobs.db')
235
234
  pathlib.Path(db_path).parents[0].mkdir(parents=True,
236
235
  exist_ok=True)
237
- _SQLALCHEMY_ENGINE = sqlalchemy.create_engine('sqlite:///' +
238
- db_path)
239
- create_table()
236
+ engine = sqlalchemy.create_engine('sqlite:///' + db_path)
237
+ create_table(engine)
238
+ _SQLALCHEMY_ENGINE = engine
240
239
  return _SQLALCHEMY_ENGINE
241
240
 
242
241
 
sky/jobs/utils.py CHANGED
@@ -30,6 +30,7 @@ from sky.backends import backend_utils
30
30
  from sky.jobs import constants as managed_job_constants
31
31
  from sky.jobs import scheduler
32
32
  from sky.jobs import state as managed_job_state
33
+ from sky.server import common as server_common
33
34
  from sky.skylet import constants
34
35
  from sky.skylet import job_lib
35
36
  from sky.skylet import log_lib
@@ -38,6 +39,7 @@ from sky.utils import annotations
38
39
  from sky.utils import command_runner
39
40
  from sky.utils import common_utils
40
41
  from sky.utils import controller_utils
42
+ from sky.utils import env_options
41
43
  from sky.utils import infra_utils
42
44
  from sky.utils import log_utils
43
45
  from sky.utils import message_utils
@@ -128,9 +130,15 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
128
130
  time.sleep(backoff.current_backoff())
129
131
 
130
132
 
131
- def _check_consolidation_mode_consistency(
133
+ def _validate_consolidation_mode_config(
132
134
  current_is_consolidation_mode: bool) -> None:
133
- """Check the consistency of the consolidation mode."""
135
+ """Validate the consolidation mode config."""
136
+ if (current_is_consolidation_mode and
137
+ not env_options.Options.IS_DEVELOPER.get() and
138
+ server_common.is_api_server_local()):
139
+ with ux_utils.print_exception_no_traceback():
140
+ raise exceptions.NotSupportedError(
141
+ 'Consolidation mode is not supported when running locally.')
134
142
  # Check whether the consolidation mode config is changed.
135
143
  if current_is_consolidation_mode:
136
144
  controller_cn = (
@@ -176,7 +184,7 @@ def _check_consolidation_mode_consistency(
176
184
  def is_consolidation_mode() -> bool:
177
185
  consolidation_mode = skypilot_config.get_nested(
178
186
  ('jobs', 'controller', 'consolidation_mode'), default_value=False)
179
- _check_consolidation_mode_consistency(consolidation_mode)
187
+ _validate_consolidation_mode_config(consolidation_mode)
180
188
  return consolidation_mode
181
189
 
182
190
 
sky/optimizer.py CHANGED
@@ -997,23 +997,29 @@ class Optimizer:
997
997
  @staticmethod
998
998
  def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
999
999
  for node, candidate_set in node_to_candidate_map.items():
1000
- if node.best_resources:
1001
- accelerator = node.best_resources.accelerators
1002
- else:
1003
- accelerator = list(node.resources)[0].accelerators
1000
+ best_resources = node.best_resources
1001
+ if best_resources is None:
1002
+ best_resources = list(node.resources)[0]
1004
1003
  is_multi_instances = False
1005
- if accelerator:
1006
- acc_name, acc_count = list(accelerator.items())[0]
1004
+ if best_resources.accelerators:
1005
+ acc_name, acc_count = list(
1006
+ best_resources.accelerators.items())[0]
1007
1007
  for cloud, candidate_list in candidate_set.items():
1008
- if len(candidate_list) > 1:
1008
+ # Filter only the candidates matching the best
1009
+ # resources chosen by the optimizer.
1010
+ best_resources_candidates = [
1011
+ res for res in candidate_list if
1012
+ res.get_accelerators_str() == f'{acc_name}:{acc_count}'
1013
+ ]
1014
+ if len(best_resources_candidates) > 1:
1009
1015
  is_multi_instances = True
1010
- instance_list = [
1016
+ instance_list = set([
1011
1017
  res.instance_type
1012
- for res in candidate_list
1018
+ for res in best_resources_candidates
1013
1019
  if res.instance_type is not None
1014
- ]
1020
+ ])
1015
1021
  candidate_str = resources_utils.format_resource(
1016
- candidate_list[0], simplify=True)
1022
+ best_resources, simplify=True)
1017
1023
 
1018
1024
  logger.info(
1019
1025
  f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
@@ -1327,8 +1333,7 @@ def _fill_in_launchable_resources(
1327
1333
  launchable: Dict[resources_lib.Resources, List[resources_lib.Resources]] = (
1328
1334
  collections.defaultdict(list))
1329
1335
  all_fuzzy_candidates = set()
1330
- cloud_candidates: _PerCloudCandidates = collections.defaultdict(
1331
- List[resources_lib.Resources])
1336
+ cloud_candidates: _PerCloudCandidates = collections.defaultdict(list)
1332
1337
  resource_hints: Dict[resources_lib.Resources,
1333
1338
  List[str]] = collections.defaultdict(list)
1334
1339
  if blocked_resources is None:
@@ -1365,7 +1370,10 @@ def _fill_in_launchable_resources(
1365
1370
  launchable[resources].extend(
1366
1371
  resources_utils.make_launchables_for_valid_region_zones(
1367
1372
  cheapest))
1368
- cloud_candidates[cloud] = feasible_resources.resources_list
1373
+ # Each cloud can occur multiple times in feasible_list,
1374
+ # for different region/zone.
1375
+ cloud_candidates[cloud].extend(
1376
+ feasible_resources.resources_list)
1369
1377
  else:
1370
1378
  all_fuzzy_candidates.update(
1371
1379
  feasible_resources.fuzzy_candidate_list)
@@ -1,5 +1,6 @@
1
1
  """Kubernetes utilities for SkyPilot."""
2
2
  import dataclasses
3
+ import enum
3
4
  import functools
4
5
  import hashlib
5
6
  import json
@@ -57,6 +58,69 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
57
58
  # and store all data that needs to be persisted in future.
58
59
  HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
59
60
 
61
+
62
+ class KubernetesHighPerformanceNetworkType(enum.Enum):
63
+ """Enum for different Kubernetes cluster types with high performance
64
+ network configurations.
65
+
66
+ This enum defines cluster types that support optimized networking for
67
+ distributed ML workloads:
68
+ - GCP_TCPX: GKE clusters with GPUDirect-TCPX support
69
+ (A3 High instances: a3-highgpu-8g)
70
+ - GCP_TCPXO: GKE clusters with GPUDirect-TCPXO support
71
+ (A3 Mega instances: a3-megagpu-8g)
72
+ - GCP_GPUDIRECT_RDMA: GKE clusters with GPUDirect-RDMA support
73
+ (A4/A3 Ultra instances)
74
+ - NEBIUS: Nebius clusters with InfiniBand support for high-throughput,
75
+ low-latency networking
76
+ - NONE: Standard clusters without specialized networking optimizations
77
+
78
+ The network configurations align with corresponding VM-based
79
+ implementations:
80
+ - GCP settings match
81
+ sky.provision.gcp.constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
82
+ - Nebius settings match the InfiniBand configuration used in Nebius VMs
83
+ """
84
+
85
+ GCP_TCPX = 'gcp_tcpx'
86
+ GCP_TCPXO = 'gcp_tcpxo'
87
+ GCP_GPUDIRECT_RDMA = 'gcp_gpudirect_rdma'
88
+ NEBIUS = 'nebius'
89
+ NONE = 'none'
90
+
91
+ def get_network_env_vars(self) -> Dict[str, str]:
92
+ """Get network environment variables for this cluster type."""
93
+ if self == KubernetesHighPerformanceNetworkType.NEBIUS:
94
+ # Nebius cluster with InfiniBand - use InfiniBand optimizations
95
+ return {
96
+ 'NCCL_IB_HCA': 'mlx5',
97
+ 'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
98
+ 'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
99
+ }
100
+ else:
101
+ # GCP clusters and generic clusters - environment variables are
102
+ # handled directly in the template
103
+ return {}
104
+
105
+ def supports_high_performance_networking(self) -> bool:
106
+ """Check if this cluster type supports high performance networking."""
107
+ return self is not KubernetesHighPerformanceNetworkType.NONE
108
+
109
+ def supports_gpu_direct(self) -> bool:
110
+ """Check if this cluster type supports GPUDirect networking."""
111
+ return self in (KubernetesHighPerformanceNetworkType.GCP_TCPX,
112
+ KubernetesHighPerformanceNetworkType.GCP_TCPXO,
113
+ KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
114
+
115
+ def requires_ipc_lock_capability(self) -> bool:
116
+ """Check if this cluster type requires IPC_LOCK capability."""
117
+ return self.supports_high_performance_networking()
118
+
119
+ def requires_tcpxo_daemon(self) -> bool:
120
+ """Check if this cluster type requires TCPXO daemon."""
121
+ return self == KubernetesHighPerformanceNetworkType.GCP_TCPXO
122
+
123
+
60
124
  # TODO(romilb): Move constants to constants.py
61
125
  DEFAULT_NAMESPACE = 'default'
62
126
 
@@ -758,6 +822,74 @@ class GKEAutoscaler(Autoscaler):
758
822
  return True
759
823
  return False
760
824
 
825
+ @classmethod
826
+ @annotations.lru_cache(scope='request', maxsize=10)
827
+ def get_available_machine_types(cls, context: str) -> List[str]:
828
+ """Returns the list of machine types that are available in the cluster.
829
+ """
830
+ # Assume context naming convention of
831
+ # gke_PROJECT-ID_LOCATION_CLUSTER-NAME
832
+ valid, project_id, location, cluster_name = cls._validate_context_name(
833
+ context)
834
+ if not valid:
835
+ # Context name is not in the format of
836
+ # gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
837
+ # Cannot determine if the context can autoscale.
838
+ # Return empty list.
839
+ logger.debug(f'Context {context} is not in the format of '
840
+ f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
841
+ 'Returning empty machine type list.')
842
+ return []
843
+ try:
844
+ logger.debug(
845
+ f'Attempting to get information about cluster {cluster_name}')
846
+ container_service = gcp.build('container',
847
+ 'v1',
848
+ credentials=None,
849
+ cache_discovery=False)
850
+ cluster = container_service.projects().locations().clusters().get(
851
+ name=f'projects/{project_id}'
852
+ f'/locations/{location}'
853
+ f'/clusters/{cluster_name}').execute()
854
+ except ImportError:
855
+ # If the gcp module is not installed, return empty list.
856
+ # Remind the user once per day to install the gcp module for better
857
+ # pod scheduling with GKE autoscaler.
858
+ if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
859
+ logger.info(
860
+ 'Could not fetch autoscaler information from GKE. '
861
+ 'Run pip install "skypilot[gcp]" for more intelligent pod '
862
+ 'scheduling with GKE autoscaler.')
863
+ cls._pip_install_gcp_hint_last_sent = time.time()
864
+ return []
865
+ except gcp.http_error_exception() as e:
866
+ # Cluster information is not available.
867
+ # Return empty list.
868
+ logger.debug(f'{e.message}', exc_info=True)
869
+ return []
870
+
871
+ machine_types = []
872
+ # Get the list of machine types that are available in the cluster.
873
+ node_pools = cluster.get('nodePools', [])
874
+ for node_pool in node_pools:
875
+ name = node_pool.get('name', '')
876
+ logger.debug(f'Checking if node pool {name} '
877
+ 'has autoscaling enabled.')
878
+ autoscaling_enabled = (node_pool.get('autoscaling',
879
+ {}).get('enabled', False))
880
+ if autoscaling_enabled:
881
+ logger.debug(f'Node pool {name} has autoscaling enabled.')
882
+ try:
883
+ machine_type = node_pool.get('config',
884
+ {}).get('machineType', '')
885
+ if machine_type:
886
+ machine_types.append(machine_type)
887
+ except KeyError:
888
+ logger.debug(f'Encountered KeyError while checking machine '
889
+ f'type of node pool {name}.')
890
+ continue
891
+ return machine_types
892
+
761
893
  @classmethod
762
894
  def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
763
895
  """Validates the context name is in the format of
@@ -129,6 +129,7 @@ extras_require: Dict[str, List[str]] = {
129
129
  'azure-mgmt-compute>=33.0.0',
130
130
  'azure-storage-blob>=12.23.1',
131
131
  'msgraph-sdk',
132
+ 'msrestazure',
132
133
  ] + local_ray,
133
134
  # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
134
135
  # parameter for stopping instances. Reference:
sky/skypilot_config.py CHANGED
@@ -52,6 +52,7 @@ import contextlib
52
52
  import copy
53
53
  import json
54
54
  import os
55
+ import pathlib
55
56
  import tempfile
56
57
  import threading
57
58
  import typing
@@ -848,7 +849,9 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
848
849
 
849
850
  global_config_path = _resolve_server_config_path()
850
851
  if global_config_path is None:
851
- global_config_path = get_user_config_path()
852
+ # Fallback to ~/.sky/config.yaml, and make sure it exists.
853
+ global_config_path = os.path.expanduser(get_user_config_path())
854
+ pathlib.Path(global_config_path).touch(exist_ok=True)
852
855
 
853
856
  db_updated = False
854
857
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None: