skypilot-nightly 1.0.0.dev20250710__py3-none-any.whl → 1.0.0.dev20250712__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/clouds/kubernetes.py +137 -23
- sky/core.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{P2Di1JdUlHuKN2lBws4Mr → Xv9sc7FbOn47FoLhF0fUv}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1043-1b39779691bb4030.js → 1043-5e5ef6198735ff7e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/1871-cf1a47986d716dd2.js +6 -0
- sky/dashboard/out/_next/static/chunks/6601-d38d10f957dff832.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6989-6ff4e45dfb49d11d.js → 6989-eab0e9c16b64fd9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/938-8e25c8ea0baa271a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4608dc89f95eba89.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-980d6f6b64ca7833.js +16 -0
- sky/dashboard/out/_next/static/chunks/{webpack-fd62f17bd9ce1fcc.js → webpack-4d50ce5087a63a95.js} +1 -1
- sky/dashboard/out/_next/static/css/a713705ccc8fe059.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +10 -11
- sky/jobs/state.py +10 -11
- sky/jobs/utils.py +11 -3
- sky/optimizer.py +22 -14
- sky/provision/kubernetes/utils.py +132 -0
- sky/setup_files/dependencies.py +1 -0
- sky/skypilot_config.py +4 -1
- sky/templates/kubernetes-ray.yml.j2 +298 -10
- sky/users/permission.py +15 -1
- sky/users/token_service.py +25 -3
- sky/utils/schemas.py +3 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/RECORD +49 -49
- sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +0 -6
- sky/dashboard/out/_next/static/chunks/6601-fcfad0ddf92ec7ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-044ad21de8b4626b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8135aba0712bda37.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c4d5cfac7fbc0668.js +0 -16
- sky/dashboard/out/_next/static/css/0da6afe66176678a.css +0 -3
- /sky/dashboard/out/_next/static/{P2Di1JdUlHuKN2lBws4Mr → Xv9sc7FbOn47FoLhF0fUv}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-a37b06ddb64521fd.js → _app-49ff6c04332cc621.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-1159f362b960e2b8.js → [cluster]-0fbfb1dd0b08c90c.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{clusters-9744c271a1642f76.js → clusters-102d169e87913ba1.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250712.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
@@ -220,17 +220,16 @@ def _glob_to_similar(glob_pattern):
|
|
220
220
|
return like_pattern
|
221
221
|
|
222
222
|
|
223
|
-
def create_table():
|
223
|
+
def create_table(engine: sqlalchemy.engine.Engine):
|
224
224
|
# Enable WAL mode to avoid locking issues.
|
225
225
|
# See: issue #1441 and PR #1509
|
226
226
|
# https://github.com/microsoft/WSL/issues/2395
|
227
227
|
# TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
|
228
228
|
# This may cause the database locked problem from WSL issue #1441.
|
229
|
-
if (
|
230
|
-
== db_utils.SQLAlchemyDialect.SQLITE.value and
|
229
|
+
if (engine.dialect.name == db_utils.SQLAlchemyDialect.SQLITE.value and
|
231
230
|
not common_utils.is_wsl()):
|
232
231
|
try:
|
233
|
-
with orm.Session(
|
232
|
+
with orm.Session(engine) as session:
|
234
233
|
session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
|
235
234
|
session.commit()
|
236
235
|
except sqlalchemy_exc.OperationalError as e:
|
@@ -240,12 +239,12 @@ def create_table():
|
|
240
239
|
# is not critical and is likely to be enabled by other processes.
|
241
240
|
|
242
241
|
# Create tables if they don't exist
|
243
|
-
db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
|
242
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata, engine)
|
244
243
|
|
245
244
|
# For backward compatibility.
|
246
245
|
# TODO(zhwu): Remove this function after all users have migrated to
|
247
246
|
# the latest version of SkyPilot.
|
248
|
-
with orm.Session(
|
247
|
+
with orm.Session(engine) as session:
|
249
248
|
# Add autostop column to clusters table
|
250
249
|
db_utils.add_column_to_table_sqlalchemy(session,
|
251
250
|
'clusters',
|
@@ -391,15 +390,15 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
391
390
|
conn_string = skypilot_config.get_nested(('db',), None)
|
392
391
|
if conn_string:
|
393
392
|
logger.debug(f'using db URI from {conn_string}')
|
394
|
-
|
395
|
-
|
393
|
+
engine = sqlalchemy.create_engine(conn_string,
|
394
|
+
poolclass=sqlalchemy.NullPool)
|
396
395
|
else:
|
397
396
|
db_path = os.path.expanduser('~/.sky/state.db')
|
398
397
|
pathlib.Path(db_path).parents[0].mkdir(parents=True,
|
399
398
|
exist_ok=True)
|
400
|
-
|
401
|
-
|
402
|
-
|
399
|
+
engine = sqlalchemy.create_engine('sqlite:///' + db_path)
|
400
|
+
create_table(engine)
|
401
|
+
_SQLALCHEMY_ENGINE = engine
|
403
402
|
return _SQLALCHEMY_ENGINE
|
404
403
|
|
405
404
|
|
sky/jobs/state.py
CHANGED
@@ -112,17 +112,16 @@ ha_recovery_script_table = sqlalchemy.Table(
|
|
112
112
|
)
|
113
113
|
|
114
114
|
|
115
|
-
def create_table():
|
115
|
+
def create_table(engine: sqlalchemy.engine.Engine):
|
116
116
|
# Enable WAL mode to avoid locking issues.
|
117
117
|
# See: issue #3863, #1441 and PR #1509
|
118
118
|
# https://github.com/microsoft/WSL/issues/2395
|
119
119
|
# TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
|
120
120
|
# This may cause the database locked problem from WSL issue #1441.
|
121
|
-
if (
|
122
|
-
== db_utils.SQLAlchemyDialect.SQLITE.value and
|
121
|
+
if (engine.dialect.name == db_utils.SQLAlchemyDialect.SQLITE.value and
|
123
122
|
not common_utils.is_wsl()):
|
124
123
|
try:
|
125
|
-
with orm.Session(
|
124
|
+
with orm.Session(engine) as session:
|
126
125
|
session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
|
127
126
|
session.commit()
|
128
127
|
except sqlalchemy_exc.OperationalError as e:
|
@@ -132,10 +131,10 @@ def create_table():
|
|
132
131
|
# is not critical and is likely to be enabled by other processes.
|
133
132
|
|
134
133
|
# Create tables if they don't exist
|
135
|
-
db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
|
134
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata, engine)
|
136
135
|
|
137
136
|
# Backward compatibility: add columns that not exist in older databases
|
138
|
-
with orm.Session(
|
137
|
+
with orm.Session(engine) as session:
|
139
138
|
db_utils.add_column_to_table_sqlalchemy(session, 'spot',
|
140
139
|
'failure_reason',
|
141
140
|
sqlalchemy.Text())
|
@@ -228,15 +227,15 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
228
227
|
conn_string = skypilot_config.get_nested(('db',), None)
|
229
228
|
if conn_string:
|
230
229
|
logger.debug(f'using db URI from {conn_string}')
|
231
|
-
|
232
|
-
|
230
|
+
engine = sqlalchemy.create_engine(conn_string,
|
231
|
+
poolclass=sqlalchemy.NullPool)
|
233
232
|
else:
|
234
233
|
db_path = os.path.expanduser('~/.sky/spot_jobs.db')
|
235
234
|
pathlib.Path(db_path).parents[0].mkdir(parents=True,
|
236
235
|
exist_ok=True)
|
237
|
-
|
238
|
-
|
239
|
-
|
236
|
+
engine = sqlalchemy.create_engine('sqlite:///' + db_path)
|
237
|
+
create_table(engine)
|
238
|
+
_SQLALCHEMY_ENGINE = engine
|
240
239
|
return _SQLALCHEMY_ENGINE
|
241
240
|
|
242
241
|
|
sky/jobs/utils.py
CHANGED
@@ -30,6 +30,7 @@ from sky.backends import backend_utils
|
|
30
30
|
from sky.jobs import constants as managed_job_constants
|
31
31
|
from sky.jobs import scheduler
|
32
32
|
from sky.jobs import state as managed_job_state
|
33
|
+
from sky.server import common as server_common
|
33
34
|
from sky.skylet import constants
|
34
35
|
from sky.skylet import job_lib
|
35
36
|
from sky.skylet import log_lib
|
@@ -38,6 +39,7 @@ from sky.utils import annotations
|
|
38
39
|
from sky.utils import command_runner
|
39
40
|
from sky.utils import common_utils
|
40
41
|
from sky.utils import controller_utils
|
42
|
+
from sky.utils import env_options
|
41
43
|
from sky.utils import infra_utils
|
42
44
|
from sky.utils import log_utils
|
43
45
|
from sky.utils import message_utils
|
@@ -128,9 +130,15 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
|
|
128
130
|
time.sleep(backoff.current_backoff())
|
129
131
|
|
130
132
|
|
131
|
-
def
|
133
|
+
def _validate_consolidation_mode_config(
|
132
134
|
current_is_consolidation_mode: bool) -> None:
|
133
|
-
"""
|
135
|
+
"""Validate the consolidation mode config."""
|
136
|
+
if (current_is_consolidation_mode and
|
137
|
+
not env_options.Options.IS_DEVELOPER.get() and
|
138
|
+
server_common.is_api_server_local()):
|
139
|
+
with ux_utils.print_exception_no_traceback():
|
140
|
+
raise exceptions.NotSupportedError(
|
141
|
+
'Consolidation mode is not supported when running locally.')
|
134
142
|
# Check whether the consolidation mode config is changed.
|
135
143
|
if current_is_consolidation_mode:
|
136
144
|
controller_cn = (
|
@@ -176,7 +184,7 @@ def _check_consolidation_mode_consistency(
|
|
176
184
|
def is_consolidation_mode() -> bool:
|
177
185
|
consolidation_mode = skypilot_config.get_nested(
|
178
186
|
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
179
|
-
|
187
|
+
_validate_consolidation_mode_config(consolidation_mode)
|
180
188
|
return consolidation_mode
|
181
189
|
|
182
190
|
|
sky/optimizer.py
CHANGED
@@ -997,23 +997,29 @@ class Optimizer:
|
|
997
997
|
@staticmethod
|
998
998
|
def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
|
999
999
|
for node, candidate_set in node_to_candidate_map.items():
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1003
|
-
accelerator = list(node.resources)[0].accelerators
|
1000
|
+
best_resources = node.best_resources
|
1001
|
+
if best_resources is None:
|
1002
|
+
best_resources = list(node.resources)[0]
|
1004
1003
|
is_multi_instances = False
|
1005
|
-
if
|
1006
|
-
acc_name, acc_count = list(
|
1004
|
+
if best_resources.accelerators:
|
1005
|
+
acc_name, acc_count = list(
|
1006
|
+
best_resources.accelerators.items())[0]
|
1007
1007
|
for cloud, candidate_list in candidate_set.items():
|
1008
|
-
|
1008
|
+
# Filter only the candidates matching the best
|
1009
|
+
# resources chosen by the optimizer.
|
1010
|
+
best_resources_candidates = [
|
1011
|
+
res for res in candidate_list if
|
1012
|
+
res.get_accelerators_str() == f'{acc_name}:{acc_count}'
|
1013
|
+
]
|
1014
|
+
if len(best_resources_candidates) > 1:
|
1009
1015
|
is_multi_instances = True
|
1010
|
-
instance_list = [
|
1016
|
+
instance_list = set([
|
1011
1017
|
res.instance_type
|
1012
|
-
for res in
|
1018
|
+
for res in best_resources_candidates
|
1013
1019
|
if res.instance_type is not None
|
1014
|
-
]
|
1020
|
+
])
|
1015
1021
|
candidate_str = resources_utils.format_resource(
|
1016
|
-
|
1022
|
+
best_resources, simplify=True)
|
1017
1023
|
|
1018
1024
|
logger.info(
|
1019
1025
|
f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
|
@@ -1327,8 +1333,7 @@ def _fill_in_launchable_resources(
|
|
1327
1333
|
launchable: Dict[resources_lib.Resources, List[resources_lib.Resources]] = (
|
1328
1334
|
collections.defaultdict(list))
|
1329
1335
|
all_fuzzy_candidates = set()
|
1330
|
-
cloud_candidates: _PerCloudCandidates = collections.defaultdict(
|
1331
|
-
List[resources_lib.Resources])
|
1336
|
+
cloud_candidates: _PerCloudCandidates = collections.defaultdict(list)
|
1332
1337
|
resource_hints: Dict[resources_lib.Resources,
|
1333
1338
|
List[str]] = collections.defaultdict(list)
|
1334
1339
|
if blocked_resources is None:
|
@@ -1365,7 +1370,10 @@ def _fill_in_launchable_resources(
|
|
1365
1370
|
launchable[resources].extend(
|
1366
1371
|
resources_utils.make_launchables_for_valid_region_zones(
|
1367
1372
|
cheapest))
|
1368
|
-
|
1373
|
+
# Each cloud can occur multiple times in feasible_list,
|
1374
|
+
# for different region/zone.
|
1375
|
+
cloud_candidates[cloud].extend(
|
1376
|
+
feasible_resources.resources_list)
|
1369
1377
|
else:
|
1370
1378
|
all_fuzzy_candidates.update(
|
1371
1379
|
feasible_resources.fuzzy_candidate_list)
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
2
2
|
import dataclasses
|
3
|
+
import enum
|
3
4
|
import functools
|
4
5
|
import hashlib
|
5
6
|
import json
|
@@ -57,6 +58,69 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
|
|
57
58
|
# and store all data that needs to be persisted in future.
|
58
59
|
HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
|
59
60
|
|
61
|
+
|
62
|
+
class KubernetesHighPerformanceNetworkType(enum.Enum):
|
63
|
+
"""Enum for different Kubernetes cluster types with high performance
|
64
|
+
network configurations.
|
65
|
+
|
66
|
+
This enum defines cluster types that support optimized networking for
|
67
|
+
distributed ML workloads:
|
68
|
+
- GCP_TCPX: GKE clusters with GPUDirect-TCPX support
|
69
|
+
(A3 High instances: a3-highgpu-8g)
|
70
|
+
- GCP_TCPXO: GKE clusters with GPUDirect-TCPXO support
|
71
|
+
(A3 Mega instances: a3-megagpu-8g)
|
72
|
+
- GCP_GPUDIRECT_RDMA: GKE clusters with GPUDirect-RDMA support
|
73
|
+
(A4/A3 Ultra instances)
|
74
|
+
- NEBIUS: Nebius clusters with InfiniBand support for high-throughput,
|
75
|
+
low-latency networking
|
76
|
+
- NONE: Standard clusters without specialized networking optimizations
|
77
|
+
|
78
|
+
The network configurations align with corresponding VM-based
|
79
|
+
implementations:
|
80
|
+
- GCP settings match
|
81
|
+
sky.provision.gcp.constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
|
82
|
+
- Nebius settings match the InfiniBand configuration used in Nebius VMs
|
83
|
+
"""
|
84
|
+
|
85
|
+
GCP_TCPX = 'gcp_tcpx'
|
86
|
+
GCP_TCPXO = 'gcp_tcpxo'
|
87
|
+
GCP_GPUDIRECT_RDMA = 'gcp_gpudirect_rdma'
|
88
|
+
NEBIUS = 'nebius'
|
89
|
+
NONE = 'none'
|
90
|
+
|
91
|
+
def get_network_env_vars(self) -> Dict[str, str]:
|
92
|
+
"""Get network environment variables for this cluster type."""
|
93
|
+
if self == KubernetesHighPerformanceNetworkType.NEBIUS:
|
94
|
+
# Nebius cluster with InfiniBand - use InfiniBand optimizations
|
95
|
+
return {
|
96
|
+
'NCCL_IB_HCA': 'mlx5',
|
97
|
+
'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
|
98
|
+
'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
|
99
|
+
}
|
100
|
+
else:
|
101
|
+
# GCP clusters and generic clusters - environment variables are
|
102
|
+
# handled directly in the template
|
103
|
+
return {}
|
104
|
+
|
105
|
+
def supports_high_performance_networking(self) -> bool:
|
106
|
+
"""Check if this cluster type supports high performance networking."""
|
107
|
+
return self is not KubernetesHighPerformanceNetworkType.NONE
|
108
|
+
|
109
|
+
def supports_gpu_direct(self) -> bool:
|
110
|
+
"""Check if this cluster type supports GPUDirect networking."""
|
111
|
+
return self in (KubernetesHighPerformanceNetworkType.GCP_TCPX,
|
112
|
+
KubernetesHighPerformanceNetworkType.GCP_TCPXO,
|
113
|
+
KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
|
114
|
+
|
115
|
+
def requires_ipc_lock_capability(self) -> bool:
|
116
|
+
"""Check if this cluster type requires IPC_LOCK capability."""
|
117
|
+
return self.supports_high_performance_networking()
|
118
|
+
|
119
|
+
def requires_tcpxo_daemon(self) -> bool:
|
120
|
+
"""Check if this cluster type requires TCPXO daemon."""
|
121
|
+
return self == KubernetesHighPerformanceNetworkType.GCP_TCPXO
|
122
|
+
|
123
|
+
|
60
124
|
# TODO(romilb): Move constants to constants.py
|
61
125
|
DEFAULT_NAMESPACE = 'default'
|
62
126
|
|
@@ -758,6 +822,74 @@ class GKEAutoscaler(Autoscaler):
|
|
758
822
|
return True
|
759
823
|
return False
|
760
824
|
|
825
|
+
@classmethod
|
826
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
827
|
+
def get_available_machine_types(cls, context: str) -> List[str]:
|
828
|
+
"""Returns the list of machine types that are available in the cluster.
|
829
|
+
"""
|
830
|
+
# Assume context naming convention of
|
831
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME
|
832
|
+
valid, project_id, location, cluster_name = cls._validate_context_name(
|
833
|
+
context)
|
834
|
+
if not valid:
|
835
|
+
# Context name is not in the format of
|
836
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
|
837
|
+
# Cannot determine if the context can autoscale.
|
838
|
+
# Return empty list.
|
839
|
+
logger.debug(f'Context {context} is not in the format of '
|
840
|
+
f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
|
841
|
+
'Returning empty machine type list.')
|
842
|
+
return []
|
843
|
+
try:
|
844
|
+
logger.debug(
|
845
|
+
f'Attempting to get information about cluster {cluster_name}')
|
846
|
+
container_service = gcp.build('container',
|
847
|
+
'v1',
|
848
|
+
credentials=None,
|
849
|
+
cache_discovery=False)
|
850
|
+
cluster = container_service.projects().locations().clusters().get(
|
851
|
+
name=f'projects/{project_id}'
|
852
|
+
f'/locations/{location}'
|
853
|
+
f'/clusters/{cluster_name}').execute()
|
854
|
+
except ImportError:
|
855
|
+
# If the gcp module is not installed, return empty list.
|
856
|
+
# Remind the user once per day to install the gcp module for better
|
857
|
+
# pod scheduling with GKE autoscaler.
|
858
|
+
if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
|
859
|
+
logger.info(
|
860
|
+
'Could not fetch autoscaler information from GKE. '
|
861
|
+
'Run pip install "skypilot[gcp]" for more intelligent pod '
|
862
|
+
'scheduling with GKE autoscaler.')
|
863
|
+
cls._pip_install_gcp_hint_last_sent = time.time()
|
864
|
+
return []
|
865
|
+
except gcp.http_error_exception() as e:
|
866
|
+
# Cluster information is not available.
|
867
|
+
# Return empty list.
|
868
|
+
logger.debug(f'{e.message}', exc_info=True)
|
869
|
+
return []
|
870
|
+
|
871
|
+
machine_types = []
|
872
|
+
# Get the list of machine types that are available in the cluster.
|
873
|
+
node_pools = cluster.get('nodePools', [])
|
874
|
+
for node_pool in node_pools:
|
875
|
+
name = node_pool.get('name', '')
|
876
|
+
logger.debug(f'Checking if node pool {name} '
|
877
|
+
'has autoscaling enabled.')
|
878
|
+
autoscaling_enabled = (node_pool.get('autoscaling',
|
879
|
+
{}).get('enabled', False))
|
880
|
+
if autoscaling_enabled:
|
881
|
+
logger.debug(f'Node pool {name} has autoscaling enabled.')
|
882
|
+
try:
|
883
|
+
machine_type = node_pool.get('config',
|
884
|
+
{}).get('machineType', '')
|
885
|
+
if machine_type:
|
886
|
+
machine_types.append(machine_type)
|
887
|
+
except KeyError:
|
888
|
+
logger.debug(f'Encountered KeyError while checking machine '
|
889
|
+
f'type of node pool {name}.')
|
890
|
+
continue
|
891
|
+
return machine_types
|
892
|
+
|
761
893
|
@classmethod
|
762
894
|
def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
|
763
895
|
"""Validates the context name is in the format of
|
sky/setup_files/dependencies.py
CHANGED
@@ -129,6 +129,7 @@ extras_require: Dict[str, List[str]] = {
|
|
129
129
|
'azure-mgmt-compute>=33.0.0',
|
130
130
|
'azure-storage-blob>=12.23.1',
|
131
131
|
'msgraph-sdk',
|
132
|
+
'msrestazure',
|
132
133
|
] + local_ray,
|
133
134
|
# We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
|
134
135
|
# parameter for stopping instances. Reference:
|
sky/skypilot_config.py
CHANGED
@@ -52,6 +52,7 @@ import contextlib
|
|
52
52
|
import copy
|
53
53
|
import json
|
54
54
|
import os
|
55
|
+
import pathlib
|
55
56
|
import tempfile
|
56
57
|
import threading
|
57
58
|
import typing
|
@@ -848,7 +849,9 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
848
849
|
|
849
850
|
global_config_path = _resolve_server_config_path()
|
850
851
|
if global_config_path is None:
|
851
|
-
|
852
|
+
# Fallback to ~/.sky/config.yaml, and make sure it exists.
|
853
|
+
global_config_path = os.path.expanduser(get_user_config_path())
|
854
|
+
pathlib.Path(global_config_path).touch(exist_ok=True)
|
852
855
|
|
853
856
|
db_updated = False
|
854
857
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|