skypilot-nightly 1.0.0.dev20250204__py3-none-any.whl → 1.0.0.dev20250206__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/vast.py +29 -0
- sky/authentication.py +18 -0
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +1 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/kubernetes_catalog.py +11 -6
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/vast.py +279 -0
- sky/jobs/dashboard/dashboard.py +10 -3
- sky/jobs/dashboard/templates/index.html +117 -52
- sky/jobs/scheduler.py +14 -5
- sky/jobs/utils.py +10 -19
- sky/provision/__init__.py +1 -0
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +161 -0
- sky/setup_files/dependencies.py +1 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/utils/controller_utils.py +5 -0
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250206.dist-info}/METADATA +4 -1
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250206.dist-info}/RECORD +29 -20
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250206.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250206.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250206.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250206.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py
CHANGED
@@ -60,6 +60,14 @@ logger = sky_logging.init_logger('sky.jobs.controller')
|
|
60
60
|
_MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
|
61
61
|
_ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
|
62
62
|
|
63
|
+
# Based on testing, assume a running job uses 350MB memory.
|
64
|
+
JOB_MEMORY_MB = 350
|
65
|
+
# Past 2000 simultaneous jobs, we become unstable.
|
66
|
+
# See https://github.com/skypilot-org/skypilot/issues/4649.
|
67
|
+
MAX_JOB_LIMIT = 2000
|
68
|
+
# Number of ongoing launches launches allowed per CPU.
|
69
|
+
LAUNCHES_PER_CPU = 4
|
70
|
+
|
63
71
|
|
64
72
|
@lru_cache(maxsize=1)
|
65
73
|
def _get_lock_path() -> str:
|
@@ -247,15 +255,16 @@ def _set_alive_waiting(job_id: int) -> None:
|
|
247
255
|
|
248
256
|
|
249
257
|
def _get_job_parallelism() -> int:
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
258
|
+
job_memory = JOB_MEMORY_MB * 1024 * 1024
|
259
|
+
|
260
|
+
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
261
|
+
|
262
|
+
return max(job_limit, 1)
|
254
263
|
|
255
264
|
|
256
265
|
def _get_launch_parallelism() -> int:
|
257
266
|
cpus = os.cpu_count()
|
258
|
-
return cpus *
|
267
|
+
return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
|
259
268
|
|
260
269
|
|
261
270
|
def _can_start_new_job() -> bool:
|
sky/jobs/utils.py
CHANGED
@@ -965,7 +965,8 @@ def format_job_table(
|
|
965
965
|
'STATUS',
|
966
966
|
]
|
967
967
|
if show_all:
|
968
|
-
|
968
|
+
# TODO: move SCHED. STATE to a separate flag (e.g. --debug)
|
969
|
+
columns += ['STARTED', 'CLUSTER', 'REGION', 'SCHED. STATE', 'DETAILS']
|
969
970
|
if tasks_have_user:
|
970
971
|
columns.insert(0, 'USER')
|
971
972
|
job_table = log_utils.create_table(columns)
|
@@ -984,20 +985,10 @@ def format_job_table(
|
|
984
985
|
# by the task_id.
|
985
986
|
jobs[get_hash(task)].append(task)
|
986
987
|
|
987
|
-
def
|
988
|
-
schedule_state: Optional[str]) -> str:
|
989
|
-
description = ''
|
990
|
-
if schedule_state is not None:
|
991
|
-
description += f'Scheduler: {schedule_state}'
|
992
|
-
if failure_reason is not None:
|
993
|
-
description += ', '
|
988
|
+
def generate_details(failure_reason: Optional[str]) -> str:
|
994
989
|
if failure_reason is not None:
|
995
|
-
|
996
|
-
|
997
|
-
if description == '':
|
998
|
-
return '-'
|
999
|
-
|
1000
|
-
return description
|
990
|
+
return f'Failure: {failure_reason}'
|
991
|
+
return '-'
|
1001
992
|
|
1002
993
|
for job_hash, job_tasks in jobs.items():
|
1003
994
|
if show_all:
|
@@ -1050,13 +1041,13 @@ def format_job_table(
|
|
1050
1041
|
status_str,
|
1051
1042
|
]
|
1052
1043
|
if show_all:
|
1053
|
-
schedule_state = job_tasks[0]['schedule_state']
|
1054
1044
|
failure_reason = job_tasks[current_task_id]['failure_reason']
|
1055
1045
|
job_values.extend([
|
1056
1046
|
'-',
|
1057
1047
|
'-',
|
1058
1048
|
'-',
|
1059
|
-
|
1049
|
+
job_tasks[0]['schedule_state'],
|
1050
|
+
generate_details(failure_reason),
|
1060
1051
|
])
|
1061
1052
|
if tasks_have_user:
|
1062
1053
|
job_values.insert(0, job_tasks[0].get('user', '-'))
|
@@ -1087,14 +1078,14 @@ def format_job_table(
|
|
1087
1078
|
# schedule_state is only set at the job level, so if we have
|
1088
1079
|
# more than one task, only display on the aggregated row.
|
1089
1080
|
schedule_state = (task['schedule_state']
|
1090
|
-
if len(job_tasks) == 1 else
|
1081
|
+
if len(job_tasks) == 1 else '-')
|
1091
1082
|
values.extend([
|
1092
1083
|
# STARTED
|
1093
1084
|
log_utils.readable_time_duration(task['start_at']),
|
1094
1085
|
task['cluster_resources'],
|
1095
1086
|
task['region'],
|
1096
|
-
|
1097
|
-
|
1087
|
+
schedule_state,
|
1088
|
+
generate_details(task['failure_reason']),
|
1098
1089
|
])
|
1099
1090
|
if tasks_have_user:
|
1100
1091
|
values.insert(0, task.get('user', '-'))
|
sky/provision/__init__.py
CHANGED
@@ -22,6 +22,7 @@ from sky.provision import kubernetes
|
|
22
22
|
from sky.provision import lambda_cloud
|
23
23
|
from sky.provision import oci
|
24
24
|
from sky.provision import runpod
|
25
|
+
from sky.provision import vast
|
25
26
|
from sky.provision import vsphere
|
26
27
|
from sky.utils import command_runner
|
27
28
|
from sky.utils import timeline
|
@@ -0,0 +1,10 @@
|
|
1
|
+
"""Vast provisioner for SkyPilot."""
|
2
|
+
|
3
|
+
from sky.provision.vast.config import bootstrap_instances
|
4
|
+
from sky.provision.vast.instance import cleanup_ports
|
5
|
+
from sky.provision.vast.instance import get_cluster_info
|
6
|
+
from sky.provision.vast.instance import query_instances
|
7
|
+
from sky.provision.vast.instance import run_instances
|
8
|
+
from sky.provision.vast.instance import stop_instances
|
9
|
+
from sky.provision.vast.instance import terminate_instances
|
10
|
+
from sky.provision.vast.instance import wait_instances
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"""Vast configuration bootstrapping."""
|
2
|
+
|
3
|
+
from sky.provision import common
|
4
|
+
|
5
|
+
|
6
|
+
def bootstrap_instances(
|
7
|
+
region: str, cluster_name: str,
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
10
|
+
del region, cluster_name # unused
|
11
|
+
return config
|
@@ -0,0 +1,247 @@
|
|
1
|
+
"""Vast instance provisioning."""
|
2
|
+
import time
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
from sky import sky_logging
|
6
|
+
from sky import status_lib
|
7
|
+
from sky.provision import common
|
8
|
+
from sky.provision.vast import utils
|
9
|
+
from sky.utils import common_utils
|
10
|
+
from sky.utils import ux_utils
|
11
|
+
|
12
|
+
POLL_INTERVAL = 10
|
13
|
+
|
14
|
+
logger = sky_logging.init_logger(__name__)
|
15
|
+
# a much more convenient method
|
16
|
+
status_filter = lambda machine_dict, stat_list: {
|
17
|
+
k: v for k, v in machine_dict.items() if v['status'] in stat_list
|
18
|
+
}
|
19
|
+
|
20
|
+
|
21
|
+
def _filter_instances(cluster_name_on_cloud: str,
|
22
|
+
status_filters: Optional[List[str]],
|
23
|
+
head_only: bool = False) -> Dict[str, Any]:
|
24
|
+
|
25
|
+
instances = utils.list_instances()
|
26
|
+
possible_names = [f'{cluster_name_on_cloud}-head']
|
27
|
+
if not head_only:
|
28
|
+
possible_names.append(f'{cluster_name_on_cloud}-worker')
|
29
|
+
|
30
|
+
filtered_instances = {}
|
31
|
+
for instance_id, instance in instances.items():
|
32
|
+
if (status_filters is not None and
|
33
|
+
instance['status'] not in status_filters):
|
34
|
+
continue
|
35
|
+
if instance.get('name') in possible_names:
|
36
|
+
filtered_instances[instance_id] = instance
|
37
|
+
return filtered_instances
|
38
|
+
|
39
|
+
|
40
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
41
|
+
for inst_id, inst in instances.items():
|
42
|
+
if inst['name'].endswith('-head'):
|
43
|
+
return inst_id
|
44
|
+
return None
|
45
|
+
|
46
|
+
|
47
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
48
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
49
|
+
"""Runs instances for the given cluster."""
|
50
|
+
pending_status = ['CREATED', 'RESTARTING']
|
51
|
+
|
52
|
+
created_instance_ids = []
|
53
|
+
instances: Dict[str, Any] = {}
|
54
|
+
|
55
|
+
while True:
|
56
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
57
|
+
if not status_filter(instances, pending_status):
|
58
|
+
break
|
59
|
+
logger.info(f'Waiting for {len(instances)} instances to be ready.')
|
60
|
+
time.sleep(POLL_INTERVAL)
|
61
|
+
|
62
|
+
running_instances = status_filter(instances, ['RUNNING'])
|
63
|
+
head_instance_id = _get_head_instance_id(running_instances)
|
64
|
+
stopped_instances = status_filter(instances, ['EXITED', 'STOPPED'])
|
65
|
+
|
66
|
+
if config.resume_stopped_nodes and stopped_instances:
|
67
|
+
for instance in stopped_instances.values():
|
68
|
+
utils.start(instance['id'])
|
69
|
+
else:
|
70
|
+
to_start_count = config.count - (len(running_instances) +
|
71
|
+
len(stopped_instances))
|
72
|
+
if to_start_count < 0:
|
73
|
+
raise RuntimeError(f'Cluster {cluster_name_on_cloud} already has '
|
74
|
+
f'{len(running_instances)} nodes,'
|
75
|
+
f'but {config.count} are required.')
|
76
|
+
if to_start_count == 0:
|
77
|
+
if head_instance_id is None:
|
78
|
+
raise RuntimeError(
|
79
|
+
f'Cluster {cluster_name_on_cloud} has no head node.')
|
80
|
+
logger.info(
|
81
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
82
|
+
f'{len(running_instances)} nodes, no need to start more.')
|
83
|
+
return common.ProvisionRecord(provider_name='vast',
|
84
|
+
cluster_name=cluster_name_on_cloud,
|
85
|
+
region=region,
|
86
|
+
zone=None,
|
87
|
+
head_instance_id=head_instance_id,
|
88
|
+
resumed_instance_ids=[],
|
89
|
+
created_instance_ids=[])
|
90
|
+
|
91
|
+
for _ in range(to_start_count):
|
92
|
+
node_type = 'head' if head_instance_id is None else 'worker'
|
93
|
+
try:
|
94
|
+
instance_id = utils.launch(
|
95
|
+
name=f'{cluster_name_on_cloud}-{node_type}',
|
96
|
+
instance_type=config.node_config['InstanceType'],
|
97
|
+
region=region,
|
98
|
+
disk_size=config.node_config['DiskSize'],
|
99
|
+
preemptible=config.node_config['Preemptible'],
|
100
|
+
image_name=config.node_config['ImageId'])
|
101
|
+
except Exception as e: # pylint: disable=broad-except
|
102
|
+
logger.warning(f'run_instances error: {e}')
|
103
|
+
raise
|
104
|
+
logger.info(f'Launched instance {instance_id}.')
|
105
|
+
created_instance_ids.append(instance_id)
|
106
|
+
if head_instance_id is None:
|
107
|
+
head_instance_id = instance_id
|
108
|
+
|
109
|
+
# Wait for instances to be ready.
|
110
|
+
while True:
|
111
|
+
instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
112
|
+
ready_instance_cnt = 0
|
113
|
+
for instance_id, instance in instances.items():
|
114
|
+
if instance.get('ssh_port') is not None:
|
115
|
+
ready_instance_cnt += 1
|
116
|
+
logger.info('Waiting for instances to be ready: '
|
117
|
+
f'({ready_instance_cnt}/{config.count}).')
|
118
|
+
if ready_instance_cnt == config.count:
|
119
|
+
break
|
120
|
+
|
121
|
+
time.sleep(POLL_INTERVAL)
|
122
|
+
|
123
|
+
head_instance_id = _get_head_instance_id(utils.list_instances())
|
124
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
125
|
+
return common.ProvisionRecord(provider_name='vast',
|
126
|
+
cluster_name=cluster_name_on_cloud,
|
127
|
+
region=region,
|
128
|
+
zone=None,
|
129
|
+
head_instance_id=head_instance_id,
|
130
|
+
resumed_instance_ids=[],
|
131
|
+
created_instance_ids=created_instance_ids)
|
132
|
+
|
133
|
+
|
134
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
135
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
136
|
+
del region, cluster_name_on_cloud, state
|
137
|
+
|
138
|
+
|
139
|
+
def stop_instances(
|
140
|
+
cluster_name_on_cloud: str,
|
141
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
142
|
+
worker_only: bool = False,
|
143
|
+
) -> None:
|
144
|
+
return action_instances('stop', cluster_name_on_cloud, provider_config,
|
145
|
+
worker_only)
|
146
|
+
|
147
|
+
|
148
|
+
def terminate_instances(
|
149
|
+
cluster_name_on_cloud: str,
|
150
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
151
|
+
worker_only: bool = False,
|
152
|
+
) -> None:
|
153
|
+
return action_instances('remove', cluster_name_on_cloud, provider_config,
|
154
|
+
worker_only)
|
155
|
+
|
156
|
+
|
157
|
+
def action_instances(
|
158
|
+
fn: str,
|
159
|
+
cluster_name_on_cloud: str,
|
160
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
161
|
+
worker_only: bool = False,
|
162
|
+
) -> None:
|
163
|
+
"""See sky/provision/__init__.py"""
|
164
|
+
del provider_config # unused
|
165
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
166
|
+
for inst_id, inst in instances.items():
|
167
|
+
logger.debug(f'Instance {fn} {inst_id}: {inst}')
|
168
|
+
if worker_only and inst['name'].endswith('-head'):
|
169
|
+
continue
|
170
|
+
try:
|
171
|
+
getattr(utils, fn)(inst_id)
|
172
|
+
except Exception as e: # pylint: disable=broad-except
|
173
|
+
with ux_utils.print_exception_no_traceback():
|
174
|
+
raise RuntimeError(
|
175
|
+
f'Failed to {fn} instance {inst_id}: '
|
176
|
+
f'{common_utils.format_exception(e, use_bracket=False)}'
|
177
|
+
) from e
|
178
|
+
|
179
|
+
|
180
|
+
def get_cluster_info(
|
181
|
+
region: str,
|
182
|
+
cluster_name_on_cloud: str,
|
183
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
184
|
+
del region # unused
|
185
|
+
running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
186
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
187
|
+
head_instance_id = None
|
188
|
+
for instance_id, instance_info in running_instances.items():
|
189
|
+
instances[instance_id] = [
|
190
|
+
common.InstanceInfo(
|
191
|
+
instance_id=instance_id,
|
192
|
+
internal_ip=instance_info['local_ipaddrs'].strip(),
|
193
|
+
external_ip=instance_info['public_ipaddr'],
|
194
|
+
ssh_port=instance_info['ports']['22/tcp'][0]['HostPort'],
|
195
|
+
tags={},
|
196
|
+
)
|
197
|
+
]
|
198
|
+
if instance_info['name'].endswith('-head'):
|
199
|
+
head_instance_id = instance_id
|
200
|
+
|
201
|
+
return common.ClusterInfo(
|
202
|
+
instances=instances,
|
203
|
+
head_instance_id=head_instance_id,
|
204
|
+
provider_name='vast',
|
205
|
+
provider_config=provider_config,
|
206
|
+
)
|
207
|
+
|
208
|
+
|
209
|
+
def open_ports(
|
210
|
+
cluster_name_on_cloud: str,
|
211
|
+
ports: List[str],
|
212
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
213
|
+
) -> None:
|
214
|
+
raise NotImplementedError('open_ports is not supported for Vast')
|
215
|
+
|
216
|
+
|
217
|
+
def query_instances(
|
218
|
+
cluster_name_on_cloud: str,
|
219
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
220
|
+
non_terminated_only: bool = True,
|
221
|
+
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
222
|
+
"""See sky/provision/__init__.py"""
|
223
|
+
|
224
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
225
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
226
|
+
# "running", "frozen", "stopped", "unknown", "loading"
|
227
|
+
status_map = {
|
228
|
+
'LOADING': status_lib.ClusterStatus.INIT,
|
229
|
+
'EXITED': status_lib.ClusterStatus.STOPPED,
|
230
|
+
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
231
|
+
'RUNNING': status_lib.ClusterStatus.UP,
|
232
|
+
}
|
233
|
+
statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
|
234
|
+
for inst_id, inst in instances.items():
|
235
|
+
status = status_map[inst['status']]
|
236
|
+
if non_terminated_only and status is None:
|
237
|
+
continue
|
238
|
+
statuses[inst_id] = status
|
239
|
+
return statuses
|
240
|
+
|
241
|
+
|
242
|
+
def cleanup_ports(
|
243
|
+
cluster_name_on_cloud: str,
|
244
|
+
ports: List[str],
|
245
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
246
|
+
) -> None:
|
247
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|
@@ -0,0 +1,161 @@
|
|
1
|
+
# pylint: disable=assignment-from-no-return
|
2
|
+
#
|
3
|
+
# The pylint exception above is an accomodation for
|
4
|
+
# false positives generated by pylint for the Vast
|
5
|
+
# python sdk.
|
6
|
+
#
|
7
|
+
"""Vast library wrapper for SkyPilot."""
|
8
|
+
from typing import Any, Dict, List
|
9
|
+
|
10
|
+
from sky import sky_logging
|
11
|
+
from sky.adaptors import vast
|
12
|
+
|
13
|
+
logger = sky_logging.init_logger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def list_instances() -> Dict[str, Dict[str, Any]]:
|
17
|
+
"""Lists instances associated with API key."""
|
18
|
+
instances = vast.vast().show_instances()
|
19
|
+
|
20
|
+
instance_dict: Dict[str, Dict[str, Any]] = {}
|
21
|
+
for instance in instances:
|
22
|
+
instance['id'] = str(instance['id'])
|
23
|
+
info = instance
|
24
|
+
|
25
|
+
if isinstance(instance['actual_status'], str):
|
26
|
+
info['status'] = instance['actual_status'].upper()
|
27
|
+
else:
|
28
|
+
info['status'] = 'UNKNOWN'
|
29
|
+
info['name'] = instance['label']
|
30
|
+
|
31
|
+
instance_dict[instance['id']] = info
|
32
|
+
|
33
|
+
return instance_dict
|
34
|
+
|
35
|
+
|
36
|
+
def launch(name: str, instance_type: str, region: str, disk_size: int,
|
37
|
+
image_name: str, preemptible: bool) -> str:
|
38
|
+
"""Launches an instance with the given parameters.
|
39
|
+
|
40
|
+
Converts the instance_type to the Vast GPU name, finds the specs for the
|
41
|
+
GPU, and launches the instance.
|
42
|
+
|
43
|
+
Notes:
|
44
|
+
|
45
|
+
* `georegion`: This is a feature flag to provide an additional
|
46
|
+
scope of geographical specificy while maintaining backward
|
47
|
+
compatibility.
|
48
|
+
|
49
|
+
* `chunked`: This is a feature flag to give breadth to the
|
50
|
+
snowflake nature of the vast catalog marketplace. It rounds
|
51
|
+
down various specifications of machines to emulate an instance
|
52
|
+
type and make them more interchangeable.
|
53
|
+
|
54
|
+
* `disk_size`: We look for instances that are of the requested
|
55
|
+
size or greater than it. For instance, `disk_size=100` might
|
56
|
+
return something with `disk_size` at 102 or even 1000.
|
57
|
+
|
58
|
+
The disk size {xx} GB is not exactly matched the requested
|
59
|
+
size {yy} GB. It is possible to charge extra cost on disk.
|
60
|
+
|
61
|
+
* `geolocation`: Geolocation on Vast can be as specific as the
|
62
|
+
host chooses to be. They can say, for instance, "Yutakachō,
|
63
|
+
Shinagawa District, Tokyo, JP." Such a specific geolocation
|
64
|
+
as ours would fail to return this host in a simple string
|
65
|
+
comparison if a user searched for "JP".
|
66
|
+
|
67
|
+
Since regardless of specificity, all our geolocations end
|
68
|
+
in two-letter country codes we just snip that to conform
|
69
|
+
to how many providers state their geolocation.
|
70
|
+
|
71
|
+
* Since the catalog is cached, we can't gaurantee availability
|
72
|
+
of any machine at the point of inquiry. As a consequence we
|
73
|
+
search for the machine again and potentially return a failure
|
74
|
+
if there is no availability.
|
75
|
+
|
76
|
+
* We pass in the cpu_ram here as a guarantor to make sure the
|
77
|
+
instance we match with will be compliant with the requested
|
78
|
+
amount of memory.
|
79
|
+
|
80
|
+
* Vast instance types are an invention for skypilot. Refer to
|
81
|
+
service_catalog/vast_catalog.py for the current construction
|
82
|
+
of the type.
|
83
|
+
|
84
|
+
"""
|
85
|
+
cpu_ram = float(instance_type.split('-')[-1]) / 1024
|
86
|
+
gpu_name = instance_type.split('-')[1].replace('_', ' ')
|
87
|
+
num_gpus = int(instance_type.split('-')[0].replace('x', ''))
|
88
|
+
|
89
|
+
query = ' '.join([
|
90
|
+
'chunked=true',
|
91
|
+
'georegion=true',
|
92
|
+
f'geolocation="{region[-2:]}"',
|
93
|
+
f'disk_space>={disk_size}',
|
94
|
+
f'num_gpus={num_gpus}',
|
95
|
+
f'gpu_name="{gpu_name}"',
|
96
|
+
f'cpu_ram>="{cpu_ram}"',
|
97
|
+
])
|
98
|
+
|
99
|
+
instance_list = vast.vast().search_offers(query=query)
|
100
|
+
|
101
|
+
if isinstance(instance_list, int) or len(instance_list) == 0:
|
102
|
+
raise RuntimeError('Failed to create instances, could not find an '
|
103
|
+
f'offer that satisfies the requirements "{query}".')
|
104
|
+
|
105
|
+
instance_touse = instance_list[0]
|
106
|
+
|
107
|
+
launch_params = {
|
108
|
+
'id': instance_touse['id'],
|
109
|
+
'direct': True,
|
110
|
+
'ssh': True,
|
111
|
+
'env': '-e __SOURCE=skypilot',
|
112
|
+
'onstart_cmd': ';'.join([
|
113
|
+
'touch ~/.no_auto_tmux',
|
114
|
+
f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
|
115
|
+
]),
|
116
|
+
'label': name,
|
117
|
+
'image': image_name
|
118
|
+
}
|
119
|
+
|
120
|
+
if preemptible:
|
121
|
+
launch_params['min_bid'] = instance_touse['min_bid']
|
122
|
+
|
123
|
+
new_instance_contract = vast.vast().create_instance(**launch_params)
|
124
|
+
|
125
|
+
new_instance = vast.vast().show_instance(
|
126
|
+
id=new_instance_contract['new_contract'])
|
127
|
+
|
128
|
+
return new_instance['id']
|
129
|
+
|
130
|
+
|
131
|
+
def start(instance_id: str) -> None:
|
132
|
+
"""Starts the given instance."""
|
133
|
+
vast.vast().start_instance(id=instance_id)
|
134
|
+
|
135
|
+
|
136
|
+
def stop(instance_id: str) -> None:
|
137
|
+
"""Stops the given instance."""
|
138
|
+
vast.vast().stop_instance(id=instance_id)
|
139
|
+
|
140
|
+
|
141
|
+
def remove(instance_id: str) -> None:
|
142
|
+
"""Terminates the given instance."""
|
143
|
+
vast.vast().destroy_instance(id=instance_id)
|
144
|
+
|
145
|
+
|
146
|
+
def get_ssh_ports(cluster_name: str) -> List[int]:
|
147
|
+
"""Gets the SSH ports for the given cluster."""
|
148
|
+
logger.debug(f'Getting SSH ports for cluster {cluster_name}.')
|
149
|
+
|
150
|
+
instances = list_instances()
|
151
|
+
possible_names = [f'{cluster_name}-head', f'{cluster_name}-worker']
|
152
|
+
|
153
|
+
ssh_ports = []
|
154
|
+
|
155
|
+
for instance in instances.values():
|
156
|
+
if instance['name'] in possible_names:
|
157
|
+
ssh_ports.append(instance['ssh_port'])
|
158
|
+
assert ssh_ports, (
|
159
|
+
f'Could not find any instances for cluster {cluster_name}.')
|
160
|
+
|
161
|
+
return ssh_ports
|
sky/setup_files/dependencies.py
CHANGED
@@ -131,6 +131,7 @@ extras_require: Dict[str, List[str]] = {
|
|
131
131
|
'cudo': ['cudo-compute>=0.1.10'],
|
132
132
|
'paperspace': [], # No dependencies needed for paperspace
|
133
133
|
'do': ['pydo>=0.3.0', 'azure-core>=1.24.0', 'azure-common'],
|
134
|
+
'vast': ['vastai-sdk>=0.1.12'],
|
134
135
|
'vsphere': [
|
135
136
|
'pyvmomi==8.0.1.0.2',
|
136
137
|
# vsphere-automation-sdk is also required, but it does not have
|
@@ -0,0 +1,70 @@
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
2
|
+
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
4
|
+
max_workers: {{num_nodes - 1}}
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
6
|
+
idle_timeout_minutes: 60
|
7
|
+
|
8
|
+
provider:
|
9
|
+
type: external
|
10
|
+
module: sky.provision.vast
|
11
|
+
region: "{{region}}"
|
12
|
+
disable_launch_config_check: true
|
13
|
+
|
14
|
+
auth:
|
15
|
+
ssh_user: root
|
16
|
+
ssh_private_key: {{ssh_private_key}}
|
17
|
+
|
18
|
+
available_node_types:
|
19
|
+
ray_head_default:
|
20
|
+
resources: {}
|
21
|
+
node_config:
|
22
|
+
InstanceType: {{instance_type}}
|
23
|
+
DiskSize: {{disk_size}}
|
24
|
+
ImageId: {{image_id}}
|
25
|
+
Preemptible: {{use_spot}}
|
26
|
+
PublicKey: |-
|
27
|
+
skypilot:ssh_public_key_content
|
28
|
+
|
29
|
+
head_node_type: ray_head_default
|
30
|
+
|
31
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
32
|
+
file_mounts: {
|
33
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
34
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
35
|
+
{%- for remote_path, local_path in credentials.items() %}
|
36
|
+
"{{remote_path}}": "{{local_path}}",
|
37
|
+
{%- endfor %}
|
38
|
+
}
|
39
|
+
|
40
|
+
rsync_exclude: []
|
41
|
+
|
42
|
+
initialization_commands: []
|
43
|
+
|
44
|
+
# List of shell commands to run to set up nodes.
|
45
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
46
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
47
|
+
# items!
|
48
|
+
#
|
49
|
+
# Increment the following for catching performance bugs easier:
|
50
|
+
# current num items (num SSH connections): 1
|
51
|
+
setup_commands:
|
52
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
53
|
+
# Line 'rm ..': there is another installation of pip.
|
54
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
55
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
56
|
+
# Line 'mkdir -p ..': disable host key check
|
57
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
58
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
59
|
+
{{ initial_setup_command }}
|
60
|
+
{%- endfor %}
|
61
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
|
62
|
+
{{ conda_installation_commands }}
|
63
|
+
{{ ray_skypilot_installation_commands }}
|
64
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
65
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
66
|
+
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
67
|
+
|
68
|
+
|
69
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
70
|
+
# We do not need to list it here anymore.
|
sky/utils/controller_utils.py
CHANGED
@@ -261,6 +261,11 @@ def _get_cloud_dependencies_installation_commands(
|
|
261
261
|
if controller != Controllers.JOBS_CONTROLLER:
|
262
262
|
# We only need IBM deps on the jobs controller.
|
263
263
|
cloud_python_dependencies = []
|
264
|
+
elif isinstance(cloud, clouds.Vast):
|
265
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
266
|
+
commands.append(f'echo -en "\\r{step_prefix}Vast{empty_str}" && '
|
267
|
+
'pip list | grep vastai_sdk > /dev/null 2>&1 || '
|
268
|
+
'pip install "vastai_sdk>=0.1.12" > /dev/null 2>&1')
|
264
269
|
|
265
270
|
python_packages.update(cloud_python_dependencies)
|
266
271
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20250206
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -99,6 +99,8 @@ Provides-Extra: do
|
|
99
99
|
Requires-Dist: pydo>=0.3.0; extra == "do"
|
100
100
|
Requires-Dist: azure-core>=1.24.0; extra == "do"
|
101
101
|
Requires-Dist: azure-common; extra == "do"
|
102
|
+
Provides-Extra: vast
|
103
|
+
Requires-Dist: vastai-sdk>=0.1.12; extra == "vast"
|
102
104
|
Provides-Extra: vsphere
|
103
105
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "vsphere"
|
104
106
|
Provides-Extra: all
|
@@ -145,6 +147,7 @@ Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
145
147
|
Requires-Dist: pydo>=0.3.0; extra == "all"
|
146
148
|
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
147
149
|
Requires-Dist: azure-common; extra == "all"
|
150
|
+
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
148
151
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
149
152
|
Dynamic: author
|
150
153
|
Dynamic: classifier
|