skypilot-nightly 1.0.0.dev20250203__py3-none-any.whl → 1.0.0.dev20250205__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/vast.py +29 -0
- sky/authentication.py +18 -0
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +1 -0
- sky/check.py +2 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/kubernetes_catalog.py +11 -6
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/vast.py +279 -0
- sky/jobs/dashboard/dashboard.py +156 -20
- sky/jobs/dashboard/templates/index.html +557 -78
- sky/jobs/scheduler.py +14 -5
- sky/provision/__init__.py +1 -0
- sky/provision/lambda_cloud/instance.py +17 -1
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +161 -0
- sky/serve/serve_state.py +23 -21
- sky/setup_files/dependencies.py +1 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/utils/controller_utils.py +5 -0
- {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/METADATA +4 -1
- {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/RECORD +31 -22
- {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py
CHANGED
@@ -60,6 +60,14 @@ logger = sky_logging.init_logger('sky.jobs.controller')
|
|
60
60
|
_MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
|
61
61
|
_ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
|
62
62
|
|
63
|
+
# Based on testing, assume a running job uses 350MB memory.
|
64
|
+
JOB_MEMORY_MB = 350
|
65
|
+
# Past 2000 simultaneous jobs, we become unstable.
|
66
|
+
# See https://github.com/skypilot-org/skypilot/issues/4649.
|
67
|
+
MAX_JOB_LIMIT = 2000
|
68
|
+
# Number of ongoing launches launches allowed per CPU.
|
69
|
+
LAUNCHES_PER_CPU = 4
|
70
|
+
|
63
71
|
|
64
72
|
@lru_cache(maxsize=1)
|
65
73
|
def _get_lock_path() -> str:
|
@@ -247,15 +255,16 @@ def _set_alive_waiting(job_id: int) -> None:
|
|
247
255
|
|
248
256
|
|
249
257
|
def _get_job_parallelism() -> int:
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
258
|
+
job_memory = JOB_MEMORY_MB * 1024 * 1024
|
259
|
+
|
260
|
+
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
261
|
+
|
262
|
+
return max(job_limit, 1)
|
254
263
|
|
255
264
|
|
256
265
|
def _get_launch_parallelism() -> int:
|
257
266
|
cpus = os.cpu_count()
|
258
|
-
return cpus *
|
267
|
+
return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
|
259
268
|
|
260
269
|
|
261
270
|
def _can_start_new_job() -> bool:
|
sky/provision/__init__.py
CHANGED
@@ -22,6 +22,7 @@ from sky.provision import kubernetes
|
|
22
22
|
from sky.provision import lambda_cloud
|
23
23
|
from sky.provision import oci
|
24
24
|
from sky.provision import runpod
|
25
|
+
from sky.provision import vast
|
25
26
|
from sky.provision import vsphere
|
26
27
|
from sky.utils import command_runner
|
27
28
|
from sky.utils import timeline
|
@@ -64,6 +64,21 @@ def _get_ssh_key_name(prefix: str = '') -> str:
|
|
64
64
|
return name
|
65
65
|
|
66
66
|
|
67
|
+
def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
|
68
|
+
private_ip = instance_info.get('private_ip')
|
69
|
+
if private_ip is None:
|
70
|
+
if single_node:
|
71
|
+
# The Lambda cloud API may return an instance info without
|
72
|
+
# private IP. It does not align with their docs, but we still
|
73
|
+
# allow single-node cluster to proceed with provisioning, by using
|
74
|
+
# 127.0.0.1, as private IP is not critical for single-node case.
|
75
|
+
return '127.0.0.1'
|
76
|
+
msg = f'Failed to retrieve private IP for instance {instance_info}.'
|
77
|
+
logger.error(msg)
|
78
|
+
raise RuntimeError(msg)
|
79
|
+
return private_ip
|
80
|
+
|
81
|
+
|
67
82
|
def run_instances(region: str, cluster_name_on_cloud: str,
|
68
83
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
69
84
|
"""Runs instances for the given cluster"""
|
@@ -197,13 +212,14 @@ def get_cluster_info(
|
|
197
212
|
) -> common.ClusterInfo:
|
198
213
|
del region # unused
|
199
214
|
running_instances = _filter_instances(cluster_name_on_cloud, ['active'])
|
215
|
+
single_node = len(running_instances) == 1
|
200
216
|
instances: Dict[str, List[common.InstanceInfo]] = {}
|
201
217
|
head_instance_id = None
|
202
218
|
for instance_id, instance_info in running_instances.items():
|
203
219
|
instances[instance_id] = [
|
204
220
|
common.InstanceInfo(
|
205
221
|
instance_id=instance_id,
|
206
|
-
internal_ip=instance_info
|
222
|
+
internal_ip=_get_private_ip(instance_info, single_node),
|
207
223
|
external_ip=instance_info['ip'],
|
208
224
|
ssh_port=22,
|
209
225
|
tags={},
|
@@ -0,0 +1,10 @@
|
|
1
|
+
"""Vast provisioner for SkyPilot."""
|
2
|
+
|
3
|
+
from sky.provision.vast.config import bootstrap_instances
|
4
|
+
from sky.provision.vast.instance import cleanup_ports
|
5
|
+
from sky.provision.vast.instance import get_cluster_info
|
6
|
+
from sky.provision.vast.instance import query_instances
|
7
|
+
from sky.provision.vast.instance import run_instances
|
8
|
+
from sky.provision.vast.instance import stop_instances
|
9
|
+
from sky.provision.vast.instance import terminate_instances
|
10
|
+
from sky.provision.vast.instance import wait_instances
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"""Vast configuration bootstrapping."""
|
2
|
+
|
3
|
+
from sky.provision import common
|
4
|
+
|
5
|
+
|
6
|
+
def bootstrap_instances(
|
7
|
+
region: str, cluster_name: str,
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
10
|
+
del region, cluster_name # unused
|
11
|
+
return config
|
@@ -0,0 +1,247 @@
|
|
1
|
+
"""Vast instance provisioning."""
|
2
|
+
import time
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
from sky import sky_logging
|
6
|
+
from sky import status_lib
|
7
|
+
from sky.provision import common
|
8
|
+
from sky.provision.vast import utils
|
9
|
+
from sky.utils import common_utils
|
10
|
+
from sky.utils import ux_utils
|
11
|
+
|
12
|
+
POLL_INTERVAL = 10
|
13
|
+
|
14
|
+
logger = sky_logging.init_logger(__name__)
|
15
|
+
# a much more convenient method
|
16
|
+
status_filter = lambda machine_dict, stat_list: {
|
17
|
+
k: v for k, v in machine_dict.items() if v['status'] in stat_list
|
18
|
+
}
|
19
|
+
|
20
|
+
|
21
|
+
def _filter_instances(cluster_name_on_cloud: str,
|
22
|
+
status_filters: Optional[List[str]],
|
23
|
+
head_only: bool = False) -> Dict[str, Any]:
|
24
|
+
|
25
|
+
instances = utils.list_instances()
|
26
|
+
possible_names = [f'{cluster_name_on_cloud}-head']
|
27
|
+
if not head_only:
|
28
|
+
possible_names.append(f'{cluster_name_on_cloud}-worker')
|
29
|
+
|
30
|
+
filtered_instances = {}
|
31
|
+
for instance_id, instance in instances.items():
|
32
|
+
if (status_filters is not None and
|
33
|
+
instance['status'] not in status_filters):
|
34
|
+
continue
|
35
|
+
if instance.get('name') in possible_names:
|
36
|
+
filtered_instances[instance_id] = instance
|
37
|
+
return filtered_instances
|
38
|
+
|
39
|
+
|
40
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
41
|
+
for inst_id, inst in instances.items():
|
42
|
+
if inst['name'].endswith('-head'):
|
43
|
+
return inst_id
|
44
|
+
return None
|
45
|
+
|
46
|
+
|
47
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
48
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
49
|
+
"""Runs instances for the given cluster."""
|
50
|
+
pending_status = ['CREATED', 'RESTARTING']
|
51
|
+
|
52
|
+
created_instance_ids = []
|
53
|
+
instances: Dict[str, Any] = {}
|
54
|
+
|
55
|
+
while True:
|
56
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
57
|
+
if not status_filter(instances, pending_status):
|
58
|
+
break
|
59
|
+
logger.info(f'Waiting for {len(instances)} instances to be ready.')
|
60
|
+
time.sleep(POLL_INTERVAL)
|
61
|
+
|
62
|
+
running_instances = status_filter(instances, ['RUNNING'])
|
63
|
+
head_instance_id = _get_head_instance_id(running_instances)
|
64
|
+
stopped_instances = status_filter(instances, ['EXITED', 'STOPPED'])
|
65
|
+
|
66
|
+
if config.resume_stopped_nodes and stopped_instances:
|
67
|
+
for instance in stopped_instances.values():
|
68
|
+
utils.start(instance['id'])
|
69
|
+
else:
|
70
|
+
to_start_count = config.count - (len(running_instances) +
|
71
|
+
len(stopped_instances))
|
72
|
+
if to_start_count < 0:
|
73
|
+
raise RuntimeError(f'Cluster {cluster_name_on_cloud} already has '
|
74
|
+
f'{len(running_instances)} nodes,'
|
75
|
+
f'but {config.count} are required.')
|
76
|
+
if to_start_count == 0:
|
77
|
+
if head_instance_id is None:
|
78
|
+
raise RuntimeError(
|
79
|
+
f'Cluster {cluster_name_on_cloud} has no head node.')
|
80
|
+
logger.info(
|
81
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
82
|
+
f'{len(running_instances)} nodes, no need to start more.')
|
83
|
+
return common.ProvisionRecord(provider_name='vast',
|
84
|
+
cluster_name=cluster_name_on_cloud,
|
85
|
+
region=region,
|
86
|
+
zone=None,
|
87
|
+
head_instance_id=head_instance_id,
|
88
|
+
resumed_instance_ids=[],
|
89
|
+
created_instance_ids=[])
|
90
|
+
|
91
|
+
for _ in range(to_start_count):
|
92
|
+
node_type = 'head' if head_instance_id is None else 'worker'
|
93
|
+
try:
|
94
|
+
instance_id = utils.launch(
|
95
|
+
name=f'{cluster_name_on_cloud}-{node_type}',
|
96
|
+
instance_type=config.node_config['InstanceType'],
|
97
|
+
region=region,
|
98
|
+
disk_size=config.node_config['DiskSize'],
|
99
|
+
preemptible=config.node_config['Preemptible'],
|
100
|
+
image_name=config.node_config['ImageId'])
|
101
|
+
except Exception as e: # pylint: disable=broad-except
|
102
|
+
logger.warning(f'run_instances error: {e}')
|
103
|
+
raise
|
104
|
+
logger.info(f'Launched instance {instance_id}.')
|
105
|
+
created_instance_ids.append(instance_id)
|
106
|
+
if head_instance_id is None:
|
107
|
+
head_instance_id = instance_id
|
108
|
+
|
109
|
+
# Wait for instances to be ready.
|
110
|
+
while True:
|
111
|
+
instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
112
|
+
ready_instance_cnt = 0
|
113
|
+
for instance_id, instance in instances.items():
|
114
|
+
if instance.get('ssh_port') is not None:
|
115
|
+
ready_instance_cnt += 1
|
116
|
+
logger.info('Waiting for instances to be ready: '
|
117
|
+
f'({ready_instance_cnt}/{config.count}).')
|
118
|
+
if ready_instance_cnt == config.count:
|
119
|
+
break
|
120
|
+
|
121
|
+
time.sleep(POLL_INTERVAL)
|
122
|
+
|
123
|
+
head_instance_id = _get_head_instance_id(utils.list_instances())
|
124
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
125
|
+
return common.ProvisionRecord(provider_name='vast',
|
126
|
+
cluster_name=cluster_name_on_cloud,
|
127
|
+
region=region,
|
128
|
+
zone=None,
|
129
|
+
head_instance_id=head_instance_id,
|
130
|
+
resumed_instance_ids=[],
|
131
|
+
created_instance_ids=created_instance_ids)
|
132
|
+
|
133
|
+
|
134
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
135
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
136
|
+
del region, cluster_name_on_cloud, state
|
137
|
+
|
138
|
+
|
139
|
+
def stop_instances(
|
140
|
+
cluster_name_on_cloud: str,
|
141
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
142
|
+
worker_only: bool = False,
|
143
|
+
) -> None:
|
144
|
+
return action_instances('stop', cluster_name_on_cloud, provider_config,
|
145
|
+
worker_only)
|
146
|
+
|
147
|
+
|
148
|
+
def terminate_instances(
|
149
|
+
cluster_name_on_cloud: str,
|
150
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
151
|
+
worker_only: bool = False,
|
152
|
+
) -> None:
|
153
|
+
return action_instances('remove', cluster_name_on_cloud, provider_config,
|
154
|
+
worker_only)
|
155
|
+
|
156
|
+
|
157
|
+
def action_instances(
|
158
|
+
fn: str,
|
159
|
+
cluster_name_on_cloud: str,
|
160
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
161
|
+
worker_only: bool = False,
|
162
|
+
) -> None:
|
163
|
+
"""See sky/provision/__init__.py"""
|
164
|
+
del provider_config # unused
|
165
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
166
|
+
for inst_id, inst in instances.items():
|
167
|
+
logger.debug(f'Instance {fn} {inst_id}: {inst}')
|
168
|
+
if worker_only and inst['name'].endswith('-head'):
|
169
|
+
continue
|
170
|
+
try:
|
171
|
+
getattr(utils, fn)(inst_id)
|
172
|
+
except Exception as e: # pylint: disable=broad-except
|
173
|
+
with ux_utils.print_exception_no_traceback():
|
174
|
+
raise RuntimeError(
|
175
|
+
f'Failed to {fn} instance {inst_id}: '
|
176
|
+
f'{common_utils.format_exception(e, use_bracket=False)}'
|
177
|
+
) from e
|
178
|
+
|
179
|
+
|
180
|
+
def get_cluster_info(
|
181
|
+
region: str,
|
182
|
+
cluster_name_on_cloud: str,
|
183
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
184
|
+
del region # unused
|
185
|
+
running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
186
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
187
|
+
head_instance_id = None
|
188
|
+
for instance_id, instance_info in running_instances.items():
|
189
|
+
instances[instance_id] = [
|
190
|
+
common.InstanceInfo(
|
191
|
+
instance_id=instance_id,
|
192
|
+
internal_ip=instance_info['local_ipaddrs'].strip(),
|
193
|
+
external_ip=instance_info['public_ipaddr'],
|
194
|
+
ssh_port=instance_info['ports']['22/tcp'][0]['HostPort'],
|
195
|
+
tags={},
|
196
|
+
)
|
197
|
+
]
|
198
|
+
if instance_info['name'].endswith('-head'):
|
199
|
+
head_instance_id = instance_id
|
200
|
+
|
201
|
+
return common.ClusterInfo(
|
202
|
+
instances=instances,
|
203
|
+
head_instance_id=head_instance_id,
|
204
|
+
provider_name='vast',
|
205
|
+
provider_config=provider_config,
|
206
|
+
)
|
207
|
+
|
208
|
+
|
209
|
+
def open_ports(
|
210
|
+
cluster_name_on_cloud: str,
|
211
|
+
ports: List[str],
|
212
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
213
|
+
) -> None:
|
214
|
+
raise NotImplementedError('open_ports is not supported for Vast')
|
215
|
+
|
216
|
+
|
217
|
+
def query_instances(
|
218
|
+
cluster_name_on_cloud: str,
|
219
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
220
|
+
non_terminated_only: bool = True,
|
221
|
+
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
222
|
+
"""See sky/provision/__init__.py"""
|
223
|
+
|
224
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
225
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
226
|
+
# "running", "frozen", "stopped", "unknown", "loading"
|
227
|
+
status_map = {
|
228
|
+
'LOADING': status_lib.ClusterStatus.INIT,
|
229
|
+
'EXITED': status_lib.ClusterStatus.STOPPED,
|
230
|
+
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
231
|
+
'RUNNING': status_lib.ClusterStatus.UP,
|
232
|
+
}
|
233
|
+
statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
|
234
|
+
for inst_id, inst in instances.items():
|
235
|
+
status = status_map[inst['status']]
|
236
|
+
if non_terminated_only and status is None:
|
237
|
+
continue
|
238
|
+
statuses[inst_id] = status
|
239
|
+
return statuses
|
240
|
+
|
241
|
+
|
242
|
+
def cleanup_ports(
|
243
|
+
cluster_name_on_cloud: str,
|
244
|
+
ports: List[str],
|
245
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
246
|
+
) -> None:
|
247
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|
@@ -0,0 +1,161 @@
|
|
1
|
+
# pylint: disable=assignment-from-no-return
|
2
|
+
#
|
3
|
+
# The pylint exception above is an accomodation for
|
4
|
+
# false positives generated by pylint for the Vast
|
5
|
+
# python sdk.
|
6
|
+
#
|
7
|
+
"""Vast library wrapper for SkyPilot."""
|
8
|
+
from typing import Any, Dict, List
|
9
|
+
|
10
|
+
from sky import sky_logging
|
11
|
+
from sky.adaptors import vast
|
12
|
+
|
13
|
+
logger = sky_logging.init_logger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def list_instances() -> Dict[str, Dict[str, Any]]:
|
17
|
+
"""Lists instances associated with API key."""
|
18
|
+
instances = vast.vast().show_instances()
|
19
|
+
|
20
|
+
instance_dict: Dict[str, Dict[str, Any]] = {}
|
21
|
+
for instance in instances:
|
22
|
+
instance['id'] = str(instance['id'])
|
23
|
+
info = instance
|
24
|
+
|
25
|
+
if isinstance(instance['actual_status'], str):
|
26
|
+
info['status'] = instance['actual_status'].upper()
|
27
|
+
else:
|
28
|
+
info['status'] = 'UNKNOWN'
|
29
|
+
info['name'] = instance['label']
|
30
|
+
|
31
|
+
instance_dict[instance['id']] = info
|
32
|
+
|
33
|
+
return instance_dict
|
34
|
+
|
35
|
+
|
36
|
+
def launch(name: str, instance_type: str, region: str, disk_size: int,
|
37
|
+
image_name: str, preemptible: bool) -> str:
|
38
|
+
"""Launches an instance with the given parameters.
|
39
|
+
|
40
|
+
Converts the instance_type to the Vast GPU name, finds the specs for the
|
41
|
+
GPU, and launches the instance.
|
42
|
+
|
43
|
+
Notes:
|
44
|
+
|
45
|
+
* `georegion`: This is a feature flag to provide an additional
|
46
|
+
scope of geographical specificy while maintaining backward
|
47
|
+
compatibility.
|
48
|
+
|
49
|
+
* `chunked`: This is a feature flag to give breadth to the
|
50
|
+
snowflake nature of the vast catalog marketplace. It rounds
|
51
|
+
down various specifications of machines to emulate an instance
|
52
|
+
type and make them more interchangeable.
|
53
|
+
|
54
|
+
* `disk_size`: We look for instances that are of the requested
|
55
|
+
size or greater than it. For instance, `disk_size=100` might
|
56
|
+
return something with `disk_size` at 102 or even 1000.
|
57
|
+
|
58
|
+
The disk size {xx} GB is not exactly matched the requested
|
59
|
+
size {yy} GB. It is possible to charge extra cost on disk.
|
60
|
+
|
61
|
+
* `geolocation`: Geolocation on Vast can be as specific as the
|
62
|
+
host chooses to be. They can say, for instance, "Yutakachō,
|
63
|
+
Shinagawa District, Tokyo, JP." Such a specific geolocation
|
64
|
+
as ours would fail to return this host in a simple string
|
65
|
+
comparison if a user searched for "JP".
|
66
|
+
|
67
|
+
Since regardless of specificity, all our geolocations end
|
68
|
+
in two-letter country codes we just snip that to conform
|
69
|
+
to how many providers state their geolocation.
|
70
|
+
|
71
|
+
* Since the catalog is cached, we can't gaurantee availability
|
72
|
+
of any machine at the point of inquiry. As a consequence we
|
73
|
+
search for the machine again and potentially return a failure
|
74
|
+
if there is no availability.
|
75
|
+
|
76
|
+
* We pass in the cpu_ram here as a guarantor to make sure the
|
77
|
+
instance we match with will be compliant with the requested
|
78
|
+
amount of memory.
|
79
|
+
|
80
|
+
* Vast instance types are an invention for skypilot. Refer to
|
81
|
+
service_catalog/vast_catalog.py for the current construction
|
82
|
+
of the type.
|
83
|
+
|
84
|
+
"""
|
85
|
+
cpu_ram = float(instance_type.split('-')[-1]) / 1024
|
86
|
+
gpu_name = instance_type.split('-')[1].replace('_', ' ')
|
87
|
+
num_gpus = int(instance_type.split('-')[0].replace('x', ''))
|
88
|
+
|
89
|
+
query = ' '.join([
|
90
|
+
'chunked=true',
|
91
|
+
'georegion=true',
|
92
|
+
f'geolocation="{region[-2:]}"',
|
93
|
+
f'disk_space>={disk_size}',
|
94
|
+
f'num_gpus={num_gpus}',
|
95
|
+
f'gpu_name="{gpu_name}"',
|
96
|
+
f'cpu_ram>="{cpu_ram}"',
|
97
|
+
])
|
98
|
+
|
99
|
+
instance_list = vast.vast().search_offers(query=query)
|
100
|
+
|
101
|
+
if isinstance(instance_list, int) or len(instance_list) == 0:
|
102
|
+
raise RuntimeError('Failed to create instances, could not find an '
|
103
|
+
f'offer that satisfies the requirements "{query}".')
|
104
|
+
|
105
|
+
instance_touse = instance_list[0]
|
106
|
+
|
107
|
+
launch_params = {
|
108
|
+
'id': instance_touse['id'],
|
109
|
+
'direct': True,
|
110
|
+
'ssh': True,
|
111
|
+
'env': '-e __SOURCE=skypilot',
|
112
|
+
'onstart_cmd': ';'.join([
|
113
|
+
'touch ~/.no_auto_tmux',
|
114
|
+
f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
|
115
|
+
]),
|
116
|
+
'label': name,
|
117
|
+
'image': image_name
|
118
|
+
}
|
119
|
+
|
120
|
+
if preemptible:
|
121
|
+
launch_params['min_bid'] = instance_touse['min_bid']
|
122
|
+
|
123
|
+
new_instance_contract = vast.vast().create_instance(**launch_params)
|
124
|
+
|
125
|
+
new_instance = vast.vast().show_instance(
|
126
|
+
id=new_instance_contract['new_contract'])
|
127
|
+
|
128
|
+
return new_instance['id']
|
129
|
+
|
130
|
+
|
131
|
+
def start(instance_id: str) -> None:
|
132
|
+
"""Starts the given instance."""
|
133
|
+
vast.vast().start_instance(id=instance_id)
|
134
|
+
|
135
|
+
|
136
|
+
def stop(instance_id: str) -> None:
|
137
|
+
"""Stops the given instance."""
|
138
|
+
vast.vast().stop_instance(id=instance_id)
|
139
|
+
|
140
|
+
|
141
|
+
def remove(instance_id: str) -> None:
|
142
|
+
"""Terminates the given instance."""
|
143
|
+
vast.vast().destroy_instance(id=instance_id)
|
144
|
+
|
145
|
+
|
146
|
+
def get_ssh_ports(cluster_name: str) -> List[int]:
|
147
|
+
"""Gets the SSH ports for the given cluster."""
|
148
|
+
logger.debug(f'Getting SSH ports for cluster {cluster_name}.')
|
149
|
+
|
150
|
+
instances = list_instances()
|
151
|
+
possible_names = [f'{cluster_name}-head', f'{cluster_name}-worker']
|
152
|
+
|
153
|
+
ssh_ports = []
|
154
|
+
|
155
|
+
for instance in instances.values():
|
156
|
+
if instance['name'] in possible_names:
|
157
|
+
ssh_ports.append(instance['ssh_port'])
|
158
|
+
assert ssh_ports, (
|
159
|
+
f'Could not find any instances for cluster {cluster_name}.')
|
160
|
+
|
161
|
+
return ssh_ports
|
sky/serve/serve_state.py
CHANGED
@@ -55,33 +55,35 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
|
|
55
55
|
PRIMARY KEY (service_name, replica_id))""")
|
56
56
|
cursor.execute("""\
|
57
57
|
CREATE TABLE IF NOT EXISTS version_specs (
|
58
|
-
version INTEGER,
|
58
|
+
version INTEGER,
|
59
59
|
service_name TEXT,
|
60
60
|
spec BLOB,
|
61
61
|
PRIMARY KEY (service_name, version))""")
|
62
62
|
conn.commit()
|
63
63
|
|
64
|
+
# Backward compatibility.
|
65
|
+
db_utils.add_column_to_table(cursor, conn, 'services',
|
66
|
+
'requested_resources_str', 'TEXT')
|
67
|
+
# Deprecated: switched to `active_versions` below for the version
|
68
|
+
# considered active by the load balancer. The
|
69
|
+
# authscaler/replica_manager version can be found in the
|
70
|
+
# version_specs table.
|
71
|
+
db_utils.add_column_to_table(
|
72
|
+
cursor, conn, 'services', 'current_version',
|
73
|
+
f'INTEGER DEFAULT {constants.INITIAL_VERSION}')
|
74
|
+
# The versions that is activated for the service. This is a list
|
75
|
+
# of integers in json format.
|
76
|
+
db_utils.add_column_to_table(cursor, conn, 'services', 'active_versions',
|
77
|
+
f'TEXT DEFAULT {json.dumps([])!r}')
|
78
|
+
db_utils.add_column_to_table(cursor, conn, 'services',
|
79
|
+
'load_balancing_policy', 'TEXT DEFAULT NULL')
|
80
|
+
# Whether the service's load balancer is encrypted with TLS.
|
81
|
+
db_utils.add_column_to_table(cursor, conn, 'services', 'tls_encrypted',
|
82
|
+
'INTEGER DEFAULT 0')
|
83
|
+
conn.commit()
|
84
|
+
|
64
85
|
|
65
|
-
|
66
|
-
# Backward compatibility.
|
67
|
-
db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services',
|
68
|
-
'requested_resources_str', 'TEXT')
|
69
|
-
# Deprecated: switched to `active_versions` below for the version considered
|
70
|
-
# active by the load balancer. The authscaler/replica_manager version can be
|
71
|
-
# found in the version_specs table.
|
72
|
-
db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services',
|
73
|
-
'current_version',
|
74
|
-
f'INTEGER DEFAULT {constants.INITIAL_VERSION}')
|
75
|
-
# The versions that is activated for the service. This is a list of integers in
|
76
|
-
# json format.
|
77
|
-
db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services',
|
78
|
-
'active_versions',
|
79
|
-
f'TEXT DEFAULT {json.dumps([])!r}')
|
80
|
-
db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services',
|
81
|
-
'load_balancing_policy', 'TEXT DEFAULT NULL')
|
82
|
-
# Whether the service's load balancer is encrypted with TLS.
|
83
|
-
db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services', 'tls_encrypted',
|
84
|
-
'INTEGER DEFAULT 0')
|
86
|
+
db_utils.SQLiteConn(_DB_PATH, create_table)
|
85
87
|
_UNIQUE_CONSTRAINT_FAILED_ERROR_MSG = 'UNIQUE constraint failed: services.name'
|
86
88
|
|
87
89
|
|
sky/setup_files/dependencies.py
CHANGED
@@ -131,6 +131,7 @@ extras_require: Dict[str, List[str]] = {
|
|
131
131
|
'cudo': ['cudo-compute>=0.1.10'],
|
132
132
|
'paperspace': [], # No dependencies needed for paperspace
|
133
133
|
'do': ['pydo>=0.3.0', 'azure-core>=1.24.0', 'azure-common'],
|
134
|
+
'vast': ['vastai-sdk>=0.1.12'],
|
134
135
|
'vsphere': [
|
135
136
|
'pyvmomi==8.0.1.0.2',
|
136
137
|
# vsphere-automation-sdk is also required, but it does not have
|
@@ -0,0 +1,70 @@
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
2
|
+
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
4
|
+
max_workers: {{num_nodes - 1}}
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
6
|
+
idle_timeout_minutes: 60
|
7
|
+
|
8
|
+
provider:
|
9
|
+
type: external
|
10
|
+
module: sky.provision.vast
|
11
|
+
region: "{{region}}"
|
12
|
+
disable_launch_config_check: true
|
13
|
+
|
14
|
+
auth:
|
15
|
+
ssh_user: root
|
16
|
+
ssh_private_key: {{ssh_private_key}}
|
17
|
+
|
18
|
+
available_node_types:
|
19
|
+
ray_head_default:
|
20
|
+
resources: {}
|
21
|
+
node_config:
|
22
|
+
InstanceType: {{instance_type}}
|
23
|
+
DiskSize: {{disk_size}}
|
24
|
+
ImageId: {{image_id}}
|
25
|
+
Preemptible: {{use_spot}}
|
26
|
+
PublicKey: |-
|
27
|
+
skypilot:ssh_public_key_content
|
28
|
+
|
29
|
+
head_node_type: ray_head_default
|
30
|
+
|
31
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
32
|
+
file_mounts: {
|
33
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
34
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
35
|
+
{%- for remote_path, local_path in credentials.items() %}
|
36
|
+
"{{remote_path}}": "{{local_path}}",
|
37
|
+
{%- endfor %}
|
38
|
+
}
|
39
|
+
|
40
|
+
rsync_exclude: []
|
41
|
+
|
42
|
+
initialization_commands: []
|
43
|
+
|
44
|
+
# List of shell commands to run to set up nodes.
|
45
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
46
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
47
|
+
# items!
|
48
|
+
#
|
49
|
+
# Increment the following for catching performance bugs easier:
|
50
|
+
# current num items (num SSH connections): 1
|
51
|
+
setup_commands:
|
52
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
53
|
+
# Line 'rm ..': there is another installation of pip.
|
54
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
55
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
56
|
+
# Line 'mkdir -p ..': disable host key check
|
57
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
58
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
59
|
+
{{ initial_setup_command }}
|
60
|
+
{%- endfor %}
|
61
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
|
62
|
+
{{ conda_installation_commands }}
|
63
|
+
{{ ray_skypilot_installation_commands }}
|
64
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
65
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
66
|
+
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
67
|
+
|
68
|
+
|
69
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
70
|
+
# We do not need to list it here anymore.
|
sky/utils/controller_utils.py
CHANGED
@@ -261,6 +261,11 @@ def _get_cloud_dependencies_installation_commands(
|
|
261
261
|
if controller != Controllers.JOBS_CONTROLLER:
|
262
262
|
# We only need IBM deps on the jobs controller.
|
263
263
|
cloud_python_dependencies = []
|
264
|
+
elif isinstance(cloud, clouds.Vast):
|
265
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
266
|
+
commands.append(f'echo -en "\\r{step_prefix}Vast{empty_str}" && '
|
267
|
+
'pip list | grep vastai_sdk > /dev/null 2>&1 || '
|
268
|
+
'pip install "vastai_sdk>=0.1.12" > /dev/null 2>&1')
|
264
269
|
|
265
270
|
python_packages.update(cloud_python_dependencies)
|
266
271
|
|