skypilot-nightly 1.0.0.dev20250203__py3-none-any.whl → 1.0.0.dev20250205__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/vast.py +29 -0
  3. sky/authentication.py +18 -0
  4. sky/backends/backend_utils.py +4 -1
  5. sky/backends/cloud_vm_ray_backend.py +1 -0
  6. sky/check.py +2 -2
  7. sky/clouds/__init__.py +2 -0
  8. sky/clouds/service_catalog/constants.py +1 -1
  9. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  10. sky/clouds/service_catalog/kubernetes_catalog.py +11 -6
  11. sky/clouds/service_catalog/vast_catalog.py +104 -0
  12. sky/clouds/vast.py +279 -0
  13. sky/jobs/dashboard/dashboard.py +156 -20
  14. sky/jobs/dashboard/templates/index.html +557 -78
  15. sky/jobs/scheduler.py +14 -5
  16. sky/provision/__init__.py +1 -0
  17. sky/provision/lambda_cloud/instance.py +17 -1
  18. sky/provision/vast/__init__.py +10 -0
  19. sky/provision/vast/config.py +11 -0
  20. sky/provision/vast/instance.py +247 -0
  21. sky/provision/vast/utils.py +161 -0
  22. sky/serve/serve_state.py +23 -21
  23. sky/setup_files/dependencies.py +1 -0
  24. sky/templates/vast-ray.yml.j2 +70 -0
  25. sky/utils/controller_utils.py +5 -0
  26. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/METADATA +4 -1
  27. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/RECORD +31 -22
  28. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/LICENSE +0 -0
  29. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/WHEEL +0 -0
  30. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/entry_points.txt +0 -0
  31. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py CHANGED
@@ -60,6 +60,14 @@ logger = sky_logging.init_logger('sky.jobs.controller')
60
60
  _MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
61
61
  _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
62
62
 
63
+ # Based on testing, assume a running job uses 350MB memory.
64
+ JOB_MEMORY_MB = 350
65
+ # Past 2000 simultaneous jobs, we become unstable.
66
+ # See https://github.com/skypilot-org/skypilot/issues/4649.
67
+ MAX_JOB_LIMIT = 2000
68
+ # Number of ongoing launches launches allowed per CPU.
69
+ LAUNCHES_PER_CPU = 4
70
+
63
71
 
64
72
  @lru_cache(maxsize=1)
65
73
  def _get_lock_path() -> str:
@@ -247,15 +255,16 @@ def _set_alive_waiting(job_id: int) -> None:
247
255
 
248
256
 
249
257
  def _get_job_parallelism() -> int:
250
- # Assume a running job uses 350MB memory.
251
- # We observe 230-300 in practice.
252
- job_memory = 350 * 1024 * 1024
253
- return max(psutil.virtual_memory().total // job_memory, 1)
258
+ job_memory = JOB_MEMORY_MB * 1024 * 1024
259
+
260
+ job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
261
+
262
+ return max(job_limit, 1)
254
263
 
255
264
 
256
265
  def _get_launch_parallelism() -> int:
257
266
  cpus = os.cpu_count()
258
- return cpus * 4 if cpus is not None else 1
267
+ return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
259
268
 
260
269
 
261
270
  def _can_start_new_job() -> bool:
sky/provision/__init__.py CHANGED
@@ -22,6 +22,7 @@ from sky.provision import kubernetes
22
22
  from sky.provision import lambda_cloud
23
23
  from sky.provision import oci
24
24
  from sky.provision import runpod
25
+ from sky.provision import vast
25
26
  from sky.provision import vsphere
26
27
  from sky.utils import command_runner
27
28
  from sky.utils import timeline
@@ -64,6 +64,21 @@ def _get_ssh_key_name(prefix: str = '') -> str:
64
64
  return name
65
65
 
66
66
 
67
+ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
68
+ private_ip = instance_info.get('private_ip')
69
+ if private_ip is None:
70
+ if single_node:
71
+ # The Lambda cloud API may return an instance info without
72
+ # private IP. It does not align with their docs, but we still
73
+ # allow single-node cluster to proceed with provisioning, by using
74
+ # 127.0.0.1, as private IP is not critical for single-node case.
75
+ return '127.0.0.1'
76
+ msg = f'Failed to retrieve private IP for instance {instance_info}.'
77
+ logger.error(msg)
78
+ raise RuntimeError(msg)
79
+ return private_ip
80
+
81
+
67
82
  def run_instances(region: str, cluster_name_on_cloud: str,
68
83
  config: common.ProvisionConfig) -> common.ProvisionRecord:
69
84
  """Runs instances for the given cluster"""
@@ -197,13 +212,14 @@ def get_cluster_info(
197
212
  ) -> common.ClusterInfo:
198
213
  del region # unused
199
214
  running_instances = _filter_instances(cluster_name_on_cloud, ['active'])
215
+ single_node = len(running_instances) == 1
200
216
  instances: Dict[str, List[common.InstanceInfo]] = {}
201
217
  head_instance_id = None
202
218
  for instance_id, instance_info in running_instances.items():
203
219
  instances[instance_id] = [
204
220
  common.InstanceInfo(
205
221
  instance_id=instance_id,
206
- internal_ip=instance_info['private_ip'],
222
+ internal_ip=_get_private_ip(instance_info, single_node),
207
223
  external_ip=instance_info['ip'],
208
224
  ssh_port=22,
209
225
  tags={},
@@ -0,0 +1,10 @@
1
+ """Vast provisioner for SkyPilot."""
2
+
3
+ from sky.provision.vast.config import bootstrap_instances
4
+ from sky.provision.vast.instance import cleanup_ports
5
+ from sky.provision.vast.instance import get_cluster_info
6
+ from sky.provision.vast.instance import query_instances
7
+ from sky.provision.vast.instance import run_instances
8
+ from sky.provision.vast.instance import stop_instances
9
+ from sky.provision.vast.instance import terminate_instances
10
+ from sky.provision.vast.instance import wait_instances
@@ -0,0 +1,11 @@
1
+ """Vast configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+ return config
@@ -0,0 +1,247 @@
1
+ """Vast instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from sky import sky_logging
6
+ from sky import status_lib
7
+ from sky.provision import common
8
+ from sky.provision.vast import utils
9
+ from sky.utils import common_utils
10
+ from sky.utils import ux_utils
11
+
12
+ POLL_INTERVAL = 10
13
+
14
+ logger = sky_logging.init_logger(__name__)
15
+ # a much more convenient method
16
+ status_filter = lambda machine_dict, stat_list: {
17
+ k: v for k, v in machine_dict.items() if v['status'] in stat_list
18
+ }
19
+
20
+
21
+ def _filter_instances(cluster_name_on_cloud: str,
22
+ status_filters: Optional[List[str]],
23
+ head_only: bool = False) -> Dict[str, Any]:
24
+
25
+ instances = utils.list_instances()
26
+ possible_names = [f'{cluster_name_on_cloud}-head']
27
+ if not head_only:
28
+ possible_names.append(f'{cluster_name_on_cloud}-worker')
29
+
30
+ filtered_instances = {}
31
+ for instance_id, instance in instances.items():
32
+ if (status_filters is not None and
33
+ instance['status'] not in status_filters):
34
+ continue
35
+ if instance.get('name') in possible_names:
36
+ filtered_instances[instance_id] = instance
37
+ return filtered_instances
38
+
39
+
40
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
41
+ for inst_id, inst in instances.items():
42
+ if inst['name'].endswith('-head'):
43
+ return inst_id
44
+ return None
45
+
46
+
47
+ def run_instances(region: str, cluster_name_on_cloud: str,
48
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
49
+ """Runs instances for the given cluster."""
50
+ pending_status = ['CREATED', 'RESTARTING']
51
+
52
+ created_instance_ids = []
53
+ instances: Dict[str, Any] = {}
54
+
55
+ while True:
56
+ instances = _filter_instances(cluster_name_on_cloud, None)
57
+ if not status_filter(instances, pending_status):
58
+ break
59
+ logger.info(f'Waiting for {len(instances)} instances to be ready.')
60
+ time.sleep(POLL_INTERVAL)
61
+
62
+ running_instances = status_filter(instances, ['RUNNING'])
63
+ head_instance_id = _get_head_instance_id(running_instances)
64
+ stopped_instances = status_filter(instances, ['EXITED', 'STOPPED'])
65
+
66
+ if config.resume_stopped_nodes and stopped_instances:
67
+ for instance in stopped_instances.values():
68
+ utils.start(instance['id'])
69
+ else:
70
+ to_start_count = config.count - (len(running_instances) +
71
+ len(stopped_instances))
72
+ if to_start_count < 0:
73
+ raise RuntimeError(f'Cluster {cluster_name_on_cloud} already has '
74
+ f'{len(running_instances)} nodes,'
75
+ f'but {config.count} are required.')
76
+ if to_start_count == 0:
77
+ if head_instance_id is None:
78
+ raise RuntimeError(
79
+ f'Cluster {cluster_name_on_cloud} has no head node.')
80
+ logger.info(
81
+ f'Cluster {cluster_name_on_cloud} already has '
82
+ f'{len(running_instances)} nodes, no need to start more.')
83
+ return common.ProvisionRecord(provider_name='vast',
84
+ cluster_name=cluster_name_on_cloud,
85
+ region=region,
86
+ zone=None,
87
+ head_instance_id=head_instance_id,
88
+ resumed_instance_ids=[],
89
+ created_instance_ids=[])
90
+
91
+ for _ in range(to_start_count):
92
+ node_type = 'head' if head_instance_id is None else 'worker'
93
+ try:
94
+ instance_id = utils.launch(
95
+ name=f'{cluster_name_on_cloud}-{node_type}',
96
+ instance_type=config.node_config['InstanceType'],
97
+ region=region,
98
+ disk_size=config.node_config['DiskSize'],
99
+ preemptible=config.node_config['Preemptible'],
100
+ image_name=config.node_config['ImageId'])
101
+ except Exception as e: # pylint: disable=broad-except
102
+ logger.warning(f'run_instances error: {e}')
103
+ raise
104
+ logger.info(f'Launched instance {instance_id}.')
105
+ created_instance_ids.append(instance_id)
106
+ if head_instance_id is None:
107
+ head_instance_id = instance_id
108
+
109
+ # Wait for instances to be ready.
110
+ while True:
111
+ instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
112
+ ready_instance_cnt = 0
113
+ for instance_id, instance in instances.items():
114
+ if instance.get('ssh_port') is not None:
115
+ ready_instance_cnt += 1
116
+ logger.info('Waiting for instances to be ready: '
117
+ f'({ready_instance_cnt}/{config.count}).')
118
+ if ready_instance_cnt == config.count:
119
+ break
120
+
121
+ time.sleep(POLL_INTERVAL)
122
+
123
+ head_instance_id = _get_head_instance_id(utils.list_instances())
124
+ assert head_instance_id is not None, 'head_instance_id should not be None'
125
+ return common.ProvisionRecord(provider_name='vast',
126
+ cluster_name=cluster_name_on_cloud,
127
+ region=region,
128
+ zone=None,
129
+ head_instance_id=head_instance_id,
130
+ resumed_instance_ids=[],
131
+ created_instance_ids=created_instance_ids)
132
+
133
+
134
+ def wait_instances(region: str, cluster_name_on_cloud: str,
135
+ state: Optional[status_lib.ClusterStatus]) -> None:
136
+ del region, cluster_name_on_cloud, state
137
+
138
+
139
+ def stop_instances(
140
+ cluster_name_on_cloud: str,
141
+ provider_config: Optional[Dict[str, Any]] = None,
142
+ worker_only: bool = False,
143
+ ) -> None:
144
+ return action_instances('stop', cluster_name_on_cloud, provider_config,
145
+ worker_only)
146
+
147
+
148
+ def terminate_instances(
149
+ cluster_name_on_cloud: str,
150
+ provider_config: Optional[Dict[str, Any]] = None,
151
+ worker_only: bool = False,
152
+ ) -> None:
153
+ return action_instances('remove', cluster_name_on_cloud, provider_config,
154
+ worker_only)
155
+
156
+
157
+ def action_instances(
158
+ fn: str,
159
+ cluster_name_on_cloud: str,
160
+ provider_config: Optional[Dict[str, Any]] = None,
161
+ worker_only: bool = False,
162
+ ) -> None:
163
+ """See sky/provision/__init__.py"""
164
+ del provider_config # unused
165
+ instances = _filter_instances(cluster_name_on_cloud, None)
166
+ for inst_id, inst in instances.items():
167
+ logger.debug(f'Instance {fn} {inst_id}: {inst}')
168
+ if worker_only and inst['name'].endswith('-head'):
169
+ continue
170
+ try:
171
+ getattr(utils, fn)(inst_id)
172
+ except Exception as e: # pylint: disable=broad-except
173
+ with ux_utils.print_exception_no_traceback():
174
+ raise RuntimeError(
175
+ f'Failed to {fn} instance {inst_id}: '
176
+ f'{common_utils.format_exception(e, use_bracket=False)}'
177
+ ) from e
178
+
179
+
180
+ def get_cluster_info(
181
+ region: str,
182
+ cluster_name_on_cloud: str,
183
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
184
+ del region # unused
185
+ running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
186
+ instances: Dict[str, List[common.InstanceInfo]] = {}
187
+ head_instance_id = None
188
+ for instance_id, instance_info in running_instances.items():
189
+ instances[instance_id] = [
190
+ common.InstanceInfo(
191
+ instance_id=instance_id,
192
+ internal_ip=instance_info['local_ipaddrs'].strip(),
193
+ external_ip=instance_info['public_ipaddr'],
194
+ ssh_port=instance_info['ports']['22/tcp'][0]['HostPort'],
195
+ tags={},
196
+ )
197
+ ]
198
+ if instance_info['name'].endswith('-head'):
199
+ head_instance_id = instance_id
200
+
201
+ return common.ClusterInfo(
202
+ instances=instances,
203
+ head_instance_id=head_instance_id,
204
+ provider_name='vast',
205
+ provider_config=provider_config,
206
+ )
207
+
208
+
209
+ def open_ports(
210
+ cluster_name_on_cloud: str,
211
+ ports: List[str],
212
+ provider_config: Optional[Dict[str, Any]] = None,
213
+ ) -> None:
214
+ raise NotImplementedError('open_ports is not supported for Vast')
215
+
216
+
217
+ def query_instances(
218
+ cluster_name_on_cloud: str,
219
+ provider_config: Optional[Dict[str, Any]] = None,
220
+ non_terminated_only: bool = True,
221
+ ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
222
+ """See sky/provision/__init__.py"""
223
+
224
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
225
+ instances = _filter_instances(cluster_name_on_cloud, None)
226
+ # "running", "frozen", "stopped", "unknown", "loading"
227
+ status_map = {
228
+ 'LOADING': status_lib.ClusterStatus.INIT,
229
+ 'EXITED': status_lib.ClusterStatus.STOPPED,
230
+ 'STOPPED': status_lib.ClusterStatus.STOPPED,
231
+ 'RUNNING': status_lib.ClusterStatus.UP,
232
+ }
233
+ statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
234
+ for inst_id, inst in instances.items():
235
+ status = status_map[inst['status']]
236
+ if non_terminated_only and status is None:
237
+ continue
238
+ statuses[inst_id] = status
239
+ return statuses
240
+
241
+
242
+ def cleanup_ports(
243
+ cluster_name_on_cloud: str,
244
+ ports: List[str],
245
+ provider_config: Optional[Dict[str, Any]] = None,
246
+ ) -> None:
247
+ del cluster_name_on_cloud, ports, provider_config # Unused.
@@ -0,0 +1,161 @@
1
+ # pylint: disable=assignment-from-no-return
2
+ #
3
+ # The pylint exception above is an accomodation for
4
+ # false positives generated by pylint for the Vast
5
+ # python sdk.
6
+ #
7
+ """Vast library wrapper for SkyPilot."""
8
+ from typing import Any, Dict, List
9
+
10
+ from sky import sky_logging
11
+ from sky.adaptors import vast
12
+
13
+ logger = sky_logging.init_logger(__name__)
14
+
15
+
16
+ def list_instances() -> Dict[str, Dict[str, Any]]:
17
+ """Lists instances associated with API key."""
18
+ instances = vast.vast().show_instances()
19
+
20
+ instance_dict: Dict[str, Dict[str, Any]] = {}
21
+ for instance in instances:
22
+ instance['id'] = str(instance['id'])
23
+ info = instance
24
+
25
+ if isinstance(instance['actual_status'], str):
26
+ info['status'] = instance['actual_status'].upper()
27
+ else:
28
+ info['status'] = 'UNKNOWN'
29
+ info['name'] = instance['label']
30
+
31
+ instance_dict[instance['id']] = info
32
+
33
+ return instance_dict
34
+
35
+
36
+ def launch(name: str, instance_type: str, region: str, disk_size: int,
37
+ image_name: str, preemptible: bool) -> str:
38
+ """Launches an instance with the given parameters.
39
+
40
+ Converts the instance_type to the Vast GPU name, finds the specs for the
41
+ GPU, and launches the instance.
42
+
43
+ Notes:
44
+
45
+ * `georegion`: This is a feature flag to provide an additional
46
+ scope of geographical specificy while maintaining backward
47
+ compatibility.
48
+
49
+ * `chunked`: This is a feature flag to give breadth to the
50
+ snowflake nature of the vast catalog marketplace. It rounds
51
+ down various specifications of machines to emulate an instance
52
+ type and make them more interchangeable.
53
+
54
+ * `disk_size`: We look for instances that are of the requested
55
+ size or greater than it. For instance, `disk_size=100` might
56
+ return something with `disk_size` at 102 or even 1000.
57
+
58
+ The disk size {xx} GB is not exactly matched the requested
59
+ size {yy} GB. It is possible to charge extra cost on disk.
60
+
61
+ * `geolocation`: Geolocation on Vast can be as specific as the
62
+ host chooses to be. They can say, for instance, "Yutakachō,
63
+ Shinagawa District, Tokyo, JP." Such a specific geolocation
64
+ as ours would fail to return this host in a simple string
65
+ comparison if a user searched for "JP".
66
+
67
+ Since regardless of specificity, all our geolocations end
68
+ in two-letter country codes we just snip that to conform
69
+ to how many providers state their geolocation.
70
+
71
+ * Since the catalog is cached, we can't gaurantee availability
72
+ of any machine at the point of inquiry. As a consequence we
73
+ search for the machine again and potentially return a failure
74
+ if there is no availability.
75
+
76
+ * We pass in the cpu_ram here as a guarantor to make sure the
77
+ instance we match with will be compliant with the requested
78
+ amount of memory.
79
+
80
+ * Vast instance types are an invention for skypilot. Refer to
81
+ service_catalog/vast_catalog.py for the current construction
82
+ of the type.
83
+
84
+ """
85
+ cpu_ram = float(instance_type.split('-')[-1]) / 1024
86
+ gpu_name = instance_type.split('-')[1].replace('_', ' ')
87
+ num_gpus = int(instance_type.split('-')[0].replace('x', ''))
88
+
89
+ query = ' '.join([
90
+ 'chunked=true',
91
+ 'georegion=true',
92
+ f'geolocation="{region[-2:]}"',
93
+ f'disk_space>={disk_size}',
94
+ f'num_gpus={num_gpus}',
95
+ f'gpu_name="{gpu_name}"',
96
+ f'cpu_ram>="{cpu_ram}"',
97
+ ])
98
+
99
+ instance_list = vast.vast().search_offers(query=query)
100
+
101
+ if isinstance(instance_list, int) or len(instance_list) == 0:
102
+ raise RuntimeError('Failed to create instances, could not find an '
103
+ f'offer that satisfies the requirements "{query}".')
104
+
105
+ instance_touse = instance_list[0]
106
+
107
+ launch_params = {
108
+ 'id': instance_touse['id'],
109
+ 'direct': True,
110
+ 'ssh': True,
111
+ 'env': '-e __SOURCE=skypilot',
112
+ 'onstart_cmd': ';'.join([
113
+ 'touch ~/.no_auto_tmux',
114
+ f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
115
+ ]),
116
+ 'label': name,
117
+ 'image': image_name
118
+ }
119
+
120
+ if preemptible:
121
+ launch_params['min_bid'] = instance_touse['min_bid']
122
+
123
+ new_instance_contract = vast.vast().create_instance(**launch_params)
124
+
125
+ new_instance = vast.vast().show_instance(
126
+ id=new_instance_contract['new_contract'])
127
+
128
+ return new_instance['id']
129
+
130
+
131
+ def start(instance_id: str) -> None:
132
+ """Starts the given instance."""
133
+ vast.vast().start_instance(id=instance_id)
134
+
135
+
136
+ def stop(instance_id: str) -> None:
137
+ """Stops the given instance."""
138
+ vast.vast().stop_instance(id=instance_id)
139
+
140
+
141
+ def remove(instance_id: str) -> None:
142
+ """Terminates the given instance."""
143
+ vast.vast().destroy_instance(id=instance_id)
144
+
145
+
146
+ def get_ssh_ports(cluster_name: str) -> List[int]:
147
+ """Gets the SSH ports for the given cluster."""
148
+ logger.debug(f'Getting SSH ports for cluster {cluster_name}.')
149
+
150
+ instances = list_instances()
151
+ possible_names = [f'{cluster_name}-head', f'{cluster_name}-worker']
152
+
153
+ ssh_ports = []
154
+
155
+ for instance in instances.values():
156
+ if instance['name'] in possible_names:
157
+ ssh_ports.append(instance['ssh_port'])
158
+ assert ssh_ports, (
159
+ f'Could not find any instances for cluster {cluster_name}.')
160
+
161
+ return ssh_ports
sky/serve/serve_state.py CHANGED
@@ -55,33 +55,35 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
55
55
  PRIMARY KEY (service_name, replica_id))""")
56
56
  cursor.execute("""\
57
57
  CREATE TABLE IF NOT EXISTS version_specs (
58
- version INTEGER,
58
+ version INTEGER,
59
59
  service_name TEXT,
60
60
  spec BLOB,
61
61
  PRIMARY KEY (service_name, version))""")
62
62
  conn.commit()
63
63
 
64
+ # Backward compatibility.
65
+ db_utils.add_column_to_table(cursor, conn, 'services',
66
+ 'requested_resources_str', 'TEXT')
67
+ # Deprecated: switched to `active_versions` below for the version
68
+ # considered active by the load balancer. The
69
+ # authscaler/replica_manager version can be found in the
70
+ # version_specs table.
71
+ db_utils.add_column_to_table(
72
+ cursor, conn, 'services', 'current_version',
73
+ f'INTEGER DEFAULT {constants.INITIAL_VERSION}')
74
+ # The versions that is activated for the service. This is a list
75
+ # of integers in json format.
76
+ db_utils.add_column_to_table(cursor, conn, 'services', 'active_versions',
77
+ f'TEXT DEFAULT {json.dumps([])!r}')
78
+ db_utils.add_column_to_table(cursor, conn, 'services',
79
+ 'load_balancing_policy', 'TEXT DEFAULT NULL')
80
+ # Whether the service's load balancer is encrypted with TLS.
81
+ db_utils.add_column_to_table(cursor, conn, 'services', 'tls_encrypted',
82
+ 'INTEGER DEFAULT 0')
83
+ conn.commit()
84
+
64
85
 
65
- _DB = db_utils.SQLiteConn(_DB_PATH, create_table)
66
- # Backward compatibility.
67
- db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services',
68
- 'requested_resources_str', 'TEXT')
69
- # Deprecated: switched to `active_versions` below for the version considered
70
- # active by the load balancer. The authscaler/replica_manager version can be
71
- # found in the version_specs table.
72
- db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services',
73
- 'current_version',
74
- f'INTEGER DEFAULT {constants.INITIAL_VERSION}')
75
- # The versions that is activated for the service. This is a list of integers in
76
- # json format.
77
- db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services',
78
- 'active_versions',
79
- f'TEXT DEFAULT {json.dumps([])!r}')
80
- db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services',
81
- 'load_balancing_policy', 'TEXT DEFAULT NULL')
82
- # Whether the service's load balancer is encrypted with TLS.
83
- db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services', 'tls_encrypted',
84
- 'INTEGER DEFAULT 0')
86
+ db_utils.SQLiteConn(_DB_PATH, create_table)
85
87
  _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG = 'UNIQUE constraint failed: services.name'
86
88
 
87
89
 
@@ -131,6 +131,7 @@ extras_require: Dict[str, List[str]] = {
131
131
  'cudo': ['cudo-compute>=0.1.10'],
132
132
  'paperspace': [], # No dependencies needed for paperspace
133
133
  'do': ['pydo>=0.3.0', 'azure-core>=1.24.0', 'azure-common'],
134
+ 'vast': ['vastai-sdk>=0.1.12'],
134
135
  'vsphere': [
135
136
  'pyvmomi==8.0.1.0.2',
136
137
  # vsphere-automation-sdk is also required, but it does not have
@@ -0,0 +1,70 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.vast
11
+ region: "{{region}}"
12
+ disable_launch_config_check: true
13
+
14
+ auth:
15
+ ssh_user: root
16
+ ssh_private_key: {{ssh_private_key}}
17
+
18
+ available_node_types:
19
+ ray_head_default:
20
+ resources: {}
21
+ node_config:
22
+ InstanceType: {{instance_type}}
23
+ DiskSize: {{disk_size}}
24
+ ImageId: {{image_id}}
25
+ Preemptible: {{use_spot}}
26
+ PublicKey: |-
27
+ skypilot:ssh_public_key_content
28
+
29
+ head_node_type: ray_head_default
30
+
31
+ # Format: `REMOTE_PATH : LOCAL_PATH`
32
+ file_mounts: {
33
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
34
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
35
+ {%- for remote_path, local_path in credentials.items() %}
36
+ "{{remote_path}}": "{{local_path}}",
37
+ {%- endfor %}
38
+ }
39
+
40
+ rsync_exclude: []
41
+
42
+ initialization_commands: []
43
+
44
+ # List of shell commands to run to set up nodes.
45
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
46
+ # connection, which is expensive. Try your best to co-locate commands into fewer
47
+ # items!
48
+ #
49
+ # Increment the following for catching performance bugs easier:
50
+ # current num items (num SSH connections): 1
51
+ setup_commands:
52
+ # Create ~/.ssh/config file in case the file does not exist in the image.
53
+ # Line 'rm ..': there is another installation of pip.
54
+ # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
55
+ # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
56
+ # Line 'mkdir -p ..': disable host key check
57
+ # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
58
+ - {%- for initial_setup_command in initial_setup_commands %}
59
+ {{ initial_setup_command }}
60
+ {%- endfor %}
61
+ mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
62
+ {{ conda_installation_commands }}
63
+ {{ ray_skypilot_installation_commands }}
64
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
65
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
66
+ (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
67
+
68
+
69
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
70
+ # We do not need to list it here anymore.
@@ -261,6 +261,11 @@ def _get_cloud_dependencies_installation_commands(
261
261
  if controller != Controllers.JOBS_CONTROLLER:
262
262
  # We only need IBM deps on the jobs controller.
263
263
  cloud_python_dependencies = []
264
+ elif isinstance(cloud, clouds.Vast):
265
+ step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
266
+ commands.append(f'echo -en "\\r{step_prefix}Vast{empty_str}" && '
267
+ 'pip list | grep vastai_sdk > /dev/null 2>&1 || '
268
+ 'pip install "vastai_sdk>=0.1.12" > /dev/null 2>&1')
264
269
 
265
270
  python_packages.update(cloud_python_dependencies)
266
271