skypilot-nightly 1.0.0.dev20250203__py3-none-any.whl → 1.0.0.dev20250205__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/vast.py +29 -0
  3. sky/authentication.py +18 -0
  4. sky/backends/backend_utils.py +4 -1
  5. sky/backends/cloud_vm_ray_backend.py +1 -0
  6. sky/check.py +2 -2
  7. sky/clouds/__init__.py +2 -0
  8. sky/clouds/service_catalog/constants.py +1 -1
  9. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  10. sky/clouds/service_catalog/kubernetes_catalog.py +11 -6
  11. sky/clouds/service_catalog/vast_catalog.py +104 -0
  12. sky/clouds/vast.py +279 -0
  13. sky/jobs/dashboard/dashboard.py +156 -20
  14. sky/jobs/dashboard/templates/index.html +557 -78
  15. sky/jobs/scheduler.py +14 -5
  16. sky/provision/__init__.py +1 -0
  17. sky/provision/lambda_cloud/instance.py +17 -1
  18. sky/provision/vast/__init__.py +10 -0
  19. sky/provision/vast/config.py +11 -0
  20. sky/provision/vast/instance.py +247 -0
  21. sky/provision/vast/utils.py +161 -0
  22. sky/serve/serve_state.py +23 -21
  23. sky/setup_files/dependencies.py +1 -0
  24. sky/templates/vast-ray.yml.j2 +70 -0
  25. sky/utils/controller_utils.py +5 -0
  26. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/METADATA +4 -1
  27. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/RECORD +31 -22
  28. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/LICENSE +0 -0
  29. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/WHEEL +0 -0
  30. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/entry_points.txt +0 -0
  31. {skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/top_level.txt +0 -0
sky/clouds/vast.py ADDED
@@ -0,0 +1,279 @@
1
+ """ Vast Cloud. """
2
+
3
+ import typing
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
5
+
6
+ from sky import clouds
7
+ from sky.clouds import service_catalog
8
+ from sky.utils import resources_utils
9
+
10
+ if typing.TYPE_CHECKING:
11
+ from sky import resources as resources_lib
12
+
13
+
14
+ @clouds.CLOUD_REGISTRY.register
15
+ class Vast(clouds.Cloud):
16
+ """ Vast GPU Cloud
17
+
18
+ _REPR | The string representation for the Vast GPU cloud object.
19
+ """
20
+ _REPR = 'Vast'
21
+ _CLOUD_UNSUPPORTED_FEATURES = {
22
+ clouds.CloudImplementationFeatures.MULTI_NODE:
23
+ ('Multi-node not supported yet, as the interconnection among nodes '
24
+ 'are non-trivial on Vast.'),
25
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
26
+ ('Customizing disk tier is not supported yet on Vast.'),
27
+ clouds.CloudImplementationFeatures.OPEN_PORTS:
28
+ ('Opening ports is currently not supported on Vast.'),
29
+ clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
30
+ ('Mounting object stores is not supported on Vast.'),
31
+ }
32
+ #
33
+ # Vast doesn't have a max cluster name limit. This number
34
+ # is reasonably large and exists to play nicely with the
35
+ # other providers
36
+ #
37
+ _MAX_CLUSTER_NAME_LEN_LIMIT = 120
38
+ _regions: List[clouds.Region] = []
39
+
40
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
41
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
42
+
43
+ @classmethod
44
+ def _unsupported_features_for_resources(
45
+ cls, resources: 'resources_lib.Resources'
46
+ ) -> Dict[clouds.CloudImplementationFeatures, str]:
47
+ """The features not supported based on the resources provided.
48
+
49
+ This method is used by check_features_are_supported() to check if the
50
+ cloud implementation supports all the requested features.
51
+
52
+ Returns:
53
+ A dict of {feature: reason} for the features not supported by the
54
+ cloud implementation.
55
+ """
56
+ del resources # unused
57
+ return cls._CLOUD_UNSUPPORTED_FEATURES
58
+
59
+ @classmethod
60
+ def _max_cluster_name_length(cls) -> Optional[int]:
61
+ return cls._MAX_CLUSTER_NAME_LEN_LIMIT
62
+
63
+ @classmethod
64
+ def regions_with_offering(cls, instance_type: str,
65
+ accelerators: Optional[Dict[str, int]],
66
+ use_spot: bool, region: Optional[str],
67
+ zone: Optional[str]) -> List[clouds.Region]:
68
+ assert zone is None, 'Vast does not support zones.'
69
+ del accelerators, zone # unused
70
+ regions = service_catalog.get_region_zones_for_instance_type(
71
+ instance_type, use_spot, 'vast')
72
+
73
+ if region is not None:
74
+ regions = [r for r in regions if r.name == region]
75
+ return regions
76
+
77
+ @classmethod
78
+ def get_vcpus_mem_from_instance_type(
79
+ cls,
80
+ instance_type: str,
81
+ ) -> Tuple[Optional[float], Optional[float]]:
82
+ return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
83
+ clouds='vast')
84
+
85
+ @classmethod
86
+ def zones_provision_loop(
87
+ cls,
88
+ *,
89
+ region: str,
90
+ num_nodes: int,
91
+ instance_type: str,
92
+ accelerators: Optional[Dict[str, int]] = None,
93
+ use_spot: bool = False,
94
+ ) -> Iterator[None]:
95
+ del num_nodes # unused
96
+ regions = cls.regions_with_offering(instance_type,
97
+ accelerators,
98
+ use_spot,
99
+ region=region,
100
+ zone=None)
101
+ for r in regions:
102
+ assert r.zones is None, r
103
+ yield r.zones
104
+
105
+ def instance_type_to_hourly_cost(self,
106
+ instance_type: str,
107
+ use_spot: bool,
108
+ region: Optional[str] = None,
109
+ zone: Optional[str] = None) -> float:
110
+ return service_catalog.get_hourly_cost(instance_type,
111
+ use_spot=use_spot,
112
+ region=region,
113
+ zone=zone,
114
+ clouds='vast')
115
+
116
+ def accelerators_to_hourly_cost(self,
117
+ accelerators: Dict[str, int],
118
+ use_spot: bool,
119
+ region: Optional[str] = None,
120
+ zone: Optional[str] = None) -> float:
121
+ """Returns the hourly cost of the accelerators, in dollars/hour."""
122
+ del accelerators, use_spot, region, zone # unused
123
+ return 0.0 # Vast includes accelerators in the hourly cost.
124
+
125
+ def get_egress_cost(self, num_gigabytes: float) -> float:
126
+ return 0.0
127
+
128
+ @classmethod
129
+ def get_default_instance_type(
130
+ cls,
131
+ cpus: Optional[str] = None,
132
+ memory: Optional[str] = None,
133
+ disk_tier: Optional[resources_utils.DiskTier] = None
134
+ ) -> Optional[str]:
135
+ """Returns the default instance type for Vast."""
136
+ return service_catalog.get_default_instance_type(cpus=cpus,
137
+ memory=memory,
138
+ disk_tier=disk_tier,
139
+ clouds='vast')
140
+
141
+ @classmethod
142
+ def get_accelerators_from_instance_type(
143
+ cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
144
+ return service_catalog.get_accelerators_from_instance_type(
145
+ instance_type, clouds='vast')
146
+
147
+ @classmethod
148
+ def get_zone_shell_cmd(cls) -> Optional[str]:
149
+ return None
150
+
151
+ def make_deploy_resources_variables(
152
+ self,
153
+ resources: 'resources_lib.Resources',
154
+ cluster_name: resources_utils.ClusterName,
155
+ region: 'clouds.Region',
156
+ zones: Optional[List['clouds.Zone']],
157
+ num_nodes: int,
158
+ dryrun: bool = False) -> Dict[str, Optional[str]]:
159
+ del zones, dryrun, cluster_name, num_nodes # unused
160
+
161
+ r = resources
162
+ acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
163
+ custom_resources = resources_utils.make_ray_custom_resources_str(
164
+ acc_dict)
165
+
166
+ if r.image_id is None:
167
+ image_id = 'vastai/base:0.0.2'
168
+ elif r.extract_docker_image() is not None:
169
+ image_id = r.extract_docker_image()
170
+ else:
171
+ image_id = r.image_id[r.region]
172
+
173
+ return {
174
+ 'instance_type': resources.instance_type,
175
+ 'custom_resources': custom_resources,
176
+ 'region': region.name,
177
+ 'image_id': image_id,
178
+ }
179
+
180
+ def _get_feasible_launchable_resources(
181
+ self, resources: 'resources_lib.Resources'
182
+ ) -> 'resources_utils.FeasibleResources':
183
+ """Returns a list of feasible resources for the given resources."""
184
+ if resources.instance_type is not None:
185
+ assert resources.is_launchable(), resources
186
+ resources = resources.copy(accelerators=None)
187
+ return resources_utils.FeasibleResources([resources], [], None)
188
+
189
+ def _make(instance_list):
190
+ resource_list = []
191
+ for instance_type in instance_list:
192
+ r = resources.copy(
193
+ cloud=Vast(),
194
+ instance_type=instance_type,
195
+ accelerators=None,
196
+ cpus=None,
197
+ )
198
+ resource_list.append(r)
199
+ return resource_list
200
+
201
+ # Currently, handle a filter on accelerators only.
202
+ accelerators = resources.accelerators
203
+ if accelerators is None:
204
+ # Return a default instance type
205
+ default_instance_type = Vast.get_default_instance_type(
206
+ cpus=resources.cpus,
207
+ memory=resources.memory,
208
+ disk_tier=resources.disk_tier)
209
+ if default_instance_type is None:
210
+ # TODO: Add hints to all return values in this method to help
211
+ # users understand why the resources are not launchable.
212
+ return resources_utils.FeasibleResources([], [], None)
213
+ else:
214
+ return resources_utils.FeasibleResources(
215
+ _make([default_instance_type]), [], None)
216
+
217
+ assert len(accelerators) == 1, resources
218
+ acc, acc_count = list(accelerators.items())[0]
219
+ (instance_list, fuzzy_candidate_list
220
+ ) = service_catalog.get_instance_type_for_accelerator(
221
+ acc,
222
+ acc_count,
223
+ use_spot=resources.use_spot,
224
+ cpus=resources.cpus,
225
+ region=resources.region,
226
+ zone=resources.zone,
227
+ memory=resources.memory,
228
+ clouds='vast')
229
+ if instance_list is None:
230
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
231
+ None)
232
+ return resources_utils.FeasibleResources(_make(instance_list),
233
+ fuzzy_candidate_list, None)
234
+
235
+ @classmethod
236
+ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
237
+ """ Verify that the user has valid credentials for Vast. """
238
+ try:
239
+ import vastai_sdk as _vast # pylint: disable=import-outside-toplevel
240
+ vast = _vast.VastAI()
241
+
242
+ # We only support file pased credential passing
243
+ if vast.creds_source != 'FILE':
244
+ return False, (
245
+ 'error \n' # First line is indented by 4 spaces
246
+ ' Credentials can be set up by running: \n'
247
+ ' $ pip install vastai\n'
248
+ ' $ echo [key] > ~/.vast_api_key\n'
249
+ ' For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#vast' # pylint: disable=line-too-long
250
+ )
251
+
252
+ return True, None
253
+
254
+ except ImportError:
255
+ return False, ('Failed to import vast. '
256
+ 'To install, run: pip install skypilot[vast]')
257
+
258
+ def get_credential_file_mounts(self) -> Dict[str, str]:
259
+ return {
260
+ '~/.config/vastai/vast_api_key': '~/.config/vastai/vast_api_key'
261
+ }
262
+
263
+ @classmethod
264
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
265
+ # NOTE: used for very advanced SkyPilot functionality
266
+ # Can implement later if desired
267
+ return None
268
+
269
+ def instance_type_exists(self, instance_type: str) -> bool:
270
+ return service_catalog.instance_type_exists(instance_type, 'vast')
271
+
272
+ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
273
+ return service_catalog.validate_region_zone(region, zone, clouds='vast')
274
+
275
+ @classmethod
276
+ def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
277
+ # TODO: use 0.0 for now to allow all images. We should change this to
278
+ # return the docker image size.
279
+ return 0.0
@@ -6,13 +6,17 @@ https://github.com/ray-project/ray/tree/master/dashboard/client/src) and/or get
6
6
  rid of the SSH port-forwarding business (see cli.py's job_dashboard()
7
7
  comment).
8
8
  """
9
+ import collections
9
10
  import datetime
11
+ import enum
12
+ import os
10
13
  import pathlib
11
14
 
12
15
  import flask
13
16
  import yaml
14
17
 
15
18
  from sky import jobs as managed_jobs
19
+ from sky.jobs import constants as managed_job_constants
16
20
  from sky.utils import common_utils
17
21
  from sky.utils import controller_utils
18
22
 
@@ -41,6 +45,92 @@ def _is_running_on_jobs_controller() -> bool:
41
45
  return False
42
46
 
43
47
 
48
+ # Column indices for job table
49
+ class JobTableColumns(enum.IntEnum):
50
+ """Column indices for the jobs table in the dashboard.
51
+
52
+ - DROPDOWN (0): Column for expandable dropdown arrow
53
+ - ID (1): Job ID column
54
+ - TASK (2): Task name/number column
55
+ - NAME (3): Job name column
56
+ - RESOURCES (4): Resources used by job
57
+ - SUBMITTED (5): Job submission timestamp
58
+ - TOTAL_DURATION (6): Total time since job submission
59
+ - JOB_DURATION (7): Actual job runtime
60
+ - RECOVERIES (8): Number of job recoveries
61
+ - STATUS (9): Current job status
62
+ - STARTED (10): Job start timestamp
63
+ - CLUSTER (11): Cluster name
64
+ - REGION (12): Cloud region
65
+ - FAILOVER (13): Job failover history
66
+ - DETAILS (14): Job details
67
+ - ACTIONS (15): Available actions column
68
+ """
69
+ DROPDOWN = 0
70
+ ID = 1
71
+ TASK = 2
72
+ NAME = 3
73
+ RESOURCES = 4
74
+ SUBMITTED = 5
75
+ TOTAL_DURATION = 6
76
+ JOB_DURATION = 7
77
+ RECOVERIES = 8
78
+ STATUS = 9
79
+ STARTED = 10
80
+ CLUSTER = 11
81
+ REGION = 12
82
+ DETAILS = 13
83
+ FAILOVER = 14
84
+ ACTIONS = 15
85
+
86
+
87
+ # Column headers matching the indices above
88
+ JOB_TABLE_COLUMNS = [
89
+ '', 'ID', 'Task', 'Name', 'Resources', 'Submitted', 'Total Duration',
90
+ 'Job Duration', 'Status', 'Started', 'Cluster', 'Region', 'Failover',
91
+ 'Recoveries', 'Details', 'Actions'
92
+ ]
93
+
94
+
95
+ def _extract_launch_history(log_content: str) -> str:
96
+ """Extract launch history from log content.
97
+
98
+ Args:
99
+ log_content: Content of the log file.
100
+ Returns:
101
+ A formatted string containing the launch history.
102
+ """
103
+ launches = []
104
+ current_launch = None
105
+
106
+ for line in log_content.splitlines():
107
+ if 'Launching on' in line:
108
+ try:
109
+ parts = line.split(']')
110
+ if len(parts) >= 2:
111
+ timestamp = parts[0].split()[1:3]
112
+ message = parts[1].replace('[0m⚙︎', '').strip()
113
+ formatted_line = f'{" ".join(timestamp)} {message}'
114
+ if current_launch:
115
+ prev_time, prev_target = current_launch.rsplit(
116
+ ' Launching on ', 1)
117
+ launches.append(
118
+ f'{prev_time} Tried to launch on {prev_target}')
119
+
120
+ # Store the current launch
121
+ current_launch = formatted_line
122
+ except IndexError:
123
+ launches.append(line.strip())
124
+
125
+ # Add the final (successful) launch at the beginning
126
+ if current_launch:
127
+ result = [current_launch]
128
+ result.extend(launches)
129
+ return '\n'.join(result)
130
+
131
+ return 'No launch history found'
132
+
133
+
44
134
  @app.route('/')
45
135
  def home():
46
136
  if not _is_running_on_jobs_controller():
@@ -54,38 +144,84 @@ def home():
54
144
  rows = managed_jobs.format_job_table(all_managed_jobs,
55
145
  show_all=True,
56
146
  return_rows=True)
57
- # Add an empty column for the dropdown button. This will be added in the
58
- # jobs/templates/index.html file.
59
- rows = [[''] + row for row in rows]
60
-
61
- # FIXME(zongheng): make the job table/queue funcs return structured info so
62
- # that we don't have to do things like row[-5] below.
63
- columns = [
64
- '', 'ID', 'Task', 'Name', 'Resources', 'Submitted', 'Total Duration',
65
- 'Job Duration', 'Recoveries', 'Status', 'Started', 'Cluster', 'Region',
66
- 'Failure'
67
- ]
68
- if rows and len(rows[0]) != len(columns):
147
+
148
+ status_counts = collections.defaultdict(int)
149
+ for task in all_managed_jobs:
150
+ if not task['status'].is_terminal():
151
+ status_counts[task['status'].value] += 1
152
+
153
+ # Add an empty column for the dropdown button and actions column
154
+ rows = [[''] + row + [''] + [''] for row in rows
155
+ ] # Add empty cell for failover and actions column
156
+
157
+ # Add log content as failover history for each job
158
+ for row in rows:
159
+ job_id = str(row[JobTableColumns.ID]).strip().replace(' ⤳', '')
160
+ if job_id and job_id != '-':
161
+ try:
162
+ log_path = os.path.join(
163
+ os.path.expanduser(
164
+ managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
165
+ f'{job_id}.log')
166
+ if os.path.exists(log_path):
167
+ with open(log_path, 'r', encoding='utf-8') as f:
168
+ log_content = f.read()
169
+ row[JobTableColumns.FAILOVER] = _extract_launch_history(
170
+ log_content)
171
+ else:
172
+ row[JobTableColumns.FAILOVER] = 'Log file not found'
173
+ except (IOError, OSError) as e:
174
+ row[JobTableColumns.FAILOVER] = f'Error reading log: {str(e)}'
175
+ app.logger.error('All managed jobs:')
176
+
177
+ # Validate column count
178
+ if rows and len(rows[0]) != len(JOB_TABLE_COLUMNS):
69
179
  raise RuntimeError(
70
- 'Dashboard code and managed job queue code are out of sync.')
180
+ f'Dashboard code and managed job queue code are out of sync. '
181
+ f'Expected {(JOB_TABLE_COLUMNS)} columns, got {(rows[0])}')
71
182
 
72
- # Fix STATUS color codes: '\x1b[33mCANCELLED\x1b[0m' -> 'CANCELLED'.
183
+ # Fix STATUS color codes: '\x1b[33mCANCELLED\x1b[0m' -> 'CANCELLED'
73
184
  for row in rows:
74
- row[-5] = common_utils.remove_color(row[-5])
75
- # Remove filler rows ([''], ..., ['-']).
76
- rows = [row for row in rows if ''.join(map(str, row)) != '']
185
+ row[JobTableColumns.STATUS] = common_utils.remove_color(
186
+ row[JobTableColumns.STATUS])
187
+
188
+ # Remove filler rows ([''], ..., ['-'])
189
+ rows = [
190
+ row for row in rows
191
+ if ''.join(map(str, row[:JobTableColumns.ACTIONS])) != ''
192
+ ]
193
+
194
+ # Get all unique status values
195
+ status_values = sorted(
196
+ list(set(row[JobTableColumns.STATUS] for row in rows)))
77
197
 
78
- # Get all unique status values.
79
- status_values = sorted(list(set(row[-5] for row in rows)))
80
198
  rendered_html = flask.render_template(
81
199
  'index.html',
82
- columns=columns,
200
+ columns=JOB_TABLE_COLUMNS,
83
201
  rows=rows,
84
202
  last_updated_timestamp=timestamp,
85
203
  status_values=status_values,
204
+ status_counts=status_counts,
86
205
  )
87
206
  return rendered_html
88
207
 
89
208
 
209
+ @app.route('/download_log/<job_id>')
210
+ def download_log(job_id):
211
+ try:
212
+ log_path = os.path.join(
213
+ os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
214
+ f'{job_id}.log')
215
+ if not os.path.exists(log_path):
216
+ flask.abort(404)
217
+ return flask.send_file(log_path,
218
+ mimetype='text/plain',
219
+ as_attachment=True,
220
+ download_name=f'job_{job_id}.log')
221
+ except (IOError, OSError) as e:
222
+ app.logger.error(f'Error downloading log for job {job_id}: {str(e)}')
223
+ flask.abort(500)
224
+
225
+
90
226
  if __name__ == '__main__':
91
227
  app.run()