skypilot-nightly 1.0.0.dev20250219__py3-none-any.whl → 1.0.0.dev20250221__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/nebius.py +85 -0
  3. sky/backends/backend_utils.py +8 -0
  4. sky/backends/cloud_vm_ray_backend.py +10 -2
  5. sky/client/sdk.py +8 -3
  6. sky/clouds/__init__.py +2 -0
  7. sky/clouds/nebius.py +294 -0
  8. sky/clouds/service_catalog/constants.py +1 -1
  9. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  10. sky/jobs/controller.py +17 -0
  11. sky/jobs/server/core.py +31 -3
  12. sky/provision/__init__.py +1 -0
  13. sky/provision/kubernetes/instance.py +5 -1
  14. sky/provision/kubernetes/utils.py +8 -7
  15. sky/provision/nebius/__init__.py +11 -0
  16. sky/provision/nebius/config.py +11 -0
  17. sky/provision/nebius/instance.py +285 -0
  18. sky/provision/nebius/utils.py +310 -0
  19. sky/server/common.py +5 -7
  20. sky/server/requests/executor.py +94 -87
  21. sky/server/server.py +10 -5
  22. sky/server/stream_utils.py +8 -11
  23. sky/setup_files/dependencies.py +9 -1
  24. sky/skylet/constants.py +3 -6
  25. sky/task.py +6 -0
  26. sky/templates/jobs-controller.yaml.j2 +3 -0
  27. sky/templates/nebius-ray.yml.j2 +79 -0
  28. sky/utils/common_utils.py +38 -0
  29. sky/utils/controller_utils.py +66 -2
  30. {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/METADATA +8 -4
  31. {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/RECORD +35 -27
  32. {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/LICENSE +0 -0
  33. {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/WHEEL +0 -0
  34. {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/entry_points.txt +0 -0
  35. {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py CHANGED
@@ -19,6 +19,7 @@ from sky import sky_logging
19
19
  from sky import task as task_lib
20
20
  from sky.backends import backend_utils
21
21
  from sky.clouds.service_catalog import common as service_catalog_common
22
+ from sky.data import storage as storage_lib
22
23
  from sky.jobs import constants as managed_job_constants
23
24
  from sky.jobs import utils as managed_job_utils
24
25
  from sky.provision import common as provision_common
@@ -101,9 +102,35 @@ def launch(
101
102
 
102
103
  with rich_utils.safe_status(
103
104
  ux_utils.spinner_message('Initializing managed job')):
104
- for task_ in dag.tasks:
105
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
106
- task_, task_type='jobs')
105
+
106
+ local_to_controller_file_mounts = {}
107
+
108
+ if storage_lib.get_cached_enabled_storage_clouds_or_refresh():
109
+ for task_ in dag.tasks:
110
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
111
+ task_, task_type='jobs')
112
+
113
+ else:
114
+ # We do not have any cloud storage available, so fall back to
115
+ # two-hop file_mount uploading.
116
+ # Note: we can't easily hack sync_storage_mounts() to upload
117
+ # directly to the controller, because the controller may not
118
+ # even be up yet.
119
+ for task_ in dag.tasks:
120
+ if task_.storage_mounts:
121
+ # Technically, we could convert COPY storage_mounts that
122
+ # have a local source and do not specify `store`, but we
123
+ # will not do that for now. Only plain file_mounts are
124
+ # supported.
125
+ raise exceptions.NotSupportedError(
126
+ 'Cloud-based file_mounts are specified, but no cloud '
127
+ 'storage is available. Please specify local '
128
+ 'file_mounts only.')
129
+
130
+ # Merge file mounts from all tasks.
131
+ local_to_controller_file_mounts.update(
132
+ controller_utils.translate_local_file_mounts_to_two_hop(
133
+ task_))
107
134
 
108
135
  with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
109
136
  mode='w') as f:
@@ -120,6 +147,7 @@ def launch(
120
147
  vars_to_fill = {
121
148
  'remote_user_yaml_path': remote_user_yaml_path,
122
149
  'user_yaml_path': f.name,
150
+ 'local_to_controller_file_mounts': local_to_controller_file_mounts,
123
151
  'jobs_controller': controller_name,
124
152
  # Note: actual cluster name will be <task.name>-<managed job ID>
125
153
  'dag_name': dag.name,
sky/provision/__init__.py CHANGED
@@ -20,6 +20,7 @@ from sky.provision import fluidstack
20
20
  from sky.provision import gcp
21
21
  from sky.provision import kubernetes
22
22
  from sky.provision import lambda_cloud
23
+ from sky.provision import nebius
23
24
  from sky.provision import oci
24
25
  from sky.provision import runpod
25
26
  from sky.provision import vast
@@ -797,7 +797,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
797
797
  'value': 'present',
798
798
  'effect': 'NoSchedule'
799
799
  }
800
- pod_spec_copy['spec']['tolerations'] = [tpu_toleration]
800
+ # Preserve existing tolerations if any
801
+ existing_tolerations = pod_spec_copy['spec'].get('tolerations', [])
802
+ pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
803
+ tpu_toleration
804
+ ]
801
805
 
802
806
  return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
803
807
  context)
@@ -1302,13 +1302,13 @@ class KubernetesInstanceType:
1302
1302
  - Accelerators
1303
1303
  The name format is "{n}CPU--{k}GB" where n is the number of vCPUs and
1304
1304
  k is the amount of memory in GB. Accelerators can be specified by
1305
- appending "--{a}{type}" where a is the number of accelerators and
1306
- type is the accelerator type.
1305
+ appending "--{type}:{a}" where type is the accelerator type and a
1306
+ is the number of accelerators.
1307
1307
  CPU and memory can be specified as floats. Accelerator count must be int.
1308
1308
  Examples:
1309
1309
  - 4CPU--16GB
1310
1310
  - 0.5CPU--1.5GB
1311
- - 4CPU--16GB--1V100
1311
+ - 4CPU--16GB--V100:1
1312
1312
  """
1313
1313
 
1314
1314
  def __init__(self,
@@ -1333,13 +1333,14 @@ class KubernetesInstanceType:
1333
1333
  # valid logical instance type name.
1334
1334
  assert self.accelerator_type is not None, self.accelerator_count
1335
1335
  acc_name = self.accelerator_type.replace(' ', '_')
1336
- name += f'--{self.accelerator_count}{acc_name}'
1336
+ name += f'--{acc_name}:{self.accelerator_count}'
1337
1337
  return name
1338
1338
 
1339
1339
  @staticmethod
1340
1340
  def is_valid_instance_type(name: str) -> bool:
1341
1341
  """Returns whether the given name is a valid instance type."""
1342
- pattern = re.compile(r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
1342
+ pattern = re.compile(
1343
+ r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
1343
1344
  return bool(pattern.match(name))
1344
1345
 
1345
1346
  @classmethod
@@ -1354,7 +1355,7 @@ class KubernetesInstanceType:
1354
1355
  accelerator_type | str: Type of accelerator
1355
1356
  """
1356
1357
  pattern = re.compile(
1357
- r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$' # pylint: disable=line-too-long
1358
+ r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
1358
1359
  )
1359
1360
  match = pattern.match(name)
1360
1361
  if match:
@@ -1400,7 +1401,7 @@ class KubernetesInstanceType:
1400
1401
  # Round up accelerator_count if it is not an int.
1401
1402
  accelerator_count = math.ceil(accelerator_count)
1402
1403
  if accelerator_count > 0:
1403
- name += f'--{accelerator_count}{accelerator_type}'
1404
+ name += f'--{accelerator_type}:{accelerator_count}'
1404
1405
  return cls(cpus=cpus,
1405
1406
  memory=memory,
1406
1407
  accelerator_count=accelerator_count,
@@ -0,0 +1,11 @@
1
+ """Nebius provisioner for SkyPilot."""
2
+
3
+ from sky.provision.nebius.config import bootstrap_instances
4
+ from sky.provision.nebius.instance import cleanup_ports
5
+ from sky.provision.nebius.instance import get_cluster_info
6
+ from sky.provision.nebius.instance import open_ports
7
+ from sky.provision.nebius.instance import query_instances
8
+ from sky.provision.nebius.instance import run_instances
9
+ from sky.provision.nebius.instance import stop_instances
10
+ from sky.provision.nebius.instance import terminate_instances
11
+ from sky.provision.nebius.instance import wait_instances
@@ -0,0 +1,11 @@
1
+ """Nebius configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+ return config
@@ -0,0 +1,285 @@
1
+ """Nebius instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from sky import sky_logging
6
+ from sky.provision import common
7
+ from sky.provision.nebius import utils
8
+ from sky.utils import common_utils
9
+ from sky.utils import status_lib
10
+ from sky.utils import ux_utils
11
+
12
+ PENDING_STATUS = ['STARTING', 'DELETING', 'STOPPING']
13
+
14
+ MAX_RETRIES_TO_LAUNCH = 120 # Maximum number of retries
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+
19
+ def _filter_instances(region: str,
20
+ cluster_name_on_cloud: str,
21
+ status_filters: Optional[List[str]],
22
+ head_only: bool = False) -> Dict[str, Any]:
23
+ project_id = utils.get_project_by_region(region)
24
+ instances = utils.list_instances(project_id)
25
+ filtered_instances = {}
26
+ for instance_id, instance in instances.items():
27
+ if (status_filters is not None and
28
+ instance['status'] not in status_filters):
29
+ continue
30
+
31
+ if instance['name'] and instance['name'].startswith(
32
+ f'{cluster_name_on_cloud}-'):
33
+ if head_only and instance['name'].endswith('-worker'):
34
+ continue
35
+ else:
36
+ filtered_instances[instance_id] = instance
37
+ return filtered_instances
38
+
39
+
40
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
41
+ head_instance_id = None
42
+ for inst_id, inst in instances.items():
43
+ if inst['name'].endswith('-head'):
44
+ head_instance_id = inst_id
45
+ break
46
+ return head_instance_id
47
+
48
+
49
+ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
50
+ retry_count = 0
51
+ while retry_count < MAX_RETRIES_TO_LAUNCH:
52
+ instances = _filter_instances(region, cluster_name_on_cloud,
53
+ PENDING_STATUS)
54
+ if not instances:
55
+ break
56
+ logger.info(f'Waiting for {len(instances)} instances to be ready '
57
+ f'(Attempt {retry_count + 1}/{MAX_RETRIES_TO_LAUNCH}).')
58
+ time.sleep(utils.POLL_INTERVAL)
59
+ retry_count += 1
60
+
61
+ if retry_count == MAX_RETRIES_TO_LAUNCH:
62
+ raise TimeoutError(f'Exceeded maximum retries '
63
+ f'({MAX_RETRIES_TO_LAUNCH * utils.POLL_INTERVAL}'
64
+ f' seconds) while waiting for instances'
65
+ f' to be ready.')
66
+
67
+
68
+ def run_instances(region: str, cluster_name_on_cloud: str,
69
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
70
+ """Runs instances for the given cluster."""
71
+ _wait_until_no_pending(region, cluster_name_on_cloud)
72
+ running_instances = _filter_instances(region, cluster_name_on_cloud,
73
+ ['RUNNING'])
74
+ head_instance_id = _get_head_instance_id(running_instances)
75
+ to_start_count = config.count - len(running_instances)
76
+ if to_start_count < 0:
77
+ raise RuntimeError(
78
+ f'Cluster {cluster_name_on_cloud} already has '
79
+ f'{len(running_instances)} nodes, but {config.count} are required.')
80
+ if to_start_count == 0:
81
+ if head_instance_id is None:
82
+ raise RuntimeError(
83
+ f'Cluster {cluster_name_on_cloud} has no head node.')
84
+ logger.info(f'Cluster {cluster_name_on_cloud} already has '
85
+ f'{len(running_instances)} nodes, no need to start more.')
86
+ return common.ProvisionRecord(provider_name='nebius',
87
+ cluster_name=cluster_name_on_cloud,
88
+ region=region,
89
+ zone=None,
90
+ head_instance_id=head_instance_id,
91
+ resumed_instance_ids=[],
92
+ created_instance_ids=[])
93
+
94
+ created_instance_ids = []
95
+ resumed_instance_ids = []
96
+ stopped_instances = _filter_instances(region, cluster_name_on_cloud,
97
+ ['STOPPED'])
98
+ if config.resume_stopped_nodes and len(stopped_instances) > to_start_count:
99
+
100
+ raise RuntimeError(
101
+ 'The number of running/stopped/stopping instances combined '
102
+ f'({len(stopped_instances) + len(running_instances)}) in '
103
+ f'cluster "{cluster_name_on_cloud}" is greater than the '
104
+ f'number requested by the user ({config.count}). '
105
+ 'This is likely a resource leak. '
106
+ 'Use "sky down" to terminate the cluster.')
107
+
108
+ for stopped_instance_id, _ in stopped_instances.items():
109
+ if to_start_count > 0:
110
+ try:
111
+ utils.start(stopped_instance_id)
112
+ resumed_instance_ids.append(stopped_instance_id)
113
+ to_start_count -= 1
114
+ if stopped_instances[stopped_instance_id]['name'].endswith(
115
+ '-head'):
116
+ head_instance_id = stopped_instance_id
117
+ except Exception as e: # pylint: disable=broad-except
118
+ logger.warning(f'Start instance error: {e}')
119
+ raise
120
+ time.sleep(utils.POLL_INTERVAL) # to avoid fake STOPPED status
121
+ logger.info(f'Started instance {stopped_instance_id}.')
122
+
123
+ for _ in range(to_start_count):
124
+ node_type = 'head' if head_instance_id is None else 'worker'
125
+ try:
126
+ platform, preset = config.node_config['InstanceType'].split('_')
127
+ instance_id = utils.launch(
128
+ cluster_name_on_cloud=cluster_name_on_cloud,
129
+ node_type=node_type,
130
+ platform=platform,
131
+ preset=preset,
132
+ region=region,
133
+ image_family=config.node_config['ImageId'],
134
+ disk_size=config.node_config['DiskSize'],
135
+ user_data=config.node_config['UserData'])
136
+ except Exception as e: # pylint: disable=broad-except
137
+ logger.warning(f'run_instances error: {e}')
138
+ raise
139
+ logger.info(f'Launched instance {instance_id}.')
140
+ created_instance_ids.append(instance_id)
141
+ if head_instance_id is None:
142
+ head_instance_id = instance_id
143
+ assert head_instance_id is not None, 'head_instance_id should not be None'
144
+ return common.ProvisionRecord(provider_name='nebius',
145
+ cluster_name=cluster_name_on_cloud,
146
+ region=region,
147
+ zone=None,
148
+ head_instance_id=head_instance_id,
149
+ resumed_instance_ids=resumed_instance_ids,
150
+ created_instance_ids=created_instance_ids)
151
+
152
+
153
+ def wait_instances(region: str, cluster_name_on_cloud: str,
154
+ state: Optional[status_lib.ClusterStatus]) -> None:
155
+ _wait_until_no_pending(region, cluster_name_on_cloud)
156
+ if state is not None:
157
+ if state == status_lib.ClusterStatus.UP:
158
+ stopped_instances = _filter_instances(region, cluster_name_on_cloud,
159
+ ['STOPPED'])
160
+ if stopped_instances:
161
+ raise RuntimeError(
162
+ f'Cluster {cluster_name_on_cloud} is in UP state, but '
163
+ f'{len(stopped_instances)} instances are stopped.')
164
+ if state == status_lib.ClusterStatus.STOPPED:
165
+ running_instances = _filter_instances(region, cluster_name_on_cloud,
166
+ ['RUNNIG'])
167
+
168
+ if running_instances:
169
+ raise RuntimeError(
170
+ f'Cluster {cluster_name_on_cloud} is in STOPPED state, but '
171
+ f'{len(running_instances)} instances are running.')
172
+
173
+
174
+ def stop_instances(
175
+ cluster_name_on_cloud: str,
176
+ provider_config: Optional[Dict[str, Any]] = None,
177
+ worker_only: bool = False,
178
+ ) -> None:
179
+ assert provider_config is not None
180
+ exist_instances = _filter_instances(provider_config['region'],
181
+ cluster_name_on_cloud, ['RUNNING'])
182
+ for instance in exist_instances:
183
+ if worker_only and instance.endswith('-head'):
184
+ continue
185
+ utils.stop(instance)
186
+
187
+
188
+ def terminate_instances(
189
+ cluster_name_on_cloud: str,
190
+ provider_config: Optional[Dict[str, Any]] = None,
191
+ worker_only: bool = False,
192
+ ) -> None:
193
+ """See sky/provision/__init__.py"""
194
+
195
+ assert provider_config is not None
196
+ instances = _filter_instances(provider_config['region'],
197
+ cluster_name_on_cloud,
198
+ status_filters=None)
199
+ for inst_id, inst in instances.items():
200
+ logger.debug(f'Terminating instance {inst_id}: {inst}')
201
+ if worker_only and inst['name'].endswith('-head'):
202
+ continue
203
+ try:
204
+ utils.remove(inst_id)
205
+ except Exception as e: # pylint: disable=broad-except
206
+ with ux_utils.print_exception_no_traceback():
207
+ raise RuntimeError(
208
+ f'Failed to terminate instance {inst_id}: '
209
+ f'{common_utils.format_exception(e, use_bracket=False)}'
210
+ ) from e
211
+ utils.delete_cluster(cluster_name_on_cloud, provider_config['region'])
212
+
213
+
214
+ def get_cluster_info(
215
+ region: str,
216
+ cluster_name_on_cloud: str,
217
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
218
+ _wait_until_no_pending(region, cluster_name_on_cloud)
219
+ running_instances = _filter_instances(region, cluster_name_on_cloud,
220
+ ['RUNNING'])
221
+ instances: Dict[str, List[common.InstanceInfo]] = {}
222
+ head_instance_id = None
223
+ for instance_id, instance_info in running_instances.items():
224
+ instances[instance_id] = [
225
+ common.InstanceInfo(
226
+ instance_id=instance_id,
227
+ internal_ip=instance_info['internal_ip'],
228
+ external_ip=instance_info['external_ip'],
229
+ tags={},
230
+ )
231
+ ]
232
+ if instance_info['name'].endswith('-head'):
233
+ head_instance_id = instance_id
234
+ assert head_instance_id is not None
235
+ return common.ClusterInfo(
236
+ instances=instances,
237
+ head_instance_id=head_instance_id,
238
+ provider_name='nebius',
239
+ provider_config=provider_config,
240
+ )
241
+
242
+
243
+ def query_instances(
244
+ cluster_name_on_cloud: str,
245
+ provider_config: Optional[Dict[str, Any]] = None,
246
+ non_terminated_only: bool = True,
247
+ ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
248
+ """See sky/provision/__init__.py"""
249
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
250
+ instances = _filter_instances(provider_config['region'],
251
+ cluster_name_on_cloud, None)
252
+
253
+ status_map = {
254
+ 'STARTING': status_lib.ClusterStatus.INIT,
255
+ 'RUNNING': status_lib.ClusterStatus.UP,
256
+ 'STOPPED': status_lib.ClusterStatus.STOPPED,
257
+ 'STOPPING': status_lib.ClusterStatus.STOPPED,
258
+ 'DELETING': status_lib.ClusterStatus.STOPPED,
259
+ }
260
+ statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
261
+ for inst_id, inst in instances.items():
262
+ status = status_map[inst['status']]
263
+ if non_terminated_only and status is None:
264
+ continue
265
+ statuses[inst_id] = status
266
+ return statuses
267
+
268
+
269
+ def open_ports(
270
+ cluster_name_on_cloud: str,
271
+ ports: List[str],
272
+ provider_config: Optional[Dict[str, Any]] = None,
273
+ ) -> None:
274
+ """See sky/provision/__init__.py"""
275
+ logger.debug(f'Skip opening ports {ports} for Nebius instances, as all '
276
+ 'ports are open by default.')
277
+ del cluster_name_on_cloud, provider_config, ports
278
+
279
+
280
+ def cleanup_ports(
281
+ cluster_name_on_cloud: str,
282
+ ports: List[str],
283
+ provider_config: Optional[Dict[str, Any]] = None,
284
+ ) -> None:
285
+ del cluster_name_on_cloud, ports, provider_config # Unused.