skypilot-nightly 1.0.0.dev20250916__py3-none-any.whl → 1.0.0.dev20250918__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (67) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/primeintellect.py +1 -0
  3. sky/adaptors/seeweb.py +68 -4
  4. sky/authentication.py +25 -0
  5. sky/backends/__init__.py +3 -2
  6. sky/backends/backend_utils.py +16 -12
  7. sky/backends/cloud_vm_ray_backend.py +57 -0
  8. sky/catalog/primeintellect_catalog.py +95 -0
  9. sky/clouds/__init__.py +2 -0
  10. sky/clouds/primeintellect.py +314 -0
  11. sky/core.py +10 -3
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/{6856-e0754534b3015377.js → 6856-9a2538f38c004652.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +6 -0
  17. sky/dashboard/out/_next/static/chunks/{webpack-05f82d90d6fd7f82.js → webpack-487697b47d8c5e50.js} +1 -1
  18. sky/dashboard/out/_next/static/{y8s7LlyyfhMzpzCkxuD2r → k1mo5xWZrV9djgjd0moOT}/_buildManifest.js +1 -1
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  28. sky/dashboard/out/jobs.html +1 -1
  29. sky/dashboard/out/users.html +1 -1
  30. sky/dashboard/out/volumes.html +1 -1
  31. sky/dashboard/out/workspace/new.html +1 -1
  32. sky/dashboard/out/workspaces/[name].html +1 -1
  33. sky/dashboard/out/workspaces.html +1 -1
  34. sky/global_user_state.py +42 -34
  35. sky/jobs/server/server.py +14 -1
  36. sky/jobs/state.py +26 -1
  37. sky/provision/__init__.py +1 -0
  38. sky/provision/docker_utils.py +6 -2
  39. sky/provision/primeintellect/__init__.py +10 -0
  40. sky/provision/primeintellect/config.py +11 -0
  41. sky/provision/primeintellect/instance.py +454 -0
  42. sky/provision/primeintellect/utils.py +398 -0
  43. sky/resources.py +9 -1
  44. sky/schemas/generated/servev1_pb2.py +58 -0
  45. sky/schemas/generated/servev1_pb2.pyi +115 -0
  46. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  47. sky/serve/serve_rpc_utils.py +179 -0
  48. sky/serve/serve_utils.py +29 -12
  49. sky/serve/server/core.py +37 -19
  50. sky/serve/server/impl.py +221 -129
  51. sky/server/requests/executor.py +3 -0
  52. sky/setup_files/dependencies.py +1 -0
  53. sky/skylet/constants.py +5 -3
  54. sky/skylet/services.py +98 -0
  55. sky/skylet/skylet.py +3 -1
  56. sky/templates/kubernetes-ray.yml.j2 +22 -12
  57. sky/templates/primeintellect-ray.yml.j2 +71 -0
  58. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/METADATA +37 -36
  59. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/RECORD +64 -52
  60. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +0 -1
  61. sky/dashboard/out/_next/static/chunks/8969-0487dfbf149d9e53.js +0 -1
  62. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +0 -6
  63. /sky/dashboard/out/_next/static/{y8s7LlyyfhMzpzCkxuD2r → k1mo5xWZrV9djgjd0moOT}/_ssgManifest.js +0 -0
  64. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/WHEEL +0 -0
  65. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/entry_points.txt +0 -0
  66. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/licenses/LICENSE +0 -0
  67. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -613,7 +613,7 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
613
613
  """
614
614
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
615
615
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
616
- count = await session.execute(
616
+ result = await session.execute(
617
617
  sqlalchemy.update(spot_table).where(
618
618
  sqlalchemy.and_(
619
619
  spot_table.c.spot_job_id == job_id,
@@ -625,6 +625,7 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
625
625
  spot_table.c.end_at.is_(None),
626
626
  )).values({spot_table.c.status: ManagedJobStatus.PENDING.value})
627
627
  )
628
+ count = result.rowcount
628
629
  await session.commit()
629
630
  if count != 1:
630
631
  raise exceptions.ManagedJobStatusError(
@@ -712,7 +713,19 @@ def set_failed(
712
713
  where_conditions = [spot_table.c.spot_job_id == job_id]
713
714
  if task_id is not None:
714
715
  where_conditions.append(spot_table.c.task_id == task_id)
716
+
717
+ # Handle failure_reason prepending when override_terminal is True
715
718
  if override_terminal:
719
+ # Get existing failure_reason with row lock to prevent race
720
+ # conditions
721
+ existing_reason_result = session.execute(
722
+ sqlalchemy.select(spot_table.c.failure_reason).where(
723
+ sqlalchemy.and_(*where_conditions)).with_for_update())
724
+ existing_reason_row = existing_reason_result.fetchone()
725
+ if existing_reason_row and existing_reason_row[0]:
726
+ # Prepend new failure reason to existing one
727
+ fields_to_set[spot_table.c.failure_reason] = (
728
+ failure_reason + '. Previously: ' + existing_reason_row[0])
716
729
  # Use COALESCE for end_at to avoid overriding the existing end_at if
717
730
  # it's already set.
718
731
  fields_to_set[spot_table.c.end_at] = sqlalchemy.func.coalesce(
@@ -1651,7 +1664,19 @@ async def set_failed_async(
1651
1664
  where_conditions = [spot_table.c.spot_job_id == job_id]
1652
1665
  if task_id is not None:
1653
1666
  where_conditions.append(spot_table.c.task_id == task_id)
1667
+
1668
+ # Handle failure_reason prepending when override_terminal is True
1654
1669
  if override_terminal:
1670
+ # Get existing failure_reason with row lock to prevent race
1671
+ # conditions
1672
+ existing_reason_result = await session.execute(
1673
+ sqlalchemy.select(spot_table.c.failure_reason).where(
1674
+ sqlalchemy.and_(*where_conditions)).with_for_update())
1675
+ existing_reason_row = existing_reason_result.fetchone()
1676
+ if existing_reason_row and existing_reason_row[0]:
1677
+ # Prepend new failure reason to existing one
1678
+ fields_to_set[spot_table.c.failure_reason] = (
1679
+ failure_reason + '. Previously: ' + existing_reason_row[0])
1655
1680
  fields_to_set[spot_table.c.end_at] = sqlalchemy.func.coalesce(
1656
1681
  spot_table.c.end_at, end_time)
1657
1682
  else:
sky/provision/__init__.py CHANGED
@@ -24,6 +24,7 @@ from sky.provision import kubernetes
24
24
  from sky.provision import lambda_cloud
25
25
  from sky.provision import nebius
26
26
  from sky.provision import oci
27
+ from sky.provision import primeintellect
27
28
  from sky.provision import runpod
28
29
  from sky.provision import scp
29
30
  from sky.provision import seeweb
@@ -15,10 +15,14 @@ logger = sky_logging.init_logger(__name__)
15
15
  # Configure environment variables. A docker image can have environment variables
16
16
  # set in the Dockerfile with `ENV``. We need to export these variables to the
17
17
  # shell environment, so that our ssh session can access them.
18
+ # Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
19
+ # Docker images with Ray 2.48.0+ set this for UV package manager support,
20
+ # but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
21
+ # See: https://github.com/skypilot-org/skypilot/pull/7181
18
22
  SETUP_ENV_VARS_CMD = (
19
23
  'prefix_cmd() '
20
24
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
- 'export -p > ~/container_env_var.sh && '
25
+ 'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
22
26
  '$(prefix_cmd) '
23
27
  'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
24
28
 
@@ -410,7 +414,7 @@ class DockerInitializer:
410
414
  # pylint: disable=anomalous-backslash-in-string
411
415
  self._run(
412
416
  'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
413
- f'sudo echo "Port {port}" >> /etc/ssh/sshd_config;'
417
+ f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
414
418
  'mkdir -p ~/.ssh;'
415
419
  'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
416
420
  'sudo service ssh start;'
@@ -0,0 +1,10 @@
1
+ """Prime Intellect provisioner for SkyPilot."""
2
+
3
+ from sky.provision.primeintellect.config import bootstrap_instances
4
+ from sky.provision.primeintellect.instance import cleanup_ports
5
+ from sky.provision.primeintellect.instance import get_cluster_info
6
+ from sky.provision.primeintellect.instance import query_instances
7
+ from sky.provision.primeintellect.instance import run_instances
8
+ from sky.provision.primeintellect.instance import stop_instances
9
+ from sky.provision.primeintellect.instance import terminate_instances
10
+ from sky.provision.primeintellect.instance import wait_instances
@@ -0,0 +1,11 @@
1
+ """Prime Intellect configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+ return config
@@ -0,0 +1,454 @@
1
+ """Prime Intellect instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ from sky import exceptions
6
+ from sky import sky_logging
7
+ from sky.provision import common
8
+ from sky.provision.primeintellect import utils
9
+ from sky.utils import common_utils
10
+ from sky.utils import status_lib
11
+ from sky.utils import ux_utils
12
+
13
+ # The maximum number of times to poll for the status of an operation.
14
+ POLL_INTERVAL = 5
15
+ MAX_POLLS = 60 // POLL_INTERVAL
16
+ # Terminating instances can take several minutes, so we increase the timeout
17
+ MAX_POLLS_FOR_UP_OR_TERMINATE = MAX_POLLS * 16
18
+
19
+ # status filters
20
+ # PROVISIONING, PENDING, ACTIVE, STOPPED, ERROR, DELETING, TERMINATED
21
+
22
+ logger = sky_logging.init_logger(__name__)
23
+
24
+ # SSH connection readiness polling constants
25
+ SSH_CONN_MAX_RETRIES = 6
26
+ SSH_CONN_RETRY_INTERVAL_SECONDS = 10
27
+
28
+
29
+ def _filter_instances(cluster_name_on_cloud: str,
30
+ status_filters: Optional[List[str]]) -> Dict[str, Any]:
31
+ client = utils.PrimeIntellectAPIClient()
32
+ instances = client.list_instances()
33
+ # TODO: verify names are we using it?
34
+ possible_names = [
35
+ f'{cluster_name_on_cloud}-head',
36
+ f'{cluster_name_on_cloud}-worker',
37
+ ]
38
+
39
+ filtered_instances = {}
40
+ for instance in instances:
41
+ instance_id = instance['id']
42
+ if (status_filters is not None and
43
+ instance['status'] not in status_filters):
44
+ continue
45
+ instance_name = instance.get('name')
46
+ if instance_name and instance_name in possible_names:
47
+ filtered_instances[instance_id] = instance
48
+ return filtered_instances
49
+
50
+
51
+ def _get_instance_info(instance_id: str) -> Dict[str, Any]:
52
+ client = utils.PrimeIntellectAPIClient()
53
+ return client.get_instance_details(instance_id)
54
+
55
+
56
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
57
+ head_instance_id = None
58
+ for inst_id, inst in instances.items():
59
+ if inst['name'].endswith('-head'):
60
+ head_instance_id = inst_id
61
+ break
62
+ return head_instance_id
63
+
64
+
65
+ # Helper is available as utils.parse_ssh_connection.
66
+
67
+
68
+ def run_instances(region: str, cluster_name_on_cloud: str,
69
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
70
+ """Runs instances for the given cluster."""
71
+
72
+ pending_status = [
73
+ 'PROVISIONING',
74
+ 'PENDING',
75
+ ]
76
+ newly_started_instances = _filter_instances(cluster_name_on_cloud,
77
+ pending_status)
78
+ client = utils.PrimeIntellectAPIClient()
79
+
80
+ while True:
81
+ instances = _filter_instances(cluster_name_on_cloud, pending_status)
82
+ if not instances:
83
+ break
84
+ instance_statuses = [
85
+ instance['status'] for instance in instances.values()
86
+ ]
87
+ logger.info(f'Waiting for {len(instances)} instances to be ready: '
88
+ f'{instance_statuses}')
89
+ time.sleep(POLL_INTERVAL)
90
+
91
+ exist_instances = _filter_instances(cluster_name_on_cloud,
92
+ status_filters=pending_status)
93
+ if len(exist_instances) > config.count:
94
+ raise RuntimeError(
95
+ f'Cluster {cluster_name_on_cloud} already has '
96
+ f'{len(exist_instances)} nodes, but {config.count} are required.')
97
+
98
+ exist_instances = _filter_instances(cluster_name_on_cloud,
99
+ status_filters=['ACTIVE'])
100
+ head_instance_id = _get_head_instance_id(exist_instances)
101
+ to_start_count = config.count - len(exist_instances)
102
+ if to_start_count < 0:
103
+ raise RuntimeError(
104
+ f'Cluster {cluster_name_on_cloud} already has '
105
+ f'{len(exist_instances)} nodes, but {config.count} are required.')
106
+ if to_start_count == 0:
107
+ if head_instance_id is None:
108
+ head_instance_id = list(exist_instances.keys())[0]
109
+ # TODO: implement rename pod
110
+ # client.rename(
111
+ # instance_id=head_instance_id,
112
+ # name=f'{cluster_name_on_cloud}-head',
113
+ # )
114
+ assert head_instance_id is not None, (
115
+ 'head_instance_id should not be None')
116
+ logger.info(f'Cluster {cluster_name_on_cloud} already has '
117
+ f'{len(exist_instances)} nodes, no need to start more.')
118
+ return common.ProvisionRecord(
119
+ provider_name='primeintellect',
120
+ cluster_name=cluster_name_on_cloud,
121
+ region=region,
122
+ zone=config.provider_config['zones'],
123
+ head_instance_id=head_instance_id,
124
+ resumed_instance_ids=list(newly_started_instances.keys()),
125
+ created_instance_ids=[],
126
+ )
127
+
128
+ created_instance_ids = []
129
+ for _ in range(to_start_count):
130
+ node_type = 'head' if head_instance_id is None else 'worker'
131
+ try:
132
+ # Extract vCPUs and memory from instance type
133
+ # Format: provider__gpu_prefix_base_type__vcpus__memory[_SPOT]
134
+ instance_type = config.node_config['InstanceType']
135
+ disk_size = config.node_config.get('DiskSize')
136
+ vcpus = -1
137
+ memory = -1
138
+ try:
139
+ # Split by '__'
140
+ parts = instance_type.split('__')
141
+
142
+ # Format: provider__gpu_info__vcpus__memory[_SPOT]
143
+ # For: primecompute__8xH100_80GB__104__752_SPOT
144
+ # parts[0] = primecompute, parts[1] = 8xH100_80GB,
145
+ # parts[2] = 104, parts[3] = 752, parts[4] = SPOT
146
+ if len(parts) >= 4:
147
+ vcpu_str = parts[2]
148
+ memory_str = parts[3]
149
+ vcpus = int(vcpu_str)
150
+ memory = int(memory_str)
151
+ except (ValueError, IndexError) as e:
152
+ # If parsing fails, try to get from catalog
153
+ logger.warning(
154
+ f'Failed to parse vCPUs/memory from instance type '
155
+ f'{instance_type}: {e}')
156
+
157
+ params = {
158
+ 'name': f'{cluster_name_on_cloud}-{node_type}',
159
+ 'instance_type': config.node_config['InstanceType'],
160
+ 'region': region,
161
+ 'availability_zone': config.provider_config['zones'],
162
+ 'disk_size': disk_size,
163
+ 'vcpus': vcpus,
164
+ 'memory': memory,
165
+ }
166
+
167
+ response = client.launch(**params)
168
+ instance_id = response['id']
169
+ except utils.PrimeintellectResourcesUnavailableError as e:
170
+ # Resource unavailability error - provide specific message
171
+ instance_type = config.node_config['InstanceType']
172
+ region_str = (f' in region {region}'
173
+ if region != 'PLACEHOLDER' else '')
174
+ error_msg = (
175
+ f'Resources are currently unavailable on Prime Intellect. '
176
+ f'No {instance_type} instances are available{region_str}. '
177
+ f'Please try again later or consider using a different '
178
+ f'instance type or region. Details: {str(e)}')
179
+ logger.warning(f'Resource unavailability error: {e}')
180
+ with ux_utils.print_exception_no_traceback():
181
+ raise exceptions.ResourcesUnavailableError(error_msg) from e
182
+ except utils.PrimeintellectAPIError as e:
183
+ # Other API errors - provide specific message
184
+ instance_type = config.node_config['InstanceType']
185
+ region_str = (f' in region {region}'
186
+ if region != 'PLACEHOLDER' else '')
187
+ error_msg = (f'Failed to launch {instance_type} instance on Prime '
188
+ f'Intellect{region_str}. Details: {str(e)}')
189
+ logger.warning(f'API error during instance launch: {e}')
190
+ with ux_utils.print_exception_no_traceback():
191
+ raise exceptions.ResourcesUnavailableError(error_msg) from e
192
+ except Exception as e: # pylint: disable=broad-except
193
+ # Generic error handling for unexpected errors
194
+ instance_type = config.node_config['InstanceType']
195
+ region_str = (f' in region {region}'
196
+ if region != 'PLACEHOLDER' else '')
197
+ error_msg = (
198
+ f'Unexpected error while launching {instance_type} instance '
199
+ f'on Prime Intellect{region_str}. Details: '
200
+ f'{common_utils.format_exception(e, use_bracket=False)}')
201
+ logger.warning(f'Unexpected error during instance launch: {e}')
202
+ with ux_utils.print_exception_no_traceback():
203
+ raise exceptions.ResourcesUnavailableError(error_msg) from e
204
+ logger.info(f'Launched instance {instance_id}.')
205
+ created_instance_ids.append(instance_id)
206
+ if head_instance_id is None:
207
+ head_instance_id = instance_id
208
+
209
+ # Wait for instances to be ready.
210
+ for _ in range(MAX_POLLS_FOR_UP_OR_TERMINATE):
211
+ instances = _filter_instances(cluster_name_on_cloud, ['ACTIVE'])
212
+ logger.info('Waiting for instances to be ready: '
213
+ f'({len(instances)}/{config.count}).')
214
+ if len(instances) == config.count:
215
+ break
216
+
217
+ time.sleep(POLL_INTERVAL)
218
+ else:
219
+ # Failed to launch config.count of instances after max retries
220
+ # Provide more specific error message
221
+ instance_type = config.node_config['InstanceType']
222
+ region_str = (f' in region {region}' if region != 'PLACEHOLDER' else '')
223
+ active_instances = len(
224
+ _filter_instances(cluster_name_on_cloud, ['ACTIVE']))
225
+ error_msg = (
226
+ f'Timed out waiting for {instance_type} instances to become '
227
+ f'ready on Prime Intellect{region_str}. Only {active_instances} '
228
+ f'out of {config.count} instances became active. This may '
229
+ f'indicate capacity issues or slow provisioning. Please try '
230
+ f'again later or consider using a different instance type or '
231
+ f'region.')
232
+ logger.warning(error_msg)
233
+ with ux_utils.print_exception_no_traceback():
234
+ raise exceptions.ResourcesUnavailableError(error_msg)
235
+ assert head_instance_id is not None, 'head_instance_id should not be None'
236
+ return common.ProvisionRecord(
237
+ provider_name='primeintellect',
238
+ cluster_name=cluster_name_on_cloud,
239
+ region=region,
240
+ zone=config.provider_config['zones'],
241
+ head_instance_id=head_instance_id,
242
+ resumed_instance_ids=[],
243
+ created_instance_ids=created_instance_ids,
244
+ )
245
+
246
+
247
+ def wait_instances(region: str, cluster_name_on_cloud: str,
248
+ state: Optional[status_lib.ClusterStatus]) -> None:
249
+ del region, cluster_name_on_cloud, state
250
+
251
+
252
+ def stop_instances(
253
+ cluster_name_on_cloud: str,
254
+ provider_config: Optional[Dict[str, Any]] = None,
255
+ worker_only: bool = False,
256
+ ) -> None:
257
+ raise NotImplementedError()
258
+
259
+
260
+ def terminate_instances(
261
+ cluster_name_on_cloud: str,
262
+ provider_config: Optional[Dict[str, Any]] = None,
263
+ worker_only: bool = False,
264
+ ) -> None:
265
+ """See sky/provision/__init__.py"""
266
+ del provider_config # unused
267
+ client = utils.PrimeIntellectAPIClient()
268
+ instances = _filter_instances(cluster_name_on_cloud, None)
269
+
270
+ # Log if no instances found
271
+ if not instances:
272
+ logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
273
+ return
274
+
275
+ # Filter out already terminated instances
276
+ non_terminated_instances = {
277
+ inst_id: inst
278
+ for inst_id, inst in instances.items()
279
+ if inst['status'] not in ['TERMINATED', 'DELETING']
280
+ }
281
+
282
+ if not non_terminated_instances:
283
+ logger.info(
284
+ f'All instances for cluster {cluster_name_on_cloud} are already '
285
+ f'terminated or being deleted')
286
+ return
287
+
288
+ # Log what we're about to terminate
289
+ instance_names = [
290
+ inst['name'] for inst in non_terminated_instances.values()
291
+ ]
292
+ logger.info(
293
+ f'Terminating {len(non_terminated_instances)} instances for cluster '
294
+ f'{cluster_name_on_cloud}: {instance_names}')
295
+
296
+ # Terminate each instance
297
+ terminated_instances = []
298
+ for inst_id, inst in non_terminated_instances.items():
299
+ status = inst['status']
300
+ logger.debug(f'Terminating instance {inst_id} (status: {status})')
301
+ if worker_only and inst['name'].endswith('-head'):
302
+ continue
303
+ try:
304
+ client.remove(inst_id)
305
+ terminated_instances.append(inst_id)
306
+ name = inst['name']
307
+ logger.info(
308
+ f'Successfully initiated termination of instance {inst_id} '
309
+ f'({name})')
310
+ except Exception as e: # pylint: disable=broad-except
311
+ with ux_utils.print_exception_no_traceback():
312
+ raise RuntimeError(
313
+ f'Failed to terminate instance {inst_id}: '
314
+ f'{common_utils.format_exception(e, use_bracket=False)}'
315
+ ) from e
316
+
317
+ # Wait for instances to be terminated
318
+ if not terminated_instances:
319
+ logger.info(
320
+ 'No instances were terminated (worker_only=True and only head '
321
+ 'node found)')
322
+ return
323
+
324
+ logger.info(f'Waiting for {len(terminated_instances)} instances to be '
325
+ f'terminated...')
326
+ for _ in range(MAX_POLLS_FOR_UP_OR_TERMINATE):
327
+ remaining_instances = _filter_instances(cluster_name_on_cloud, None)
328
+
329
+ # Check if all terminated instances are gone
330
+ still_exist = [
331
+ inst_id for inst_id in terminated_instances
332
+ if inst_id in remaining_instances
333
+ ]
334
+ if not still_exist:
335
+ logger.info('All instances have been successfully terminated')
336
+ break
337
+
338
+ # Log status of remaining instances
339
+ remaining_statuses = [(inst_id, remaining_instances[inst_id]['status'])
340
+ for inst_id in still_exist]
341
+ logger.info(
342
+ f'Waiting for termination... {len(still_exist)} instances still '
343
+ f'exist: {remaining_statuses}')
344
+ time.sleep(POLL_INTERVAL)
345
+ else:
346
+ # Timeout reached
347
+ remaining_instances = _filter_instances(cluster_name_on_cloud, None)
348
+ still_exist = [
349
+ inst_id for inst_id in terminated_instances
350
+ if inst_id in remaining_instances
351
+ ]
352
+ if still_exist:
353
+ logger.warning(
354
+ f'Timeout reached. {len(still_exist)} instances may still be '
355
+ f'terminating: {still_exist}')
356
+ else:
357
+ logger.info('All instances have been successfully terminated')
358
+
359
+
360
+ def get_cluster_info(
361
+ region: str,
362
+ cluster_name_on_cloud: str,
363
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
364
+ del region # unused
365
+ running_instances = _filter_instances(cluster_name_on_cloud, ['ACTIVE'])
366
+ instances: Dict[str, List[common.InstanceInfo]] = {}
367
+ head_instance_id = None
368
+ head_ssh_user = None
369
+ for instance_id, instance in running_instances.items():
370
+ retry_count = 0
371
+ max_retries = SSH_CONN_MAX_RETRIES
372
+ while (instance.get('sshConnection') is None and
373
+ retry_count < max_retries):
374
+ name = instance.get('name')
375
+ print(f'SSH connection to {name} is not ready, waiting '
376
+ f'{SSH_CONN_RETRY_INTERVAL_SECONDS} seconds... '
377
+ f'(attempt {retry_count + 1}/{max_retries})')
378
+ time.sleep(SSH_CONN_RETRY_INTERVAL_SECONDS)
379
+ retry_count += 1
380
+ running_instances[instance_id] = _get_instance_info(instance_id)
381
+
382
+ if instance.get('sshConnection') is not None:
383
+ print('SSH connection is ready!')
384
+ else:
385
+ raise Exception(
386
+ f'Failed to establish SSH connection after {max_retries} '
387
+ f'attempts')
388
+
389
+ assert instance.get(
390
+ 'sshConnection'), 'sshConnection cannot be null anymore'
391
+
392
+ ssh_connection = instance['sshConnection']
393
+ _, ssh_port = utils.parse_ssh_connection(ssh_connection)
394
+
395
+ external_ip = instance['ip']
396
+ if isinstance(external_ip, list):
397
+ external_ip = external_ip[0]
398
+
399
+ instances[instance_id] = [
400
+ common.InstanceInfo(
401
+ instance_id=instance_id,
402
+ internal_ip='NOT_SUPPORTED',
403
+ external_ip=external_ip,
404
+ ssh_port=ssh_port,
405
+ tags={'provider': instance['providerType']},
406
+ )
407
+ ]
408
+ if instance['name'].endswith('-head'):
409
+ head_instance_id = instance_id
410
+ parsed_user_for_user, _ = utils.parse_ssh_connection(ssh_connection)
411
+ head_ssh_user = parsed_user_for_user or 'ubuntu'
412
+
413
+ return common.ClusterInfo(
414
+ instances=instances,
415
+ head_instance_id=head_instance_id,
416
+ provider_name='primeintellect',
417
+ provider_config=provider_config,
418
+ ssh_user=head_ssh_user,
419
+ )
420
+
421
+
422
+ def query_instances(
423
+ cluster_name_on_cloud: str,
424
+ provider_config: Optional[Dict[str, Any]] = None,
425
+ non_terminated_only: bool = True,
426
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
427
+ """See sky/provision/__init__.py"""
428
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
429
+ instances = _filter_instances(cluster_name_on_cloud, None)
430
+
431
+ status_map = {
432
+ 'PENDING': status_lib.ClusterStatus.INIT,
433
+ 'ERROR': status_lib.ClusterStatus.INIT,
434
+ 'ACTIVE': status_lib.ClusterStatus.UP,
435
+ 'STOPPED': status_lib.ClusterStatus.STOPPED,
436
+ 'DELETING': None, # Being deleted - should be filtered out
437
+ 'TERMINATED': None, # Already terminated - should be filtered out
438
+ }
439
+ statuses: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
440
+ Optional[str]]] = {}
441
+ for inst_id, inst in instances.items():
442
+ status = status_map[inst['status']]
443
+ if non_terminated_only and status is None:
444
+ continue
445
+ statuses[inst_id] = (status, None)
446
+ return statuses
447
+
448
+
449
+ def cleanup_ports(
450
+ cluster_name_on_cloud: str,
451
+ ports: List[str],
452
+ provider_config: Optional[Dict[str, Any]] = None,
453
+ ) -> None:
454
+ del cluster_name_on_cloud, ports, provider_config # Unused.