skypilot-nightly 1.0.0.dev20250916__py3-none-any.whl → 1.0.0.dev20250918__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +68 -4
- sky/authentication.py +25 -0
- sky/backends/__init__.py +3 -2
- sky/backends/backend_utils.py +16 -12
- sky/backends/cloud_vm_ray_backend.py +57 -0
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/primeintellect.py +314 -0
- sky/core.py +10 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6856-e0754534b3015377.js → 6856-9a2538f38c004652.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-05f82d90d6fd7f82.js → webpack-487697b47d8c5e50.js} +1 -1
- sky/dashboard/out/_next/static/{y8s7LlyyfhMzpzCkxuD2r → k1mo5xWZrV9djgjd0moOT}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +42 -34
- sky/jobs/server/server.py +14 -1
- sky/jobs/state.py +26 -1
- sky/provision/__init__.py +1 -0
- sky/provision/docker_utils.py +6 -2
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/resources.py +9 -1
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_utils.py +29 -12
- sky/serve/server/core.py +37 -19
- sky/serve/server/impl.py +221 -129
- sky/server/requests/executor.py +3 -0
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +5 -3
- sky/skylet/services.py +98 -0
- sky/skylet/skylet.py +3 -1
- sky/templates/kubernetes-ray.yml.j2 +22 -12
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/METADATA +37 -36
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/RECORD +64 -52
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0487dfbf149d9e53.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +0 -6
- /sky/dashboard/out/_next/static/{y8s7LlyyfhMzpzCkxuD2r → k1mo5xWZrV9djgjd0moOT}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
|
@@ -613,7 +613,7 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
|
|
|
613
613
|
"""
|
|
614
614
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
615
615
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
616
|
-
|
|
616
|
+
result = await session.execute(
|
|
617
617
|
sqlalchemy.update(spot_table).where(
|
|
618
618
|
sqlalchemy.and_(
|
|
619
619
|
spot_table.c.spot_job_id == job_id,
|
|
@@ -625,6 +625,7 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
|
|
|
625
625
|
spot_table.c.end_at.is_(None),
|
|
626
626
|
)).values({spot_table.c.status: ManagedJobStatus.PENDING.value})
|
|
627
627
|
)
|
|
628
|
+
count = result.rowcount
|
|
628
629
|
await session.commit()
|
|
629
630
|
if count != 1:
|
|
630
631
|
raise exceptions.ManagedJobStatusError(
|
|
@@ -712,7 +713,19 @@ def set_failed(
|
|
|
712
713
|
where_conditions = [spot_table.c.spot_job_id == job_id]
|
|
713
714
|
if task_id is not None:
|
|
714
715
|
where_conditions.append(spot_table.c.task_id == task_id)
|
|
716
|
+
|
|
717
|
+
# Handle failure_reason prepending when override_terminal is True
|
|
715
718
|
if override_terminal:
|
|
719
|
+
# Get existing failure_reason with row lock to prevent race
|
|
720
|
+
# conditions
|
|
721
|
+
existing_reason_result = session.execute(
|
|
722
|
+
sqlalchemy.select(spot_table.c.failure_reason).where(
|
|
723
|
+
sqlalchemy.and_(*where_conditions)).with_for_update())
|
|
724
|
+
existing_reason_row = existing_reason_result.fetchone()
|
|
725
|
+
if existing_reason_row and existing_reason_row[0]:
|
|
726
|
+
# Prepend new failure reason to existing one
|
|
727
|
+
fields_to_set[spot_table.c.failure_reason] = (
|
|
728
|
+
failure_reason + '. Previously: ' + existing_reason_row[0])
|
|
716
729
|
# Use COALESCE for end_at to avoid overriding the existing end_at if
|
|
717
730
|
# it's already set.
|
|
718
731
|
fields_to_set[spot_table.c.end_at] = sqlalchemy.func.coalesce(
|
|
@@ -1651,7 +1664,19 @@ async def set_failed_async(
|
|
|
1651
1664
|
where_conditions = [spot_table.c.spot_job_id == job_id]
|
|
1652
1665
|
if task_id is not None:
|
|
1653
1666
|
where_conditions.append(spot_table.c.task_id == task_id)
|
|
1667
|
+
|
|
1668
|
+
# Handle failure_reason prepending when override_terminal is True
|
|
1654
1669
|
if override_terminal:
|
|
1670
|
+
# Get existing failure_reason with row lock to prevent race
|
|
1671
|
+
# conditions
|
|
1672
|
+
existing_reason_result = await session.execute(
|
|
1673
|
+
sqlalchemy.select(spot_table.c.failure_reason).where(
|
|
1674
|
+
sqlalchemy.and_(*where_conditions)).with_for_update())
|
|
1675
|
+
existing_reason_row = existing_reason_result.fetchone()
|
|
1676
|
+
if existing_reason_row and existing_reason_row[0]:
|
|
1677
|
+
# Prepend new failure reason to existing one
|
|
1678
|
+
fields_to_set[spot_table.c.failure_reason] = (
|
|
1679
|
+
failure_reason + '. Previously: ' + existing_reason_row[0])
|
|
1655
1680
|
fields_to_set[spot_table.c.end_at] = sqlalchemy.func.coalesce(
|
|
1656
1681
|
spot_table.c.end_at, end_time)
|
|
1657
1682
|
else:
|
sky/provision/__init__.py
CHANGED
|
@@ -24,6 +24,7 @@ from sky.provision import kubernetes
|
|
|
24
24
|
from sky.provision import lambda_cloud
|
|
25
25
|
from sky.provision import nebius
|
|
26
26
|
from sky.provision import oci
|
|
27
|
+
from sky.provision import primeintellect
|
|
27
28
|
from sky.provision import runpod
|
|
28
29
|
from sky.provision import scp
|
|
29
30
|
from sky.provision import seeweb
|
sky/provision/docker_utils.py
CHANGED
|
@@ -15,10 +15,14 @@ logger = sky_logging.init_logger(__name__)
|
|
|
15
15
|
# Configure environment variables. A docker image can have environment variables
|
|
16
16
|
# set in the Dockerfile with `ENV``. We need to export these variables to the
|
|
17
17
|
# shell environment, so that our ssh session can access them.
|
|
18
|
+
# Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
|
|
19
|
+
# Docker images with Ray 2.48.0+ set this for UV package manager support,
|
|
20
|
+
# but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
|
|
21
|
+
# See: https://github.com/skypilot-org/skypilot/pull/7181
|
|
18
22
|
SETUP_ENV_VARS_CMD = (
|
|
19
23
|
'prefix_cmd() '
|
|
20
24
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
|
|
21
|
-
'export -p > ~/container_env_var.sh && '
|
|
25
|
+
'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
|
|
22
26
|
'$(prefix_cmd) '
|
|
23
27
|
'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
|
|
24
28
|
|
|
@@ -410,7 +414,7 @@ class DockerInitializer:
|
|
|
410
414
|
# pylint: disable=anomalous-backslash-in-string
|
|
411
415
|
self._run(
|
|
412
416
|
'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
|
|
413
|
-
f'
|
|
417
|
+
f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
|
|
414
418
|
'mkdir -p ~/.ssh;'
|
|
415
419
|
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
|
|
416
420
|
'sudo service ssh start;'
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Prime Intellect provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.primeintellect.config import bootstrap_instances
|
|
4
|
+
from sky.provision.primeintellect.instance import cleanup_ports
|
|
5
|
+
from sky.provision.primeintellect.instance import get_cluster_info
|
|
6
|
+
from sky.provision.primeintellect.instance import query_instances
|
|
7
|
+
from sky.provision.primeintellect.instance import run_instances
|
|
8
|
+
from sky.provision.primeintellect.instance import stop_instances
|
|
9
|
+
from sky.provision.primeintellect.instance import terminate_instances
|
|
10
|
+
from sky.provision.primeintellect.instance import wait_instances
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Prime Intellect configuration bootstrapping."""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
|
10
|
+
del region, cluster_name # unused
|
|
11
|
+
return config
|
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
"""Prime Intellect instance provisioning."""
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from sky import exceptions
|
|
6
|
+
from sky import sky_logging
|
|
7
|
+
from sky.provision import common
|
|
8
|
+
from sky.provision.primeintellect import utils
|
|
9
|
+
from sky.utils import common_utils
|
|
10
|
+
from sky.utils import status_lib
|
|
11
|
+
from sky.utils import ux_utils
|
|
12
|
+
|
|
13
|
+
# The maximum number of times to poll for the status of an operation.
|
|
14
|
+
POLL_INTERVAL = 5
|
|
15
|
+
MAX_POLLS = 60 // POLL_INTERVAL
|
|
16
|
+
# Terminating instances can take several minutes, so we increase the timeout
|
|
17
|
+
MAX_POLLS_FOR_UP_OR_TERMINATE = MAX_POLLS * 16
|
|
18
|
+
|
|
19
|
+
# status filters
|
|
20
|
+
# PROVISIONING, PENDING, ACTIVE, STOPPED, ERROR, DELETING, TERMINATED
|
|
21
|
+
|
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
|
23
|
+
|
|
24
|
+
# SSH connection readiness polling constants
|
|
25
|
+
SSH_CONN_MAX_RETRIES = 6
|
|
26
|
+
SSH_CONN_RETRY_INTERVAL_SECONDS = 10
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _filter_instances(cluster_name_on_cloud: str,
|
|
30
|
+
status_filters: Optional[List[str]]) -> Dict[str, Any]:
|
|
31
|
+
client = utils.PrimeIntellectAPIClient()
|
|
32
|
+
instances = client.list_instances()
|
|
33
|
+
# TODO: verify names are we using it?
|
|
34
|
+
possible_names = [
|
|
35
|
+
f'{cluster_name_on_cloud}-head',
|
|
36
|
+
f'{cluster_name_on_cloud}-worker',
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
filtered_instances = {}
|
|
40
|
+
for instance in instances:
|
|
41
|
+
instance_id = instance['id']
|
|
42
|
+
if (status_filters is not None and
|
|
43
|
+
instance['status'] not in status_filters):
|
|
44
|
+
continue
|
|
45
|
+
instance_name = instance.get('name')
|
|
46
|
+
if instance_name and instance_name in possible_names:
|
|
47
|
+
filtered_instances[instance_id] = instance
|
|
48
|
+
return filtered_instances
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _get_instance_info(instance_id: str) -> Dict[str, Any]:
|
|
52
|
+
client = utils.PrimeIntellectAPIClient()
|
|
53
|
+
return client.get_instance_details(instance_id)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
57
|
+
head_instance_id = None
|
|
58
|
+
for inst_id, inst in instances.items():
|
|
59
|
+
if inst['name'].endswith('-head'):
|
|
60
|
+
head_instance_id = inst_id
|
|
61
|
+
break
|
|
62
|
+
return head_instance_id
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Helper is available as utils.parse_ssh_connection.
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
69
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
|
+
"""Runs instances for the given cluster."""
|
|
71
|
+
|
|
72
|
+
pending_status = [
|
|
73
|
+
'PROVISIONING',
|
|
74
|
+
'PENDING',
|
|
75
|
+
]
|
|
76
|
+
newly_started_instances = _filter_instances(cluster_name_on_cloud,
|
|
77
|
+
pending_status)
|
|
78
|
+
client = utils.PrimeIntellectAPIClient()
|
|
79
|
+
|
|
80
|
+
while True:
|
|
81
|
+
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
|
82
|
+
if not instances:
|
|
83
|
+
break
|
|
84
|
+
instance_statuses = [
|
|
85
|
+
instance['status'] for instance in instances.values()
|
|
86
|
+
]
|
|
87
|
+
logger.info(f'Waiting for {len(instances)} instances to be ready: '
|
|
88
|
+
f'{instance_statuses}')
|
|
89
|
+
time.sleep(POLL_INTERVAL)
|
|
90
|
+
|
|
91
|
+
exist_instances = _filter_instances(cluster_name_on_cloud,
|
|
92
|
+
status_filters=pending_status)
|
|
93
|
+
if len(exist_instances) > config.count:
|
|
94
|
+
raise RuntimeError(
|
|
95
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
|
96
|
+
f'{len(exist_instances)} nodes, but {config.count} are required.')
|
|
97
|
+
|
|
98
|
+
exist_instances = _filter_instances(cluster_name_on_cloud,
|
|
99
|
+
status_filters=['ACTIVE'])
|
|
100
|
+
head_instance_id = _get_head_instance_id(exist_instances)
|
|
101
|
+
to_start_count = config.count - len(exist_instances)
|
|
102
|
+
if to_start_count < 0:
|
|
103
|
+
raise RuntimeError(
|
|
104
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
|
105
|
+
f'{len(exist_instances)} nodes, but {config.count} are required.')
|
|
106
|
+
if to_start_count == 0:
|
|
107
|
+
if head_instance_id is None:
|
|
108
|
+
head_instance_id = list(exist_instances.keys())[0]
|
|
109
|
+
# TODO: implement rename pod
|
|
110
|
+
# client.rename(
|
|
111
|
+
# instance_id=head_instance_id,
|
|
112
|
+
# name=f'{cluster_name_on_cloud}-head',
|
|
113
|
+
# )
|
|
114
|
+
assert head_instance_id is not None, (
|
|
115
|
+
'head_instance_id should not be None')
|
|
116
|
+
logger.info(f'Cluster {cluster_name_on_cloud} already has '
|
|
117
|
+
f'{len(exist_instances)} nodes, no need to start more.')
|
|
118
|
+
return common.ProvisionRecord(
|
|
119
|
+
provider_name='primeintellect',
|
|
120
|
+
cluster_name=cluster_name_on_cloud,
|
|
121
|
+
region=region,
|
|
122
|
+
zone=config.provider_config['zones'],
|
|
123
|
+
head_instance_id=head_instance_id,
|
|
124
|
+
resumed_instance_ids=list(newly_started_instances.keys()),
|
|
125
|
+
created_instance_ids=[],
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
created_instance_ids = []
|
|
129
|
+
for _ in range(to_start_count):
|
|
130
|
+
node_type = 'head' if head_instance_id is None else 'worker'
|
|
131
|
+
try:
|
|
132
|
+
# Extract vCPUs and memory from instance type
|
|
133
|
+
# Format: provider__gpu_prefix_base_type__vcpus__memory[_SPOT]
|
|
134
|
+
instance_type = config.node_config['InstanceType']
|
|
135
|
+
disk_size = config.node_config.get('DiskSize')
|
|
136
|
+
vcpus = -1
|
|
137
|
+
memory = -1
|
|
138
|
+
try:
|
|
139
|
+
# Split by '__'
|
|
140
|
+
parts = instance_type.split('__')
|
|
141
|
+
|
|
142
|
+
# Format: provider__gpu_info__vcpus__memory[_SPOT]
|
|
143
|
+
# For: primecompute__8xH100_80GB__104__752_SPOT
|
|
144
|
+
# parts[0] = primecompute, parts[1] = 8xH100_80GB,
|
|
145
|
+
# parts[2] = 104, parts[3] = 752, parts[4] = SPOT
|
|
146
|
+
if len(parts) >= 4:
|
|
147
|
+
vcpu_str = parts[2]
|
|
148
|
+
memory_str = parts[3]
|
|
149
|
+
vcpus = int(vcpu_str)
|
|
150
|
+
memory = int(memory_str)
|
|
151
|
+
except (ValueError, IndexError) as e:
|
|
152
|
+
# If parsing fails, try to get from catalog
|
|
153
|
+
logger.warning(
|
|
154
|
+
f'Failed to parse vCPUs/memory from instance type '
|
|
155
|
+
f'{instance_type}: {e}')
|
|
156
|
+
|
|
157
|
+
params = {
|
|
158
|
+
'name': f'{cluster_name_on_cloud}-{node_type}',
|
|
159
|
+
'instance_type': config.node_config['InstanceType'],
|
|
160
|
+
'region': region,
|
|
161
|
+
'availability_zone': config.provider_config['zones'],
|
|
162
|
+
'disk_size': disk_size,
|
|
163
|
+
'vcpus': vcpus,
|
|
164
|
+
'memory': memory,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
response = client.launch(**params)
|
|
168
|
+
instance_id = response['id']
|
|
169
|
+
except utils.PrimeintellectResourcesUnavailableError as e:
|
|
170
|
+
# Resource unavailability error - provide specific message
|
|
171
|
+
instance_type = config.node_config['InstanceType']
|
|
172
|
+
region_str = (f' in region {region}'
|
|
173
|
+
if region != 'PLACEHOLDER' else '')
|
|
174
|
+
error_msg = (
|
|
175
|
+
f'Resources are currently unavailable on Prime Intellect. '
|
|
176
|
+
f'No {instance_type} instances are available{region_str}. '
|
|
177
|
+
f'Please try again later or consider using a different '
|
|
178
|
+
f'instance type or region. Details: {str(e)}')
|
|
179
|
+
logger.warning(f'Resource unavailability error: {e}')
|
|
180
|
+
with ux_utils.print_exception_no_traceback():
|
|
181
|
+
raise exceptions.ResourcesUnavailableError(error_msg) from e
|
|
182
|
+
except utils.PrimeintellectAPIError as e:
|
|
183
|
+
# Other API errors - provide specific message
|
|
184
|
+
instance_type = config.node_config['InstanceType']
|
|
185
|
+
region_str = (f' in region {region}'
|
|
186
|
+
if region != 'PLACEHOLDER' else '')
|
|
187
|
+
error_msg = (f'Failed to launch {instance_type} instance on Prime '
|
|
188
|
+
f'Intellect{region_str}. Details: {str(e)}')
|
|
189
|
+
logger.warning(f'API error during instance launch: {e}')
|
|
190
|
+
with ux_utils.print_exception_no_traceback():
|
|
191
|
+
raise exceptions.ResourcesUnavailableError(error_msg) from e
|
|
192
|
+
except Exception as e: # pylint: disable=broad-except
|
|
193
|
+
# Generic error handling for unexpected errors
|
|
194
|
+
instance_type = config.node_config['InstanceType']
|
|
195
|
+
region_str = (f' in region {region}'
|
|
196
|
+
if region != 'PLACEHOLDER' else '')
|
|
197
|
+
error_msg = (
|
|
198
|
+
f'Unexpected error while launching {instance_type} instance '
|
|
199
|
+
f'on Prime Intellect{region_str}. Details: '
|
|
200
|
+
f'{common_utils.format_exception(e, use_bracket=False)}')
|
|
201
|
+
logger.warning(f'Unexpected error during instance launch: {e}')
|
|
202
|
+
with ux_utils.print_exception_no_traceback():
|
|
203
|
+
raise exceptions.ResourcesUnavailableError(error_msg) from e
|
|
204
|
+
logger.info(f'Launched instance {instance_id}.')
|
|
205
|
+
created_instance_ids.append(instance_id)
|
|
206
|
+
if head_instance_id is None:
|
|
207
|
+
head_instance_id = instance_id
|
|
208
|
+
|
|
209
|
+
# Wait for instances to be ready.
|
|
210
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_TERMINATE):
|
|
211
|
+
instances = _filter_instances(cluster_name_on_cloud, ['ACTIVE'])
|
|
212
|
+
logger.info('Waiting for instances to be ready: '
|
|
213
|
+
f'({len(instances)}/{config.count}).')
|
|
214
|
+
if len(instances) == config.count:
|
|
215
|
+
break
|
|
216
|
+
|
|
217
|
+
time.sleep(POLL_INTERVAL)
|
|
218
|
+
else:
|
|
219
|
+
# Failed to launch config.count of instances after max retries
|
|
220
|
+
# Provide more specific error message
|
|
221
|
+
instance_type = config.node_config['InstanceType']
|
|
222
|
+
region_str = (f' in region {region}' if region != 'PLACEHOLDER' else '')
|
|
223
|
+
active_instances = len(
|
|
224
|
+
_filter_instances(cluster_name_on_cloud, ['ACTIVE']))
|
|
225
|
+
error_msg = (
|
|
226
|
+
f'Timed out waiting for {instance_type} instances to become '
|
|
227
|
+
f'ready on Prime Intellect{region_str}. Only {active_instances} '
|
|
228
|
+
f'out of {config.count} instances became active. This may '
|
|
229
|
+
f'indicate capacity issues or slow provisioning. Please try '
|
|
230
|
+
f'again later or consider using a different instance type or '
|
|
231
|
+
f'region.')
|
|
232
|
+
logger.warning(error_msg)
|
|
233
|
+
with ux_utils.print_exception_no_traceback():
|
|
234
|
+
raise exceptions.ResourcesUnavailableError(error_msg)
|
|
235
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
|
236
|
+
return common.ProvisionRecord(
|
|
237
|
+
provider_name='primeintellect',
|
|
238
|
+
cluster_name=cluster_name_on_cloud,
|
|
239
|
+
region=region,
|
|
240
|
+
zone=config.provider_config['zones'],
|
|
241
|
+
head_instance_id=head_instance_id,
|
|
242
|
+
resumed_instance_ids=[],
|
|
243
|
+
created_instance_ids=created_instance_ids,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
|
248
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
|
249
|
+
del region, cluster_name_on_cloud, state
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def stop_instances(
|
|
253
|
+
cluster_name_on_cloud: str,
|
|
254
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
255
|
+
worker_only: bool = False,
|
|
256
|
+
) -> None:
|
|
257
|
+
raise NotImplementedError()
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def terminate_instances(
|
|
261
|
+
cluster_name_on_cloud: str,
|
|
262
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
263
|
+
worker_only: bool = False,
|
|
264
|
+
) -> None:
|
|
265
|
+
"""See sky/provision/__init__.py"""
|
|
266
|
+
del provider_config # unused
|
|
267
|
+
client = utils.PrimeIntellectAPIClient()
|
|
268
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
269
|
+
|
|
270
|
+
# Log if no instances found
|
|
271
|
+
if not instances:
|
|
272
|
+
logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
|
|
273
|
+
return
|
|
274
|
+
|
|
275
|
+
# Filter out already terminated instances
|
|
276
|
+
non_terminated_instances = {
|
|
277
|
+
inst_id: inst
|
|
278
|
+
for inst_id, inst in instances.items()
|
|
279
|
+
if inst['status'] not in ['TERMINATED', 'DELETING']
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if not non_terminated_instances:
|
|
283
|
+
logger.info(
|
|
284
|
+
f'All instances for cluster {cluster_name_on_cloud} are already '
|
|
285
|
+
f'terminated or being deleted')
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
# Log what we're about to terminate
|
|
289
|
+
instance_names = [
|
|
290
|
+
inst['name'] for inst in non_terminated_instances.values()
|
|
291
|
+
]
|
|
292
|
+
logger.info(
|
|
293
|
+
f'Terminating {len(non_terminated_instances)} instances for cluster '
|
|
294
|
+
f'{cluster_name_on_cloud}: {instance_names}')
|
|
295
|
+
|
|
296
|
+
# Terminate each instance
|
|
297
|
+
terminated_instances = []
|
|
298
|
+
for inst_id, inst in non_terminated_instances.items():
|
|
299
|
+
status = inst['status']
|
|
300
|
+
logger.debug(f'Terminating instance {inst_id} (status: {status})')
|
|
301
|
+
if worker_only and inst['name'].endswith('-head'):
|
|
302
|
+
continue
|
|
303
|
+
try:
|
|
304
|
+
client.remove(inst_id)
|
|
305
|
+
terminated_instances.append(inst_id)
|
|
306
|
+
name = inst['name']
|
|
307
|
+
logger.info(
|
|
308
|
+
f'Successfully initiated termination of instance {inst_id} '
|
|
309
|
+
f'({name})')
|
|
310
|
+
except Exception as e: # pylint: disable=broad-except
|
|
311
|
+
with ux_utils.print_exception_no_traceback():
|
|
312
|
+
raise RuntimeError(
|
|
313
|
+
f'Failed to terminate instance {inst_id}: '
|
|
314
|
+
f'{common_utils.format_exception(e, use_bracket=False)}'
|
|
315
|
+
) from e
|
|
316
|
+
|
|
317
|
+
# Wait for instances to be terminated
|
|
318
|
+
if not terminated_instances:
|
|
319
|
+
logger.info(
|
|
320
|
+
'No instances were terminated (worker_only=True and only head '
|
|
321
|
+
'node found)')
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
logger.info(f'Waiting for {len(terminated_instances)} instances to be '
|
|
325
|
+
f'terminated...')
|
|
326
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_TERMINATE):
|
|
327
|
+
remaining_instances = _filter_instances(cluster_name_on_cloud, None)
|
|
328
|
+
|
|
329
|
+
# Check if all terminated instances are gone
|
|
330
|
+
still_exist = [
|
|
331
|
+
inst_id for inst_id in terminated_instances
|
|
332
|
+
if inst_id in remaining_instances
|
|
333
|
+
]
|
|
334
|
+
if not still_exist:
|
|
335
|
+
logger.info('All instances have been successfully terminated')
|
|
336
|
+
break
|
|
337
|
+
|
|
338
|
+
# Log status of remaining instances
|
|
339
|
+
remaining_statuses = [(inst_id, remaining_instances[inst_id]['status'])
|
|
340
|
+
for inst_id in still_exist]
|
|
341
|
+
logger.info(
|
|
342
|
+
f'Waiting for termination... {len(still_exist)} instances still '
|
|
343
|
+
f'exist: {remaining_statuses}')
|
|
344
|
+
time.sleep(POLL_INTERVAL)
|
|
345
|
+
else:
|
|
346
|
+
# Timeout reached
|
|
347
|
+
remaining_instances = _filter_instances(cluster_name_on_cloud, None)
|
|
348
|
+
still_exist = [
|
|
349
|
+
inst_id for inst_id in terminated_instances
|
|
350
|
+
if inst_id in remaining_instances
|
|
351
|
+
]
|
|
352
|
+
if still_exist:
|
|
353
|
+
logger.warning(
|
|
354
|
+
f'Timeout reached. {len(still_exist)} instances may still be '
|
|
355
|
+
f'terminating: {still_exist}')
|
|
356
|
+
else:
|
|
357
|
+
logger.info('All instances have been successfully terminated')
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def get_cluster_info(
|
|
361
|
+
region: str,
|
|
362
|
+
cluster_name_on_cloud: str,
|
|
363
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
|
364
|
+
del region # unused
|
|
365
|
+
running_instances = _filter_instances(cluster_name_on_cloud, ['ACTIVE'])
|
|
366
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
|
367
|
+
head_instance_id = None
|
|
368
|
+
head_ssh_user = None
|
|
369
|
+
for instance_id, instance in running_instances.items():
|
|
370
|
+
retry_count = 0
|
|
371
|
+
max_retries = SSH_CONN_MAX_RETRIES
|
|
372
|
+
while (instance.get('sshConnection') is None and
|
|
373
|
+
retry_count < max_retries):
|
|
374
|
+
name = instance.get('name')
|
|
375
|
+
print(f'SSH connection to {name} is not ready, waiting '
|
|
376
|
+
f'{SSH_CONN_RETRY_INTERVAL_SECONDS} seconds... '
|
|
377
|
+
f'(attempt {retry_count + 1}/{max_retries})')
|
|
378
|
+
time.sleep(SSH_CONN_RETRY_INTERVAL_SECONDS)
|
|
379
|
+
retry_count += 1
|
|
380
|
+
running_instances[instance_id] = _get_instance_info(instance_id)
|
|
381
|
+
|
|
382
|
+
if instance.get('sshConnection') is not None:
|
|
383
|
+
print('SSH connection is ready!')
|
|
384
|
+
else:
|
|
385
|
+
raise Exception(
|
|
386
|
+
f'Failed to establish SSH connection after {max_retries} '
|
|
387
|
+
f'attempts')
|
|
388
|
+
|
|
389
|
+
assert instance.get(
|
|
390
|
+
'sshConnection'), 'sshConnection cannot be null anymore'
|
|
391
|
+
|
|
392
|
+
ssh_connection = instance['sshConnection']
|
|
393
|
+
_, ssh_port = utils.parse_ssh_connection(ssh_connection)
|
|
394
|
+
|
|
395
|
+
external_ip = instance['ip']
|
|
396
|
+
if isinstance(external_ip, list):
|
|
397
|
+
external_ip = external_ip[0]
|
|
398
|
+
|
|
399
|
+
instances[instance_id] = [
|
|
400
|
+
common.InstanceInfo(
|
|
401
|
+
instance_id=instance_id,
|
|
402
|
+
internal_ip='NOT_SUPPORTED',
|
|
403
|
+
external_ip=external_ip,
|
|
404
|
+
ssh_port=ssh_port,
|
|
405
|
+
tags={'provider': instance['providerType']},
|
|
406
|
+
)
|
|
407
|
+
]
|
|
408
|
+
if instance['name'].endswith('-head'):
|
|
409
|
+
head_instance_id = instance_id
|
|
410
|
+
parsed_user_for_user, _ = utils.parse_ssh_connection(ssh_connection)
|
|
411
|
+
head_ssh_user = parsed_user_for_user or 'ubuntu'
|
|
412
|
+
|
|
413
|
+
return common.ClusterInfo(
|
|
414
|
+
instances=instances,
|
|
415
|
+
head_instance_id=head_instance_id,
|
|
416
|
+
provider_name='primeintellect',
|
|
417
|
+
provider_config=provider_config,
|
|
418
|
+
ssh_user=head_ssh_user,
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def query_instances(
|
|
423
|
+
cluster_name_on_cloud: str,
|
|
424
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
425
|
+
non_terminated_only: bool = True,
|
|
426
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
427
|
+
"""See sky/provision/__init__.py"""
|
|
428
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
429
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
430
|
+
|
|
431
|
+
status_map = {
|
|
432
|
+
'PENDING': status_lib.ClusterStatus.INIT,
|
|
433
|
+
'ERROR': status_lib.ClusterStatus.INIT,
|
|
434
|
+
'ACTIVE': status_lib.ClusterStatus.UP,
|
|
435
|
+
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
|
436
|
+
'DELETING': None, # Being deleted - should be filtered out
|
|
437
|
+
'TERMINATED': None, # Already terminated - should be filtered out
|
|
438
|
+
}
|
|
439
|
+
statuses: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
|
|
440
|
+
Optional[str]]] = {}
|
|
441
|
+
for inst_id, inst in instances.items():
|
|
442
|
+
status = status_map[inst['status']]
|
|
443
|
+
if non_terminated_only and status is None:
|
|
444
|
+
continue
|
|
445
|
+
statuses[inst_id] = (status, None)
|
|
446
|
+
return statuses
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def cleanup_ports(
|
|
450
|
+
cluster_name_on_cloud: str,
|
|
451
|
+
ports: List[str],
|
|
452
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
453
|
+
) -> None:
|
|
454
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|