skypilot-nightly 1.0.0.dev20250613__py3-none-any.whl → 1.0.0.dev20250615__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/authentication.py +20 -2
- sky/backends/backend_utils.py +3 -1
- sky/backends/cloud_vm_ray_backend.py +2 -1
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/catalog/hyperbolic_catalog.py +133 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/hyperbolic.py +276 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{UdgJCk2sZFLJgFJW_qiWG → R07f8gwfXT1U0zRznq4Lg}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/37-824c707421f6f003.js +6 -0
- sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/{856-0776dc6ed6000c39.js → 856-c2c39c0912285e54.js} +1 -1
- sky/dashboard/out/_next/static/chunks/938-385d190b95815e11.js +1 -0
- sky/dashboard/out/_next/static/chunks/{webpack-5c3e6471d04780c6.js → webpack-1b69b196a4dbffef.js} +1 -1
- sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/provision/__init__.py +1 -0
- sky/provision/hyperbolic/__init__.py +11 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +423 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/setup_files/dependencies.py +2 -1
- sky/skylet/constants.py +1 -1
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/users/permission.py +2 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
- {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/RECORD +52 -43
- sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +0 -6
- sky/dashboard/out/_next/static/chunks/600.15a0009177e86b86.js +0 -16
- sky/dashboard/out/_next/static/chunks/938-ab185187a63f9cdb.js +0 -1
- sky/dashboard/out/_next/static/css/5d71bfc09f184bab.css +0 -3
- /sky/dashboard/out/_next/static/{UdgJCk2sZFLJgFJW_qiWG → R07f8gwfXT1U0zRznq4Lg}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-6fcc4bf91ac45b39.js → 843-ab9c4f609239155f.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-7bbd9d39d6f9a98a.js → _app-32b2caae3445bf3b.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-451a14e7e755ebbc.js → [cluster]-59950b2f83b66e48.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{clusters-e56b17fd85d0ba58.js → clusters-82a651dbad53ec6e.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{jobs-fe233baf3d073491.js → jobs-336ab80e270ce2ce.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"R07f8gwfXT1U0zRznq4Lg","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-c807fc34f09c7df3.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-385d190b95815e11.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-ab9c4f609239155f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-c8c2191328532b7d.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"R07f8gwfXT1U0zRznq4Lg","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"R07f8gwfXT1U0zRznq4Lg","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/provision/__init__.py
CHANGED
@@ -18,6 +18,7 @@ from sky.provision import common
|
|
18
18
|
from sky.provision import cudo
|
19
19
|
from sky.provision import fluidstack
|
20
20
|
from sky.provision import gcp
|
21
|
+
from sky.provision import hyperbolic
|
21
22
|
from sky.provision import kubernetes
|
22
23
|
from sky.provision import lambda_cloud
|
23
24
|
from sky.provision import nebius
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"""Hyperbolic provisioner for SkyPilot."""
|
2
|
+
|
3
|
+
from sky.provision.hyperbolic.config import bootstrap_instances
|
4
|
+
from sky.provision.hyperbolic.instance import cleanup_ports
|
5
|
+
from sky.provision.hyperbolic.instance import get_cluster_info
|
6
|
+
from sky.provision.hyperbolic.instance import open_ports
|
7
|
+
from sky.provision.hyperbolic.instance import query_instances
|
8
|
+
from sky.provision.hyperbolic.instance import run_instances
|
9
|
+
from sky.provision.hyperbolic.instance import stop_instances
|
10
|
+
from sky.provision.hyperbolic.instance import terminate_instances
|
11
|
+
from sky.provision.hyperbolic.instance import wait_instances
|
@@ -0,0 +1,10 @@
|
|
1
|
+
"""Hyperbolic Cloud configuration bootstrapping"""
|
2
|
+
|
3
|
+
from sky.provision import common
|
4
|
+
|
5
|
+
|
6
|
+
def bootstrap_instances(
|
7
|
+
region: str, cluster_name: str,
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
9
|
+
del region, cluster_name # unused
|
10
|
+
return config
|
@@ -0,0 +1,423 @@
|
|
1
|
+
"""Hyperbolic instance provisioning."""
|
2
|
+
import time
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
from sky import sky_logging
|
6
|
+
from sky.provision import common
|
7
|
+
from sky.provision.hyperbolic import utils
|
8
|
+
from sky.utils import status_lib
|
9
|
+
|
10
|
+
PROVIDER_NAME = 'hyperbolic'
|
11
|
+
POLL_INTERVAL = 5
|
12
|
+
QUERY_PORTS_TIMEOUT_SECONDS = 30
|
13
|
+
#TODO come up with a reasonable value for this timeout
|
14
|
+
TIMEOUT = 300
|
15
|
+
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
def _filter_instances(cluster_name_on_cloud: str,
|
20
|
+
status_filters: Optional[List[str]],
|
21
|
+
head_only: bool = False) -> Dict[str, Dict[str, Any]]:
|
22
|
+
logger.debug(f'Filtering instances: cluster={cluster_name_on_cloud}, '
|
23
|
+
f'status={status_filters}')
|
24
|
+
_ = head_only # Mark as intentionally unused
|
25
|
+
|
26
|
+
# Filter by cluster name using metadata
|
27
|
+
instances = utils.list_instances(
|
28
|
+
metadata={'skypilot': {
|
29
|
+
'cluster_name': cluster_name_on_cloud
|
30
|
+
}})
|
31
|
+
|
32
|
+
# Normalize status filters to lowercase
|
33
|
+
if status_filters is not None:
|
34
|
+
status_filters = [s.lower() for s in status_filters]
|
35
|
+
|
36
|
+
filtered_instances: Dict[str, Dict[str, Any]] = {}
|
37
|
+
for instance_id, instance in instances.items():
|
38
|
+
try:
|
39
|
+
# Check status filter
|
40
|
+
instance_status = instance.get('status', '').lower()
|
41
|
+
if (status_filters is not None and
|
42
|
+
instance_status not in status_filters):
|
43
|
+
logger.debug(
|
44
|
+
f'Skipping instance {instance_id} '
|
45
|
+
f'- status {instance_status} not in {status_filters}')
|
46
|
+
continue
|
47
|
+
|
48
|
+
filtered_instances[instance_id] = instance
|
49
|
+
logger.debug(f'Including instance {instance_id} '
|
50
|
+
f'with status {instance_status}')
|
51
|
+
|
52
|
+
except Exception as e: # pylint: disable=broad-except
|
53
|
+
logger.warning(f'Error processing instance {instance_id}: {str(e)}')
|
54
|
+
continue
|
55
|
+
|
56
|
+
logger.info(f'Found {len(filtered_instances)} instances matching filters')
|
57
|
+
return filtered_instances
|
58
|
+
|
59
|
+
|
60
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
61
|
+
"""Get the instance ID from the instances dict."""
|
62
|
+
if not instances:
|
63
|
+
return None
|
64
|
+
return next(iter(instances.keys()))
|
65
|
+
|
66
|
+
|
67
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
68
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
69
|
+
logger.info(f'Starting run_instances with region={region}, '
|
70
|
+
f'cluster={cluster_name_on_cloud}')
|
71
|
+
logger.debug(f'Config: {config}')
|
72
|
+
start_time = time.time()
|
73
|
+
|
74
|
+
# Define pending statuses for Hyperbolic
|
75
|
+
pending_status = [
|
76
|
+
utils.HyperbolicInstanceStatus.CREATING.value,
|
77
|
+
utils.HyperbolicInstanceStatus.STARTING.value
|
78
|
+
]
|
79
|
+
logger.debug(
|
80
|
+
f'Looking for instances with pending statuses: {pending_status}')
|
81
|
+
|
82
|
+
# Wait for any pending instances to be ready
|
83
|
+
while True:
|
84
|
+
if time.time() - start_time > TIMEOUT:
|
85
|
+
logger.error(
|
86
|
+
f'Timed out after {TIMEOUT}s waiting for instances to be ready')
|
87
|
+
raise TimeoutError(
|
88
|
+
f'Timed out after {TIMEOUT}s waiting for instances to be ready')
|
89
|
+
|
90
|
+
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
91
|
+
logger.debug(f'Found {len(instances)} instances with pending status')
|
92
|
+
if not instances:
|
93
|
+
break
|
94
|
+
logger.info(
|
95
|
+
f'Waiting for instance to be ready. Current instances: {instances}')
|
96
|
+
time.sleep(POLL_INTERVAL)
|
97
|
+
|
98
|
+
# Check existing running instance
|
99
|
+
logger.info('Checking for existing running instances')
|
100
|
+
exist_instances = _filter_instances(
|
101
|
+
cluster_name_on_cloud, [utils.HyperbolicInstanceStatus.ONLINE.value])
|
102
|
+
logger.debug(
|
103
|
+
f'Found {len(exist_instances)} running instances: {exist_instances}')
|
104
|
+
instance_id = _get_head_instance_id(exist_instances)
|
105
|
+
logger.debug(f'Head instance ID: {instance_id}')
|
106
|
+
|
107
|
+
# Calculate if we need to start a new instance
|
108
|
+
to_start_count = 1 - len(exist_instances) # Always 1 for single node
|
109
|
+
logger.info(f'Need to start {to_start_count} new instances')
|
110
|
+
if to_start_count < 0:
|
111
|
+
logger.error(
|
112
|
+
f'Cluster {cluster_name_on_cloud} already has an instance running')
|
113
|
+
raise RuntimeError(
|
114
|
+
f'Cluster {cluster_name_on_cloud} already has an instance running.')
|
115
|
+
if to_start_count == 0:
|
116
|
+
if instance_id is None:
|
117
|
+
logger.error(
|
118
|
+
f'Cluster {cluster_name_on_cloud} has no running instance')
|
119
|
+
raise RuntimeError(
|
120
|
+
f'Cluster {cluster_name_on_cloud} has no running instance.')
|
121
|
+
logger.info(
|
122
|
+
f'Cluster {cluster_name_on_cloud} already has a running instance')
|
123
|
+
return common.ProvisionRecord(provider_name=PROVIDER_NAME,
|
124
|
+
cluster_name=cluster_name_on_cloud,
|
125
|
+
region='default',
|
126
|
+
zone=None,
|
127
|
+
head_instance_id=instance_id,
|
128
|
+
resumed_instance_ids=[],
|
129
|
+
created_instance_ids=[])
|
130
|
+
|
131
|
+
try:
|
132
|
+
# Get instance type from node_config
|
133
|
+
instance_type = config.node_config.get('InstanceType')
|
134
|
+
logger.debug(f'Instance type from config: {instance_type}')
|
135
|
+
if not instance_type:
|
136
|
+
logger.error('InstanceType is not set in node_config')
|
137
|
+
raise RuntimeError(
|
138
|
+
'InstanceType is not set in node_config. '
|
139
|
+
'Please specify an instance type for Hyperbolic.')
|
140
|
+
|
141
|
+
# Parse gpu_model configuration from instance type
|
142
|
+
# Format: {gpu_count}x-{gpu_model}-{cpu}-{memory}
|
143
|
+
# Example: 1x-A100-24-271
|
144
|
+
try:
|
145
|
+
parts = instance_type.split('-')
|
146
|
+
if len(parts) != 4:
|
147
|
+
raise ValueError(
|
148
|
+
f'Invalid instance type format: {instance_type}. '
|
149
|
+
'Expected format: {gpu_count}x-{gpu_model}-{cpu}-{memory}')
|
150
|
+
|
151
|
+
gpu_count = int(parts[0].rstrip('x'))
|
152
|
+
gpu_model = parts[1]
|
153
|
+
logger.info(f'Parsed GPU config from instance type: '
|
154
|
+
f'model={gpu_model}, count={gpu_count}')
|
155
|
+
|
156
|
+
# Launch instance
|
157
|
+
instance_id, ssh_command = utils.launch_instance(
|
158
|
+
gpu_model, gpu_count, cluster_name_on_cloud)
|
159
|
+
logger.info(f'Launched instance {instance_id} with SSH command: '
|
160
|
+
f'{ssh_command}')
|
161
|
+
created_instance_ids = [instance_id]
|
162
|
+
|
163
|
+
# Wait for instance to be ready
|
164
|
+
if not utils.wait_for_instance(
|
165
|
+
instance_id, utils.HyperbolicInstanceStatus.ONLINE.value):
|
166
|
+
raise RuntimeError(
|
167
|
+
f'Instance {instance_id} failed to reach ONLINE state')
|
168
|
+
|
169
|
+
except ValueError as e:
|
170
|
+
logger.error(f'Failed to parse instance type: {e}')
|
171
|
+
raise RuntimeError(str(e)) from e
|
172
|
+
except Exception as e:
|
173
|
+
logger.error(f'Failed to launch instance: {e}')
|
174
|
+
raise RuntimeError(str(e)) from e
|
175
|
+
|
176
|
+
except Exception as e:
|
177
|
+
logger.error(f'Unexpected error: {e}')
|
178
|
+
raise
|
179
|
+
|
180
|
+
# Wait for instance to be ready
|
181
|
+
logger.info(f'Waiting for instance {instance_id} to be ready')
|
182
|
+
while True:
|
183
|
+
instances = _filter_instances(
|
184
|
+
cluster_name_on_cloud,
|
185
|
+
[utils.HyperbolicInstanceStatus.ONLINE.value])
|
186
|
+
logger.debug(f'Current instances: {instances}')
|
187
|
+
if len(instances) == 1:
|
188
|
+
logger.info(f'Instance {instance_id} is ready')
|
189
|
+
break
|
190
|
+
if time.time() - start_time > TIMEOUT:
|
191
|
+
logger.error(
|
192
|
+
f'Timed out after {TIMEOUT}s waiting for instance to be ready')
|
193
|
+
raise TimeoutError(
|
194
|
+
f'Timed out after {TIMEOUT}s waiting for instance to be ready')
|
195
|
+
logger.info('Waiting for instance to be ready...')
|
196
|
+
time.sleep(POLL_INTERVAL)
|
197
|
+
|
198
|
+
logger.info(f'Returning ProvisionRecord for instance {instance_id}')
|
199
|
+
return common.ProvisionRecord(provider_name=PROVIDER_NAME,
|
200
|
+
cluster_name=cluster_name_on_cloud,
|
201
|
+
region='default',
|
202
|
+
zone=None,
|
203
|
+
head_instance_id=instance_id,
|
204
|
+
resumed_instance_ids=[],
|
205
|
+
created_instance_ids=created_instance_ids)
|
206
|
+
|
207
|
+
|
208
|
+
def terminate_instances(
|
209
|
+
cluster_name_on_cloud: str,
|
210
|
+
provider_config: Optional[dict] = None,
|
211
|
+
worker_only: bool = False,
|
212
|
+
) -> None:
|
213
|
+
"""Terminate all instances in the cluster."""
|
214
|
+
del provider_config, worker_only # unused
|
215
|
+
logger.info(
|
216
|
+
f'Terminating all instances for cluster {cluster_name_on_cloud}')
|
217
|
+
|
218
|
+
# First check if instances exist
|
219
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
220
|
+
if not instances:
|
221
|
+
logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
|
222
|
+
return
|
223
|
+
|
224
|
+
# Terminate each instance
|
225
|
+
for instance_id in instances:
|
226
|
+
try:
|
227
|
+
utils.terminate_instance(instance_id)
|
228
|
+
logger.info(f'Terminated instance {instance_id}')
|
229
|
+
except Exception as e: # pylint: disable=broad-except
|
230
|
+
logger.warning(f'Failed to terminate instance {instance_id}: {e}')
|
231
|
+
continue
|
232
|
+
|
233
|
+
# Wait for instances to be terminated
|
234
|
+
start_time = time.time()
|
235
|
+
while True:
|
236
|
+
if time.time() - start_time > TIMEOUT:
|
237
|
+
logger.error(
|
238
|
+
f'Timed out after {TIMEOUT}s waiting for instances to terminate'
|
239
|
+
)
|
240
|
+
break
|
241
|
+
|
242
|
+
instances = _filter_instances(
|
243
|
+
cluster_name_on_cloud,
|
244
|
+
[utils.HyperbolicInstanceStatus.TERMINATED.value])
|
245
|
+
if not instances:
|
246
|
+
logger.info('All instances terminated successfully')
|
247
|
+
break
|
248
|
+
|
249
|
+
logger.info('Waiting for instances to terminate...')
|
250
|
+
time.sleep(POLL_INTERVAL)
|
251
|
+
|
252
|
+
|
253
|
+
def get_cluster_info(
|
254
|
+
region: str,
|
255
|
+
cluster_name_on_cloud: str,
|
256
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
257
|
+
"""Returns information about the cluster."""
|
258
|
+
del region # unused
|
259
|
+
running_instances = _filter_instances(
|
260
|
+
cluster_name_on_cloud, [utils.HyperbolicInstanceStatus.ONLINE.value])
|
261
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
262
|
+
head_instance_id = None
|
263
|
+
|
264
|
+
for instance_id, instance_info in running_instances.items():
|
265
|
+
# Extract hostname and port from sshCommand
|
266
|
+
ssh_command = instance_info.get('sshCommand', '')
|
267
|
+
if ssh_command:
|
268
|
+
# Format: ssh user@hostname -p port
|
269
|
+
parts = ssh_command.split()
|
270
|
+
if len(parts) >= 4:
|
271
|
+
user_host = parts[1] # user@hostname
|
272
|
+
if '@' in user_host:
|
273
|
+
ssh_user = user_host.split('@')[0]
|
274
|
+
hostname = user_host.split('@')[1]
|
275
|
+
else:
|
276
|
+
hostname = user_host
|
277
|
+
port = int(parts[3])
|
278
|
+
else:
|
279
|
+
hostname = instance_id
|
280
|
+
port = 22
|
281
|
+
else:
|
282
|
+
hostname = instance_id
|
283
|
+
port = 22
|
284
|
+
|
285
|
+
instances[instance_id] = [
|
286
|
+
common.InstanceInfo(
|
287
|
+
instance_id=instance_id,
|
288
|
+
internal_ip=hostname,
|
289
|
+
external_ip=hostname,
|
290
|
+
ssh_port=port,
|
291
|
+
tags={},
|
292
|
+
)
|
293
|
+
]
|
294
|
+
if head_instance_id is None:
|
295
|
+
head_instance_id = instance_id
|
296
|
+
|
297
|
+
return common.ClusterInfo(
|
298
|
+
instances=instances,
|
299
|
+
head_instance_id=head_instance_id,
|
300
|
+
provider_name=PROVIDER_NAME,
|
301
|
+
provider_config=provider_config,
|
302
|
+
ssh_user=ssh_user,
|
303
|
+
)
|
304
|
+
|
305
|
+
|
306
|
+
def query_instances(
|
307
|
+
cluster_name_on_cloud: str,
|
308
|
+
provider_config: Optional[dict] = None,
|
309
|
+
non_terminated_only: bool = True,
|
310
|
+
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
311
|
+
"""Returns the status of the specified instances for Hyperbolic."""
|
312
|
+
del provider_config # unused
|
313
|
+
# Fetch all instances for this cluster
|
314
|
+
instances = utils.list_instances(
|
315
|
+
metadata={'skypilot': {
|
316
|
+
'cluster_name': cluster_name_on_cloud
|
317
|
+
}})
|
318
|
+
if not instances:
|
319
|
+
# No instances found: return empty dict to indicate fully deleted
|
320
|
+
return {}
|
321
|
+
|
322
|
+
statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
|
323
|
+
for instance_id, instance in instances.items():
|
324
|
+
try:
|
325
|
+
raw_status = instance.get('status', 'unknown').lower()
|
326
|
+
hyperbolic_status = utils.HyperbolicInstanceStatus.from_raw_status(
|
327
|
+
raw_status)
|
328
|
+
status = hyperbolic_status.to_cluster_status()
|
329
|
+
if non_terminated_only and status is None:
|
330
|
+
continue
|
331
|
+
statuses[instance_id] = status
|
332
|
+
except utils.HyperbolicError as e:
|
333
|
+
logger.warning(
|
334
|
+
f'Failed to parse status for instance {instance_id}: {e}')
|
335
|
+
continue
|
336
|
+
return statuses
|
337
|
+
|
338
|
+
|
339
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
340
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
341
|
+
"""Wait for instances to reach the desired state."""
|
342
|
+
del region # unused
|
343
|
+
if state == status_lib.ClusterStatus.UP:
|
344
|
+
# Check if any instances are in ONLINE state
|
345
|
+
instances = _filter_instances(
|
346
|
+
cluster_name_on_cloud,
|
347
|
+
[utils.HyperbolicInstanceStatus.ONLINE.value])
|
348
|
+
if not instances:
|
349
|
+
# Check if any instances are in a failed state
|
350
|
+
failed_instances = _filter_instances(cluster_name_on_cloud, [
|
351
|
+
utils.HyperbolicInstanceStatus.FAILED.value,
|
352
|
+
utils.HyperbolicInstanceStatus.ERROR.value
|
353
|
+
])
|
354
|
+
if failed_instances:
|
355
|
+
raise RuntimeError(
|
356
|
+
f'Cluster {cluster_name_on_cloud} has failed instances: '
|
357
|
+
f'{failed_instances}')
|
358
|
+
raise RuntimeError(f'No running instances found for cluster '
|
359
|
+
f'{cluster_name_on_cloud}')
|
360
|
+
# Check if any instances are in TERMINATED state
|
361
|
+
terminated_instances = _filter_instances(
|
362
|
+
cluster_name_on_cloud,
|
363
|
+
[utils.HyperbolicInstanceStatus.TERMINATED.value])
|
364
|
+
if terminated_instances:
|
365
|
+
error_msg = (
|
366
|
+
f'Cluster {cluster_name_on_cloud} is in UP state, but '
|
367
|
+
f'{len(terminated_instances)} instances are terminated.')
|
368
|
+
raise RuntimeError(error_msg)
|
369
|
+
elif state == status_lib.ClusterStatus.STOPPED:
|
370
|
+
# Check if any instances are in TERMINATED state
|
371
|
+
instances = _filter_instances(
|
372
|
+
cluster_name_on_cloud,
|
373
|
+
[utils.HyperbolicInstanceStatus.TERMINATED.value])
|
374
|
+
if not instances:
|
375
|
+
# Check if any instances are in a failed state
|
376
|
+
failed_instances = _filter_instances(cluster_name_on_cloud, [
|
377
|
+
utils.HyperbolicInstanceStatus.FAILED.value,
|
378
|
+
utils.HyperbolicInstanceStatus.ERROR.value
|
379
|
+
])
|
380
|
+
if failed_instances:
|
381
|
+
raise RuntimeError(
|
382
|
+
f'Cluster {cluster_name_on_cloud} has failed instances: '
|
383
|
+
f'{failed_instances}')
|
384
|
+
raise RuntimeError(f'No terminated instances found for cluster '
|
385
|
+
f'{cluster_name_on_cloud}')
|
386
|
+
# Check if any instances are in ONLINE state
|
387
|
+
running_instances = _filter_instances(
|
388
|
+
cluster_name_on_cloud,
|
389
|
+
[utils.HyperbolicInstanceStatus.ONLINE.value])
|
390
|
+
if running_instances:
|
391
|
+
error_msg = (
|
392
|
+
f'Cluster {cluster_name_on_cloud} is in STOPPED state, but '
|
393
|
+
f'{len(running_instances)} instances are running.')
|
394
|
+
raise RuntimeError(error_msg)
|
395
|
+
else:
|
396
|
+
raise RuntimeError(f'Unsupported state: {state}')
|
397
|
+
|
398
|
+
|
399
|
+
def stop_instances(
|
400
|
+
cluster_name_on_cloud: str,
|
401
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
402
|
+
worker_only: bool = False,
|
403
|
+
) -> None:
|
404
|
+
"""Stop running instances. Not supported for Hyperbolic."""
|
405
|
+
raise NotImplementedError('stop_instances is not supported for Hyperbolic')
|
406
|
+
|
407
|
+
|
408
|
+
def cleanup_ports(
|
409
|
+
cluster_name_on_cloud: str,
|
410
|
+
provider_config: Optional[dict] = None,
|
411
|
+
ports: Optional[list] = None,
|
412
|
+
) -> None:
|
413
|
+
"""Cleanup ports. Not supported for Hyperbolic."""
|
414
|
+
raise NotImplementedError('cleanup_ports is not supported for Hyperbolic')
|
415
|
+
|
416
|
+
|
417
|
+
def open_ports(
|
418
|
+
cluster_name_on_cloud: str,
|
419
|
+
ports: list,
|
420
|
+
provider_config: Optional[dict] = None,
|
421
|
+
) -> None:
|
422
|
+
"""Open ports. Not supported for Hyperbolic."""
|
423
|
+
raise NotImplementedError('open_ports is not supported for Hyperbolic')
|