skypilot-nightly 1.0.0.dev20250613__py3-none-any.whl → 1.0.0.dev20250615__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/hyperbolic.py +8 -0
  3. sky/authentication.py +20 -2
  4. sky/backends/backend_utils.py +3 -1
  5. sky/backends/cloud_vm_ray_backend.py +2 -1
  6. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  7. sky/catalog/hyperbolic_catalog.py +133 -0
  8. sky/clouds/__init__.py +2 -0
  9. sky/clouds/hyperbolic.py +276 -0
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{UdgJCk2sZFLJgFJW_qiWG → R07f8gwfXT1U0zRznq4Lg}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/37-824c707421f6f003.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +16 -0
  14. sky/dashboard/out/_next/static/chunks/{856-0776dc6ed6000c39.js → 856-c2c39c0912285e54.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/938-385d190b95815e11.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{webpack-5c3e6471d04780c6.js → webpack-1b69b196a4dbffef.js} +1 -1
  17. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +3 -0
  18. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  19. sky/dashboard/out/clusters/[cluster].html +1 -1
  20. sky/dashboard/out/clusters.html +1 -1
  21. sky/dashboard/out/config.html +1 -1
  22. sky/dashboard/out/index.html +1 -1
  23. sky/dashboard/out/infra/[context].html +1 -1
  24. sky/dashboard/out/infra.html +1 -1
  25. sky/dashboard/out/jobs/[job].html +1 -1
  26. sky/dashboard/out/jobs.html +1 -1
  27. sky/dashboard/out/users.html +1 -1
  28. sky/dashboard/out/workspace/new.html +1 -1
  29. sky/dashboard/out/workspaces/[name].html +1 -1
  30. sky/dashboard/out/workspaces.html +1 -1
  31. sky/provision/__init__.py +1 -0
  32. sky/provision/hyperbolic/__init__.py +11 -0
  33. sky/provision/hyperbolic/config.py +10 -0
  34. sky/provision/hyperbolic/instance.py +423 -0
  35. sky/provision/hyperbolic/utils.py +373 -0
  36. sky/setup_files/dependencies.py +2 -1
  37. sky/skylet/constants.py +1 -1
  38. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  39. sky/users/permission.py +2 -0
  40. sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
  41. {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/METADATA +2 -1
  42. {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/RECORD +52 -43
  43. sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +0 -6
  44. sky/dashboard/out/_next/static/chunks/600.15a0009177e86b86.js +0 -16
  45. sky/dashboard/out/_next/static/chunks/938-ab185187a63f9cdb.js +0 -1
  46. sky/dashboard/out/_next/static/css/5d71bfc09f184bab.css +0 -3
  47. /sky/dashboard/out/_next/static/{UdgJCk2sZFLJgFJW_qiWG → R07f8gwfXT1U0zRznq4Lg}/_ssgManifest.js +0 -0
  48. /sky/dashboard/out/_next/static/chunks/{843-6fcc4bf91ac45b39.js → 843-ab9c4f609239155f.js} +0 -0
  49. /sky/dashboard/out/_next/static/chunks/pages/{_app-7bbd9d39d6f9a98a.js → _app-32b2caae3445bf3b.js} +0 -0
  50. /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-451a14e7e755ebbc.js → [cluster]-59950b2f83b66e48.js} +0 -0
  51. /sky/dashboard/out/_next/static/chunks/pages/{clusters-e56b17fd85d0ba58.js → clusters-82a651dbad53ec6e.js} +0 -0
  52. /sky/dashboard/out/_next/static/chunks/pages/{jobs-fe233baf3d073491.js → jobs-336ab80e270ce2ce.js} +0 -0
  53. {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/WHEEL +0 -0
  54. {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/entry_points.txt +0 -0
  55. {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/licenses/LICENSE +0 -0
  56. {skypilot_nightly-1.0.0.dev20250613.dist-info → skypilot_nightly-1.0.0.dev20250615.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/5d71bfc09f184bab.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/5d71bfc09f184bab.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-5c3e6471d04780c6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-7bbd9d39d6f9a98a.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/UdgJCk2sZFLJgFJW_qiWG/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/UdgJCk2sZFLJgFJW_qiWG/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"UdgJCk2sZFLJgFJW_qiWG","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"R07f8gwfXT1U0zRznq4Lg","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/5d71bfc09f184bab.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/5d71bfc09f184bab.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-5c3e6471d04780c6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-7bbd9d39d6f9a98a.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-0776dc6ed6000c39.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-c807fc34f09c7df3.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-ab185187a63f9cdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-6fcc4bf91ac45b39.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-c8c2191328532b7d.js" defer=""></script><script src="/dashboard/_next/static/UdgJCk2sZFLJgFJW_qiWG/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/UdgJCk2sZFLJgFJW_qiWG/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"UdgJCk2sZFLJgFJW_qiWG","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-c807fc34f09c7df3.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-385d190b95815e11.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-ab9c4f609239155f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-c8c2191328532b7d.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"R07f8gwfXT1U0zRznq4Lg","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/5d71bfc09f184bab.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/5d71bfc09f184bab.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-5c3e6471d04780c6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-7bbd9d39d6f9a98a.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/UdgJCk2sZFLJgFJW_qiWG/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/UdgJCk2sZFLJgFJW_qiWG/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"UdgJCk2sZFLJgFJW_qiWG","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/R07f8gwfXT1U0zRznq4Lg/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"R07f8gwfXT1U0zRznq4Lg","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/provision/__init__.py CHANGED
@@ -18,6 +18,7 @@ from sky.provision import common
18
18
  from sky.provision import cudo
19
19
  from sky.provision import fluidstack
20
20
  from sky.provision import gcp
21
+ from sky.provision import hyperbolic
21
22
  from sky.provision import kubernetes
22
23
  from sky.provision import lambda_cloud
23
24
  from sky.provision import nebius
@@ -0,0 +1,11 @@
1
+ """Hyperbolic provisioner for SkyPilot."""
2
+
3
+ from sky.provision.hyperbolic.config import bootstrap_instances
4
+ from sky.provision.hyperbolic.instance import cleanup_ports
5
+ from sky.provision.hyperbolic.instance import get_cluster_info
6
+ from sky.provision.hyperbolic.instance import open_ports
7
+ from sky.provision.hyperbolic.instance import query_instances
8
+ from sky.provision.hyperbolic.instance import run_instances
9
+ from sky.provision.hyperbolic.instance import stop_instances
10
+ from sky.provision.hyperbolic.instance import terminate_instances
11
+ from sky.provision.hyperbolic.instance import wait_instances
@@ -0,0 +1,10 @@
1
+ """Hyperbolic Cloud configuration bootstrapping"""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ del region, cluster_name # unused
10
+ return config
@@ -0,0 +1,423 @@
1
+ """Hyperbolic instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from sky import sky_logging
6
+ from sky.provision import common
7
+ from sky.provision.hyperbolic import utils
8
+ from sky.utils import status_lib
9
+
10
+ PROVIDER_NAME = 'hyperbolic'
11
+ POLL_INTERVAL = 5
12
+ QUERY_PORTS_TIMEOUT_SECONDS = 30
13
+ #TODO come up with a reasonable value for this timeout
14
+ TIMEOUT = 300
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+
19
+ def _filter_instances(cluster_name_on_cloud: str,
20
+ status_filters: Optional[List[str]],
21
+ head_only: bool = False) -> Dict[str, Dict[str, Any]]:
22
+ logger.debug(f'Filtering instances: cluster={cluster_name_on_cloud}, '
23
+ f'status={status_filters}')
24
+ _ = head_only # Mark as intentionally unused
25
+
26
+ # Filter by cluster name using metadata
27
+ instances = utils.list_instances(
28
+ metadata={'skypilot': {
29
+ 'cluster_name': cluster_name_on_cloud
30
+ }})
31
+
32
+ # Normalize status filters to lowercase
33
+ if status_filters is not None:
34
+ status_filters = [s.lower() for s in status_filters]
35
+
36
+ filtered_instances: Dict[str, Dict[str, Any]] = {}
37
+ for instance_id, instance in instances.items():
38
+ try:
39
+ # Check status filter
40
+ instance_status = instance.get('status', '').lower()
41
+ if (status_filters is not None and
42
+ instance_status not in status_filters):
43
+ logger.debug(
44
+ f'Skipping instance {instance_id} '
45
+ f'- status {instance_status} not in {status_filters}')
46
+ continue
47
+
48
+ filtered_instances[instance_id] = instance
49
+ logger.debug(f'Including instance {instance_id} '
50
+ f'with status {instance_status}')
51
+
52
+ except Exception as e: # pylint: disable=broad-except
53
+ logger.warning(f'Error processing instance {instance_id}: {str(e)}')
54
+ continue
55
+
56
+ logger.info(f'Found {len(filtered_instances)} instances matching filters')
57
+ return filtered_instances
58
+
59
+
60
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
61
+ """Get the instance ID from the instances dict."""
62
+ if not instances:
63
+ return None
64
+ return next(iter(instances.keys()))
65
+
66
+
67
+ def run_instances(region: str, cluster_name_on_cloud: str,
68
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
69
+ logger.info(f'Starting run_instances with region={region}, '
70
+ f'cluster={cluster_name_on_cloud}')
71
+ logger.debug(f'Config: {config}')
72
+ start_time = time.time()
73
+
74
+ # Define pending statuses for Hyperbolic
75
+ pending_status = [
76
+ utils.HyperbolicInstanceStatus.CREATING.value,
77
+ utils.HyperbolicInstanceStatus.STARTING.value
78
+ ]
79
+ logger.debug(
80
+ f'Looking for instances with pending statuses: {pending_status}')
81
+
82
+ # Wait for any pending instances to be ready
83
+ while True:
84
+ if time.time() - start_time > TIMEOUT:
85
+ logger.error(
86
+ f'Timed out after {TIMEOUT}s waiting for instances to be ready')
87
+ raise TimeoutError(
88
+ f'Timed out after {TIMEOUT}s waiting for instances to be ready')
89
+
90
+ instances = _filter_instances(cluster_name_on_cloud, pending_status)
91
+ logger.debug(f'Found {len(instances)} instances with pending status')
92
+ if not instances:
93
+ break
94
+ logger.info(
95
+ f'Waiting for instance to be ready. Current instances: {instances}')
96
+ time.sleep(POLL_INTERVAL)
97
+
98
+ # Check existing running instance
99
+ logger.info('Checking for existing running instances')
100
+ exist_instances = _filter_instances(
101
+ cluster_name_on_cloud, [utils.HyperbolicInstanceStatus.ONLINE.value])
102
+ logger.debug(
103
+ f'Found {len(exist_instances)} running instances: {exist_instances}')
104
+ instance_id = _get_head_instance_id(exist_instances)
105
+ logger.debug(f'Head instance ID: {instance_id}')
106
+
107
+ # Calculate if we need to start a new instance
108
+ to_start_count = 1 - len(exist_instances) # Always 1 for single node
109
+ logger.info(f'Need to start {to_start_count} new instances')
110
+ if to_start_count < 0:
111
+ logger.error(
112
+ f'Cluster {cluster_name_on_cloud} already has an instance running')
113
+ raise RuntimeError(
114
+ f'Cluster {cluster_name_on_cloud} already has an instance running.')
115
+ if to_start_count == 0:
116
+ if instance_id is None:
117
+ logger.error(
118
+ f'Cluster {cluster_name_on_cloud} has no running instance')
119
+ raise RuntimeError(
120
+ f'Cluster {cluster_name_on_cloud} has no running instance.')
121
+ logger.info(
122
+ f'Cluster {cluster_name_on_cloud} already has a running instance')
123
+ return common.ProvisionRecord(provider_name=PROVIDER_NAME,
124
+ cluster_name=cluster_name_on_cloud,
125
+ region='default',
126
+ zone=None,
127
+ head_instance_id=instance_id,
128
+ resumed_instance_ids=[],
129
+ created_instance_ids=[])
130
+
131
+ try:
132
+ # Get instance type from node_config
133
+ instance_type = config.node_config.get('InstanceType')
134
+ logger.debug(f'Instance type from config: {instance_type}')
135
+ if not instance_type:
136
+ logger.error('InstanceType is not set in node_config')
137
+ raise RuntimeError(
138
+ 'InstanceType is not set in node_config. '
139
+ 'Please specify an instance type for Hyperbolic.')
140
+
141
+ # Parse gpu_model configuration from instance type
142
+ # Format: {gpu_count}x-{gpu_model}-{cpu}-{memory}
143
+ # Example: 1x-A100-24-271
144
+ try:
145
+ parts = instance_type.split('-')
146
+ if len(parts) != 4:
147
+ raise ValueError(
148
+ f'Invalid instance type format: {instance_type}. '
149
+ 'Expected format: {gpu_count}x-{gpu_model}-{cpu}-{memory}')
150
+
151
+ gpu_count = int(parts[0].rstrip('x'))
152
+ gpu_model = parts[1]
153
+ logger.info(f'Parsed GPU config from instance type: '
154
+ f'model={gpu_model}, count={gpu_count}')
155
+
156
+ # Launch instance
157
+ instance_id, ssh_command = utils.launch_instance(
158
+ gpu_model, gpu_count, cluster_name_on_cloud)
159
+ logger.info(f'Launched instance {instance_id} with SSH command: '
160
+ f'{ssh_command}')
161
+ created_instance_ids = [instance_id]
162
+
163
+ # Wait for instance to be ready
164
+ if not utils.wait_for_instance(
165
+ instance_id, utils.HyperbolicInstanceStatus.ONLINE.value):
166
+ raise RuntimeError(
167
+ f'Instance {instance_id} failed to reach ONLINE state')
168
+
169
+ except ValueError as e:
170
+ logger.error(f'Failed to parse instance type: {e}')
171
+ raise RuntimeError(str(e)) from e
172
+ except Exception as e:
173
+ logger.error(f'Failed to launch instance: {e}')
174
+ raise RuntimeError(str(e)) from e
175
+
176
+ except Exception as e:
177
+ logger.error(f'Unexpected error: {e}')
178
+ raise
179
+
180
+ # Wait for instance to be ready
181
+ logger.info(f'Waiting for instance {instance_id} to be ready')
182
+ while True:
183
+ instances = _filter_instances(
184
+ cluster_name_on_cloud,
185
+ [utils.HyperbolicInstanceStatus.ONLINE.value])
186
+ logger.debug(f'Current instances: {instances}')
187
+ if len(instances) == 1:
188
+ logger.info(f'Instance {instance_id} is ready')
189
+ break
190
+ if time.time() - start_time > TIMEOUT:
191
+ logger.error(
192
+ f'Timed out after {TIMEOUT}s waiting for instance to be ready')
193
+ raise TimeoutError(
194
+ f'Timed out after {TIMEOUT}s waiting for instance to be ready')
195
+ logger.info('Waiting for instance to be ready...')
196
+ time.sleep(POLL_INTERVAL)
197
+
198
+ logger.info(f'Returning ProvisionRecord for instance {instance_id}')
199
+ return common.ProvisionRecord(provider_name=PROVIDER_NAME,
200
+ cluster_name=cluster_name_on_cloud,
201
+ region='default',
202
+ zone=None,
203
+ head_instance_id=instance_id,
204
+ resumed_instance_ids=[],
205
+ created_instance_ids=created_instance_ids)
206
+
207
+
208
+ def terminate_instances(
209
+ cluster_name_on_cloud: str,
210
+ provider_config: Optional[dict] = None,
211
+ worker_only: bool = False,
212
+ ) -> None:
213
+ """Terminate all instances in the cluster."""
214
+ del provider_config, worker_only # unused
215
+ logger.info(
216
+ f'Terminating all instances for cluster {cluster_name_on_cloud}')
217
+
218
+ # First check if instances exist
219
+ instances = _filter_instances(cluster_name_on_cloud, None)
220
+ if not instances:
221
+ logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
222
+ return
223
+
224
+ # Terminate each instance
225
+ for instance_id in instances:
226
+ try:
227
+ utils.terminate_instance(instance_id)
228
+ logger.info(f'Terminated instance {instance_id}')
229
+ except Exception as e: # pylint: disable=broad-except
230
+ logger.warning(f'Failed to terminate instance {instance_id}: {e}')
231
+ continue
232
+
233
+ # Wait for instances to be terminated
234
+ start_time = time.time()
235
+ while True:
236
+ if time.time() - start_time > TIMEOUT:
237
+ logger.error(
238
+ f'Timed out after {TIMEOUT}s waiting for instances to terminate'
239
+ )
240
+ break
241
+
242
+ instances = _filter_instances(
243
+ cluster_name_on_cloud,
244
+ [utils.HyperbolicInstanceStatus.TERMINATED.value])
245
+ if not instances:
246
+ logger.info('All instances terminated successfully')
247
+ break
248
+
249
+ logger.info('Waiting for instances to terminate...')
250
+ time.sleep(POLL_INTERVAL)
251
+
252
+
253
+ def get_cluster_info(
254
+ region: str,
255
+ cluster_name_on_cloud: str,
256
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
257
+ """Returns information about the cluster."""
258
+ del region # unused
259
+ running_instances = _filter_instances(
260
+ cluster_name_on_cloud, [utils.HyperbolicInstanceStatus.ONLINE.value])
261
+ instances: Dict[str, List[common.InstanceInfo]] = {}
262
+ head_instance_id = None
263
+
264
+ for instance_id, instance_info in running_instances.items():
265
+ # Extract hostname and port from sshCommand
266
+ ssh_command = instance_info.get('sshCommand', '')
267
+ if ssh_command:
268
+ # Format: ssh user@hostname -p port
269
+ parts = ssh_command.split()
270
+ if len(parts) >= 4:
271
+ user_host = parts[1] # user@hostname
272
+ if '@' in user_host:
273
+ ssh_user = user_host.split('@')[0]
274
+ hostname = user_host.split('@')[1]
275
+ else:
276
+ hostname = user_host
277
+ port = int(parts[3])
278
+ else:
279
+ hostname = instance_id
280
+ port = 22
281
+ else:
282
+ hostname = instance_id
283
+ port = 22
284
+
285
+ instances[instance_id] = [
286
+ common.InstanceInfo(
287
+ instance_id=instance_id,
288
+ internal_ip=hostname,
289
+ external_ip=hostname,
290
+ ssh_port=port,
291
+ tags={},
292
+ )
293
+ ]
294
+ if head_instance_id is None:
295
+ head_instance_id = instance_id
296
+
297
+ return common.ClusterInfo(
298
+ instances=instances,
299
+ head_instance_id=head_instance_id,
300
+ provider_name=PROVIDER_NAME,
301
+ provider_config=provider_config,
302
+ ssh_user=ssh_user,
303
+ )
304
+
305
+
306
+ def query_instances(
307
+ cluster_name_on_cloud: str,
308
+ provider_config: Optional[dict] = None,
309
+ non_terminated_only: bool = True,
310
+ ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
311
+ """Returns the status of the specified instances for Hyperbolic."""
312
+ del provider_config # unused
313
+ # Fetch all instances for this cluster
314
+ instances = utils.list_instances(
315
+ metadata={'skypilot': {
316
+ 'cluster_name': cluster_name_on_cloud
317
+ }})
318
+ if not instances:
319
+ # No instances found: return empty dict to indicate fully deleted
320
+ return {}
321
+
322
+ statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
323
+ for instance_id, instance in instances.items():
324
+ try:
325
+ raw_status = instance.get('status', 'unknown').lower()
326
+ hyperbolic_status = utils.HyperbolicInstanceStatus.from_raw_status(
327
+ raw_status)
328
+ status = hyperbolic_status.to_cluster_status()
329
+ if non_terminated_only and status is None:
330
+ continue
331
+ statuses[instance_id] = status
332
+ except utils.HyperbolicError as e:
333
+ logger.warning(
334
+ f'Failed to parse status for instance {instance_id}: {e}')
335
+ continue
336
+ return statuses
337
+
338
+
339
+ def wait_instances(region: str, cluster_name_on_cloud: str,
340
+ state: Optional[status_lib.ClusterStatus]) -> None:
341
+ """Wait for instances to reach the desired state."""
342
+ del region # unused
343
+ if state == status_lib.ClusterStatus.UP:
344
+ # Check if any instances are in ONLINE state
345
+ instances = _filter_instances(
346
+ cluster_name_on_cloud,
347
+ [utils.HyperbolicInstanceStatus.ONLINE.value])
348
+ if not instances:
349
+ # Check if any instances are in a failed state
350
+ failed_instances = _filter_instances(cluster_name_on_cloud, [
351
+ utils.HyperbolicInstanceStatus.FAILED.value,
352
+ utils.HyperbolicInstanceStatus.ERROR.value
353
+ ])
354
+ if failed_instances:
355
+ raise RuntimeError(
356
+ f'Cluster {cluster_name_on_cloud} has failed instances: '
357
+ f'{failed_instances}')
358
+ raise RuntimeError(f'No running instances found for cluster '
359
+ f'{cluster_name_on_cloud}')
360
+ # Check if any instances are in TERMINATED state
361
+ terminated_instances = _filter_instances(
362
+ cluster_name_on_cloud,
363
+ [utils.HyperbolicInstanceStatus.TERMINATED.value])
364
+ if terminated_instances:
365
+ error_msg = (
366
+ f'Cluster {cluster_name_on_cloud} is in UP state, but '
367
+ f'{len(terminated_instances)} instances are terminated.')
368
+ raise RuntimeError(error_msg)
369
+ elif state == status_lib.ClusterStatus.STOPPED:
370
+ # Check if any instances are in TERMINATED state
371
+ instances = _filter_instances(
372
+ cluster_name_on_cloud,
373
+ [utils.HyperbolicInstanceStatus.TERMINATED.value])
374
+ if not instances:
375
+ # Check if any instances are in a failed state
376
+ failed_instances = _filter_instances(cluster_name_on_cloud, [
377
+ utils.HyperbolicInstanceStatus.FAILED.value,
378
+ utils.HyperbolicInstanceStatus.ERROR.value
379
+ ])
380
+ if failed_instances:
381
+ raise RuntimeError(
382
+ f'Cluster {cluster_name_on_cloud} has failed instances: '
383
+ f'{failed_instances}')
384
+ raise RuntimeError(f'No terminated instances found for cluster '
385
+ f'{cluster_name_on_cloud}')
386
+ # Check if any instances are in ONLINE state
387
+ running_instances = _filter_instances(
388
+ cluster_name_on_cloud,
389
+ [utils.HyperbolicInstanceStatus.ONLINE.value])
390
+ if running_instances:
391
+ error_msg = (
392
+ f'Cluster {cluster_name_on_cloud} is in STOPPED state, but '
393
+ f'{len(running_instances)} instances are running.')
394
+ raise RuntimeError(error_msg)
395
+ else:
396
+ raise RuntimeError(f'Unsupported state: {state}')
397
+
398
+
399
+ def stop_instances(
400
+ cluster_name_on_cloud: str,
401
+ provider_config: Optional[Dict[str, Any]] = None,
402
+ worker_only: bool = False,
403
+ ) -> None:
404
+ """Stop running instances. Not supported for Hyperbolic."""
405
+ raise NotImplementedError('stop_instances is not supported for Hyperbolic')
406
+
407
+
408
+ def cleanup_ports(
409
+ cluster_name_on_cloud: str,
410
+ provider_config: Optional[dict] = None,
411
+ ports: Optional[list] = None,
412
+ ) -> None:
413
+ """Cleanup ports. Not supported for Hyperbolic."""
414
+ raise NotImplementedError('cleanup_ports is not supported for Hyperbolic')
415
+
416
+
417
+ def open_ports(
418
+ cluster_name_on_cloud: str,
419
+ ports: list,
420
+ provider_config: Optional[dict] = None,
421
+ ) -> None:
422
+ """Open ports. Not supported for Hyperbolic."""
423
+ raise NotImplementedError('open_ports is not supported for Hyperbolic')