skypilot-nightly 1.0.0.dev20250915__py3-none-any.whl → 1.0.0.dev20250918__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +68 -4
- sky/authentication.py +25 -0
- sky/backends/__init__.py +3 -2
- sky/backends/backend_utils.py +16 -12
- sky/backends/cloud_vm_ray_backend.py +61 -4
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/client/sdk.py +6 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/primeintellect.py +314 -0
- sky/core.py +10 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.4a881570243431a5.js +51 -0
- sky/dashboard/out/_next/static/chunks/{6856-e0754534b3015377.js → 6856-9a2538f38c004652.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{6990-11c8e9b982e8ffec.js → 6990-f6818c84ed8f1c86.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-d1e29b3aa66bf4cf.js → webpack-487697b47d8c5e50.js} +1 -1
- sky/dashboard/out/_next/static/{dG6B0i0HO4jIoKb4ZFYJ_ → k1mo5xWZrV9djgjd0moOT}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +42 -34
- sky/jobs/server/server.py +14 -1
- sky/jobs/state.py +26 -1
- sky/provision/__init__.py +1 -0
- sky/provision/docker_utils.py +50 -3
- sky/provision/instance_setup.py +15 -1
- sky/provision/lambda_cloud/instance.py +12 -11
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/resources.py +9 -1
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_utils.py +29 -12
- sky/serve/server/core.py +37 -19
- sky/serve/server/impl.py +221 -129
- sky/server/common.py +13 -0
- sky/server/constants.py +3 -0
- sky/server/requests/executor.py +23 -6
- sky/server/server.py +10 -5
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +5 -3
- sky/skylet/services.py +98 -0
- sky/skylet/skylet.py +3 -1
- sky/skypilot_config.py +10 -3
- sky/templates/kubernetes-ray.yml.j2 +22 -12
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/METADATA +39 -38
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/RECORD +74 -62
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +0 -51
- sky/dashboard/out/_next/static/chunks/8969-0487dfbf149d9e53.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +0 -6
- /sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-7528cc0ef8c522c5.js} +0 -0
- /sky/dashboard/out/_next/static/{dG6B0i0HO4jIoKb4ZFYJ_ → k1mo5xWZrV9djgjd0moOT}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = '59773bc2eb3fb02a2b4bb26921a298c0233e8efd'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20250918'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
|
@@ -143,6 +143,7 @@ Kubernetes = clouds.Kubernetes
|
|
|
143
143
|
K8s = Kubernetes
|
|
144
144
|
OCI = clouds.OCI
|
|
145
145
|
Paperspace = clouds.Paperspace
|
|
146
|
+
PrimeIntellect = clouds.PrimeIntellect
|
|
146
147
|
RunPod = clouds.RunPod
|
|
147
148
|
Vast = clouds.Vast
|
|
148
149
|
Vsphere = clouds.Vsphere
|
|
@@ -163,6 +164,7 @@ __all__ = [
|
|
|
163
164
|
'Lambda',
|
|
164
165
|
'OCI',
|
|
165
166
|
'Paperspace',
|
|
167
|
+
'PrimeIntellect',
|
|
166
168
|
'RunPod',
|
|
167
169
|
'Vast',
|
|
168
170
|
'SCP',
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Prime Intellect cloud adaptor."""
|
sky/adaptors/seeweb.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
""" Seeweb Adaptor """
|
|
2
2
|
import configparser
|
|
3
|
-
|
|
3
|
+
import pathlib
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import pydantic
|
|
7
|
+
import requests # type: ignore
|
|
4
8
|
|
|
5
9
|
from sky.adaptors import common
|
|
6
10
|
from sky.utils import annotations
|
|
@@ -43,7 +47,7 @@ def check_compute_credentials() -> bool:
|
|
|
43
47
|
Returns True if credentials are valid; otherwise raises a SeewebError.
|
|
44
48
|
"""
|
|
45
49
|
# Read API key from standard Seeweb configuration file
|
|
46
|
-
key_path = Path('~/.seeweb_cloud/seeweb_keys').expanduser()
|
|
50
|
+
key_path = pathlib.Path('~/.seeweb_cloud/seeweb_keys').expanduser()
|
|
47
51
|
if not key_path.exists():
|
|
48
52
|
raise SeewebCredentialsFileNotFound(
|
|
49
53
|
'Missing Seeweb API key file ~/.seeweb_cloud/seeweb_keys')
|
|
@@ -84,7 +88,7 @@ def check_storage_credentials() -> bool:
|
|
|
84
88
|
def client():
|
|
85
89
|
"""Returns an authenticated ecsapi.Api object."""
|
|
86
90
|
# Create authenticated client using the same credential pattern
|
|
87
|
-
key_path = Path('~/.seeweb_cloud/seeweb_keys').expanduser()
|
|
91
|
+
key_path = pathlib.Path('~/.seeweb_cloud/seeweb_keys').expanduser()
|
|
88
92
|
if not key_path.exists():
|
|
89
93
|
raise SeewebCredentialsFileNotFound(
|
|
90
94
|
'Missing Seeweb API key file ~/.seeweb_cloud/seeweb_keys')
|
|
@@ -100,4 +104,64 @@ def client():
|
|
|
100
104
|
raise SeewebApiKeyMissing(
|
|
101
105
|
'Empty api_key in ~/.seeweb_cloud/seeweb_keys')
|
|
102
106
|
|
|
103
|
-
|
|
107
|
+
api = ecsapi.Api(token=api_key)
|
|
108
|
+
|
|
109
|
+
# Monkey-patch fetch_servers to be tolerant to API schema mismatches.
|
|
110
|
+
orig_fetch_servers = api.fetch_servers
|
|
111
|
+
orig_delete_server = api.delete_server
|
|
112
|
+
|
|
113
|
+
def _tolerant_fetch_servers(
|
|
114
|
+
timeout: Optional[int] = None): # type: ignore[override]
|
|
115
|
+
try:
|
|
116
|
+
return orig_fetch_servers(timeout=timeout)
|
|
117
|
+
except pydantic.ValidationError:
|
|
118
|
+
# Fallback path: fetch raw JSON, drop snapshot fields, then validate
|
|
119
|
+
# pylint: disable=protected-access
|
|
120
|
+
base_url = api._Api__generate_base_url() # type: ignore
|
|
121
|
+
headers = api._Api__generate_authentication_headers(
|
|
122
|
+
) # type: ignore
|
|
123
|
+
url = f'{base_url}/servers'
|
|
124
|
+
resp = requests.get(url, headers=headers, timeout=timeout or 15)
|
|
125
|
+
resp.raise_for_status()
|
|
126
|
+
data = resp.json()
|
|
127
|
+
try:
|
|
128
|
+
servers = data.get('server', [])
|
|
129
|
+
for s in servers:
|
|
130
|
+
s.pop('last_restored_snapshot', None)
|
|
131
|
+
except (KeyError, TypeError, ValueError):
|
|
132
|
+
pass
|
|
133
|
+
server_list_response_cls = ecsapi._server._ServerListResponse
|
|
134
|
+
servers_response = server_list_response_cls.model_validate(data)
|
|
135
|
+
return servers_response.server
|
|
136
|
+
|
|
137
|
+
api.fetch_servers = _tolerant_fetch_servers # type: ignore[assignment]
|
|
138
|
+
|
|
139
|
+
def _tolerant_delete_server(server_name: str,
|
|
140
|
+
timeout: Optional[int] = None):
|
|
141
|
+
try:
|
|
142
|
+
return orig_delete_server(server_name, timeout=timeout)
|
|
143
|
+
except pydantic.ValidationError:
|
|
144
|
+
# Fallback: perform raw DELETE and interpret not_found as success
|
|
145
|
+
# pylint: disable=protected-access
|
|
146
|
+
base_url = api._Api__generate_base_url() # type: ignore
|
|
147
|
+
headers = api._Api__generate_authentication_headers(
|
|
148
|
+
) # type: ignore
|
|
149
|
+
url = f'{base_url}/servers/{server_name}'
|
|
150
|
+
resp = requests.delete(url, headers=headers, timeout=timeout or 15)
|
|
151
|
+
# Treat 404 as idempotent success
|
|
152
|
+
if resp.status_code == 404:
|
|
153
|
+
return None
|
|
154
|
+
# Some APIs return {status: 'not_found', message: ...}
|
|
155
|
+
try:
|
|
156
|
+
data = resp.json()
|
|
157
|
+
if isinstance(data, dict) and data.get('status') == 'not_found':
|
|
158
|
+
return None
|
|
159
|
+
except (ValueError, TypeError):
|
|
160
|
+
pass
|
|
161
|
+
# If not clearly not_found, re-raise original behavior
|
|
162
|
+
resp.raise_for_status()
|
|
163
|
+
# Best-effort: return None to indicate deletion requested
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
api.delete_server = _tolerant_delete_server # type: ignore[assignment]
|
|
167
|
+
return api
|
sky/authentication.py
CHANGED
|
@@ -45,6 +45,7 @@ from sky.adaptors import vast
|
|
|
45
45
|
from sky.provision.fluidstack import fluidstack_utils
|
|
46
46
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
47
47
|
from sky.provision.lambda_cloud import lambda_utils
|
|
48
|
+
from sky.provision.primeintellect import utils as primeintellect_utils
|
|
48
49
|
from sky.utils import common_utils
|
|
49
50
|
from sky.utils import config_utils
|
|
50
51
|
from sky.utils import kubernetes_enums
|
|
@@ -604,6 +605,30 @@ def setup_hyperbolic_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
604
605
|
return configure_ssh_info(config)
|
|
605
606
|
|
|
606
607
|
|
|
608
|
+
def setup_primeintellect_authentication(
|
|
609
|
+
config: Dict[str, Any]) -> Dict[str, Any]:
|
|
610
|
+
"""Sets up SSH authentication for Prime Intellect.
|
|
611
|
+
- Generates a new SSH key pair if one does not exist.
|
|
612
|
+
- Adds the public SSH key to the user's Prime Intellect account.
|
|
613
|
+
"""
|
|
614
|
+
# Ensure local SSH keypair exists and fetch public key content
|
|
615
|
+
_, public_key_path = get_or_generate_keys()
|
|
616
|
+
with open(public_key_path, 'r', encoding='utf-8') as f:
|
|
617
|
+
public_key = f.read().strip()
|
|
618
|
+
|
|
619
|
+
# Register the public key with Prime Intellect (no-op if already exists)
|
|
620
|
+
client = primeintellect_utils.PrimeIntellectAPIClient()
|
|
621
|
+
client.get_or_add_ssh_key(public_key)
|
|
622
|
+
|
|
623
|
+
# Set up auth section for Ray template
|
|
624
|
+
config.setdefault('auth', {})
|
|
625
|
+
# Default username for Prime Intellect images
|
|
626
|
+
config['auth']['ssh_user'] = 'ubuntu'
|
|
627
|
+
config['auth']['ssh_public_key'] = public_key_path
|
|
628
|
+
|
|
629
|
+
return configure_ssh_info(config)
|
|
630
|
+
|
|
631
|
+
|
|
607
632
|
def setup_seeweb_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
608
633
|
"""Registers the public key with Seeweb and notes the remote name."""
|
|
609
634
|
# 1. local key pair
|
sky/backends/__init__.py
CHANGED
|
@@ -4,11 +4,12 @@ from sky.backends.backend import ResourceHandle
|
|
|
4
4
|
from sky.backends.cloud_vm_ray_backend import CloudVmRayBackend
|
|
5
5
|
from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle
|
|
6
6
|
from sky.backends.cloud_vm_ray_backend import LocalResourcesHandle
|
|
7
|
+
from sky.backends.cloud_vm_ray_backend import SkyletClient
|
|
7
8
|
from sky.backends.local_docker_backend import LocalDockerBackend
|
|
8
9
|
from sky.backends.local_docker_backend import LocalDockerResourceHandle
|
|
9
10
|
|
|
10
11
|
__all__ = [
|
|
11
12
|
'Backend', 'ResourceHandle', 'CloudVmRayBackend',
|
|
12
|
-
'CloudVmRayResourceHandle', '
|
|
13
|
-
'LocalDockerResourceHandle'
|
|
13
|
+
'CloudVmRayResourceHandle', 'SkyletClient', 'LocalResourcesHandle',
|
|
14
|
+
'LocalDockerBackend', 'LocalDockerResourceHandle'
|
|
14
15
|
]
|
sky/backends/backend_utils.py
CHANGED
|
@@ -1116,6 +1116,8 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1116
1116
|
config = auth.setup_fluidstack_authentication(config)
|
|
1117
1117
|
elif isinstance(cloud, clouds.Hyperbolic):
|
|
1118
1118
|
config = auth.setup_hyperbolic_authentication(config)
|
|
1119
|
+
elif isinstance(cloud, clouds.PrimeIntellect):
|
|
1120
|
+
config = auth.setup_primeintellect_authentication(config)
|
|
1119
1121
|
elif isinstance(cloud, clouds.Seeweb):
|
|
1120
1122
|
config = auth.setup_seeweb_authentication(config)
|
|
1121
1123
|
else:
|
|
@@ -2331,12 +2333,13 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2331
2333
|
'All nodes up; SkyPilot runtime healthy.',
|
|
2332
2334
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2333
2335
|
nop_if_duplicate=True)
|
|
2334
|
-
global_user_state.add_or_update_cluster(
|
|
2335
|
-
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2336
|
+
global_user_state.add_or_update_cluster(
|
|
2337
|
+
cluster_name,
|
|
2338
|
+
handle,
|
|
2339
|
+
requested_resources=None,
|
|
2340
|
+
ready=True,
|
|
2341
|
+
is_launch=False,
|
|
2342
|
+
existing_cluster_hash=record['cluster_hash'])
|
|
2340
2343
|
return global_user_state.get_cluster_from_name(cluster_name)
|
|
2341
2344
|
|
|
2342
2345
|
# All cases below are transitioning the cluster to non-UP states.
|
|
@@ -2542,12 +2545,13 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2542
2545
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2543
2546
|
nop_if_duplicate=True,
|
|
2544
2547
|
duplicate_regex=init_reason_regex)
|
|
2545
|
-
global_user_state.add_or_update_cluster(
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
|
|
2548
|
+
global_user_state.add_or_update_cluster(
|
|
2549
|
+
cluster_name,
|
|
2550
|
+
handle,
|
|
2551
|
+
requested_resources=None,
|
|
2552
|
+
ready=False,
|
|
2553
|
+
is_launch=False,
|
|
2554
|
+
existing_cluster_hash=record['cluster_hash'])
|
|
2551
2555
|
return global_user_state.get_cluster_from_name(cluster_name)
|
|
2552
2556
|
# Now is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2553
2557
|
# STOPPED.
|
|
@@ -52,6 +52,7 @@ from sky.provision import metadata_utils
|
|
|
52
52
|
from sky.provision import provisioner
|
|
53
53
|
from sky.provision.kubernetes import config as config_lib
|
|
54
54
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
55
|
+
from sky.serve import constants as serve_constants
|
|
55
56
|
from sky.server.requests import requests as requests_lib
|
|
56
57
|
from sky.skylet import autostop_lib
|
|
57
58
|
from sky.skylet import constants
|
|
@@ -90,6 +91,8 @@ if typing.TYPE_CHECKING:
|
|
|
90
91
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
91
92
|
from sky.schemas.generated import jobsv1_pb2
|
|
92
93
|
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
94
|
+
from sky.schemas.generated import servev1_pb2
|
|
95
|
+
from sky.schemas.generated import servev1_pb2_grpc
|
|
93
96
|
else:
|
|
94
97
|
# To avoid requiring grpcio to be installed on the client side.
|
|
95
98
|
grpc = adaptors_common.LazyImport(
|
|
@@ -104,6 +107,10 @@ else:
|
|
|
104
107
|
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
105
108
|
jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
106
109
|
'sky.schemas.generated.jobsv1_pb2_grpc')
|
|
110
|
+
servev1_pb2 = adaptors_common.LazyImport(
|
|
111
|
+
'sky.schemas.generated.servev1_pb2')
|
|
112
|
+
servev1_pb2_grpc = adaptors_common.LazyImport(
|
|
113
|
+
'sky.schemas.generated.servev1_pb2_grpc')
|
|
107
114
|
|
|
108
115
|
Path = str
|
|
109
116
|
|
|
@@ -222,6 +229,7 @@ def _get_cluster_config_template(cloud):
|
|
|
222
229
|
clouds.SCP: 'scp-ray.yml.j2',
|
|
223
230
|
clouds.OCI: 'oci-ray.yml.j2',
|
|
224
231
|
clouds.Paperspace: 'paperspace-ray.yml.j2',
|
|
232
|
+
clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
|
|
225
233
|
clouds.DO: 'do-ray.yml.j2',
|
|
226
234
|
clouds.RunPod: 'runpod-ray.yml.j2',
|
|
227
235
|
clouds.Kubernetes: 'kubernetes-ray.yml.j2',
|
|
@@ -3045,6 +3053,7 @@ class SkyletClient:
|
|
|
3045
3053
|
def __init__(self, channel: 'grpc.Channel'):
|
|
3046
3054
|
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
3047
3055
|
self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
|
|
3056
|
+
self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
|
|
3048
3057
|
|
|
3049
3058
|
def set_autostop(
|
|
3050
3059
|
self,
|
|
@@ -3131,6 +3140,54 @@ class SkyletClient:
|
|
|
3131
3140
|
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
3132
3141
|
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
3133
3142
|
|
|
3143
|
+
def get_service_status(
|
|
3144
|
+
self,
|
|
3145
|
+
request: 'servev1_pb2.GetServiceStatusRequest',
|
|
3146
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3147
|
+
) -> 'servev1_pb2.GetServiceStatusResponse':
|
|
3148
|
+
return self._serve_stub.GetServiceStatus(request, timeout=timeout)
|
|
3149
|
+
|
|
3150
|
+
def add_serve_version(
|
|
3151
|
+
self,
|
|
3152
|
+
request: 'servev1_pb2.AddVersionRequest',
|
|
3153
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3154
|
+
) -> 'servev1_pb2.AddVersionResponse':
|
|
3155
|
+
return self._serve_stub.AddVersion(request, timeout=timeout)
|
|
3156
|
+
|
|
3157
|
+
def terminate_services(
|
|
3158
|
+
self,
|
|
3159
|
+
request: 'servev1_pb2.TerminateServicesRequest',
|
|
3160
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3161
|
+
) -> 'servev1_pb2.TerminateServicesResponse':
|
|
3162
|
+
return self._serve_stub.TerminateServices(request, timeout=timeout)
|
|
3163
|
+
|
|
3164
|
+
def terminate_replica(
|
|
3165
|
+
self,
|
|
3166
|
+
request: 'servev1_pb2.TerminateReplicaRequest',
|
|
3167
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3168
|
+
) -> 'servev1_pb2.TerminateReplicaResponse':
|
|
3169
|
+
return self._serve_stub.TerminateReplica(request, timeout=timeout)
|
|
3170
|
+
|
|
3171
|
+
def wait_service_registration(
|
|
3172
|
+
self,
|
|
3173
|
+
request: 'servev1_pb2.WaitServiceRegistrationRequest',
|
|
3174
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3175
|
+
) -> 'servev1_pb2.WaitServiceRegistrationResponse':
|
|
3176
|
+
# set timeout to at least 10 seconds more than service register
|
|
3177
|
+
# constant to make sure that timeouts will not occur.
|
|
3178
|
+
if timeout is not None:
|
|
3179
|
+
timeout = max(timeout,
|
|
3180
|
+
serve_constants.SERVICE_REGISTER_TIMEOUT_SECONDS + 10)
|
|
3181
|
+
return self._serve_stub.WaitServiceRegistration(request,
|
|
3182
|
+
timeout=timeout)
|
|
3183
|
+
|
|
3184
|
+
def update_service(
|
|
3185
|
+
self,
|
|
3186
|
+
request: 'servev1_pb2.UpdateServiceRequest',
|
|
3187
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3188
|
+
) -> 'servev1_pb2.UpdateServiceResponse':
|
|
3189
|
+
return self._serve_stub.UpdateService(request, timeout=timeout)
|
|
3190
|
+
|
|
3134
3191
|
|
|
3135
3192
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
3136
3193
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
@@ -4394,15 +4451,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4394
4451
|
else:
|
|
4395
4452
|
raise
|
|
4396
4453
|
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
4397
|
-
lock = locks.get_lock(lock_id)
|
|
4454
|
+
lock = locks.get_lock(lock_id, timeout=1)
|
|
4398
4455
|
# Retry in case new cluster operation comes in and holds the lock
|
|
4399
4456
|
# right after the lock is removed.
|
|
4400
4457
|
n_attempts = 2
|
|
4401
4458
|
while True:
|
|
4402
4459
|
n_attempts -= 1
|
|
4403
|
-
# In case other running cluster operations are still holding the
|
|
4404
|
-
# lock.
|
|
4405
|
-
lock.force_unlock()
|
|
4406
4460
|
# We have to kill the cluster requests, because `down` and `stop`
|
|
4407
4461
|
# should be higher priority than the cluster requests, and we should
|
|
4408
4462
|
# release the lock from other requests.
|
|
@@ -4420,6 +4474,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4420
4474
|
'Failed to kill other launch requests for the '
|
|
4421
4475
|
f'cluster {handle.cluster_name}: '
|
|
4422
4476
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
4477
|
+
# In case other running cluster operations are still holding the
|
|
4478
|
+
# lock.
|
|
4479
|
+
lock.force_unlock()
|
|
4423
4480
|
try:
|
|
4424
4481
|
with lock:
|
|
4425
4482
|
self.teardown_no_lock(
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""PrimeIntellect service catalog.
|
|
2
|
+
|
|
3
|
+
This module loads the service catalog file and can be used to
|
|
4
|
+
query instance types and pricing information for PrimeIntellect.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import typing
|
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
from sky.catalog import common
|
|
11
|
+
|
|
12
|
+
if typing.TYPE_CHECKING:
|
|
13
|
+
from sky.clouds import cloud
|
|
14
|
+
|
|
15
|
+
_df = common.read_catalog('primeintellect/vms.csv')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def instance_type_exists(instance_type: str) -> bool:
|
|
19
|
+
return common.instance_type_exists_impl(_df, instance_type)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def validate_region_zone(
|
|
23
|
+
region: Optional[str],
|
|
24
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
|
25
|
+
return common.validate_region_zone_impl('primeintellect', _df, region, zone)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_hourly_cost(instance_type: str,
|
|
29
|
+
use_spot: bool = False,
|
|
30
|
+
region: Optional[str] = None,
|
|
31
|
+
zone: Optional[str] = None) -> float:
|
|
32
|
+
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
|
33
|
+
return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
|
|
34
|
+
zone)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_vcpus_mem_from_instance_type(
|
|
38
|
+
instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
|
39
|
+
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
43
|
+
memory: Optional[str] = None,
|
|
44
|
+
disk_tier: Optional[str] = None,
|
|
45
|
+
region: Optional[str] = None,
|
|
46
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
47
|
+
del disk_tier # no disk tiers
|
|
48
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
|
|
49
|
+
zone)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_accelerators_from_instance_type(
|
|
53
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
|
54
|
+
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_instance_type_for_accelerator(
|
|
58
|
+
acc_name: str,
|
|
59
|
+
acc_count: int,
|
|
60
|
+
cpus: Optional[str] = None,
|
|
61
|
+
memory: Optional[str] = None,
|
|
62
|
+
use_spot: bool = False,
|
|
63
|
+
region: Optional[str] = None,
|
|
64
|
+
zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
|
|
65
|
+
"""Returns a list of instance types that have the given accelerator."""
|
|
66
|
+
return common.get_instance_type_for_accelerator_impl(df=_df,
|
|
67
|
+
acc_name=acc_name,
|
|
68
|
+
acc_count=acc_count,
|
|
69
|
+
cpus=cpus,
|
|
70
|
+
memory=memory,
|
|
71
|
+
use_spot=use_spot,
|
|
72
|
+
region=region,
|
|
73
|
+
zone=zone)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
|
77
|
+
use_spot: bool) -> List['cloud.Region']:
|
|
78
|
+
df = _df[_df['InstanceType'] == instance_type]
|
|
79
|
+
return common.get_region_zones(df, use_spot)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def list_accelerators(
|
|
83
|
+
gpus_only: bool,
|
|
84
|
+
name_filter: Optional[str],
|
|
85
|
+
region_filter: Optional[str],
|
|
86
|
+
quantity_filter: Optional[int],
|
|
87
|
+
case_sensitive: bool = True,
|
|
88
|
+
all_regions: bool = False,
|
|
89
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
90
|
+
"""Returns all instance types in Prime Intellect offering GPUs."""
|
|
91
|
+
del require_price
|
|
92
|
+
return common.list_accelerators_impl('PrimeIntellect', _df, gpus_only,
|
|
93
|
+
name_filter, region_filter,
|
|
94
|
+
quantity_filter, case_sensitive,
|
|
95
|
+
all_regions)
|
sky/client/sdk.py
CHANGED
|
@@ -2064,6 +2064,12 @@ def stream_and_get(
|
|
|
2064
2064
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
|
2065
2065
|
None),
|
|
2066
2066
|
stream=True)
|
|
2067
|
+
stream_request_id: Optional[server_common.RequestId[
|
|
2068
|
+
T]] = server_common.get_stream_request_id(response)
|
|
2069
|
+
if request_id is not None and stream_request_id is not None:
|
|
2070
|
+
assert request_id == stream_request_id
|
|
2071
|
+
if request_id is None:
|
|
2072
|
+
request_id = stream_request_id
|
|
2067
2073
|
if response.status_code in [404, 400]:
|
|
2068
2074
|
detail = response.json().get('detail')
|
|
2069
2075
|
with ux_utils.print_exception_no_traceback():
|
sky/clouds/__init__.py
CHANGED
|
@@ -26,6 +26,7 @@ from sky.clouds.lambda_cloud import Lambda
|
|
|
26
26
|
from sky.clouds.nebius import Nebius
|
|
27
27
|
from sky.clouds.oci import OCI
|
|
28
28
|
from sky.clouds.paperspace import Paperspace
|
|
29
|
+
from sky.clouds.primeintellect import PrimeIntellect
|
|
29
30
|
from sky.clouds.runpod import RunPod
|
|
30
31
|
from sky.clouds.scp import SCP
|
|
31
32
|
from sky.clouds.seeweb import Seeweb
|
|
@@ -44,6 +45,7 @@ __all__ = [
|
|
|
44
45
|
'Lambda',
|
|
45
46
|
'DO',
|
|
46
47
|
'Paperspace',
|
|
48
|
+
'PrimeIntellect',
|
|
47
49
|
'SCP',
|
|
48
50
|
'RunPod',
|
|
49
51
|
'Vast',
|