skypilot-nightly 1.0.0.dev20250916__py3-none-any.whl → 1.0.0.dev20250919__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +68 -4
- sky/authentication.py +25 -0
- sky/backends/__init__.py +3 -2
- sky/backends/backend_utils.py +16 -12
- sky/backends/cloud_vm_ray_backend.py +57 -0
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/primeintellect.py +314 -0
- sky/core.py +77 -48
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{y8s7LlyyfhMzpzCkxuD2r → VvaUqYDvHOcHZRnvMBmax}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6856-e0754534b3015377.js → 6856-9a2538f38c004652.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0b4b35dc1dfe046c.js → [cluster]-9525660179df3605.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-05f82d90d6fd7f82.js → webpack-b2a3938c22b6647b.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +99 -62
- sky/jobs/server/server.py +14 -1
- sky/jobs/state.py +26 -1
- sky/metrics/utils.py +174 -8
- sky/provision/__init__.py +1 -0
- sky/provision/docker_utils.py +6 -2
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/resources.py +9 -1
- sky/schemas/generated/jobsv1_pb2.py +40 -40
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_utils.py +29 -12
- sky/serve/server/core.py +37 -19
- sky/serve/server/impl.py +221 -129
- sky/server/metrics.py +52 -158
- sky/server/requests/executor.py +12 -8
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/requests.py +1 -1
- sky/server/requests/serializers/encoders.py +3 -2
- sky/server/server.py +5 -41
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +10 -5
- sky/skylet/job_lib.py +14 -15
- sky/skylet/services.py +98 -0
- sky/skylet/skylet.py +3 -1
- sky/templates/kubernetes-ray.yml.j2 +22 -12
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/utils/locks.py +41 -10
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/METADATA +36 -35
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/RECORD +76 -64
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0487dfbf149d9e53.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- /sky/dashboard/out/_next/static/{y8s7LlyyfhMzpzCkxuD2r → VvaUqYDvHOcHZRnvMBmax}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/top_level.txt +0 -0
sky/metrics/utils.py
CHANGED
|
@@ -1,11 +1,165 @@
|
|
|
1
1
|
"""Utilities for processing GPU metrics from Kubernetes clusters."""
|
|
2
|
+
import contextlib
|
|
3
|
+
import functools
|
|
2
4
|
import os
|
|
3
5
|
import re
|
|
6
|
+
import select
|
|
4
7
|
import subprocess
|
|
5
8
|
import time
|
|
6
9
|
from typing import List, Optional, Tuple
|
|
7
10
|
|
|
8
11
|
import httpx
|
|
12
|
+
import prometheus_client as prom
|
|
13
|
+
|
|
14
|
+
from sky.skylet import constants
|
|
15
|
+
from sky.utils import context_utils
|
|
16
|
+
|
|
17
|
+
_SELECT_TIMEOUT = 1
|
|
18
|
+
_SELECT_BUFFER_SIZE = 4096
|
|
19
|
+
|
|
20
|
+
_KB = 2**10
|
|
21
|
+
_MB = 2**20
|
|
22
|
+
_MEM_BUCKETS = [
|
|
23
|
+
_KB,
|
|
24
|
+
256 * _KB,
|
|
25
|
+
512 * _KB,
|
|
26
|
+
_MB,
|
|
27
|
+
2 * _MB,
|
|
28
|
+
4 * _MB,
|
|
29
|
+
8 * _MB,
|
|
30
|
+
16 * _MB,
|
|
31
|
+
32 * _MB,
|
|
32
|
+
64 * _MB,
|
|
33
|
+
128 * _MB,
|
|
34
|
+
256 * _MB,
|
|
35
|
+
float('inf'),
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
39
|
+
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
40
|
+
'false').lower() == 'true'
|
|
41
|
+
|
|
42
|
+
# Time spent processing a piece of code, refer to time_it().
|
|
43
|
+
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
44
|
+
'sky_apiserver_code_duration_seconds',
|
|
45
|
+
'Time spent processing code',
|
|
46
|
+
['name', 'group'],
|
|
47
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
48
|
+
60.0, 120.0, float('inf')),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Total number of API server requests, grouped by path, method, and status.
|
|
52
|
+
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
53
|
+
'sky_apiserver_requests_total',
|
|
54
|
+
'Total number of API server requests',
|
|
55
|
+
['path', 'method', 'status'],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Time spent processing API server requests, grouped by path, method, and
|
|
59
|
+
# status.
|
|
60
|
+
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
61
|
+
'sky_apiserver_request_duration_seconds',
|
|
62
|
+
'Time spent processing API server requests',
|
|
63
|
+
['path', 'method', 'status'],
|
|
64
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
65
|
+
60.0, 120.0, float('inf')),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
69
|
+
'sky_apiserver_event_loop_lag_seconds',
|
|
70
|
+
'Scheduling delay of the server event loop',
|
|
71
|
+
['pid'],
|
|
72
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
|
|
73
|
+
60.0, float('inf')),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
|
|
77
|
+
'sky_apiserver_websocket_connections',
|
|
78
|
+
'Number of websocket connections',
|
|
79
|
+
['pid'],
|
|
80
|
+
multiprocess_mode='livesum',
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
|
|
84
|
+
'sky_apiserver_websocket_closed_total',
|
|
85
|
+
'Number of websocket closed',
|
|
86
|
+
['pid', 'reason'],
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# The number of execution starts in each worker process, we do not record
|
|
90
|
+
# histogram here as the duration has been measured in
|
|
91
|
+
# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
|
|
92
|
+
# Recording histogram WITH worker label will cause high cardinality.
|
|
93
|
+
SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
|
|
94
|
+
'sky_apiserver_process_execution_start_total',
|
|
95
|
+
'Total number of execution starts in each worker process',
|
|
96
|
+
['request', 'pid'],
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
|
|
100
|
+
'sky_apiserver_process_peak_rss',
|
|
101
|
+
'Peak RSS we saw in each process in last 30 seconds',
|
|
102
|
+
['pid', 'type'],
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
106
|
+
'sky_apiserver_process_cpu_total',
|
|
107
|
+
'Total CPU times a worker process has been running',
|
|
108
|
+
['pid', 'type', 'mode'],
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
112
|
+
'sky_apiserver_request_memory_usage_bytes',
|
|
113
|
+
'Peak memory usage of requests', ['name'],
|
|
114
|
+
buckets=_MEM_BUCKETS)
|
|
115
|
+
|
|
116
|
+
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
117
|
+
'sky_apiserver_request_rss_incr_bytes',
|
|
118
|
+
'RSS increment after requests', ['name'],
|
|
119
|
+
buckets=_MEM_BUCKETS)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@contextlib.contextmanager
|
|
123
|
+
def time_it(name: str, group: str = 'default'):
|
|
124
|
+
"""Context manager to measure and record code execution duration."""
|
|
125
|
+
if not METRICS_ENABLED:
|
|
126
|
+
yield
|
|
127
|
+
else:
|
|
128
|
+
start_time = time.time()
|
|
129
|
+
try:
|
|
130
|
+
yield
|
|
131
|
+
finally:
|
|
132
|
+
duration = time.time() - start_time
|
|
133
|
+
SKY_APISERVER_CODE_DURATION_SECONDS.labels(
|
|
134
|
+
name=name, group=group).observe(duration)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def time_me(func):
|
|
138
|
+
"""Measure the duration of decorated function."""
|
|
139
|
+
|
|
140
|
+
@functools.wraps(func)
|
|
141
|
+
def wrapper(*args, **kwargs):
|
|
142
|
+
if not METRICS_ENABLED:
|
|
143
|
+
return func(*args, **kwargs)
|
|
144
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
145
|
+
with time_it(name, group='function'):
|
|
146
|
+
return func(*args, **kwargs)
|
|
147
|
+
|
|
148
|
+
return wrapper
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def time_me_async(func):
|
|
152
|
+
"""Measure the duration of decorated async function."""
|
|
153
|
+
|
|
154
|
+
@functools.wraps(func)
|
|
155
|
+
async def async_wrapper(*args, **kwargs):
|
|
156
|
+
if not METRICS_ENABLED:
|
|
157
|
+
return await func(*args, **kwargs)
|
|
158
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
159
|
+
with time_it(name, group='function'):
|
|
160
|
+
return await func(*args, **kwargs)
|
|
161
|
+
|
|
162
|
+
return async_wrapper
|
|
9
163
|
|
|
10
164
|
|
|
11
165
|
def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
@@ -44,6 +198,7 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
44
198
|
local_port = None
|
|
45
199
|
start_time = time.time()
|
|
46
200
|
|
|
201
|
+
buffer = ''
|
|
47
202
|
# wait for the port forward to start and extract the local port
|
|
48
203
|
while time.time() - start_time < start_port_forward_timeout:
|
|
49
204
|
if port_forward_process.poll() is not None:
|
|
@@ -56,10 +211,16 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
56
211
|
|
|
57
212
|
# read output line by line to find the local port
|
|
58
213
|
if port_forward_process.stdout:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
214
|
+
# Wait up to 1s for data to be available without blocking
|
|
215
|
+
r, _, _ = select.select([port_forward_process.stdout], [], [],
|
|
216
|
+
_SELECT_TIMEOUT)
|
|
217
|
+
if r:
|
|
218
|
+
# Read available bytes from the FD without blocking
|
|
219
|
+
fd = port_forward_process.stdout.fileno()
|
|
220
|
+
raw = os.read(fd, _SELECT_BUFFER_SIZE)
|
|
221
|
+
chunk = raw.decode(errors='ignore')
|
|
222
|
+
buffer += chunk
|
|
223
|
+
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
|
|
63
224
|
if match:
|
|
64
225
|
local_port = int(match.group(1))
|
|
65
226
|
break
|
|
@@ -122,8 +283,8 @@ async def send_metrics_request_with_port_forward(
|
|
|
122
283
|
port_forward_process = None
|
|
123
284
|
try:
|
|
124
285
|
# Start port forward
|
|
125
|
-
port_forward_process, local_port =
|
|
126
|
-
context, namespace, service, service_port)
|
|
286
|
+
port_forward_process, local_port = await context_utils.to_thread(
|
|
287
|
+
start_svc_port_forward, context, namespace, service, service_port)
|
|
127
288
|
|
|
128
289
|
# Build endpoint URL
|
|
129
290
|
endpoint = f'http://localhost:{local_port}{endpoint_path}'
|
|
@@ -143,7 +304,8 @@ async def send_metrics_request_with_port_forward(
|
|
|
143
304
|
finally:
|
|
144
305
|
# Always clean up port forward
|
|
145
306
|
if port_forward_process:
|
|
146
|
-
stop_svc_port_forward
|
|
307
|
+
await context_utils.to_thread(stop_svc_port_forward,
|
|
308
|
+
port_forward_process)
|
|
147
309
|
|
|
148
310
|
|
|
149
311
|
async def add_cluster_name_label(metrics_text: str, context: str) -> str:
|
|
@@ -193,7 +355,11 @@ async def get_metrics_for_context(context: str) -> str:
|
|
|
193
355
|
"""
|
|
194
356
|
# Query both DCGM metrics and kube_pod_labels metrics
|
|
195
357
|
# This ensures the dashboard can perform joins to filter by skypilot cluster
|
|
196
|
-
match_patterns = [
|
|
358
|
+
match_patterns = [
|
|
359
|
+
'{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
|
|
360
|
+
'kube_pod_labels',
|
|
361
|
+
'node_cpu_seconds_total{mode="idle"}'
|
|
362
|
+
]
|
|
197
363
|
|
|
198
364
|
# TODO(rohan): don't hardcode the namespace and service name
|
|
199
365
|
metrics_text = await send_metrics_request_with_port_forward(
|
sky/provision/__init__.py
CHANGED
|
@@ -24,6 +24,7 @@ from sky.provision import kubernetes
|
|
|
24
24
|
from sky.provision import lambda_cloud
|
|
25
25
|
from sky.provision import nebius
|
|
26
26
|
from sky.provision import oci
|
|
27
|
+
from sky.provision import primeintellect
|
|
27
28
|
from sky.provision import runpod
|
|
28
29
|
from sky.provision import scp
|
|
29
30
|
from sky.provision import seeweb
|
sky/provision/docker_utils.py
CHANGED
|
@@ -15,10 +15,14 @@ logger = sky_logging.init_logger(__name__)
|
|
|
15
15
|
# Configure environment variables. A docker image can have environment variables
|
|
16
16
|
# set in the Dockerfile with `ENV``. We need to export these variables to the
|
|
17
17
|
# shell environment, so that our ssh session can access them.
|
|
18
|
+
# Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
|
|
19
|
+
# Docker images with Ray 2.48.0+ set this for UV package manager support,
|
|
20
|
+
# but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
|
|
21
|
+
# See: https://github.com/skypilot-org/skypilot/pull/7181
|
|
18
22
|
SETUP_ENV_VARS_CMD = (
|
|
19
23
|
'prefix_cmd() '
|
|
20
24
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
|
|
21
|
-
'export -p > ~/container_env_var.sh && '
|
|
25
|
+
'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
|
|
22
26
|
'$(prefix_cmd) '
|
|
23
27
|
'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
|
|
24
28
|
|
|
@@ -410,7 +414,7 @@ class DockerInitializer:
|
|
|
410
414
|
# pylint: disable=anomalous-backslash-in-string
|
|
411
415
|
self._run(
|
|
412
416
|
'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
|
|
413
|
-
f'
|
|
417
|
+
f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
|
|
414
418
|
'mkdir -p ~/.ssh;'
|
|
415
419
|
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
|
|
416
420
|
'sudo service ssh start;'
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Prime Intellect provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.primeintellect.config import bootstrap_instances
|
|
4
|
+
from sky.provision.primeintellect.instance import cleanup_ports
|
|
5
|
+
from sky.provision.primeintellect.instance import get_cluster_info
|
|
6
|
+
from sky.provision.primeintellect.instance import query_instances
|
|
7
|
+
from sky.provision.primeintellect.instance import run_instances
|
|
8
|
+
from sky.provision.primeintellect.instance import stop_instances
|
|
9
|
+
from sky.provision.primeintellect.instance import terminate_instances
|
|
10
|
+
from sky.provision.primeintellect.instance import wait_instances
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Prime Intellect configuration bootstrapping."""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
|
10
|
+
del region, cluster_name # unused
|
|
11
|
+
return config
|