skypilot-nightly 1.0.0.dev20250916__py3-none-any.whl → 1.0.0.dev20250919__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/primeintellect.py +1 -0
  3. sky/adaptors/seeweb.py +68 -4
  4. sky/authentication.py +25 -0
  5. sky/backends/__init__.py +3 -2
  6. sky/backends/backend_utils.py +16 -12
  7. sky/backends/cloud_vm_ray_backend.py +57 -0
  8. sky/catalog/primeintellect_catalog.py +95 -0
  9. sky/clouds/__init__.py +2 -0
  10. sky/clouds/primeintellect.py +314 -0
  11. sky/core.py +77 -48
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/{y8s7LlyyfhMzpzCkxuD2r → VvaUqYDvHOcHZRnvMBmax}/_buildManifest.js +1 -1
  14. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{6856-e0754534b3015377.js → 6856-9a2538f38c004652.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0b4b35dc1dfe046c.js → [cluster]-9525660179df3605.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{webpack-05f82d90d6fd7f82.js → webpack-b2a3938c22b6647b.js} +1 -1
  22. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  23. sky/dashboard/out/clusters/[cluster].html +1 -1
  24. sky/dashboard/out/clusters.html +1 -1
  25. sky/dashboard/out/config.html +1 -1
  26. sky/dashboard/out/index.html +1 -1
  27. sky/dashboard/out/infra/[context].html +1 -1
  28. sky/dashboard/out/infra.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/dashboard/out/users.html +1 -1
  33. sky/dashboard/out/volumes.html +1 -1
  34. sky/dashboard/out/workspace/new.html +1 -1
  35. sky/dashboard/out/workspaces/[name].html +1 -1
  36. sky/dashboard/out/workspaces.html +1 -1
  37. sky/global_user_state.py +99 -62
  38. sky/jobs/server/server.py +14 -1
  39. sky/jobs/state.py +26 -1
  40. sky/metrics/utils.py +174 -8
  41. sky/provision/__init__.py +1 -0
  42. sky/provision/docker_utils.py +6 -2
  43. sky/provision/primeintellect/__init__.py +10 -0
  44. sky/provision/primeintellect/config.py +11 -0
  45. sky/provision/primeintellect/instance.py +454 -0
  46. sky/provision/primeintellect/utils.py +398 -0
  47. sky/resources.py +9 -1
  48. sky/schemas/generated/jobsv1_pb2.py +40 -40
  49. sky/schemas/generated/servev1_pb2.py +58 -0
  50. sky/schemas/generated/servev1_pb2.pyi +115 -0
  51. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  52. sky/serve/serve_rpc_utils.py +179 -0
  53. sky/serve/serve_utils.py +29 -12
  54. sky/serve/server/core.py +37 -19
  55. sky/serve/server/impl.py +221 -129
  56. sky/server/metrics.py +52 -158
  57. sky/server/requests/executor.py +12 -8
  58. sky/server/requests/payloads.py +6 -0
  59. sky/server/requests/requests.py +1 -1
  60. sky/server/requests/serializers/encoders.py +3 -2
  61. sky/server/server.py +5 -41
  62. sky/setup_files/dependencies.py +1 -0
  63. sky/skylet/constants.py +10 -5
  64. sky/skylet/job_lib.py +14 -15
  65. sky/skylet/services.py +98 -0
  66. sky/skylet/skylet.py +3 -1
  67. sky/templates/kubernetes-ray.yml.j2 +22 -12
  68. sky/templates/primeintellect-ray.yml.j2 +71 -0
  69. sky/utils/locks.py +41 -10
  70. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/METADATA +36 -35
  71. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/RECORD +76 -64
  72. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  73. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +0 -1
  74. sky/dashboard/out/_next/static/chunks/8969-0487dfbf149d9e53.js +0 -1
  75. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +0 -6
  76. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  77. /sky/dashboard/out/_next/static/{y8s7LlyyfhMzpzCkxuD2r → VvaUqYDvHOcHZRnvMBmax}/_ssgManifest.js +0 -0
  78. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/WHEEL +0 -0
  79. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/entry_points.txt +0 -0
  80. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/licenses/LICENSE +0 -0
  81. {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/top_level.txt +0 -0
sky/metrics/utils.py CHANGED
@@ -1,11 +1,165 @@
1
1
  """Utilities for processing GPU metrics from Kubernetes clusters."""
2
+ import contextlib
3
+ import functools
2
4
  import os
3
5
  import re
6
+ import select
4
7
  import subprocess
5
8
  import time
6
9
  from typing import List, Optional, Tuple
7
10
 
8
11
  import httpx
12
+ import prometheus_client as prom
13
+
14
+ from sky.skylet import constants
15
+ from sky.utils import context_utils
16
+
17
+ _SELECT_TIMEOUT = 1
18
+ _SELECT_BUFFER_SIZE = 4096
19
+
20
+ _KB = 2**10
21
+ _MB = 2**20
22
+ _MEM_BUCKETS = [
23
+ _KB,
24
+ 256 * _KB,
25
+ 512 * _KB,
26
+ _MB,
27
+ 2 * _MB,
28
+ 4 * _MB,
29
+ 8 * _MB,
30
+ 16 * _MB,
31
+ 32 * _MB,
32
+ 64 * _MB,
33
+ 128 * _MB,
34
+ 256 * _MB,
35
+ float('inf'),
36
+ ]
37
+
38
+ # Whether the metrics are enabled, cannot be changed at runtime.
39
+ METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
40
+ 'false').lower() == 'true'
41
+
42
+ # Time spent processing a piece of code, refer to time_it().
43
+ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
44
+ 'sky_apiserver_code_duration_seconds',
45
+ 'Time spent processing code',
46
+ ['name', 'group'],
47
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
48
+ 60.0, 120.0, float('inf')),
49
+ )
50
+
51
+ # Total number of API server requests, grouped by path, method, and status.
52
+ SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
53
+ 'sky_apiserver_requests_total',
54
+ 'Total number of API server requests',
55
+ ['path', 'method', 'status'],
56
+ )
57
+
58
+ # Time spent processing API server requests, grouped by path, method, and
59
+ # status.
60
+ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
61
+ 'sky_apiserver_request_duration_seconds',
62
+ 'Time spent processing API server requests',
63
+ ['path', 'method', 'status'],
64
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
65
+ 60.0, 120.0, float('inf')),
66
+ )
67
+
68
+ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
69
+ 'sky_apiserver_event_loop_lag_seconds',
70
+ 'Scheduling delay of the server event loop',
71
+ ['pid'],
72
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
73
+ 60.0, float('inf')),
74
+ )
75
+
76
+ SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
77
+ 'sky_apiserver_websocket_connections',
78
+ 'Number of websocket connections',
79
+ ['pid'],
80
+ multiprocess_mode='livesum',
81
+ )
82
+
83
+ SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
84
+ 'sky_apiserver_websocket_closed_total',
85
+ 'Number of websocket closed',
86
+ ['pid', 'reason'],
87
+ )
88
+
89
+ # The number of execution starts in each worker process, we do not record
90
+ # histogram here as the duration has been measured in
91
+ # SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
92
+ # Recording histogram WITH worker label will cause high cardinality.
93
+ SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
94
+ 'sky_apiserver_process_execution_start_total',
95
+ 'Total number of execution starts in each worker process',
96
+ ['request', 'pid'],
97
+ )
98
+
99
+ SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
100
+ 'sky_apiserver_process_peak_rss',
101
+ 'Peak RSS we saw in each process in last 30 seconds',
102
+ ['pid', 'type'],
103
+ )
104
+
105
+ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
106
+ 'sky_apiserver_process_cpu_total',
107
+ 'Total CPU times a worker process has been running',
108
+ ['pid', 'type', 'mode'],
109
+ )
110
+
111
+ SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
112
+ 'sky_apiserver_request_memory_usage_bytes',
113
+ 'Peak memory usage of requests', ['name'],
114
+ buckets=_MEM_BUCKETS)
115
+
116
+ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
117
+ 'sky_apiserver_request_rss_incr_bytes',
118
+ 'RSS increment after requests', ['name'],
119
+ buckets=_MEM_BUCKETS)
120
+
121
+
122
+ @contextlib.contextmanager
123
+ def time_it(name: str, group: str = 'default'):
124
+ """Context manager to measure and record code execution duration."""
125
+ if not METRICS_ENABLED:
126
+ yield
127
+ else:
128
+ start_time = time.time()
129
+ try:
130
+ yield
131
+ finally:
132
+ duration = time.time() - start_time
133
+ SKY_APISERVER_CODE_DURATION_SECONDS.labels(
134
+ name=name, group=group).observe(duration)
135
+
136
+
137
+ def time_me(func):
138
+ """Measure the duration of decorated function."""
139
+
140
+ @functools.wraps(func)
141
+ def wrapper(*args, **kwargs):
142
+ if not METRICS_ENABLED:
143
+ return func(*args, **kwargs)
144
+ name = f'{func.__module__}/{func.__name__}'
145
+ with time_it(name, group='function'):
146
+ return func(*args, **kwargs)
147
+
148
+ return wrapper
149
+
150
+
151
+ def time_me_async(func):
152
+ """Measure the duration of decorated async function."""
153
+
154
+ @functools.wraps(func)
155
+ async def async_wrapper(*args, **kwargs):
156
+ if not METRICS_ENABLED:
157
+ return await func(*args, **kwargs)
158
+ name = f'{func.__module__}/{func.__name__}'
159
+ with time_it(name, group='function'):
160
+ return await func(*args, **kwargs)
161
+
162
+ return async_wrapper
9
163
 
10
164
 
11
165
  def start_svc_port_forward(context: str, namespace: str, service: str,
@@ -44,6 +198,7 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
44
198
  local_port = None
45
199
  start_time = time.time()
46
200
 
201
+ buffer = ''
47
202
  # wait for the port forward to start and extract the local port
48
203
  while time.time() - start_time < start_port_forward_timeout:
49
204
  if port_forward_process.poll() is not None:
@@ -56,10 +211,16 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
56
211
 
57
212
  # read output line by line to find the local port
58
213
  if port_forward_process.stdout:
59
- line = port_forward_process.stdout.readline()
60
- if line:
61
- # look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
62
- match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
214
+ # Wait up to 1s for data to be available without blocking
215
+ r, _, _ = select.select([port_forward_process.stdout], [], [],
216
+ _SELECT_TIMEOUT)
217
+ if r:
218
+ # Read available bytes from the FD without blocking
219
+ fd = port_forward_process.stdout.fileno()
220
+ raw = os.read(fd, _SELECT_BUFFER_SIZE)
221
+ chunk = raw.decode(errors='ignore')
222
+ buffer += chunk
223
+ match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
63
224
  if match:
64
225
  local_port = int(match.group(1))
65
226
  break
@@ -122,8 +283,8 @@ async def send_metrics_request_with_port_forward(
122
283
  port_forward_process = None
123
284
  try:
124
285
  # Start port forward
125
- port_forward_process, local_port = start_svc_port_forward(
126
- context, namespace, service, service_port)
286
+ port_forward_process, local_port = await context_utils.to_thread(
287
+ start_svc_port_forward, context, namespace, service, service_port)
127
288
 
128
289
  # Build endpoint URL
129
290
  endpoint = f'http://localhost:{local_port}{endpoint_path}'
@@ -143,7 +304,8 @@ async def send_metrics_request_with_port_forward(
143
304
  finally:
144
305
  # Always clean up port forward
145
306
  if port_forward_process:
146
- stop_svc_port_forward(port_forward_process)
307
+ await context_utils.to_thread(stop_svc_port_forward,
308
+ port_forward_process)
147
309
 
148
310
 
149
311
  async def add_cluster_name_label(metrics_text: str, context: str) -> str:
@@ -193,7 +355,11 @@ async def get_metrics_for_context(context: str) -> str:
193
355
  """
194
356
  # Query both DCGM metrics and kube_pod_labels metrics
195
357
  # This ensures the dashboard can perform joins to filter by skypilot cluster
196
- match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
358
+ match_patterns = [
359
+ '{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
360
+ 'kube_pod_labels',
361
+ 'node_cpu_seconds_total{mode="idle"}'
362
+ ]
197
363
 
198
364
  # TODO(rohan): don't hardcode the namespace and service name
199
365
  metrics_text = await send_metrics_request_with_port_forward(
sky/provision/__init__.py CHANGED
@@ -24,6 +24,7 @@ from sky.provision import kubernetes
24
24
  from sky.provision import lambda_cloud
25
25
  from sky.provision import nebius
26
26
  from sky.provision import oci
27
+ from sky.provision import primeintellect
27
28
  from sky.provision import runpod
28
29
  from sky.provision import scp
29
30
  from sky.provision import seeweb
@@ -15,10 +15,14 @@ logger = sky_logging.init_logger(__name__)
15
15
  # Configure environment variables. A docker image can have environment variables
16
16
  # set in the Dockerfile with `ENV``. We need to export these variables to the
17
17
  # shell environment, so that our ssh session can access them.
18
+ # Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
19
+ # Docker images with Ray 2.48.0+ set this for UV package manager support,
20
+ # but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
21
+ # See: https://github.com/skypilot-org/skypilot/pull/7181
18
22
  SETUP_ENV_VARS_CMD = (
19
23
  'prefix_cmd() '
20
24
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
- 'export -p > ~/container_env_var.sh && '
25
+ 'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
22
26
  '$(prefix_cmd) '
23
27
  'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
24
28
 
@@ -410,7 +414,7 @@ class DockerInitializer:
410
414
  # pylint: disable=anomalous-backslash-in-string
411
415
  self._run(
412
416
  'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
413
- f'sudo echo "Port {port}" >> /etc/ssh/sshd_config;'
417
+ f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
414
418
  'mkdir -p ~/.ssh;'
415
419
  'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
416
420
  'sudo service ssh start;'
@@ -0,0 +1,10 @@
1
+ """Prime Intellect provisioner for SkyPilot."""
2
+
3
+ from sky.provision.primeintellect.config import bootstrap_instances
4
+ from sky.provision.primeintellect.instance import cleanup_ports
5
+ from sky.provision.primeintellect.instance import get_cluster_info
6
+ from sky.provision.primeintellect.instance import query_instances
7
+ from sky.provision.primeintellect.instance import run_instances
8
+ from sky.provision.primeintellect.instance import stop_instances
9
+ from sky.provision.primeintellect.instance import terminate_instances
10
+ from sky.provision.primeintellect.instance import wait_instances
@@ -0,0 +1,11 @@
1
+ """Prime Intellect configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+ return config