skypilot-nightly 1.0.0.dev20250829__py3-none-any.whl → 1.0.0.dev20250901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +24 -2
- sky/backends/backend_utils.py +39 -36
- sky/backends/cloud_vm_ray_backend.py +37 -0
- sky/client/cli/command.py +17 -6
- sky/client/common.py +5 -4
- sky/client/sdk.py +5 -0
- sky/client/sdk_async.py +8 -2
- sky/core.py +8 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +67 -0
- sky/provision/docker_utils.py +1 -1
- sky/provision/kubernetes/utils.py +39 -26
- sky/server/common.py +8 -6
- sky/server/metrics.py +82 -6
- sky/server/requests/executor.py +5 -1
- sky/server/requests/payloads.py +1 -0
- sky/server/requests/requests.py +19 -11
- sky/server/server.py +46 -14
- sky/server/uvicorn.py +7 -0
- sky/setup_files/dependencies.py +23 -8
- sky/setup_files/setup.py +2 -0
- sky/skylet/constants.py +3 -0
- sky/utils/db/db_utils.py +56 -4
- sky/utils/perf_utils.py +22 -0
- sky/utils/schemas.py +6 -0
- {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/METADATA +35 -50
- {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/RECORD +49 -48
- /sky/dashboard/out/_next/static/{hYJYFIxp_ZFONR4wTIJqZ → EqPZ0ygxa__3XPBVJ9dpy}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{hYJYFIxp_ZFONR4wTIJqZ → EqPZ0ygxa__3XPBVJ9dpy}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
|
2
|
+
import copy
|
|
2
3
|
import dataclasses
|
|
3
4
|
import datetime
|
|
4
5
|
import enum
|
|
@@ -2715,11 +2716,11 @@ def get_endpoint_debug_message(context: Optional[str] = None) -> str:
|
|
|
2715
2716
|
|
|
2716
2717
|
|
|
2717
2718
|
def combine_pod_config_fields(
|
|
2718
|
-
|
|
2719
|
+
cluster_yaml_obj: Dict[str, Any],
|
|
2719
2720
|
cluster_config_overrides: Dict[str, Any],
|
|
2720
2721
|
cloud: Optional[clouds.Cloud] = None,
|
|
2721
2722
|
context: Optional[str] = None,
|
|
2722
|
-
) ->
|
|
2723
|
+
) -> Dict[str, Any]:
|
|
2723
2724
|
"""Adds or updates fields in the YAML with fields from the
|
|
2724
2725
|
~/.sky/config.yaml's kubernetes.pod_spec dict.
|
|
2725
2726
|
This can be used to add fields to the YAML that are not supported by
|
|
@@ -2758,9 +2759,7 @@ def combine_pod_config_fields(
|
|
|
2758
2759
|
- name: my-secret
|
|
2759
2760
|
```
|
|
2760
2761
|
"""
|
|
2761
|
-
|
|
2762
|
-
yaml_content = f.read()
|
|
2763
|
-
yaml_obj = yaml_utils.safe_load(yaml_content)
|
|
2762
|
+
merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
|
|
2764
2763
|
# We don't use override_configs in `get_effective_region_config`, as merging
|
|
2765
2764
|
# the pod config requires special handling.
|
|
2766
2765
|
if isinstance(cloud, clouds.SSH):
|
|
@@ -2787,26 +2786,20 @@ def combine_pod_config_fields(
|
|
|
2787
2786
|
|
|
2788
2787
|
# Merge the kubernetes config into the YAML for both head and worker nodes.
|
|
2789
2788
|
config_utils.merge_k8s_configs(
|
|
2790
|
-
|
|
2791
|
-
kubernetes_config)
|
|
2789
|
+
merged_cluster_yaml_obj['available_node_types']['ray_head_default']
|
|
2790
|
+
['node_config'], kubernetes_config)
|
|
2791
|
+
return merged_cluster_yaml_obj
|
|
2792
2792
|
|
|
2793
|
-
# Write the updated YAML back to the file
|
|
2794
|
-
yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
|
|
2795
2793
|
|
|
2796
|
-
|
|
2797
|
-
def combine_metadata_fields(cluster_yaml_path: str,
|
|
2794
|
+
def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
|
|
2798
2795
|
cluster_config_overrides: Dict[str, Any],
|
|
2799
|
-
context: Optional[str] = None) ->
|
|
2796
|
+
context: Optional[str] = None) -> Dict[str, Any]:
|
|
2800
2797
|
"""Updates the metadata for all Kubernetes objects created by SkyPilot with
|
|
2801
2798
|
fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
|
|
2802
2799
|
|
|
2803
2800
|
Obeys the same add or update semantics as combine_pod_config_fields().
|
|
2804
2801
|
"""
|
|
2805
|
-
|
|
2806
|
-
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
|
2807
|
-
yaml_content = f.read()
|
|
2808
|
-
yaml_obj = yaml_utils.safe_load(yaml_content)
|
|
2809
|
-
|
|
2802
|
+
merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
|
|
2810
2803
|
# Get custom_metadata from global config
|
|
2811
2804
|
custom_metadata = skypilot_config.get_effective_region_config(
|
|
2812
2805
|
cloud='kubernetes',
|
|
@@ -2828,22 +2821,42 @@ def combine_metadata_fields(cluster_yaml_path: str,
|
|
|
2828
2821
|
# List of objects in the cluster YAML to be updated
|
|
2829
2822
|
combination_destinations = [
|
|
2830
2823
|
# Service accounts
|
|
2831
|
-
|
|
2832
|
-
yaml_obj['provider']['autoscaler_role']['metadata'],
|
|
2833
|
-
yaml_obj['provider']['autoscaler_role_binding']['metadata'],
|
|
2834
|
-
yaml_obj['provider']['autoscaler_service_account']['metadata'],
|
|
2835
|
-
# Pod spec
|
|
2836
|
-
yaml_obj['available_node_types']['ray_head_default']['node_config']
|
|
2824
|
+
merged_cluster_yaml_obj['provider']['autoscaler_service_account']
|
|
2837
2825
|
['metadata'],
|
|
2826
|
+
merged_cluster_yaml_obj['provider']['autoscaler_role']['metadata'],
|
|
2827
|
+
merged_cluster_yaml_obj['provider']['autoscaler_role_binding']
|
|
2828
|
+
['metadata'],
|
|
2829
|
+
merged_cluster_yaml_obj['provider']['autoscaler_service_account']
|
|
2830
|
+
['metadata'],
|
|
2831
|
+
# Pod spec
|
|
2832
|
+
merged_cluster_yaml_obj['available_node_types']['ray_head_default']
|
|
2833
|
+
['node_config']['metadata'],
|
|
2838
2834
|
# Services for pods
|
|
2839
|
-
*[
|
|
2835
|
+
*[
|
|
2836
|
+
svc['metadata']
|
|
2837
|
+
for svc in merged_cluster_yaml_obj['provider']['services']
|
|
2838
|
+
]
|
|
2840
2839
|
]
|
|
2841
2840
|
|
|
2842
2841
|
for destination in combination_destinations:
|
|
2843
2842
|
config_utils.merge_k8s_configs(destination, custom_metadata)
|
|
2844
2843
|
|
|
2845
|
-
|
|
2846
|
-
|
|
2844
|
+
return merged_cluster_yaml_obj
|
|
2845
|
+
|
|
2846
|
+
|
|
2847
|
+
def combine_pod_config_fields_and_metadata(
|
|
2848
|
+
cluster_yaml_obj: Dict[str, Any],
|
|
2849
|
+
cluster_config_overrides: Dict[str, Any],
|
|
2850
|
+
cloud: Optional[clouds.Cloud] = None,
|
|
2851
|
+
context: Optional[str] = None) -> Dict[str, Any]:
|
|
2852
|
+
"""Combines pod config fields and metadata fields"""
|
|
2853
|
+
combined_yaml_obj = combine_pod_config_fields(cluster_yaml_obj,
|
|
2854
|
+
cluster_config_overrides,
|
|
2855
|
+
cloud, context)
|
|
2856
|
+
combined_yaml_obj = combine_metadata_fields(combined_yaml_obj,
|
|
2857
|
+
cluster_config_overrides,
|
|
2858
|
+
context)
|
|
2859
|
+
return combined_yaml_obj
|
|
2847
2860
|
|
|
2848
2861
|
|
|
2849
2862
|
def merge_custom_metadata(
|
sky/server/common.py
CHANGED
|
@@ -648,14 +648,16 @@ def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
|
|
|
648
648
|
deploy: Whether the server is running in deploy mode, which means
|
|
649
649
|
multiple processes might be running.
|
|
650
650
|
"""
|
|
651
|
+
del deploy
|
|
651
652
|
if metrics or os.getenv(constants.ENV_VAR_SERVER_METRICS_ENABLED) == 'true':
|
|
652
653
|
env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
654
|
+
# Always set the metrics dir since we need to collect metrics from
|
|
655
|
+
# subprocesses like the executor.
|
|
656
|
+
metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
|
|
657
|
+
shutil.rmtree(metrics_dir, ignore_errors=True)
|
|
658
|
+
os.makedirs(metrics_dir, exist_ok=True)
|
|
659
|
+
# Refer to https://prometheus.github.io/client_python/multiprocess/
|
|
660
|
+
env['PROMETHEUS_MULTIPROC_DIR'] = metrics_dir
|
|
659
661
|
|
|
660
662
|
|
|
661
663
|
def check_server_healthy(
|
sky/server/metrics.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""Instrumentation for the API server."""
|
|
2
2
|
|
|
3
|
+
import contextlib
|
|
4
|
+
import functools
|
|
3
5
|
import os
|
|
4
6
|
import time
|
|
5
7
|
|
|
@@ -11,11 +13,16 @@ import starlette.middleware.base
|
|
|
11
13
|
import uvicorn
|
|
12
14
|
|
|
13
15
|
from sky import sky_logging
|
|
16
|
+
from sky.skylet import constants
|
|
17
|
+
|
|
18
|
+
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
19
|
+
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
20
|
+
'false').lower() == 'true'
|
|
14
21
|
|
|
15
22
|
logger = sky_logging.init_logger(__name__)
|
|
16
23
|
|
|
17
24
|
# Total number of API server requests, grouped by path, method, and status.
|
|
18
|
-
|
|
25
|
+
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
19
26
|
'sky_apiserver_requests_total',
|
|
20
27
|
'Total number of API server requests',
|
|
21
28
|
['path', 'method', 'status'],
|
|
@@ -23,14 +30,40 @@ sky_apiserver_requests_total = prom.Counter(
|
|
|
23
30
|
|
|
24
31
|
# Time spent processing API server requests, grouped by path, method, and
|
|
25
32
|
# status.
|
|
26
|
-
|
|
33
|
+
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
27
34
|
'sky_apiserver_request_duration_seconds',
|
|
28
35
|
'Time spent processing API server requests',
|
|
29
36
|
['path', 'method', 'status'],
|
|
30
|
-
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
|
|
37
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
38
|
+
60.0, 120.0, float('inf')),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Time spent processing requests in executor.
|
|
42
|
+
SKY_APISERVER_REQUEST_EXECUTION_DURATION_SECONDS = prom.Histogram(
|
|
43
|
+
'sky_apiserver_request_execution_duration_seconds',
|
|
44
|
+
'Time spent executing requests in executor',
|
|
45
|
+
['request', 'worker'],
|
|
46
|
+
buckets=(0.5, 1, 2.5, 5.0, 10.0, 15.0, 25.0, 40.0, 60.0, 90.0, 120.0, 180.0,
|
|
31
47
|
float('inf')),
|
|
32
48
|
)
|
|
33
49
|
|
|
50
|
+
# Time spent processing a piece of code, refer to time_it().
|
|
51
|
+
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
52
|
+
'sky_apiserver_code_duration_seconds',
|
|
53
|
+
'Time spent processing code',
|
|
54
|
+
['name', 'group'],
|
|
55
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
56
|
+
60.0, 120.0, float('inf')),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
60
|
+
'sky_apiserver_event_loop_lag_seconds',
|
|
61
|
+
'Scheduling delay of the server event loop',
|
|
62
|
+
['pid'],
|
|
63
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
|
|
64
|
+
60.0, float('inf')),
|
|
65
|
+
)
|
|
66
|
+
|
|
34
67
|
metrics_app = fastapi.FastAPI()
|
|
35
68
|
|
|
36
69
|
|
|
@@ -76,7 +109,7 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
76
109
|
|
|
77
110
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
78
111
|
path = request.url.path
|
|
79
|
-
logger.
|
|
112
|
+
logger.debug(f'PROM Middleware Request: {request}, {request.url.path}')
|
|
80
113
|
streaming = _is_streaming_api(path)
|
|
81
114
|
if not streaming:
|
|
82
115
|
# Exclude streaming APIs, the duration is not meaningful.
|
|
@@ -92,13 +125,56 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
92
125
|
status_code_group = '5xx'
|
|
93
126
|
raise
|
|
94
127
|
finally:
|
|
95
|
-
|
|
128
|
+
SKY_APISERVER_REQUESTS_TOTAL.labels(path=path,
|
|
96
129
|
method=method,
|
|
97
130
|
status=status_code_group).inc()
|
|
98
131
|
if not streaming:
|
|
99
132
|
duration = time.time() - start_time
|
|
100
|
-
|
|
133
|
+
SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
|
|
101
134
|
path=path, method=method,
|
|
102
135
|
status=status_code_group).observe(duration)
|
|
103
136
|
|
|
104
137
|
return response
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@contextlib.contextmanager
|
|
141
|
+
def time_it(name: str, group: str = 'default'):
|
|
142
|
+
"""Context manager to measure and record code execution duration."""
|
|
143
|
+
if not METRICS_ENABLED:
|
|
144
|
+
yield
|
|
145
|
+
else:
|
|
146
|
+
start_time = time.time()
|
|
147
|
+
try:
|
|
148
|
+
yield
|
|
149
|
+
finally:
|
|
150
|
+
duration = time.time() - start_time
|
|
151
|
+
SKY_APISERVER_CODE_DURATION_SECONDS.labels(
|
|
152
|
+
name=name, group=group).observe(duration)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def time_me(func):
|
|
156
|
+
"""Measure the duration of decorated function."""
|
|
157
|
+
|
|
158
|
+
@functools.wraps(func)
|
|
159
|
+
def wrapper(*args, **kwargs):
|
|
160
|
+
if not METRICS_ENABLED:
|
|
161
|
+
return func(*args, **kwargs)
|
|
162
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
163
|
+
with time_it(name, group='function'):
|
|
164
|
+
return func(*args, **kwargs)
|
|
165
|
+
|
|
166
|
+
return wrapper
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def time_me_async(func):
|
|
170
|
+
"""Measure the duration of decorated async function."""
|
|
171
|
+
|
|
172
|
+
@functools.wraps(func)
|
|
173
|
+
async def async_wrapper(*args, **kwargs):
|
|
174
|
+
if not METRICS_ENABLED:
|
|
175
|
+
return await func(*args, **kwargs)
|
|
176
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
177
|
+
with time_it(name, group='function'):
|
|
178
|
+
return await func(*args, **kwargs)
|
|
179
|
+
|
|
180
|
+
return async_wrapper
|
sky/server/requests/executor.py
CHANGED
|
@@ -41,6 +41,7 @@ from sky import skypilot_config
|
|
|
41
41
|
from sky.server import common as server_common
|
|
42
42
|
from sky.server import config as server_config
|
|
43
43
|
from sky.server import constants as server_constants
|
|
44
|
+
from sky.server import metrics as metrics_lib
|
|
44
45
|
from sky.server.requests import payloads
|
|
45
46
|
from sky.server.requests import preconditions
|
|
46
47
|
from sky.server.requests import process
|
|
@@ -373,6 +374,7 @@ def _request_execution_wrapper(request_id: str,
|
|
|
373
374
|
request_task.status = api_requests.RequestStatus.RUNNING
|
|
374
375
|
func = request_task.entrypoint
|
|
375
376
|
request_body = request_task.request_body
|
|
377
|
+
request_name = request_task.name
|
|
376
378
|
|
|
377
379
|
# Append to the log file instead of overwriting it since there might be
|
|
378
380
|
# logs from previous retries.
|
|
@@ -390,7 +392,9 @@ def _request_execution_wrapper(request_id: str,
|
|
|
390
392
|
config = skypilot_config.to_dict()
|
|
391
393
|
logger.debug(f'request config: \n'
|
|
392
394
|
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
393
|
-
|
|
395
|
+
with metrics_lib.time_it(name=request_name,
|
|
396
|
+
group='request_execution'):
|
|
397
|
+
return_value = func(**request_body.to_kwargs())
|
|
394
398
|
f.flush()
|
|
395
399
|
except KeyboardInterrupt:
|
|
396
400
|
logger.info(f'Request {request_id} cancelled by user')
|
sky/server/requests/payloads.py
CHANGED
|
@@ -309,6 +309,7 @@ class StatusBody(RequestBody):
|
|
|
309
309
|
cluster_names: Optional[List[str]] = None
|
|
310
310
|
refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
|
|
311
311
|
all_users: bool = True
|
|
312
|
+
include_credentials: bool = False
|
|
312
313
|
|
|
313
314
|
|
|
314
315
|
class StartBody(RequestBody):
|
sky/server/requests/requests.py
CHANGED
|
@@ -26,6 +26,7 @@ from sky import skypilot_config
|
|
|
26
26
|
from sky.server import common as server_common
|
|
27
27
|
from sky.server import constants as server_constants
|
|
28
28
|
from sky.server import daemons
|
|
29
|
+
from sky.server import metrics as metrics_lib
|
|
29
30
|
from sky.server.requests import payloads
|
|
30
31
|
from sky.server.requests.serializers import decoders
|
|
31
32
|
from sky.server.requests.serializers import encoders
|
|
@@ -460,6 +461,7 @@ def request_lock_path(request_id: str) -> str:
|
|
|
460
461
|
|
|
461
462
|
@contextlib.contextmanager
|
|
462
463
|
@init_db
|
|
464
|
+
@metrics_lib.time_me
|
|
463
465
|
def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
464
466
|
"""Get and update a SkyPilot API request."""
|
|
465
467
|
request = _get_request_no_lock(request_id)
|
|
@@ -469,6 +471,7 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
|
469
471
|
|
|
470
472
|
|
|
471
473
|
@init_db
|
|
474
|
+
@metrics_lib.time_me
|
|
472
475
|
def update_request_async(
|
|
473
476
|
request_id: str) -> AsyncContextManager[Optional[Request]]:
|
|
474
477
|
"""Async version of update_request.
|
|
@@ -508,15 +511,16 @@ def _get_request_no_lock(request_id: str) -> Optional[Request]:
|
|
|
508
511
|
async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
|
|
509
512
|
"""Async version of _get_request_no_lock."""
|
|
510
513
|
assert _DB is not None
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
row =
|
|
514
|
+
async with _DB.execute_fetchall_async(_get_request_sql,
|
|
515
|
+
(request_id + '%',)) as rows:
|
|
516
|
+
row = rows[0] if rows else None
|
|
514
517
|
if row is None:
|
|
515
518
|
return None
|
|
516
519
|
return Request.from_row(row)
|
|
517
520
|
|
|
518
521
|
|
|
519
522
|
@init_db
|
|
523
|
+
@metrics_lib.time_me
|
|
520
524
|
def get_latest_request_id() -> Optional[str]:
|
|
521
525
|
"""Get the latest request ID."""
|
|
522
526
|
assert _DB is not None
|
|
@@ -529,6 +533,7 @@ def get_latest_request_id() -> Optional[str]:
|
|
|
529
533
|
|
|
530
534
|
|
|
531
535
|
@init_db
|
|
536
|
+
@metrics_lib.time_me
|
|
532
537
|
def get_request(request_id: str) -> Optional[Request]:
|
|
533
538
|
"""Get a SkyPilot API request."""
|
|
534
539
|
with filelock.FileLock(request_lock_path(request_id)):
|
|
@@ -536,6 +541,7 @@ def get_request(request_id: str) -> Optional[Request]:
|
|
|
536
541
|
|
|
537
542
|
|
|
538
543
|
@init_db_async
|
|
544
|
+
@metrics_lib.time_me_async
|
|
539
545
|
async def get_request_async(request_id: str) -> Optional[Request]:
|
|
540
546
|
"""Async version of get_request."""
|
|
541
547
|
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
@@ -543,6 +549,7 @@ async def get_request_async(request_id: str) -> Optional[Request]:
|
|
|
543
549
|
|
|
544
550
|
|
|
545
551
|
@init_db
|
|
552
|
+
@metrics_lib.time_me
|
|
546
553
|
def create_if_not_exists(request: Request) -> bool:
|
|
547
554
|
"""Create a SkyPilot API request if it does not exist."""
|
|
548
555
|
with filelock.FileLock(request_lock_path(request.request_id)):
|
|
@@ -553,6 +560,7 @@ def create_if_not_exists(request: Request) -> bool:
|
|
|
553
560
|
|
|
554
561
|
|
|
555
562
|
@init_db_async
|
|
563
|
+
@metrics_lib.time_me_async
|
|
556
564
|
async def create_if_not_exists_async(request: Request) -> bool:
|
|
557
565
|
"""Async version of create_if_not_exists."""
|
|
558
566
|
async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
|
|
@@ -563,6 +571,7 @@ async def create_if_not_exists_async(request: Request) -> bool:
|
|
|
563
571
|
|
|
564
572
|
|
|
565
573
|
@init_db
|
|
574
|
+
@metrics_lib.time_me
|
|
566
575
|
def get_request_tasks(
|
|
567
576
|
status: Optional[List[RequestStatus]] = None,
|
|
568
577
|
cluster_names: Optional[List[str]] = None,
|
|
@@ -637,13 +646,13 @@ def get_request_tasks(
|
|
|
637
646
|
|
|
638
647
|
|
|
639
648
|
@init_db_async
|
|
649
|
+
@metrics_lib.time_me_async
|
|
640
650
|
async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
641
651
|
"""Get a list of API request ids for shell completion."""
|
|
642
652
|
assert _DB is not None
|
|
643
|
-
conn = await _DB.async_conn()
|
|
644
653
|
# Prioritize alive requests (PENDING, RUNNING) over finished ones,
|
|
645
654
|
# then order by creation time (newest first) within each category.
|
|
646
|
-
async with
|
|
655
|
+
async with _DB.execute_fetchall_async(
|
|
647
656
|
f"""SELECT request_id FROM {REQUEST_TABLE}
|
|
648
657
|
WHERE request_id LIKE ?
|
|
649
658
|
ORDER BY
|
|
@@ -652,9 +661,8 @@ async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
|
652
661
|
ELSE 1
|
|
653
662
|
END,
|
|
654
663
|
created_at DESC
|
|
655
|
-
LIMIT 1000""", (f'{incomplete}%',)) as
|
|
656
|
-
|
|
657
|
-
if rows is None:
|
|
664
|
+
LIMIT 1000""", (f'{incomplete}%',)) as rows:
|
|
665
|
+
if not rows:
|
|
658
666
|
return []
|
|
659
667
|
return [row[0] for row in rows]
|
|
660
668
|
|
|
@@ -675,9 +683,8 @@ def _add_or_update_request_no_lock(request: Request):
|
|
|
675
683
|
async def _add_or_update_request_no_lock_async(request: Request):
|
|
676
684
|
"""Async version of _add_or_update_request_no_lock."""
|
|
677
685
|
assert _DB is not None
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
await conn.commit()
|
|
686
|
+
await _DB.execute_and_commit_async(_add_or_update_request_sql,
|
|
687
|
+
request.to_row())
|
|
681
688
|
|
|
682
689
|
|
|
683
690
|
def set_request_failed(request_id: str, e: BaseException) -> None:
|
|
@@ -711,6 +718,7 @@ def set_request_cancelled(request_id: str) -> None:
|
|
|
711
718
|
|
|
712
719
|
|
|
713
720
|
@init_db
|
|
721
|
+
@metrics_lib.time_me
|
|
714
722
|
def _delete_requests(requests: List[Request]):
|
|
715
723
|
"""Clean up requests by their IDs."""
|
|
716
724
|
id_list_str = ','.join(repr(req.request_id) for req in requests)
|
sky/server/server.py
CHANGED
|
@@ -68,6 +68,7 @@ from sky.utils import common_utils
|
|
|
68
68
|
from sky.utils import context
|
|
69
69
|
from sky.utils import context_utils
|
|
70
70
|
from sky.utils import dag_utils
|
|
71
|
+
from sky.utils import perf_utils
|
|
71
72
|
from sky.utils import status_lib
|
|
72
73
|
from sky.utils import subprocess_utils
|
|
73
74
|
from sky.volumes.server import server as volumes_rest
|
|
@@ -421,6 +422,28 @@ async def cleanup_upload_ids():
|
|
|
421
422
|
upload_ids_to_cleanup.pop((upload_id, user_hash))
|
|
422
423
|
|
|
423
424
|
|
|
425
|
+
async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
426
|
+
interval: float = 0.1) -> None:
|
|
427
|
+
target = loop.time() + interval
|
|
428
|
+
|
|
429
|
+
pid = str(os.getpid())
|
|
430
|
+
lag_threshold = perf_utils.get_loop_lag_threshold()
|
|
431
|
+
|
|
432
|
+
def tick():
|
|
433
|
+
nonlocal target
|
|
434
|
+
now = loop.time()
|
|
435
|
+
lag = max(0.0, now - target)
|
|
436
|
+
if lag_threshold is not None and lag > lag_threshold:
|
|
437
|
+
logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
|
|
438
|
+
f'{lag_threshold} seconds.')
|
|
439
|
+
metrics.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
|
|
440
|
+
pid=pid).observe(lag)
|
|
441
|
+
target = now + interval
|
|
442
|
+
loop.call_at(target, tick)
|
|
443
|
+
|
|
444
|
+
loop.call_at(target, tick)
|
|
445
|
+
|
|
446
|
+
|
|
424
447
|
@contextlib.asynccontextmanager
|
|
425
448
|
async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
|
|
426
449
|
"""FastAPI lifespan context manager."""
|
|
@@ -446,6 +469,10 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
446
469
|
# can safely ignore the error if the task is already scheduled.
|
|
447
470
|
logger.debug(f'Request {event.id} already exists.')
|
|
448
471
|
asyncio.create_task(cleanup_upload_ids())
|
|
472
|
+
if metrics.METRICS_ENABLED:
|
|
473
|
+
# Start monitoring the event loop lag in each server worker
|
|
474
|
+
# event loop (process).
|
|
475
|
+
asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
|
|
449
476
|
yield
|
|
450
477
|
# Shutdown: Add any cleanup code here if needed
|
|
451
478
|
|
|
@@ -1254,20 +1281,25 @@ async def download(download_body: payloads.DownloadBody,
|
|
|
1254
1281
|
logs_dir_on_api_server).expanduser().resolve() / zip_filename
|
|
1255
1282
|
|
|
1256
1283
|
try:
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1284
|
+
|
|
1285
|
+
def _zip_files_and_folders(folder_paths, zip_path):
|
|
1286
|
+
folders = [
|
|
1287
|
+
str(folder_path.expanduser().resolve())
|
|
1288
|
+
for folder_path in folder_paths
|
|
1289
|
+
]
|
|
1290
|
+
# Check for optional query parameter to control zip entry structure
|
|
1291
|
+
relative = request.query_params.get('relative', 'home')
|
|
1292
|
+
if relative == 'items':
|
|
1293
|
+
# Dashboard-friendly: entries relative to selected folders
|
|
1294
|
+
storage_utils.zip_files_and_folders(folders,
|
|
1295
|
+
zip_path,
|
|
1296
|
+
relative_to_items=True)
|
|
1297
|
+
else:
|
|
1298
|
+
# CLI-friendly (default): entries with full paths for mapping
|
|
1299
|
+
storage_utils.zip_files_and_folders(folders, zip_path)
|
|
1300
|
+
|
|
1301
|
+
await context_utils.to_thread(_zip_files_and_folders, folder_paths,
|
|
1302
|
+
zip_path)
|
|
1271
1303
|
|
|
1272
1304
|
# Add home path to the response headers, so that the client can replace
|
|
1273
1305
|
# the remote path in the zip file to the local path.
|
sky/server/uvicorn.py
CHANGED
|
@@ -24,6 +24,7 @@ from sky.server.requests import requests as requests_lib
|
|
|
24
24
|
from sky.skylet import constants
|
|
25
25
|
from sky.utils import context_utils
|
|
26
26
|
from sky.utils import env_options
|
|
27
|
+
from sky.utils import perf_utils
|
|
27
28
|
from sky.utils import subprocess_utils
|
|
28
29
|
|
|
29
30
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -198,6 +199,12 @@ class Server(uvicorn.Server):
|
|
|
198
199
|
context_utils.hijack_sys_attrs()
|
|
199
200
|
# Use default loop policy of uvicorn (use uvloop if available).
|
|
200
201
|
self.config.setup_event_loop()
|
|
202
|
+
lag_threshold = perf_utils.get_loop_lag_threshold()
|
|
203
|
+
if lag_threshold is not None:
|
|
204
|
+
event_loop = asyncio.get_event_loop()
|
|
205
|
+
# Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
|
|
206
|
+
event_loop.set_debug(True)
|
|
207
|
+
event_loop.slow_callback_duration = lag_threshold
|
|
201
208
|
with self.capture_signals():
|
|
202
209
|
asyncio.run(self.serve(*args, **kwargs))
|
|
203
210
|
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -8,8 +8,12 @@ This file is imported by setup.py, so:
|
|
|
8
8
|
import sys
|
|
9
9
|
from typing import Dict, List
|
|
10
10
|
|
|
11
|
+
clouds_with_ray = ['ibm', 'docker', 'scp']
|
|
12
|
+
|
|
11
13
|
install_requires = [
|
|
12
14
|
'wheel<0.46.0', # https://github.com/skypilot-org/skypilot/issues/5153
|
|
15
|
+
'setuptools', # TODO: match version to pyproject.toml once #5153 is fixed
|
|
16
|
+
'pip',
|
|
13
17
|
'cachetools',
|
|
14
18
|
# NOTE: ray requires click>=7.0.
|
|
15
19
|
# click 8.2.0 has a bug in parsing the command line arguments:
|
|
@@ -148,7 +152,7 @@ extras_require: Dict[str, List[str]] = {
|
|
|
148
152
|
'azure-storage-blob>=12.23.1',
|
|
149
153
|
'msgraph-sdk',
|
|
150
154
|
'msrestazure',
|
|
151
|
-
]
|
|
155
|
+
],
|
|
152
156
|
# We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
|
|
153
157
|
# parameter for stopping instances. Reference:
|
|
154
158
|
# https://github.com/googleapis/google-api-python-client/commit/f6e9d3869ed605b06f7cbf2e8cf2db25108506e6
|
|
@@ -169,7 +173,7 @@ extras_require: Dict[str, List[str]] = {
|
|
|
169
173
|
'lambda': [], # No dependencies needed for lambda
|
|
170
174
|
'cloudflare': aws_dependencies,
|
|
171
175
|
'scp': local_ray,
|
|
172
|
-
'oci': ['oci']
|
|
176
|
+
'oci': ['oci'],
|
|
173
177
|
# Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
|
|
174
178
|
'kubernetes': [
|
|
175
179
|
'kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'
|
|
@@ -200,10 +204,21 @@ extras_require: Dict[str, List[str]] = {
|
|
|
200
204
|
'server': server_dependencies,
|
|
201
205
|
}
|
|
202
206
|
|
|
203
|
-
#
|
|
207
|
+
# Calculate which clouds should be included in the [all] installation.
|
|
208
|
+
clouds_for_all = set(extras_require)
|
|
209
|
+
clouds_for_all.remove('remote')
|
|
210
|
+
|
|
204
211
|
if sys.version_info < (3, 10):
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
212
|
+
# Nebius needs python3.10. If python 3.9 [all] will not install nebius
|
|
213
|
+
clouds_for_all.remove('nebius')
|
|
214
|
+
|
|
215
|
+
if sys.version_info >= (3, 12):
|
|
216
|
+
# The version of ray we use does not work with >= 3.12, so avoid clouds
|
|
217
|
+
# that require ray.
|
|
218
|
+
clouds_for_all -= set(clouds_with_ray)
|
|
219
|
+
# vast requires setuptools==51.1.1 which will not work with python >= 3.12
|
|
220
|
+
# TODO: Remove once https://github.com/vast-ai/vast-sdk/pull/6 is released
|
|
221
|
+
clouds_for_all.remove('vast')
|
|
222
|
+
|
|
223
|
+
extras_require['all'] = list(
|
|
224
|
+
set().union(*[extras_require[cloud] for cloud in clouds_for_all]))
|
sky/setup_files/setup.py
CHANGED
|
@@ -178,6 +178,8 @@ setuptools.setup(
|
|
|
178
178
|
'Programming Language :: Python :: 3.9',
|
|
179
179
|
'Programming Language :: Python :: 3.10',
|
|
180
180
|
'Programming Language :: Python :: 3.11',
|
|
181
|
+
'Programming Language :: Python :: 3.12',
|
|
182
|
+
'Programming Language :: Python :: 3.13',
|
|
181
183
|
'License :: OSI Approved :: Apache Software License',
|
|
182
184
|
'Operating System :: OS Independent',
|
|
183
185
|
'Topic :: Software Development :: Libraries :: Python Modules',
|