skypilot-nightly 1.0.0.dev20250829__py3-none-any.whl → 1.0.0.dev20250901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (49) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +24 -2
  3. sky/backends/backend_utils.py +39 -36
  4. sky/backends/cloud_vm_ray_backend.py +37 -0
  5. sky/client/cli/command.py +17 -6
  6. sky/client/common.py +5 -4
  7. sky/client/sdk.py +5 -0
  8. sky/client/sdk_async.py +8 -2
  9. sky/core.py +8 -3
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/config.html +1 -1
  15. sky/dashboard/out/index.html +1 -1
  16. sky/dashboard/out/infra/[context].html +1 -1
  17. sky/dashboard/out/infra.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  20. sky/dashboard/out/jobs.html +1 -1
  21. sky/dashboard/out/users.html +1 -1
  22. sky/dashboard/out/volumes.html +1 -1
  23. sky/dashboard/out/workspace/new.html +1 -1
  24. sky/dashboard/out/workspaces/[name].html +1 -1
  25. sky/dashboard/out/workspaces.html +1 -1
  26. sky/global_user_state.py +67 -0
  27. sky/provision/docker_utils.py +1 -1
  28. sky/provision/kubernetes/utils.py +39 -26
  29. sky/server/common.py +8 -6
  30. sky/server/metrics.py +82 -6
  31. sky/server/requests/executor.py +5 -1
  32. sky/server/requests/payloads.py +1 -0
  33. sky/server/requests/requests.py +19 -11
  34. sky/server/server.py +46 -14
  35. sky/server/uvicorn.py +7 -0
  36. sky/setup_files/dependencies.py +23 -8
  37. sky/setup_files/setup.py +2 -0
  38. sky/skylet/constants.py +3 -0
  39. sky/utils/db/db_utils.py +56 -4
  40. sky/utils/perf_utils.py +22 -0
  41. sky/utils/schemas.py +6 -0
  42. {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/METADATA +35 -50
  43. {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/RECORD +49 -48
  44. /sky/dashboard/out/_next/static/{hYJYFIxp_ZFONR4wTIJqZ → EqPZ0ygxa__3XPBVJ9dpy}/_buildManifest.js +0 -0
  45. /sky/dashboard/out/_next/static/{hYJYFIxp_ZFONR4wTIJqZ → EqPZ0ygxa__3XPBVJ9dpy}/_ssgManifest.js +0 -0
  46. {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/WHEEL +0 -0
  47. {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/entry_points.txt +0 -0
  48. {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/licenses/LICENSE +0 -0
  49. {skypilot_nightly-1.0.0.dev20250829.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  """Kubernetes utilities for SkyPilot."""
2
+ import copy
2
3
  import dataclasses
3
4
  import datetime
4
5
  import enum
@@ -2715,11 +2716,11 @@ def get_endpoint_debug_message(context: Optional[str] = None) -> str:
2715
2716
 
2716
2717
 
2717
2718
  def combine_pod_config_fields(
2718
- cluster_yaml_path: str,
2719
+ cluster_yaml_obj: Dict[str, Any],
2719
2720
  cluster_config_overrides: Dict[str, Any],
2720
2721
  cloud: Optional[clouds.Cloud] = None,
2721
2722
  context: Optional[str] = None,
2722
- ) -> None:
2723
+ ) -> Dict[str, Any]:
2723
2724
  """Adds or updates fields in the YAML with fields from the
2724
2725
  ~/.sky/config.yaml's kubernetes.pod_spec dict.
2725
2726
  This can be used to add fields to the YAML that are not supported by
@@ -2758,9 +2759,7 @@ def combine_pod_config_fields(
2758
2759
  - name: my-secret
2759
2760
  ```
2760
2761
  """
2761
- with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2762
- yaml_content = f.read()
2763
- yaml_obj = yaml_utils.safe_load(yaml_content)
2762
+ merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
2764
2763
  # We don't use override_configs in `get_effective_region_config`, as merging
2765
2764
  # the pod config requires special handling.
2766
2765
  if isinstance(cloud, clouds.SSH):
@@ -2787,26 +2786,20 @@ def combine_pod_config_fields(
2787
2786
 
2788
2787
  # Merge the kubernetes config into the YAML for both head and worker nodes.
2789
2788
  config_utils.merge_k8s_configs(
2790
- yaml_obj['available_node_types']['ray_head_default']['node_config'],
2791
- kubernetes_config)
2789
+ merged_cluster_yaml_obj['available_node_types']['ray_head_default']
2790
+ ['node_config'], kubernetes_config)
2791
+ return merged_cluster_yaml_obj
2792
2792
 
2793
- # Write the updated YAML back to the file
2794
- yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2795
2793
 
2796
-
2797
- def combine_metadata_fields(cluster_yaml_path: str,
2794
+ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
2798
2795
  cluster_config_overrides: Dict[str, Any],
2799
- context: Optional[str] = None) -> None:
2796
+ context: Optional[str] = None) -> Dict[str, Any]:
2800
2797
  """Updates the metadata for all Kubernetes objects created by SkyPilot with
2801
2798
  fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
2802
2799
 
2803
2800
  Obeys the same add or update semantics as combine_pod_config_fields().
2804
2801
  """
2805
-
2806
- with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2807
- yaml_content = f.read()
2808
- yaml_obj = yaml_utils.safe_load(yaml_content)
2809
-
2802
+ merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
2810
2803
  # Get custom_metadata from global config
2811
2804
  custom_metadata = skypilot_config.get_effective_region_config(
2812
2805
  cloud='kubernetes',
@@ -2828,22 +2821,42 @@ def combine_metadata_fields(cluster_yaml_path: str,
2828
2821
  # List of objects in the cluster YAML to be updated
2829
2822
  combination_destinations = [
2830
2823
  # Service accounts
2831
- yaml_obj['provider']['autoscaler_service_account']['metadata'],
2832
- yaml_obj['provider']['autoscaler_role']['metadata'],
2833
- yaml_obj['provider']['autoscaler_role_binding']['metadata'],
2834
- yaml_obj['provider']['autoscaler_service_account']['metadata'],
2835
- # Pod spec
2836
- yaml_obj['available_node_types']['ray_head_default']['node_config']
2824
+ merged_cluster_yaml_obj['provider']['autoscaler_service_account']
2837
2825
  ['metadata'],
2826
+ merged_cluster_yaml_obj['provider']['autoscaler_role']['metadata'],
2827
+ merged_cluster_yaml_obj['provider']['autoscaler_role_binding']
2828
+ ['metadata'],
2829
+ merged_cluster_yaml_obj['provider']['autoscaler_service_account']
2830
+ ['metadata'],
2831
+ # Pod spec
2832
+ merged_cluster_yaml_obj['available_node_types']['ray_head_default']
2833
+ ['node_config']['metadata'],
2838
2834
  # Services for pods
2839
- *[svc['metadata'] for svc in yaml_obj['provider']['services']]
2835
+ *[
2836
+ svc['metadata']
2837
+ for svc in merged_cluster_yaml_obj['provider']['services']
2838
+ ]
2840
2839
  ]
2841
2840
 
2842
2841
  for destination in combination_destinations:
2843
2842
  config_utils.merge_k8s_configs(destination, custom_metadata)
2844
2843
 
2845
- # Write the updated YAML back to the file
2846
- yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2844
+ return merged_cluster_yaml_obj
2845
+
2846
+
2847
+ def combine_pod_config_fields_and_metadata(
2848
+ cluster_yaml_obj: Dict[str, Any],
2849
+ cluster_config_overrides: Dict[str, Any],
2850
+ cloud: Optional[clouds.Cloud] = None,
2851
+ context: Optional[str] = None) -> Dict[str, Any]:
2852
+ """Combines pod config fields and metadata fields"""
2853
+ combined_yaml_obj = combine_pod_config_fields(cluster_yaml_obj,
2854
+ cluster_config_overrides,
2855
+ cloud, context)
2856
+ combined_yaml_obj = combine_metadata_fields(combined_yaml_obj,
2857
+ cluster_config_overrides,
2858
+ context)
2859
+ return combined_yaml_obj
2847
2860
 
2848
2861
 
2849
2862
  def merge_custom_metadata(
sky/server/common.py CHANGED
@@ -648,14 +648,16 @@ def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
648
648
  deploy: Whether the server is running in deploy mode, which means
649
649
  multiple processes might be running.
650
650
  """
651
+ del deploy
651
652
  if metrics or os.getenv(constants.ENV_VAR_SERVER_METRICS_ENABLED) == 'true':
652
653
  env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
653
- if deploy:
654
- metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
655
- shutil.rmtree(metrics_dir, ignore_errors=True)
656
- os.makedirs(metrics_dir, exist_ok=True)
657
- # Refer to https://prometheus.github.io/client_python/multiprocess/
658
- env['PROMETHEUS_MULTIPROC_DIR'] = metrics_dir
654
+ # Always set the metrics dir since we need to collect metrics from
655
+ # subprocesses like the executor.
656
+ metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
657
+ shutil.rmtree(metrics_dir, ignore_errors=True)
658
+ os.makedirs(metrics_dir, exist_ok=True)
659
+ # Refer to https://prometheus.github.io/client_python/multiprocess/
660
+ env['PROMETHEUS_MULTIPROC_DIR'] = metrics_dir
659
661
 
660
662
 
661
663
  def check_server_healthy(
sky/server/metrics.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """Instrumentation for the API server."""
2
2
 
3
+ import contextlib
4
+ import functools
3
5
  import os
4
6
  import time
5
7
 
@@ -11,11 +13,16 @@ import starlette.middleware.base
11
13
  import uvicorn
12
14
 
13
15
  from sky import sky_logging
16
+ from sky.skylet import constants
17
+
18
+ # Whether the metrics are enabled, cannot be changed at runtime.
19
+ METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
20
+ 'false').lower() == 'true'
14
21
 
15
22
  logger = sky_logging.init_logger(__name__)
16
23
 
17
24
  # Total number of API server requests, grouped by path, method, and status.
18
- sky_apiserver_requests_total = prom.Counter(
25
+ SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
19
26
  'sky_apiserver_requests_total',
20
27
  'Total number of API server requests',
21
28
  ['path', 'method', 'status'],
@@ -23,14 +30,40 @@ sky_apiserver_requests_total = prom.Counter(
23
30
 
24
31
  # Time spent processing API server requests, grouped by path, method, and
25
32
  # status.
26
- sky_apiserver_request_duration_seconds = prom.Histogram(
33
+ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
27
34
  'sky_apiserver_request_duration_seconds',
28
35
  'Time spent processing API server requests',
29
36
  ['path', 'method', 'status'],
30
- buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
37
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
38
+ 60.0, 120.0, float('inf')),
39
+ )
40
+
41
+ # Time spent processing requests in executor.
42
+ SKY_APISERVER_REQUEST_EXECUTION_DURATION_SECONDS = prom.Histogram(
43
+ 'sky_apiserver_request_execution_duration_seconds',
44
+ 'Time spent executing requests in executor',
45
+ ['request', 'worker'],
46
+ buckets=(0.5, 1, 2.5, 5.0, 10.0, 15.0, 25.0, 40.0, 60.0, 90.0, 120.0, 180.0,
31
47
  float('inf')),
32
48
  )
33
49
 
50
+ # Time spent processing a piece of code, refer to time_it().
51
+ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
52
+ 'sky_apiserver_code_duration_seconds',
53
+ 'Time spent processing code',
54
+ ['name', 'group'],
55
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
56
+ 60.0, 120.0, float('inf')),
57
+ )
58
+
59
+ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
60
+ 'sky_apiserver_event_loop_lag_seconds',
61
+ 'Scheduling delay of the server event loop',
62
+ ['pid'],
63
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
64
+ 60.0, float('inf')),
65
+ )
66
+
34
67
  metrics_app = fastapi.FastAPI()
35
68
 
36
69
 
@@ -76,7 +109,7 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
76
109
 
77
110
  async def dispatch(self, request: fastapi.Request, call_next):
78
111
  path = request.url.path
79
- logger.info(f'PROM Middleware Request: {request}, {request.url.path}')
112
+ logger.debug(f'PROM Middleware Request: {request}, {request.url.path}')
80
113
  streaming = _is_streaming_api(path)
81
114
  if not streaming:
82
115
  # Exclude streaming APIs, the duration is not meaningful.
@@ -92,13 +125,56 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
92
125
  status_code_group = '5xx'
93
126
  raise
94
127
  finally:
95
- sky_apiserver_requests_total.labels(path=path,
128
+ SKY_APISERVER_REQUESTS_TOTAL.labels(path=path,
96
129
  method=method,
97
130
  status=status_code_group).inc()
98
131
  if not streaming:
99
132
  duration = time.time() - start_time
100
- sky_apiserver_request_duration_seconds.labels(
133
+ SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
101
134
  path=path, method=method,
102
135
  status=status_code_group).observe(duration)
103
136
 
104
137
  return response
138
+
139
+
140
+ @contextlib.contextmanager
141
+ def time_it(name: str, group: str = 'default'):
142
+ """Context manager to measure and record code execution duration."""
143
+ if not METRICS_ENABLED:
144
+ yield
145
+ else:
146
+ start_time = time.time()
147
+ try:
148
+ yield
149
+ finally:
150
+ duration = time.time() - start_time
151
+ SKY_APISERVER_CODE_DURATION_SECONDS.labels(
152
+ name=name, group=group).observe(duration)
153
+
154
+
155
+ def time_me(func):
156
+ """Measure the duration of decorated function."""
157
+
158
+ @functools.wraps(func)
159
+ def wrapper(*args, **kwargs):
160
+ if not METRICS_ENABLED:
161
+ return func(*args, **kwargs)
162
+ name = f'{func.__module__}/{func.__name__}'
163
+ with time_it(name, group='function'):
164
+ return func(*args, **kwargs)
165
+
166
+ return wrapper
167
+
168
+
169
+ def time_me_async(func):
170
+ """Measure the duration of decorated async function."""
171
+
172
+ @functools.wraps(func)
173
+ async def async_wrapper(*args, **kwargs):
174
+ if not METRICS_ENABLED:
175
+ return await func(*args, **kwargs)
176
+ name = f'{func.__module__}/{func.__name__}'
177
+ with time_it(name, group='function'):
178
+ return await func(*args, **kwargs)
179
+
180
+ return async_wrapper
@@ -41,6 +41,7 @@ from sky import skypilot_config
41
41
  from sky.server import common as server_common
42
42
  from sky.server import config as server_config
43
43
  from sky.server import constants as server_constants
44
+ from sky.server import metrics as metrics_lib
44
45
  from sky.server.requests import payloads
45
46
  from sky.server.requests import preconditions
46
47
  from sky.server.requests import process
@@ -373,6 +374,7 @@ def _request_execution_wrapper(request_id: str,
373
374
  request_task.status = api_requests.RequestStatus.RUNNING
374
375
  func = request_task.entrypoint
375
376
  request_body = request_task.request_body
377
+ request_name = request_task.name
376
378
 
377
379
  # Append to the log file instead of overwriting it since there might be
378
380
  # logs from previous retries.
@@ -390,7 +392,9 @@ def _request_execution_wrapper(request_id: str,
390
392
  config = skypilot_config.to_dict()
391
393
  logger.debug(f'request config: \n'
392
394
  f'{yaml_utils.dump_yaml_str(dict(config))}')
393
- return_value = func(**request_body.to_kwargs())
395
+ with metrics_lib.time_it(name=request_name,
396
+ group='request_execution'):
397
+ return_value = func(**request_body.to_kwargs())
394
398
  f.flush()
395
399
  except KeyboardInterrupt:
396
400
  logger.info(f'Request {request_id} cancelled by user')
@@ -309,6 +309,7 @@ class StatusBody(RequestBody):
309
309
  cluster_names: Optional[List[str]] = None
310
310
  refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
311
311
  all_users: bool = True
312
+ include_credentials: bool = False
312
313
 
313
314
 
314
315
  class StartBody(RequestBody):
@@ -26,6 +26,7 @@ from sky import skypilot_config
26
26
  from sky.server import common as server_common
27
27
  from sky.server import constants as server_constants
28
28
  from sky.server import daemons
29
+ from sky.server import metrics as metrics_lib
29
30
  from sky.server.requests import payloads
30
31
  from sky.server.requests.serializers import decoders
31
32
  from sky.server.requests.serializers import encoders
@@ -460,6 +461,7 @@ def request_lock_path(request_id: str) -> str:
460
461
 
461
462
  @contextlib.contextmanager
462
463
  @init_db
464
+ @metrics_lib.time_me
463
465
  def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
464
466
  """Get and update a SkyPilot API request."""
465
467
  request = _get_request_no_lock(request_id)
@@ -469,6 +471,7 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
469
471
 
470
472
 
471
473
  @init_db
474
+ @metrics_lib.time_me
472
475
  def update_request_async(
473
476
  request_id: str) -> AsyncContextManager[Optional[Request]]:
474
477
  """Async version of update_request.
@@ -508,15 +511,16 @@ def _get_request_no_lock(request_id: str) -> Optional[Request]:
508
511
  async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
509
512
  """Async version of _get_request_no_lock."""
510
513
  assert _DB is not None
511
- conn = await _DB.async_conn()
512
- async with conn.execute(_get_request_sql, (request_id + '%',)) as cursor:
513
- row = await cursor.fetchone()
514
+ async with _DB.execute_fetchall_async(_get_request_sql,
515
+ (request_id + '%',)) as rows:
516
+ row = rows[0] if rows else None
514
517
  if row is None:
515
518
  return None
516
519
  return Request.from_row(row)
517
520
 
518
521
 
519
522
  @init_db
523
+ @metrics_lib.time_me
520
524
  def get_latest_request_id() -> Optional[str]:
521
525
  """Get the latest request ID."""
522
526
  assert _DB is not None
@@ -529,6 +533,7 @@ def get_latest_request_id() -> Optional[str]:
529
533
 
530
534
 
531
535
  @init_db
536
+ @metrics_lib.time_me
532
537
  def get_request(request_id: str) -> Optional[Request]:
533
538
  """Get a SkyPilot API request."""
534
539
  with filelock.FileLock(request_lock_path(request_id)):
@@ -536,6 +541,7 @@ def get_request(request_id: str) -> Optional[Request]:
536
541
 
537
542
 
538
543
  @init_db_async
544
+ @metrics_lib.time_me_async
539
545
  async def get_request_async(request_id: str) -> Optional[Request]:
540
546
  """Async version of get_request."""
541
547
  async with filelock.AsyncFileLock(request_lock_path(request_id)):
@@ -543,6 +549,7 @@ async def get_request_async(request_id: str) -> Optional[Request]:
543
549
 
544
550
 
545
551
  @init_db
552
+ @metrics_lib.time_me
546
553
  def create_if_not_exists(request: Request) -> bool:
547
554
  """Create a SkyPilot API request if it does not exist."""
548
555
  with filelock.FileLock(request_lock_path(request.request_id)):
@@ -553,6 +560,7 @@ def create_if_not_exists(request: Request) -> bool:
553
560
 
554
561
 
555
562
  @init_db_async
563
+ @metrics_lib.time_me_async
556
564
  async def create_if_not_exists_async(request: Request) -> bool:
557
565
  """Async version of create_if_not_exists."""
558
566
  async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
@@ -563,6 +571,7 @@ async def create_if_not_exists_async(request: Request) -> bool:
563
571
 
564
572
 
565
573
  @init_db
574
+ @metrics_lib.time_me
566
575
  def get_request_tasks(
567
576
  status: Optional[List[RequestStatus]] = None,
568
577
  cluster_names: Optional[List[str]] = None,
@@ -637,13 +646,13 @@ def get_request_tasks(
637
646
 
638
647
 
639
648
  @init_db_async
649
+ @metrics_lib.time_me_async
640
650
  async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
641
651
  """Get a list of API request ids for shell completion."""
642
652
  assert _DB is not None
643
- conn = await _DB.async_conn()
644
653
  # Prioritize alive requests (PENDING, RUNNING) over finished ones,
645
654
  # then order by creation time (newest first) within each category.
646
- async with conn.execute(
655
+ async with _DB.execute_fetchall_async(
647
656
  f"""SELECT request_id FROM {REQUEST_TABLE}
648
657
  WHERE request_id LIKE ?
649
658
  ORDER BY
@@ -652,9 +661,8 @@ async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
652
661
  ELSE 1
653
662
  END,
654
663
  created_at DESC
655
- LIMIT 1000""", (f'{incomplete}%',)) as cursor:
656
- rows = await cursor.fetchall()
657
- if rows is None:
664
+ LIMIT 1000""", (f'{incomplete}%',)) as rows:
665
+ if not rows:
658
666
  return []
659
667
  return [row[0] for row in rows]
660
668
 
@@ -675,9 +683,8 @@ def _add_or_update_request_no_lock(request: Request):
675
683
  async def _add_or_update_request_no_lock_async(request: Request):
676
684
  """Async version of _add_or_update_request_no_lock."""
677
685
  assert _DB is not None
678
- conn = await _DB.async_conn()
679
- await conn.execute(_add_or_update_request_sql, request.to_row())
680
- await conn.commit()
686
+ await _DB.execute_and_commit_async(_add_or_update_request_sql,
687
+ request.to_row())
681
688
 
682
689
 
683
690
  def set_request_failed(request_id: str, e: BaseException) -> None:
@@ -711,6 +718,7 @@ def set_request_cancelled(request_id: str) -> None:
711
718
 
712
719
 
713
720
  @init_db
721
+ @metrics_lib.time_me
714
722
  def _delete_requests(requests: List[Request]):
715
723
  """Clean up requests by their IDs."""
716
724
  id_list_str = ','.join(repr(req.request_id) for req in requests)
sky/server/server.py CHANGED
@@ -68,6 +68,7 @@ from sky.utils import common_utils
68
68
  from sky.utils import context
69
69
  from sky.utils import context_utils
70
70
  from sky.utils import dag_utils
71
+ from sky.utils import perf_utils
71
72
  from sky.utils import status_lib
72
73
  from sky.utils import subprocess_utils
73
74
  from sky.volumes.server import server as volumes_rest
@@ -421,6 +422,28 @@ async def cleanup_upload_ids():
421
422
  upload_ids_to_cleanup.pop((upload_id, user_hash))
422
423
 
423
424
 
425
+ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
426
+ interval: float = 0.1) -> None:
427
+ target = loop.time() + interval
428
+
429
+ pid = str(os.getpid())
430
+ lag_threshold = perf_utils.get_loop_lag_threshold()
431
+
432
+ def tick():
433
+ nonlocal target
434
+ now = loop.time()
435
+ lag = max(0.0, now - target)
436
+ if lag_threshold is not None and lag > lag_threshold:
437
+ logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
438
+ f'{lag_threshold} seconds.')
439
+ metrics.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
440
+ pid=pid).observe(lag)
441
+ target = now + interval
442
+ loop.call_at(target, tick)
443
+
444
+ loop.call_at(target, tick)
445
+
446
+
424
447
  @contextlib.asynccontextmanager
425
448
  async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
426
449
  """FastAPI lifespan context manager."""
@@ -446,6 +469,10 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
446
469
  # can safely ignore the error if the task is already scheduled.
447
470
  logger.debug(f'Request {event.id} already exists.')
448
471
  asyncio.create_task(cleanup_upload_ids())
472
+ if metrics.METRICS_ENABLED:
473
+ # Start monitoring the event loop lag in each server worker
474
+ # event loop (process).
475
+ asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
449
476
  yield
450
477
  # Shutdown: Add any cleanup code here if needed
451
478
 
@@ -1254,20 +1281,25 @@ async def download(download_body: payloads.DownloadBody,
1254
1281
  logs_dir_on_api_server).expanduser().resolve() / zip_filename
1255
1282
 
1256
1283
  try:
1257
- folders = [
1258
- str(folder_path.expanduser().resolve())
1259
- for folder_path in folder_paths
1260
- ]
1261
- # Check for optional query parameter to control zip entry structure
1262
- relative = request.query_params.get('relative', 'home')
1263
- if relative == 'items':
1264
- # Dashboard-friendly: entries relative to selected folders
1265
- storage_utils.zip_files_and_folders(folders,
1266
- zip_path,
1267
- relative_to_items=True)
1268
- else:
1269
- # CLI-friendly (default): entries with full paths for mapping
1270
- storage_utils.zip_files_and_folders(folders, zip_path)
1284
+
1285
+ def _zip_files_and_folders(folder_paths, zip_path):
1286
+ folders = [
1287
+ str(folder_path.expanduser().resolve())
1288
+ for folder_path in folder_paths
1289
+ ]
1290
+ # Check for optional query parameter to control zip entry structure
1291
+ relative = request.query_params.get('relative', 'home')
1292
+ if relative == 'items':
1293
+ # Dashboard-friendly: entries relative to selected folders
1294
+ storage_utils.zip_files_and_folders(folders,
1295
+ zip_path,
1296
+ relative_to_items=True)
1297
+ else:
1298
+ # CLI-friendly (default): entries with full paths for mapping
1299
+ storage_utils.zip_files_and_folders(folders, zip_path)
1300
+
1301
+ await context_utils.to_thread(_zip_files_and_folders, folder_paths,
1302
+ zip_path)
1271
1303
 
1272
1304
  # Add home path to the response headers, so that the client can replace
1273
1305
  # the remote path in the zip file to the local path.
sky/server/uvicorn.py CHANGED
@@ -24,6 +24,7 @@ from sky.server.requests import requests as requests_lib
24
24
  from sky.skylet import constants
25
25
  from sky.utils import context_utils
26
26
  from sky.utils import env_options
27
+ from sky.utils import perf_utils
27
28
  from sky.utils import subprocess_utils
28
29
 
29
30
  logger = sky_logging.init_logger(__name__)
@@ -198,6 +199,12 @@ class Server(uvicorn.Server):
198
199
  context_utils.hijack_sys_attrs()
199
200
  # Use default loop policy of uvicorn (use uvloop if available).
200
201
  self.config.setup_event_loop()
202
+ lag_threshold = perf_utils.get_loop_lag_threshold()
203
+ if lag_threshold is not None:
204
+ event_loop = asyncio.get_event_loop()
205
+ # Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
206
+ event_loop.set_debug(True)
207
+ event_loop.slow_callback_duration = lag_threshold
201
208
  with self.capture_signals():
202
209
  asyncio.run(self.serve(*args, **kwargs))
203
210
 
@@ -8,8 +8,12 @@ This file is imported by setup.py, so:
8
8
  import sys
9
9
  from typing import Dict, List
10
10
 
11
+ clouds_with_ray = ['ibm', 'docker', 'scp']
12
+
11
13
  install_requires = [
12
14
  'wheel<0.46.0', # https://github.com/skypilot-org/skypilot/issues/5153
15
+ 'setuptools', # TODO: match version to pyproject.toml once #5153 is fixed
16
+ 'pip',
13
17
  'cachetools',
14
18
  # NOTE: ray requires click>=7.0.
15
19
  # click 8.2.0 has a bug in parsing the command line arguments:
@@ -148,7 +152,7 @@ extras_require: Dict[str, List[str]] = {
148
152
  'azure-storage-blob>=12.23.1',
149
153
  'msgraph-sdk',
150
154
  'msrestazure',
151
- ] + local_ray,
155
+ ],
152
156
  # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
153
157
  # parameter for stopping instances. Reference:
154
158
  # https://github.com/googleapis/google-api-python-client/commit/f6e9d3869ed605b06f7cbf2e8cf2db25108506e6
@@ -169,7 +173,7 @@ extras_require: Dict[str, List[str]] = {
169
173
  'lambda': [], # No dependencies needed for lambda
170
174
  'cloudflare': aws_dependencies,
171
175
  'scp': local_ray,
172
- 'oci': ['oci'] + local_ray,
176
+ 'oci': ['oci'],
173
177
  # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
174
178
  'kubernetes': [
175
179
  'kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'
@@ -200,10 +204,21 @@ extras_require: Dict[str, List[str]] = {
200
204
  'server': server_dependencies,
201
205
  }
202
206
 
203
- # Nebius needs python3.10. If python 3.9 [all] will not install nebius
207
+ # Calculate which clouds should be included in the [all] installation.
208
+ clouds_for_all = set(extras_require)
209
+ clouds_for_all.remove('remote')
210
+
204
211
  if sys.version_info < (3, 10):
205
- filtered_keys = [k for k in extras_require if k != 'nebius']
206
- extras_require['all'] = sum(
207
- [v for k, v in extras_require.items() if k != 'nebius'], [])
208
- else:
209
- extras_require['all'] = sum(extras_require.values(), [])
212
+ # Nebius needs python3.10. If python 3.9 [all] will not install nebius
213
+ clouds_for_all.remove('nebius')
214
+
215
+ if sys.version_info >= (3, 12):
216
+ # The version of ray we use does not work with >= 3.12, so avoid clouds
217
+ # that require ray.
218
+ clouds_for_all -= set(clouds_with_ray)
219
+ # vast requires setuptools==51.1.1 which will not work with python >= 3.12
220
+ # TODO: Remove once https://github.com/vast-ai/vast-sdk/pull/6 is released
221
+ clouds_for_all.remove('vast')
222
+
223
+ extras_require['all'] = list(
224
+ set().union(*[extras_require[cloud] for cloud in clouds_for_all]))
sky/setup_files/setup.py CHANGED
@@ -178,6 +178,8 @@ setuptools.setup(
178
178
  'Programming Language :: Python :: 3.9',
179
179
  'Programming Language :: Python :: 3.10',
180
180
  'Programming Language :: Python :: 3.11',
181
+ 'Programming Language :: Python :: 3.12',
182
+ 'Programming Language :: Python :: 3.13',
181
183
  'License :: OSI Approved :: Apache Software License',
182
184
  'Operating System :: OS Independent',
183
185
  'Topic :: Software Development :: Libraries :: Python Modules',
sky/skylet/constants.py CHANGED
@@ -505,3 +505,6 @@ COST_REPORT_DEFAULT_DAYS = 30
505
505
 
506
506
  # The directory for file locks.
507
507
  SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
508
+
509
+ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
510
+ 'DEBUG_LOOP_LAG_THRESHOLD_MS')