skypilot-nightly 1.0.0.dev20250731__py3-none-any.whl → 1.0.0.dev20250802__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (58) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +6 -1
  3. sky/backends/cloud_vm_ray_backend.py +2 -1
  4. sky/catalog/data_fetchers/fetch_nebius.py +31 -7
  5. sky/client/cli/command.py +40 -14
  6. sky/client/cli/flags.py +15 -0
  7. sky/client/sdk.py +80 -10
  8. sky/client/sdk.pyi +4 -0
  9. sky/core.py +10 -2
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{oKqDxFQ88cquF4nQGE_0w → 2JNCZ4daQBotwWRNGi6aE}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +11 -0
  15. sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-13145516b19858fb.js} +1 -1
  16. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  17. sky/dashboard/out/clusters/[cluster].html +1 -1
  18. sky/dashboard/out/clusters.html +1 -1
  19. sky/dashboard/out/config.html +1 -1
  20. sky/dashboard/out/index.html +1 -1
  21. sky/dashboard/out/infra/[context].html +1 -1
  22. sky/dashboard/out/infra.html +1 -1
  23. sky/dashboard/out/jobs/[job].html +1 -1
  24. sky/dashboard/out/jobs.html +1 -1
  25. sky/dashboard/out/users.html +1 -1
  26. sky/dashboard/out/volumes.html +1 -1
  27. sky/dashboard/out/workspace/new.html +1 -1
  28. sky/dashboard/out/workspaces/[name].html +1 -1
  29. sky/dashboard/out/workspaces.html +1 -1
  30. sky/data/data_utils.py +21 -1
  31. sky/data/storage.py +12 -0
  32. sky/execution.py +5 -3
  33. sky/jobs/client/sdk.py +5 -1
  34. sky/provision/runpod/utils.py +27 -12
  35. sky/resources.py +17 -4
  36. sky/server/constants.py +1 -1
  37. sky/server/daemons.py +164 -0
  38. sky/server/requests/payloads.py +3 -0
  39. sky/server/requests/requests.py +2 -124
  40. sky/server/server.py +2 -1
  41. sky/server/uvicorn.py +2 -1
  42. sky/setup_files/dependencies.py +1 -1
  43. sky/sky_logging.py +30 -0
  44. sky/skylet/autostop_lib.py +96 -8
  45. sky/skylet/constants.py +4 -3
  46. sky/skylet/events.py +27 -13
  47. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  48. sky/utils/schemas.py +29 -0
  49. {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/METADATA +4 -3
  50. {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/RECORD +55 -54
  51. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +0 -6
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +0 -1
  53. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +0 -11
  54. /sky/dashboard/out/_next/static/{oKqDxFQ88cquF4nQGE_0w → 2JNCZ4daQBotwWRNGi6aE}/_ssgManifest.js +0 -0
  55. {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/WHEEL +0 -0
  56. {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/entry_points.txt +0 -0
  57. {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/licenses/LICENSE +0 -0
  58. {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/top_level.txt +0 -0
sky/jobs/client/sdk.py CHANGED
@@ -2,11 +2,11 @@
2
2
  import json
3
3
  import typing
4
4
  from typing import Dict, List, Optional, Union
5
- import webbrowser
6
5
 
7
6
  import click
8
7
 
9
8
  from sky import sky_logging
9
+ from sky.adaptors import common as adaptors_common
10
10
  from sky.client import common as client_common
11
11
  from sky.client import sdk
12
12
  from sky.serve.client import impl
@@ -23,9 +23,13 @@ from sky.utils import dag_utils
23
23
 
24
24
  if typing.TYPE_CHECKING:
25
25
  import io
26
+ import webbrowser
26
27
 
27
28
  import sky
28
29
  from sky.serve import serve_utils
30
+ else:
31
+ # only used in dashboard()
32
+ webbrowser = adaptors_common.LazyImport('webbrowser')
29
33
 
30
34
  logger = sky_logging.init_logger(__name__)
31
35
 
@@ -270,18 +270,17 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
270
270
  docker_login_config: Optional[Dict[str, str]]) -> str:
271
271
  """Launches an instance with the given parameters.
272
272
 
273
- Converts the instance_type to the RunPod GPU name, finds the specs for the
274
- GPU, and launches the instance.
273
+ For CPU instances, we directly use the instance_type for launching the
274
+ instance.
275
+
276
+ For GPU instances, we convert the instance_type to the RunPod GPU name,
277
+ and finds the specs for the GPU, before launching the instance.
275
278
 
276
279
  Returns:
277
280
  instance_id: The instance ID.
278
281
  """
279
282
  name = f'{cluster_name}-{node_type}'
280
- gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
281
- gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
282
- cloud_type = instance_type.split('_')[2]
283
283
 
284
- gpu_specs = runpod.runpod.get_gpu(gpu_type)
285
284
  # TODO(zhwu): keep this align with setups in
286
285
  # `provision.kuberunetes.instance.py`
287
286
  setup_cmd = (
@@ -329,12 +328,7 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
329
328
  params = {
330
329
  'name': name,
331
330
  'image_name': image_name_formatted,
332
- 'gpu_type_id': gpu_type,
333
- 'cloud_type': cloud_type,
334
331
  'container_disk_in_gb': disk_size,
335
- 'min_vcpu_count': 4 * gpu_quantity,
336
- 'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
337
- 'gpu_count': gpu_quantity,
338
332
  'country_code': region,
339
333
  'data_center_id': zone,
340
334
  'ports': ports_str,
@@ -343,12 +337,33 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
343
337
  'template_id': template_id,
344
338
  }
345
339
 
340
+ # GPU instance types start with f'{gpu_count}x',
341
+ # CPU instance types start with 'cpu'.
342
+ is_cpu_instance = instance_type.startswith('cpu')
343
+ if is_cpu_instance:
344
+ # RunPod CPU instances can be uniquely identified by the instance_id.
345
+ params.update({
346
+ 'instance_id': instance_type,
347
+ })
348
+ else:
349
+ gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
350
+ gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
351
+ cloud_type = instance_type.split('_')[2]
352
+ gpu_specs = runpod.runpod.get_gpu(gpu_type)
353
+ params.update({
354
+ 'gpu_type_id': gpu_type,
355
+ 'cloud_type': cloud_type,
356
+ 'min_vcpu_count': 4 * gpu_quantity,
357
+ 'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
358
+ 'gpu_count': gpu_quantity,
359
+ })
360
+
346
361
  if preemptible is None or not preemptible:
347
362
  new_instance = runpod.runpod.create_pod(**params)
348
363
  else:
349
364
  new_instance = runpod_commands.create_spot_pod(
350
365
  bid_per_gpu=bid_per_gpu,
351
- **params,
366
+ **params, # type: ignore[arg-type]
352
367
  )
353
368
 
354
369
  return new_instance['id']
sky/resources.py CHANGED
@@ -20,6 +20,7 @@ from sky.provision import docker_utils
20
20
  from sky.provision.gcp import constants as gcp_constants
21
21
  from sky.provision.kubernetes import utils as kubernetes_utils
22
22
  from sky.provision.nebius import constants as nebius_constants
23
+ from sky.skylet import autostop_lib
23
24
  from sky.skylet import constants
24
25
  from sky.utils import accelerator_registry
25
26
  from sky.utils import annotations
@@ -69,14 +70,18 @@ class AutostopConfig:
69
70
  # flags.
70
71
  idle_minutes: int = 0
71
72
  down: bool = False
73
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None
72
74
 
73
75
  def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
74
76
  if not self.enabled:
75
77
  return False
76
- return {
78
+ config: Dict[str, Any] = {
77
79
  'idle_minutes': self.idle_minutes,
78
80
  'down': self.down,
79
81
  }
82
+ if self.wait_for is not None:
83
+ config['wait_for'] = self.wait_for.value
84
+ return config
80
85
 
81
86
  @classmethod
82
87
  def from_yaml_config(
@@ -104,6 +109,9 @@ class AutostopConfig:
104
109
  autostop_config.idle_minutes = config['idle_minutes']
105
110
  if 'down' in config:
106
111
  autostop_config.down = config['down']
112
+ if 'wait_for' in config:
113
+ autostop_config.wait_for = (
114
+ autostop_lib.AutostopWaitFor.from_str(config['wait_for']))
107
115
  return autostop_config
108
116
 
109
117
  return None
@@ -958,15 +966,18 @@ class Resources:
958
966
  valid_volumes.append(volume)
959
967
  self._volumes = valid_volumes
960
968
 
961
- def override_autostop_config(self,
962
- down: bool = False,
963
- idle_minutes: Optional[int] = None) -> None:
969
+ def override_autostop_config(
970
+ self,
971
+ down: bool = False,
972
+ idle_minutes: Optional[int] = None,
973
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None) -> None:
964
974
  """Override autostop config to the resource.
965
975
 
966
976
  Args:
967
977
  down: If true, override the autostop config to use autodown.
968
978
  idle_minutes: If not None, override the idle minutes to autostop or
969
979
  autodown.
980
+ wait_for: If not None, override the wait mode.
970
981
  """
971
982
  if not down and idle_minutes is None:
972
983
  return
@@ -976,6 +987,8 @@ class Resources:
976
987
  self._autostop_config.down = down
977
988
  if idle_minutes is not None:
978
989
  self._autostop_config.idle_minutes = idle_minutes
990
+ if wait_for is not None:
991
+ self._autostop_config.wait_for = wait_for
979
992
 
980
993
  def is_launchable(self) -> bool:
981
994
  """Returns whether the resource is launchable."""
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 12
13
+ API_VERSION = 13
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
sky/server/daemons.py ADDED
@@ -0,0 +1,164 @@
1
+ """Internal server daemons that run in the background."""
2
+ import dataclasses
3
+ import os
4
+ import time
5
+ from typing import Callable
6
+
7
+ from sky import sky_logging
8
+ from sky import skypilot_config
9
+ from sky.server import constants as server_constants
10
+ from sky.utils import common
11
+ from sky.utils import env_options
12
+ from sky.utils import ux_utils
13
+
14
+ logger = sky_logging.init_logger(__name__)
15
+
16
+
17
+ @dataclasses.dataclass
18
+ class InternalRequestDaemon:
19
+ """Internal daemon that runs an event in the background."""
20
+
21
+ id: str
22
+ name: str
23
+ event_fn: Callable[[], None]
24
+ default_log_level: str = 'INFO'
25
+
26
+ def refresh_log_level(self) -> int:
27
+ # pylint: disable=import-outside-toplevel
28
+ import logging
29
+
30
+ try:
31
+ # Refresh config within the while loop.
32
+ # Since this is a long running daemon,
33
+ # reload_config_for_new_request()
34
+ # is not called in between the event runs.
35
+ skypilot_config.safe_reload_config()
36
+ # Get the configured log level for the daemon inside the event loop
37
+ # in case the log level changes after the API server is started.
38
+ level_str = skypilot_config.get_nested(
39
+ ('daemons', self.id, 'log_level'), self.default_log_level)
40
+ return getattr(logging, level_str.upper())
41
+ except AttributeError:
42
+ # Bad level should be rejected by
43
+ # schema validation, just in case.
44
+ logger.warning(f'Invalid log level: {level_str}, using DEBUG')
45
+ return logging.DEBUG
46
+ except Exception as e: # pylint: disable=broad-except
47
+ logger.exception(f'Error refreshing log level for {self.id}: {e}')
48
+ return logging.DEBUG
49
+
50
+ def run_event(self):
51
+ """Run the event."""
52
+
53
+ # Disable logging for periodic refresh to avoid the usage message being
54
+ # sent multiple times.
55
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
56
+
57
+ level = self.refresh_log_level()
58
+ while True:
59
+ try:
60
+ with ux_utils.enable_traceback(), \
61
+ sky_logging.set_sky_logging_levels(level):
62
+ sky_logging.reload_logger()
63
+ level = self.refresh_log_level()
64
+ self.event_fn()
65
+ except Exception: # pylint: disable=broad-except
66
+ # It is OK to fail to run the event, as the event is not
67
+ # critical, but we should log the error.
68
+ logger.exception(
69
+ f'Error running {self.name} event. '
70
+ f'Restarting in '
71
+ f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
72
+ 'seconds...')
73
+ time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
74
+
75
+
76
+ def refresh_cluster_status_event():
77
+ """Periodically refresh the cluster status."""
78
+ # pylint: disable=import-outside-toplevel
79
+ from sky import core
80
+
81
+ logger.info('=== Refreshing cluster status ===')
82
+ # This periodically refresh will hold the lock for the cluster being
83
+ # refreshed, but it is OK because other operations will just wait for
84
+ # the lock and get the just refreshed status without refreshing again.
85
+ core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
86
+ logger.info('Status refreshed. Sleeping '
87
+ f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
88
+ ' seconds for the next refresh...\n')
89
+ time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
90
+
91
+
92
+ def refresh_volume_status_event():
93
+ """Periodically refresh the volume status."""
94
+ # pylint: disable=import-outside-toplevel
95
+ from sky.volumes.server import core
96
+
97
+ # Disable logging for periodic refresh to avoid the usage message being
98
+ # sent multiple times.
99
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
100
+
101
+ logger.info('=== Refreshing volume status ===')
102
+ core.volume_refresh()
103
+ logger.info('Volume status refreshed. Sleeping '
104
+ f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
105
+ ' seconds for the next refresh...\n')
106
+ time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
107
+
108
+
109
+ def managed_job_status_refresh_event():
110
+ """Refresh the managed job status for controller consolidation mode."""
111
+ # pylint: disable=import-outside-toplevel
112
+ from sky.jobs import utils as managed_job_utils
113
+ if not managed_job_utils.is_consolidation_mode():
114
+ return
115
+ # We run the recovery logic before starting the event loop as those two are
116
+ # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
117
+ from sky.utils import controller_utils
118
+ if controller_utils.high_availability_specified(
119
+ controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
120
+ managed_job_utils.ha_recovery_for_consolidation_mode()
121
+ # After recovery, we start the event loop.
122
+ from sky.skylet import events
123
+ refresh_event = events.ManagedJobEvent()
124
+ scheduling_event = events.ManagedJobSchedulingEvent()
125
+ logger.info('=== Running managed job event ===')
126
+ refresh_event.run()
127
+ scheduling_event.run()
128
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
129
+
130
+
131
+ def sky_serve_status_refresh_event():
132
+ """Refresh the sky serve status for controller consolidation mode."""
133
+ # pylint: disable=import-outside-toplevel
134
+ from sky.serve import serve_utils
135
+ if not serve_utils.is_consolidation_mode():
136
+ return
137
+ # TODO(tian): Add HA recovery logic.
138
+ from sky.skylet import events
139
+ event = events.ServiceUpdateEvent()
140
+ logger.info('=== Running serve status refresh event ===')
141
+ event.run()
142
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
143
+
144
+
145
+ # Register the events to run in the background.
146
+ INTERNAL_REQUEST_DAEMONS = [
147
+ # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
148
+ # set to updated status automatically, without showing users the hint of
149
+ # cluster being stopped or down when `sky status -r` is called.
150
+ InternalRequestDaemon(id='skypilot-status-refresh-daemon',
151
+ name='status',
152
+ event_fn=refresh_cluster_status_event,
153
+ default_log_level='DEBUG'),
154
+ # Volume status refresh daemon to update the volume status periodically.
155
+ InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
156
+ name='volume',
157
+ event_fn=refresh_volume_status_event),
158
+ InternalRequestDaemon(id='managed-job-status-refresh-daemon',
159
+ name='managed-job-status',
160
+ event_fn=managed_job_status_refresh_event),
161
+ InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
162
+ name='sky-serve-status',
163
+ event_fn=sky_serve_status_refresh_event),
164
+ ]
@@ -33,6 +33,7 @@ from sky import sky_logging
33
33
  from sky import skypilot_config
34
34
  from sky.adaptors import common as adaptors_common
35
35
  from sky.server import common
36
+ from sky.skylet import autostop_lib
36
37
  from sky.skylet import constants
37
38
  from sky.usage import constants as usage_constants
38
39
  from sky.usage import usage_lib
@@ -312,6 +313,7 @@ class StartBody(RequestBody):
312
313
  """The request body for the start endpoint."""
313
314
  cluster_name: str
314
315
  idle_minutes_to_autostop: Optional[int] = None
316
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None
315
317
  retry_until_up: bool = False
316
318
  down: bool = False
317
319
  force: bool = False
@@ -321,6 +323,7 @@ class AutostopBody(RequestBody):
321
323
  """The request body for the autostop endpoint."""
322
324
  cluster_name: str
323
325
  idle_minutes: int
326
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None
324
327
  down: bool = False
325
328
 
326
329
 
@@ -24,12 +24,11 @@ from sky import sky_logging
24
24
  from sky import skypilot_config
25
25
  from sky.server import common as server_common
26
26
  from sky.server import constants as server_constants
27
+ from sky.server import daemons
27
28
  from sky.server.requests import payloads
28
29
  from sky.server.requests.serializers import decoders
29
30
  from sky.server.requests.serializers import encoders
30
- from sky.utils import common
31
31
  from sky.utils import common_utils
32
- from sky.utils import env_options
33
32
  from sky.utils import subprocess_utils
34
33
  from sky.utils import ux_utils
35
34
  from sky.utils.db import db_utils
@@ -307,127 +306,6 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
307
306
  kill_requests(request_ids)
308
307
 
309
308
 
310
- def refresh_cluster_status_event():
311
- """Periodically refresh the cluster status."""
312
- # pylint: disable=import-outside-toplevel
313
- from sky import core
314
-
315
- # Disable logging for periodic refresh to avoid the usage message being
316
- # sent multiple times.
317
- os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
318
-
319
- while True:
320
- logger.info('=== Refreshing cluster status ===')
321
- # This periodically refresh will hold the lock for the cluster being
322
- # refreshed, but it is OK because other operations will just wait for
323
- # the lock and get the just refreshed status without refreshing again.
324
- core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
325
- logger.info(
326
- 'Status refreshed. Sleeping '
327
- f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
328
- ' seconds for the next refresh...\n')
329
- time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
330
-
331
-
332
- def refresh_volume_status_event():
333
- """Periodically refresh the volume status."""
334
- # pylint: disable=import-outside-toplevel
335
- from sky.volumes.server import core
336
-
337
- # Disable logging for periodic refresh to avoid the usage message being
338
- # sent multiple times.
339
- os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
340
-
341
- while True:
342
- logger.info('=== Refreshing volume status ===')
343
- core.volume_refresh()
344
- logger.info('Volume status refreshed. Sleeping '
345
- f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
346
- ' seconds for the next refresh...\n')
347
- time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
348
-
349
-
350
- def managed_job_status_refresh_event():
351
- """Refresh the managed job status for controller consolidation mode."""
352
- # pylint: disable=import-outside-toplevel
353
- from sky.jobs import utils as managed_job_utils
354
- if not managed_job_utils.is_consolidation_mode():
355
- return
356
- # We run the recovery logic before starting the event loop as those two are
357
- # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
358
- from sky.utils import controller_utils
359
- if controller_utils.high_availability_specified(
360
- controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
361
- managed_job_utils.ha_recovery_for_consolidation_mode()
362
- # After recovery, we start the event loop.
363
- from sky.skylet import events
364
- refresh_event = events.ManagedJobEvent()
365
- scheduling_event = events.ManagedJobSchedulingEvent()
366
- while True:
367
- logger.info('=== Running managed job event ===')
368
- refresh_event.run()
369
- scheduling_event.run()
370
- time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
371
-
372
-
373
- def sky_serve_status_refresh_event():
374
- """Refresh the managed job status for controller consolidation mode."""
375
- # pylint: disable=import-outside-toplevel
376
- from sky.serve import serve_utils
377
- if not serve_utils.is_consolidation_mode():
378
- return
379
- # TODO(tian): Add HA recovery logic.
380
- from sky.skylet import events
381
- event = events.ServiceUpdateEvent()
382
- while True:
383
- time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
384
- event.run()
385
-
386
-
387
- @dataclasses.dataclass
388
- class InternalRequestDaemon:
389
- """Internal daemon that runs an event in the background."""
390
-
391
- id: str
392
- name: str
393
- event_fn: Callable[[], None]
394
-
395
- def run_event(self):
396
- """Run the event."""
397
- while True:
398
- with ux_utils.enable_traceback():
399
- try:
400
- self.event_fn()
401
- break
402
- except Exception: # pylint: disable=broad-except
403
- # It is OK to fail to run the event, as the event is not
404
- # critical, but we should log the error.
405
- logger.exception(
406
- f'Error running {self.name} event. '
407
- f'Restarting in '
408
- f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
409
- 'seconds...')
410
- time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
411
-
412
-
413
- # Register the events to run in the background.
414
- INTERNAL_REQUEST_DAEMONS = [
415
- # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
416
- # set to updated status automatically, without showing users the hint of
417
- # cluster being stopped or down when `sky status -r` is called.
418
- InternalRequestDaemon(id='skypilot-status-refresh-daemon',
419
- name='status',
420
- event_fn=refresh_cluster_status_event),
421
- # Volume status refresh daemon to update the volume status periodically.
422
- InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
423
- name='volume',
424
- event_fn=refresh_volume_status_event),
425
- InternalRequestDaemon(id='managed-job-status-refresh-daemon',
426
- name='managed-job-status',
427
- event_fn=managed_job_status_refresh_event),
428
- ]
429
-
430
-
431
309
  def kill_requests(request_ids: Optional[List[str]] = None,
432
310
  user_id: Optional[str] = None) -> List[str]:
433
311
  """Kill a SkyPilot API request and set its status to cancelled.
@@ -458,7 +336,7 @@ def kill_requests(request_ids: Optional[List[str]] = None,
458
336
  # Skip internal requests. The internal requests are scheduled with
459
337
  # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
460
338
  if request_record.request_id in set(
461
- event.id for event in INTERNAL_REQUEST_DAEMONS):
339
+ event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
462
340
  continue
463
341
  if request_record.status > RequestStatus.RUNNING:
464
342
  logger.debug(f'Request {request_id} already finished')
sky/server/server.py CHANGED
@@ -46,6 +46,7 @@ from sky.serve.server import server as serve_rest
46
46
  from sky.server import common
47
47
  from sky.server import config as server_config
48
48
  from sky.server import constants as server_constants
49
+ from sky.server import daemons
49
50
  from sky.server import metrics
50
51
  from sky.server import state
51
52
  from sky.server import stream_utils
@@ -482,7 +483,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
482
483
  """FastAPI lifespan context manager."""
483
484
  del app # unused
484
485
  # Startup: Run background tasks
485
- for event in requests_lib.INTERNAL_REQUEST_DAEMONS:
486
+ for event in daemons.INTERNAL_REQUEST_DAEMONS:
486
487
  try:
487
488
  executor.schedule_request(
488
489
  request_id=event.id,
sky/server/uvicorn.py CHANGED
@@ -16,6 +16,7 @@ import uvicorn
16
16
  from uvicorn.supervisors import multiprocess
17
17
 
18
18
  from sky import sky_logging
19
+ from sky.server import daemons
19
20
  from sky.server import state
20
21
  from sky.server.requests import requests as requests_lib
21
22
  from sky.skylet import constants
@@ -120,7 +121,7 @@ class Server(uvicorn.Server):
120
121
  # Proactively cancel internal requests and logs requests since
121
122
  # they can run for infinite time.
122
123
  internal_request_ids = [
123
- d.id for d in requests_lib.INTERNAL_REQUEST_DAEMONS
124
+ d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
124
125
  ]
125
126
  if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
126
127
  logger.warning('Timeout waiting for on-going requests to '
@@ -177,7 +177,7 @@ extras_require: Dict[str, List[str]] = {
177
177
  # 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
178
178
  ],
179
179
  'nebius': [
180
- 'nebius>=0.2.37',
180
+ 'nebius>=0.2.47',
181
181
  ] + aws_dependencies,
182
182
  'hyperbolic': [], # No dependencies needed for hyperbolic
183
183
  'server': server_dependencies,
sky/sky_logging.py CHANGED
@@ -171,6 +171,36 @@ def set_logging_level(logger: str, level: int):
171
171
  logger.setLevel(original_level)
172
172
 
173
173
 
174
+ @contextlib.contextmanager
175
+ def set_sky_logging_levels(level: int):
176
+ """Set the logging level for all loggers."""
177
+ # Turn off logger
178
+ previous_levels = {}
179
+ for logger_name in logging.Logger.manager.loggerDict:
180
+ if logger_name.startswith('sky'):
181
+ logger = logging.getLogger(logger_name)
182
+ previous_levels[logger_name] = logger.level
183
+ logger.setLevel(level)
184
+ if level == logging.DEBUG:
185
+ previous_show_debug_info = env_options.Options.SHOW_DEBUG_INFO.get()
186
+ os.environ[env_options.Options.SHOW_DEBUG_INFO.env_key] = '1'
187
+ try:
188
+ yield
189
+ finally:
190
+ # Restore logger
191
+ for logger_name in logging.Logger.manager.loggerDict:
192
+ if logger_name.startswith('sky'):
193
+ logger = logging.getLogger(logger_name)
194
+ try:
195
+ logger.setLevel(previous_levels[logger_name])
196
+ except KeyError:
197
+ # New loggers maybe initialized after the context manager,
198
+ # no need to restore the level.
199
+ pass
200
+ if level == logging.DEBUG and not previous_show_debug_info:
201
+ os.environ.pop(env_options.Options.SHOW_DEBUG_INFO.env_key)
202
+
203
+
174
204
  def logging_enabled(logger: logging.Logger, level: int) -> bool:
175
205
  return logger.level <= level
176
206