skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +452 -53
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/config.html +1 -1
  15. sky/dashboard/out/index.html +1 -1
  16. sky/dashboard/out/infra/[context].html +1 -1
  17. sky/dashboard/out/infra.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/data_utils.py +21 -1
  26. sky/data/storage.py +12 -0
  27. sky/jobs/__init__.py +3 -0
  28. sky/jobs/client/sdk.py +80 -3
  29. sky/jobs/controller.py +76 -25
  30. sky/jobs/recovery_strategy.py +80 -34
  31. sky/jobs/scheduler.py +68 -20
  32. sky/jobs/server/core.py +228 -136
  33. sky/jobs/server/server.py +40 -0
  34. sky/jobs/state.py +129 -24
  35. sky/jobs/utils.py +109 -51
  36. sky/provision/nebius/constants.py +3 -0
  37. sky/provision/runpod/utils.py +27 -12
  38. sky/py.typed +0 -0
  39. sky/resources.py +16 -12
  40. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  41. sky/serve/autoscalers.py +8 -0
  42. sky/serve/client/impl.py +188 -0
  43. sky/serve/client/sdk.py +12 -82
  44. sky/serve/constants.py +5 -1
  45. sky/serve/controller.py +5 -0
  46. sky/serve/replica_managers.py +112 -37
  47. sky/serve/serve_state.py +16 -6
  48. sky/serve/serve_utils.py +274 -77
  49. sky/serve/server/core.py +8 -525
  50. sky/serve/server/impl.py +709 -0
  51. sky/serve/service.py +13 -9
  52. sky/serve/service_spec.py +74 -4
  53. sky/server/constants.py +1 -1
  54. sky/server/daemons.py +164 -0
  55. sky/server/requests/payloads.py +33 -0
  56. sky/server/requests/requests.py +2 -107
  57. sky/server/requests/serializers/decoders.py +12 -3
  58. sky/server/requests/serializers/encoders.py +13 -2
  59. sky/server/server.py +2 -1
  60. sky/server/uvicorn.py +2 -1
  61. sky/sky_logging.py +30 -0
  62. sky/skylet/constants.py +2 -1
  63. sky/skylet/events.py +9 -0
  64. sky/skypilot_config.py +24 -21
  65. sky/task.py +41 -11
  66. sky/templates/jobs-controller.yaml.j2 +3 -0
  67. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  68. sky/users/server.py +1 -1
  69. sky/utils/command_runner.py +4 -2
  70. sky/utils/controller_utils.py +14 -10
  71. sky/utils/dag_utils.py +4 -2
  72. sky/utils/db/migration_utils.py +2 -4
  73. sky/utils/schemas.py +47 -19
  74. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
  75. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
  76. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
  77. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
  78. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
  79. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
  80. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
  81. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/serve/service.py CHANGED
@@ -222,7 +222,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
222
222
  requested_resources_str=backend_utils.get_task_resources_str(task),
223
223
  load_balancing_policy=service_spec.load_balancing_policy,
224
224
  status=serve_state.ServiceStatus.CONTROLLER_INIT,
225
- tls_encrypted=service_spec.tls_credential is not None)
225
+ tls_encrypted=service_spec.tls_credential is not None,
226
+ pool=service_spec.pool)
226
227
  # Directly throw an error here. See sky/serve/api.py::up
227
228
  # for more details.
228
229
  if not success:
@@ -292,14 +293,17 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
292
293
  # TODO(tian): Probably we could enable multiple ports specified in
293
294
  # service spec and we could start multiple load balancers.
294
295
  # After that, we will have a mapping from replica port to endpoint.
295
- load_balancer_process = multiprocessing.Process(
296
- target=ux_utils.RedirectOutputForProcess(
297
- load_balancer.run_load_balancer,
298
- load_balancer_log_file).run,
299
- args=(controller_addr, load_balancer_port,
300
- service_spec.load_balancing_policy,
301
- service_spec.tls_credential))
302
- load_balancer_process.start()
296
+ # NOTE(tian): We don't need the load balancer for cluster pool.
297
+ # Skip the load balancer process for cluster pool.
298
+ if not service_spec.pool:
299
+ load_balancer_process = multiprocessing.Process(
300
+ target=ux_utils.RedirectOutputForProcess(
301
+ load_balancer.run_load_balancer,
302
+ load_balancer_log_file).run,
303
+ args=(controller_addr, load_balancer_port,
304
+ service_spec.load_balancing_policy,
305
+ service_spec.tls_credential))
306
+ load_balancer_process.start()
303
307
 
304
308
  if not is_recovery:
305
309
  serve_state.set_service_load_balancer_port(
sky/serve/service_spec.py CHANGED
@@ -43,7 +43,33 @@ class SkyServiceSpec:
43
43
  upscale_delay_seconds: Optional[int] = None,
44
44
  downscale_delay_seconds: Optional[int] = None,
45
45
  load_balancing_policy: Optional[str] = None,
46
+ pool: Optional[bool] = None,
46
47
  ) -> None:
48
+ if pool:
49
+ for unsupported_field in [
50
+ 'max_replicas',
51
+ 'num_overprovision',
52
+ 'target_qps_per_replica',
53
+ 'upscale_delay_seconds',
54
+ 'downscale_delay_seconds',
55
+ 'base_ondemand_fallback_replicas',
56
+ 'dynamic_ondemand_fallback',
57
+ 'spot_placer',
58
+ 'load_balancing_policy',
59
+ 'ports',
60
+ 'post_data',
61
+ 'tls_credential',
62
+ 'readiness_headers',
63
+ ]:
64
+ if locals()[unsupported_field] is not None:
65
+ with ux_utils.print_exception_no_traceback():
66
+ raise ValueError(
67
+ f'{unsupported_field} is not supported for pool.')
68
+ if max_replicas is not None and max_replicas != min_replicas:
69
+ with ux_utils.print_exception_no_traceback():
70
+ raise ValueError('Autoscaling is not supported for pool '
71
+ 'for now.')
72
+
47
73
  if max_replicas is not None and max_replicas < min_replicas:
48
74
  with ux_utils.print_exception_no_traceback():
49
75
  raise ValueError('max_replicas must be greater than or '
@@ -96,6 +122,7 @@ class SkyServiceSpec:
96
122
  self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
97
123
  self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
98
124
  self._load_balancing_policy: Optional[str] = load_balancing_policy
125
+ self._pool: Optional[bool] = pool
99
126
 
100
127
  self._use_ondemand_fallback: bool = (
101
128
  self.dynamic_ondemand_fallback is not None and
@@ -115,7 +142,7 @@ class SkyServiceSpec:
115
142
 
116
143
  service_config: Dict[str, Any] = {}
117
144
 
118
- readiness_section = config['readiness_probe']
145
+ readiness_section = config.get('readiness_probe', '/')
119
146
  if isinstance(readiness_section, str):
120
147
  service_config['readiness_path'] = readiness_section
121
148
  initial_delay_seconds = None
@@ -157,8 +184,29 @@ class SkyServiceSpec:
157
184
  raise ValueError('Port must be between 1 and 65535.')
158
185
  service_config['ports'] = str(ports) if ports is not None else None
159
186
 
187
+ pool_config = config.get('pool', None)
188
+ if pool_config is not None:
189
+ service_config['pool'] = pool_config
190
+
160
191
  policy_section = config.get('replica_policy', None)
192
+ if policy_section is not None and pool_config:
193
+ with ux_utils.print_exception_no_traceback():
194
+ raise ValueError('Cannot specify `replica_policy` for cluster '
195
+ 'pool. Only `workers: <num>` is supported '
196
+ 'for cluster pool now.')
197
+
161
198
  simplified_policy_section = config.get('replicas', None)
199
+ workers_config = config.get('workers', None)
200
+ if simplified_policy_section is not None and workers_config is not None:
201
+ with ux_utils.print_exception_no_traceback():
202
+ raise ValueError('Cannot specify both `replicas` and `workers`.'
203
+ ' Please use one of them.')
204
+ if simplified_policy_section is not None and pool_config:
205
+ with ux_utils.print_exception_no_traceback():
206
+ raise ValueError('Cannot specify `replicas` for cluster pool. '
207
+ 'Please use `workers` instead.')
208
+ if simplified_policy_section is None:
209
+ simplified_policy_section = workers_config
162
210
  if policy_section is None or simplified_policy_section is not None:
163
211
  if simplified_policy_section is not None:
164
212
  min_replicas = simplified_policy_section
@@ -239,6 +287,13 @@ class SkyServiceSpec:
239
287
  config[section] = dict()
240
288
  config[section][key] = value
241
289
 
290
+ add_if_not_none('pool', None, self._pool)
291
+
292
+ if self.pool:
293
+ # For pool, currently only `workers: <num>` is supported.
294
+ add_if_not_none('workers', None, self.min_replicas)
295
+ return config
296
+
242
297
  add_if_not_none('readiness_probe', 'path', self.readiness_path)
243
298
  add_if_not_none('readiness_probe', 'initial_delay_seconds',
244
299
  self.initial_delay_seconds)
@@ -306,10 +361,14 @@ class SkyServiceSpec:
306
361
  return ' '.join(policy_strs)
307
362
 
308
363
  def autoscaling_policy_str(self):
364
+ if self.pool:
365
+ # We only support fixed-size pool for now.
366
+ return f'Fixed-size ({self.min_replicas} workers)'
309
367
  # TODO(MaoZiming): Update policy_str
368
+ noun = 'worker' if self.pool else 'replica'
310
369
  min_plural = '' if self.min_replicas == 1 else 's'
311
370
  if self.max_replicas == self.min_replicas or self.max_replicas is None:
312
- return f'Fixed {self.min_replicas} replica{min_plural}'
371
+ return f'Fixed {self.min_replicas} {noun}{min_plural}'
313
372
  # Already checked in __init__.
314
373
  assert self.target_qps_per_replica is not None
315
374
  # TODO(tian): Refactor to contain more information
@@ -319,8 +378,8 @@ class SkyServiceSpec:
319
378
  overprovision_str = (
320
379
  f' with {self.num_overprovision} overprovisioned replicas')
321
380
  return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
322
- f'replica{max_plural}{overprovision_str} (target QPS per '
323
- f'replica: {self.target_qps_per_replica})')
381
+ f'{noun}{max_plural}{overprovision_str} (target QPS per '
382
+ f'{noun}: {self.target_qps_per_replica})')
324
383
 
325
384
  def set_ports(self, ports: str) -> None:
326
385
  self._ports = ports
@@ -332,6 +391,10 @@ class SkyServiceSpec:
332
391
  f'Certfile: {self.tls_credential.certfile}')
333
392
 
334
393
  def __repr__(self) -> str:
394
+ if self.pool:
395
+ return textwrap.dedent(f"""\
396
+ Worker policy: {self.autoscaling_policy_str()}
397
+ """)
335
398
  return textwrap.dedent(f"""\
336
399
  Readiness probe method: {self.probe_str()}
337
400
  Readiness initial delay seconds: {self.initial_delay_seconds}
@@ -420,3 +483,10 @@ class SkyServiceSpec:
420
483
  def load_balancing_policy(self) -> str:
421
484
  return lb_policies.LoadBalancingPolicy.make_policy_name(
422
485
  self._load_balancing_policy)
486
+
487
+ @property
488
+ def pool(self) -> bool:
489
+ # This can happen for backward compatibility.
490
+ if not hasattr(self, '_pool'):
491
+ return False
492
+ return bool(self._pool)
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 11
13
+ API_VERSION = 12
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
sky/server/daemons.py ADDED
@@ -0,0 +1,164 @@
1
+ """Internal server daemons that run in the background."""
2
+ import dataclasses
3
+ import os
4
+ import time
5
+ from typing import Callable
6
+
7
+ from sky import sky_logging
8
+ from sky import skypilot_config
9
+ from sky.server import constants as server_constants
10
+ from sky.utils import common
11
+ from sky.utils import env_options
12
+ from sky.utils import ux_utils
13
+
14
+ logger = sky_logging.init_logger(__name__)
15
+
16
+
17
+ @dataclasses.dataclass
18
+ class InternalRequestDaemon:
19
+ """Internal daemon that runs an event in the background."""
20
+
21
+ id: str
22
+ name: str
23
+ event_fn: Callable[[], None]
24
+ default_log_level: str = 'INFO'
25
+
26
+ def refresh_log_level(self) -> int:
27
+ # pylint: disable=import-outside-toplevel
28
+ import logging
29
+
30
+ try:
31
+ # Refresh config within the while loop.
32
+ # Since this is a long running daemon,
33
+ # reload_config_for_new_request()
34
+ # is not called in between the event runs.
35
+ skypilot_config.safe_reload_config()
36
+ # Get the configured log level for the daemon inside the event loop
37
+ # in case the log level changes after the API server is started.
38
+ level_str = skypilot_config.get_nested(
39
+ ('daemons', self.id, 'log_level'), self.default_log_level)
40
+ return getattr(logging, level_str.upper())
41
+ except AttributeError:
42
+ # Bad level should be rejected by
43
+ # schema validation, just in case.
44
+ logger.warning(f'Invalid log level: {level_str}, using DEBUG')
45
+ return logging.DEBUG
46
+ except Exception as e: # pylint: disable=broad-except
47
+ logger.exception(f'Error refreshing log level for {self.id}: {e}')
48
+ return logging.DEBUG
49
+
50
+ def run_event(self):
51
+ """Run the event."""
52
+
53
+ # Disable logging for periodic refresh to avoid the usage message being
54
+ # sent multiple times.
55
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
56
+
57
+ level = self.refresh_log_level()
58
+ while True:
59
+ try:
60
+ with ux_utils.enable_traceback(), \
61
+ sky_logging.set_sky_logging_levels(level):
62
+ sky_logging.reload_logger()
63
+ level = self.refresh_log_level()
64
+ self.event_fn()
65
+ except Exception: # pylint: disable=broad-except
66
+ # It is OK to fail to run the event, as the event is not
67
+ # critical, but we should log the error.
68
+ logger.exception(
69
+ f'Error running {self.name} event. '
70
+ f'Restarting in '
71
+ f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
72
+ 'seconds...')
73
+ time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
74
+
75
+
76
+ def refresh_cluster_status_event():
77
+ """Periodically refresh the cluster status."""
78
+ # pylint: disable=import-outside-toplevel
79
+ from sky import core
80
+
81
+ logger.info('=== Refreshing cluster status ===')
82
+ # This periodically refresh will hold the lock for the cluster being
83
+ # refreshed, but it is OK because other operations will just wait for
84
+ # the lock and get the just refreshed status without refreshing again.
85
+ core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
86
+ logger.info('Status refreshed. Sleeping '
87
+ f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
88
+ ' seconds for the next refresh...\n')
89
+ time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
90
+
91
+
92
+ def refresh_volume_status_event():
93
+ """Periodically refresh the volume status."""
94
+ # pylint: disable=import-outside-toplevel
95
+ from sky.volumes.server import core
96
+
97
+ # Disable logging for periodic refresh to avoid the usage message being
98
+ # sent multiple times.
99
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
100
+
101
+ logger.info('=== Refreshing volume status ===')
102
+ core.volume_refresh()
103
+ logger.info('Volume status refreshed. Sleeping '
104
+ f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
105
+ ' seconds for the next refresh...\n')
106
+ time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
107
+
108
+
109
+ def managed_job_status_refresh_event():
110
+ """Refresh the managed job status for controller consolidation mode."""
111
+ # pylint: disable=import-outside-toplevel
112
+ from sky.jobs import utils as managed_job_utils
113
+ if not managed_job_utils.is_consolidation_mode():
114
+ return
115
+ # We run the recovery logic before starting the event loop as those two are
116
+ # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
117
+ from sky.utils import controller_utils
118
+ if controller_utils.high_availability_specified(
119
+ controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
120
+ managed_job_utils.ha_recovery_for_consolidation_mode()
121
+ # After recovery, we start the event loop.
122
+ from sky.skylet import events
123
+ refresh_event = events.ManagedJobEvent()
124
+ scheduling_event = events.ManagedJobSchedulingEvent()
125
+ logger.info('=== Running managed job event ===')
126
+ refresh_event.run()
127
+ scheduling_event.run()
128
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
129
+
130
+
131
+ def sky_serve_status_refresh_event():
132
+ """Refresh the sky serve status for controller consolidation mode."""
133
+ # pylint: disable=import-outside-toplevel
134
+ from sky.serve import serve_utils
135
+ if not serve_utils.is_consolidation_mode():
136
+ return
137
+ # TODO(tian): Add HA recovery logic.
138
+ from sky.skylet import events
139
+ event = events.ServiceUpdateEvent()
140
+ logger.info('=== Running serve status refresh event ===')
141
+ event.run()
142
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
143
+
144
+
145
+ # Register the events to run in the background.
146
+ INTERNAL_REQUEST_DAEMONS = [
147
+ # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
148
+ # set to updated status automatically, without showing users the hint of
149
+ # cluster being stopped or down when `sky status -r` is called.
150
+ InternalRequestDaemon(id='skypilot-status-refresh-daemon',
151
+ name='status',
152
+ event_fn=refresh_cluster_status_event,
153
+ default_log_level='DEBUG'),
154
+ # Volume status refresh daemon to update the volume status periodically.
155
+ InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
156
+ name='volume',
157
+ event_fn=refresh_volume_status_event),
158
+ InternalRequestDaemon(id='managed-job-status-refresh-daemon',
159
+ name='managed-job-status',
160
+ event_fn=managed_job_status_refresh_event),
161
+ InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
162
+ name='sky-serve-status',
163
+ event_fn=sky_serve_status_refresh_event),
164
+ ]
@@ -478,6 +478,8 @@ class JobsLaunchBody(RequestBody):
478
478
  """The request body for the jobs launch endpoint."""
479
479
  task: str
480
480
  name: Optional[str]
481
+ pool: Optional[str] = None
482
+ num_jobs: Optional[int] = None
481
483
 
482
484
  def to_kwargs(self) -> Dict[str, Any]:
483
485
  kwargs = super().to_kwargs()
@@ -500,6 +502,7 @@ class JobsCancelBody(RequestBody):
500
502
  job_ids: Optional[List[int]] = None
501
503
  all: bool = False
502
504
  all_users: bool = False
505
+ pool: Optional[str] = None
503
506
 
504
507
 
505
508
  class JobsLogsBody(RequestBody):
@@ -671,6 +674,36 @@ class JobsDownloadLogsBody(RequestBody):
671
674
  local_dir: str = constants.SKY_LOGS_DIRECTORY
672
675
 
673
676
 
677
+ class JobsPoolApplyBody(RequestBody):
678
+ """The request body for the jobs pool apply endpoint."""
679
+ task: str
680
+ pool_name: str
681
+ mode: serve.UpdateMode
682
+
683
+ def to_kwargs(self) -> Dict[str, Any]:
684
+ kwargs = super().to_kwargs()
685
+ dag = common.process_mounts_in_task_on_api_server(self.task,
686
+ self.env_vars,
687
+ workdir_only=False)
688
+ assert len(
689
+ dag.tasks) == 1, ('Must only specify one task in the DAG for '
690
+ 'a pool.', dag)
691
+ kwargs['task'] = dag.tasks[0]
692
+ return kwargs
693
+
694
+
695
+ class JobsPoolDownBody(RequestBody):
696
+ """The request body for the jobs pool down endpoint."""
697
+ pool_names: Optional[Union[str, List[str]]]
698
+ all: bool = False
699
+ purge: bool = False
700
+
701
+
702
+ class JobsPoolStatusBody(RequestBody):
703
+ """The request body for the jobs pool status endpoint."""
704
+ pool_names: Optional[Union[str, List[str]]]
705
+
706
+
674
707
  class UploadZipFileResponse(pydantic.BaseModel):
675
708
  """The response body for the upload zip file endpoint."""
676
709
  status: str
@@ -24,12 +24,11 @@ from sky import sky_logging
24
24
  from sky import skypilot_config
25
25
  from sky.server import common as server_common
26
26
  from sky.server import constants as server_constants
27
+ from sky.server import daemons
27
28
  from sky.server.requests import payloads
28
29
  from sky.server.requests.serializers import decoders
29
30
  from sky.server.requests.serializers import encoders
30
- from sky.utils import common
31
31
  from sky.utils import common_utils
32
- from sky.utils import env_options
33
32
  from sky.utils import subprocess_utils
34
33
  from sky.utils import ux_utils
35
34
  from sky.utils.db import db_utils
@@ -307,110 +306,6 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
307
306
  kill_requests(request_ids)
308
307
 
309
308
 
310
- def refresh_cluster_status_event():
311
- """Periodically refresh the cluster status."""
312
- # pylint: disable=import-outside-toplevel
313
- from sky import core
314
-
315
- # Disable logging for periodic refresh to avoid the usage message being
316
- # sent multiple times.
317
- os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
318
-
319
- while True:
320
- logger.info('=== Refreshing cluster status ===')
321
- # This periodically refresh will hold the lock for the cluster being
322
- # refreshed, but it is OK because other operations will just wait for
323
- # the lock and get the just refreshed status without refreshing again.
324
- core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
325
- logger.info(
326
- 'Status refreshed. Sleeping '
327
- f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
328
- ' seconds for the next refresh...\n')
329
- time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
330
-
331
-
332
- def refresh_volume_status_event():
333
- """Periodically refresh the volume status."""
334
- # pylint: disable=import-outside-toplevel
335
- from sky.volumes.server import core
336
-
337
- # Disable logging for periodic refresh to avoid the usage message being
338
- # sent multiple times.
339
- os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
340
-
341
- while True:
342
- logger.info('=== Refreshing volume status ===')
343
- core.volume_refresh()
344
- logger.info('Volume status refreshed. Sleeping '
345
- f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
346
- ' seconds for the next refresh...\n')
347
- time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
348
-
349
-
350
- def managed_job_status_refresh_event():
351
- """Refresh the managed job status for controller consolidation mode."""
352
- # pylint: disable=import-outside-toplevel
353
- from sky.jobs import utils as managed_job_utils
354
- if not managed_job_utils.is_consolidation_mode():
355
- return
356
- # We run the recovery logic before starting the event loop as those two are
357
- # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
358
- from sky.utils import controller_utils
359
- if controller_utils.high_availability_specified(
360
- controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
361
- managed_job_utils.ha_recovery_for_consolidation_mode()
362
- # After recovery, we start the event loop.
363
- from sky.skylet import events
364
- event = events.ManagedJobEvent()
365
- while True:
366
- time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
367
- event.run()
368
-
369
-
370
- @dataclasses.dataclass
371
- class InternalRequestDaemon:
372
- """Internal daemon that runs an event in the background."""
373
-
374
- id: str
375
- name: str
376
- event_fn: Callable[[], None]
377
-
378
- def run_event(self):
379
- """Run the event."""
380
- while True:
381
- with ux_utils.enable_traceback():
382
- try:
383
- self.event_fn()
384
- break
385
- except Exception: # pylint: disable=broad-except
386
- # It is OK to fail to run the event, as the event is not
387
- # critical, but we should log the error.
388
- logger.exception(
389
- f'Error running {self.name} event. '
390
- f'Restarting in '
391
- f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
392
- 'seconds...')
393
- time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
394
-
395
-
396
- # Register the events to run in the background.
397
- INTERNAL_REQUEST_DAEMONS = [
398
- # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
399
- # set to updated status automatically, without showing users the hint of
400
- # cluster being stopped or down when `sky status -r` is called.
401
- InternalRequestDaemon(id='skypilot-status-refresh-daemon',
402
- name='status',
403
- event_fn=refresh_cluster_status_event),
404
- # Volume status refresh daemon to update the volume status periodically.
405
- InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
406
- name='volume',
407
- event_fn=refresh_volume_status_event),
408
- InternalRequestDaemon(id='managed-job-status-refresh-daemon',
409
- name='managed-job-status',
410
- event_fn=managed_job_status_refresh_event),
411
- ]
412
-
413
-
414
309
  def kill_requests(request_ids: Optional[List[str]] = None,
415
310
  user_id: Optional[str] = None) -> List[str]:
416
311
  """Kill a SkyPilot API request and set its status to cancelled.
@@ -441,7 +336,7 @@ def kill_requests(request_ids: Optional[List[str]] = None,
441
336
  # Skip internal requests. The internal requests are scheduled with
442
337
  # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
443
338
  if request_record.request_id in set(
444
- event.id for event in INTERNAL_REQUEST_DAEMONS):
339
+ event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
445
340
  continue
446
341
  if request_record.status > RequestStatus.RUNNING:
447
342
  logger.debug(f'Request {request_id} already finished')
@@ -109,9 +109,8 @@ def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
109
109
  return jobs
110
110
 
111
111
 
112
- @register_decoders('serve.status')
113
- def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
114
- service_statuses = return_value
112
+ def _decode_serve_status(
113
+ service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
115
114
  for service_status in service_statuses:
116
115
  service_status['status'] = serve_state.ServiceStatus(
117
116
  service_status['status'])
@@ -122,6 +121,16 @@ def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
122
121
  return service_statuses
123
122
 
124
123
 
124
+ @register_decoders('serve.status')
125
+ def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
126
+ return _decode_serve_status(return_value)
127
+
128
+
129
+ @register_decoders('jobs.pool_status')
130
+ def decode_jobs_pool_status(return_value: List[dict]) -> List[Dict[str, Any]]:
131
+ return _decode_serve_status(return_value)
132
+
133
+
125
134
  @register_decoders('cost_report')
126
135
  def decode_cost_report(
127
136
  return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
@@ -112,8 +112,7 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
112
112
  return jobs
113
113
 
114
114
 
115
- @register_encoder('serve.status')
116
- def encode_serve_status(
115
+ def _encode_serve_status(
117
116
  service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
118
117
  for service_status in service_statuses:
119
118
  service_status['status'] = service_status['status'].value
@@ -123,6 +122,18 @@ def encode_serve_status(
123
122
  return service_statuses
124
123
 
125
124
 
125
+ @register_encoder('serve.status')
126
+ def encode_serve_status(
127
+ service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
128
+ return _encode_serve_status(service_statuses)
129
+
130
+
131
+ @register_encoder('jobs.pool_status')
132
+ def encode_jobs_pool_status(
133
+ pool_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
134
+ return _encode_serve_status(pool_statuses)
135
+
136
+
126
137
  @register_encoder('cost_report')
127
138
  def encode_cost_report(
128
139
  cost_report: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
sky/server/server.py CHANGED
@@ -46,6 +46,7 @@ from sky.serve.server import server as serve_rest
46
46
  from sky.server import common
47
47
  from sky.server import config as server_config
48
48
  from sky.server import constants as server_constants
49
+ from sky.server import daemons
49
50
  from sky.server import metrics
50
51
  from sky.server import state
51
52
  from sky.server import stream_utils
@@ -482,7 +483,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
482
483
  """FastAPI lifespan context manager."""
483
484
  del app # unused
484
485
  # Startup: Run background tasks
485
- for event in requests_lib.INTERNAL_REQUEST_DAEMONS:
486
+ for event in daemons.INTERNAL_REQUEST_DAEMONS:
486
487
  try:
487
488
  executor.schedule_request(
488
489
  request_id=event.id,
sky/server/uvicorn.py CHANGED
@@ -16,6 +16,7 @@ import uvicorn
16
16
  from uvicorn.supervisors import multiprocess
17
17
 
18
18
  from sky import sky_logging
19
+ from sky.server import daemons
19
20
  from sky.server import state
20
21
  from sky.server.requests import requests as requests_lib
21
22
  from sky.skylet import constants
@@ -120,7 +121,7 @@ class Server(uvicorn.Server):
120
121
  # Proactively cancel internal requests and logs requests since
121
122
  # they can run for infinite time.
122
123
  internal_request_ids = [
123
- d.id for d in requests_lib.INTERNAL_REQUEST_DAEMONS
124
+ d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
124
125
  ]
125
126
  if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
126
127
  logger.warning('Timeout waiting for on-going requests to '
sky/sky_logging.py CHANGED
@@ -171,6 +171,36 @@ def set_logging_level(logger: str, level: int):
171
171
  logger.setLevel(original_level)
172
172
 
173
173
 
174
+ @contextlib.contextmanager
175
+ def set_sky_logging_levels(level: int):
176
+ """Set the logging level for all loggers."""
177
+ # Turn off logger
178
+ previous_levels = {}
179
+ for logger_name in logging.Logger.manager.loggerDict:
180
+ if logger_name.startswith('sky'):
181
+ logger = logging.getLogger(logger_name)
182
+ previous_levels[logger_name] = logger.level
183
+ logger.setLevel(level)
184
+ if level == logging.DEBUG:
185
+ previous_show_debug_info = env_options.Options.SHOW_DEBUG_INFO.get()
186
+ os.environ[env_options.Options.SHOW_DEBUG_INFO.env_key] = '1'
187
+ try:
188
+ yield
189
+ finally:
190
+ # Restore logger
191
+ for logger_name in logging.Logger.manager.loggerDict:
192
+ if logger_name.startswith('sky'):
193
+ logger = logging.getLogger(logger_name)
194
+ try:
195
+ logger.setLevel(previous_levels[logger_name])
196
+ except KeyError:
197
+ # New loggers maybe initialized after the context manager,
198
+ # no need to restore the level.
199
+ pass
200
+ if level == logging.DEBUG and not previous_show_debug_info:
201
+ os.environ.pop(env_options.Options.SHOW_DEBUG_INFO.env_key)
202
+
203
+
174
204
  def logging_enabled(logger: logging.Logger, level: int) -> bool:
175
205
  return logger.level <= level
176
206