skypilot-nightly 1.0.0.dev20250407__py3-none-any.whl → 1.0.0.dev20250410__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/azure.py +1 -1
  3. sky/adaptors/nebius.py +5 -27
  4. sky/backends/backend.py +9 -7
  5. sky/backends/cloud_vm_ray_backend.py +8 -11
  6. sky/backends/local_docker_backend.py +3 -3
  7. sky/cloud_stores.py +0 -4
  8. sky/clouds/do.py +4 -5
  9. sky/clouds/gcp.py +5 -3
  10. sky/clouds/nebius.py +22 -12
  11. sky/clouds/service_catalog/data_fetchers/fetch_ibm.py +1 -2
  12. sky/clouds/service_catalog/gcp_catalog.py +37 -10
  13. sky/core.py +6 -6
  14. sky/data/data_utils.py +5 -9
  15. sky/data/mounting_utils.py +1 -1
  16. sky/data/storage.py +25 -31
  17. sky/data/storage_utils.py +36 -20
  18. sky/execution.py +11 -4
  19. sky/jobs/server/server.py +5 -1
  20. sky/provision/do/utils.py +19 -16
  21. sky/provision/gcp/config.py +30 -20
  22. sky/server/requests/executor.py +204 -126
  23. sky/server/requests/process.py +212 -0
  24. sky/server/requests/queues/local_queue.py +16 -0
  25. sky/setup_files/dependencies.py +1 -1
  26. sky/skylet/log_lib.py +4 -0
  27. sky/task.py +27 -7
  28. sky/utils/atomic.py +52 -0
  29. sky/utils/common_utils.py +2 -2
  30. sky/utils/schemas.py +25 -7
  31. sky/utils/validator.py +1 -8
  32. {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/METADATA +2 -2
  33. {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/RECORD +37 -34
  34. {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/WHEEL +0 -0
  35. {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/entry_points.txt +0 -0
  36. {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/licenses/LICENSE +0 -0
  37. {skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/top_level.txt +0 -0
@@ -18,9 +18,7 @@ The number of the workers is determined by the system resources.
18
18
 
19
19
  See the [README.md](../README.md) for detailed architecture of the executor.
20
20
  """
21
- import concurrent.futures
22
21
  import contextlib
23
- import dataclasses
24
22
  import enum
25
23
  import multiprocessing
26
24
  import os
@@ -42,7 +40,9 @@ from sky.server import common as server_common
42
40
  from sky.server import constants as server_constants
43
41
  from sky.server.requests import payloads
44
42
  from sky.server.requests import preconditions
43
+ from sky.server.requests import process
45
44
  from sky.server.requests import requests as api_requests
45
+ from sky.server.requests.queues import local_queue
46
46
  from sky.server.requests.queues import mp_queue
47
47
  from sky.skylet import constants
48
48
  from sky.utils import annotations
@@ -101,22 +101,23 @@ _MIN_LONG_WORKERS = 1
101
101
  # workers so at least 2 workers are needed to ensure responsiveness.
102
102
  _MIN_SHORT_WORKERS = 2
103
103
 
104
+ # Default number of burstable workers for local API server. A heuristic number
105
+ # that is large enough for most local cases.
106
+ # TODO(aylei): the number of burstable workers should be auto-tuned based on the
107
+ # system usage stats.
108
+ _BURSTABLE_WORKERS_FOR_LOCAL = 1024
109
+
104
110
 
105
111
  class QueueBackend(enum.Enum):
112
+ # Local queue backend serves queues in each process locally, which has
113
+ # lower resource usage but the consumer must be in the same process, i.e.
114
+ # this only works in single-process mode.
115
+ LOCAL = 'local'
116
+ # Multi-process queue backend starts a dedicated process for serving queues.
106
117
  MULTIPROCESSING = 'multiprocessing'
107
118
  # TODO(zhwu): we can add redis backend in the future.
108
119
 
109
120
 
110
- @dataclasses.dataclass
111
- class RequestWorker:
112
- id: int
113
- # The type of queue this worker works on.
114
- schedule_type: api_requests.ScheduleType
115
-
116
- def __str__(self) -> str:
117
- return f'Worker(id={self.id}, schedule_type={self.schedule_type.value})'
118
-
119
-
120
121
  class RequestQueue:
121
122
  """The queue for the requests, either redis or multiprocessing.
122
123
 
@@ -128,9 +129,12 @@ class RequestQueue:
128
129
  backend: Optional[QueueBackend] = None) -> None:
129
130
  self.name = schedule_type.value
130
131
  self.backend = backend
131
- assert (backend is None or
132
- backend == QueueBackend.MULTIPROCESSING), backend
133
- self.queue = mp_queue.get_queue(self.name)
132
+ if backend == QueueBackend.MULTIPROCESSING:
133
+ self.queue = mp_queue.get_queue(self.name)
134
+ elif backend == QueueBackend.LOCAL:
135
+ self.queue = local_queue.get_queue(self.name)
136
+ else:
137
+ raise RuntimeError(f'Invalid queue backend: {backend}')
134
138
 
135
139
  def put(self, request: Tuple[str, bool]) -> None:
136
140
  """Put and request to the queue.
@@ -161,6 +165,104 @@ class RequestQueue:
161
165
  queue_backend = QueueBackend.MULTIPROCESSING
162
166
 
163
167
 
168
+ def executor_initializer(proc_group: str):
169
+ setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
170
+ f'{multiprocessing.current_process().pid}')
171
+
172
+
173
+ class RequestWorker:
174
+ """A worker that polls requests from the queue and runs them.
175
+
176
+ The worker can run at least `garanteed_parallelism` requests in parallel.
177
+ If there are more resources available, it can spin up extra workers up to
178
+ `garanteed_parallelism + burstable_parallelism`.
179
+ """
180
+
181
+ # The type of queue this worker works on.
182
+ schedule_type: api_requests.ScheduleType
183
+ # The least number of requests that this worker can run in parallel.
184
+ garanteed_parallelism: int
185
+ # The extra number of requests that this worker can run in parallel
186
+ # if there are available CPU/memory resources.
187
+ burstable_parallelism: int = 0
188
+
189
+ def __init__(self,
190
+ schedule_type: api_requests.ScheduleType,
191
+ garanteed_parallelism: int,
192
+ burstable_parallelism: int = 0) -> None:
193
+ self.schedule_type = schedule_type
194
+ self.garanteed_parallelism = garanteed_parallelism
195
+ self.burstable_parallelism = burstable_parallelism
196
+
197
+ def __str__(self) -> str:
198
+ return f'Worker(schedule_type={self.schedule_type.value})'
199
+
200
+ def process_request(self, executor: process.BurstableExecutor,
201
+ queue: RequestQueue) -> None:
202
+ try:
203
+ request_element = queue.get()
204
+ if request_element is None:
205
+ time.sleep(0.1)
206
+ return
207
+ request_id, ignore_return_value = request_element
208
+ request = api_requests.get_request(request_id)
209
+ assert request is not None, f'Request with ID {request_id} is None'
210
+ if request.status == api_requests.RequestStatus.CANCELLED:
211
+ return
212
+ logger.info(f'[{self}] Submitting request: {request_id}')
213
+ # Start additional process to run the request, so that it can be
214
+ # cancelled when requested by a user.
215
+ # TODO(zhwu): since the executor is reusing the request process,
216
+ # multiple requests can share the same process pid, which may cause
217
+ # issues with SkyPilot core functions if they rely on the exit of
218
+ # the process, such as subprocess_daemon.py.
219
+ executor.submit_until_success(_request_execution_wrapper,
220
+ request_id, ignore_return_value)
221
+
222
+ logger.info(f'[{self}] Submitted request: {request_id}')
223
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
224
+ # Catch any other exceptions to avoid crashing the worker process.
225
+ logger.error(
226
+ f'[{self}] Error processing request: '
227
+ f'{request_id if "request_id" in locals() else ""} '
228
+ f'{common_utils.format_exception(e, use_bracket=True)}')
229
+
230
+ def run(self) -> None:
231
+ # Handle the SIGTERM signal to abort the executor process gracefully.
232
+ proc_group = f'{self.schedule_type.value}'
233
+ if threading.current_thread() is threading.main_thread():
234
+ signal.signal(signal.SIGTERM, _sigterm_handler)
235
+ setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
236
+ queue = _get_queue(self.schedule_type)
237
+
238
+ # Use concurrent.futures.ProcessPoolExecutor instead of
239
+ # multiprocessing.Pool because the former is more efficient with the
240
+ # support of lazy creation of worker processes.
241
+ # We use executor instead of individual multiprocessing.Process to avoid
242
+ # the overhead of forking a new process for each request, which can be
243
+ # about 1s delay.
244
+ try:
245
+ executor = process.BurstableExecutor(
246
+ garanteed_workers=self.garanteed_parallelism,
247
+ burst_workers=self.burstable_parallelism,
248
+ initializer=executor_initializer,
249
+ initargs=(proc_group,))
250
+ while True:
251
+ self.process_request(executor, queue)
252
+ # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
253
+ except KeyboardInterrupt:
254
+ pass
255
+ finally:
256
+ # In most cases, here we receive either ctrl-c in foreground
257
+ # execution or SIGTERM on server exiting. Gracefully exit the
258
+ # worker process and the executor.
259
+ # TODO(aylei): worker may also be killed by system daemons like
260
+ # OOM killer, crash the API server or recreate the worker process
261
+ # to avoid broken state in such cases.
262
+ logger.info(f'[{self}] Worker process interrupted')
263
+ executor.shutdown()
264
+
265
+
164
266
  @annotations.lru_cache(scope='global', maxsize=None)
165
267
  def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
166
268
  return RequestQueue(schedule_type, backend=queue_backend)
@@ -349,110 +451,77 @@ def schedule_request(
349
451
  enqueue()
350
452
 
351
453
 
352
- def executor_initializer(proc_group: str):
353
- setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
354
- f'{multiprocessing.current_process().pid}')
355
-
356
-
357
- def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
358
- """Worker for the requests.
359
-
360
- Args:
361
- max_parallel_size: Maximum number of parallel jobs this worker can run.
362
- """
363
- # Handle the SIGTERM signal to abort the executor process gracefully.
364
- signal.signal(signal.SIGTERM, _sigterm_handler)
365
- proc_group = f'{worker.schedule_type.value}-{worker.id}'
366
- setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
367
- queue = _get_queue(worker.schedule_type)
368
-
369
- def process_request(executor: concurrent.futures.ProcessPoolExecutor):
370
- try:
371
- request_element = queue.get()
372
- if request_element is None:
373
- time.sleep(0.1)
374
- return
375
- request_id, ignore_return_value = request_element
376
- request = api_requests.get_request(request_id)
377
- assert request is not None, f'Request with ID {request_id} is None'
378
- if request.status == api_requests.RequestStatus.CANCELLED:
379
- return
380
- logger.info(f'[{worker}] Submitting request: {request_id}')
381
- # Start additional process to run the request, so that it can be
382
- # cancelled when requested by a user.
383
- # TODO(zhwu): since the executor is reusing the request process,
384
- # multiple requests can share the same process pid, which may cause
385
- # issues with SkyPilot core functions if they rely on the exit of
386
- # the process, such as subprocess_daemon.py.
387
- future = executor.submit(_request_execution_wrapper, request_id,
388
- ignore_return_value)
389
-
390
- if worker.schedule_type == api_requests.ScheduleType.LONG:
391
- try:
392
- future.result(timeout=None)
393
- except Exception as e: # pylint: disable=broad-except
394
- logger.error(f'[{worker}] Request {request_id} failed: {e}')
395
- logger.info(f'[{worker}] Finished request: {request_id}')
396
- else:
397
- logger.info(f'[{worker}] Submitted request: {request_id}')
398
- except (Exception, SystemExit) as e: # pylint: disable=broad-except
399
- # Catch any other exceptions to avoid crashing the worker process.
400
- logger.error(
401
- f'[{worker}] Error processing request: '
402
- f'{request_id if "request_id" in locals() else ""} '
403
- f'{common_utils.format_exception(e, use_bracket=True)}')
404
-
405
- # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
406
- # because the former is more efficient with the support of lazy creation of
407
- # worker processes.
408
- # We use executor instead of individual multiprocessing.Process to avoid
409
- # the overhead of forking a new process for each request, which can be about
410
- # 1s delay.
411
- try:
412
- executor = concurrent.futures.ProcessPoolExecutor(
413
- max_workers=max_parallel_size,
414
- initializer=executor_initializer,
415
- initargs=(proc_group,))
416
- while True:
417
- process_request(executor)
418
- # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
419
- except KeyboardInterrupt:
420
- pass
421
- finally:
422
- # In most cases, here we receive either ctrl-c in foreground execution
423
- # or SIGTERM on server exiting. Gracefully exit the worker process and
424
- # the executor.
425
- # TODO(aylei): worker may also be killed by system daemons like OOM
426
- # killer, crash the API server or recreate the worker process to avoid
427
- # broken state in such cases.
428
- logger.info(f'[{worker}] Worker process interrupted')
429
- executor_processes = list(executor._processes.values()) # pylint: disable=protected-access,line-too-long
430
- # Shutdown the executor so that executor process can exit once the
431
- # running task is finished or interrupted.
432
- executor.shutdown(wait=False)
433
- # Proactively interrupt the running task to avoid indefinite waiting.
434
- subprocess_utils.run_in_parallel(
435
- subprocess_utils.kill_process_with_grace_period,
436
- executor_processes,
437
- num_threads=len(executor_processes))
438
-
439
-
440
454
  def start(deploy: bool) -> List[multiprocessing.Process]:
441
- """Start the request workers."""
455
+ """Start the request workers.
456
+
457
+ Request workers run in background, schedule the requests and delegate the
458
+ request execution to executor processes. We have different assumptions for
459
+ the resources in different deployment modes, which leads to different
460
+ worker setups:
461
+
462
+ - Deployment mode (deploy=True), we assume the resources are dedicated to
463
+ the API server and the resources will be tuned for serious use cases, so:
464
+ - Use multiprocessing queue backend and dedicated workers processes to
465
+ avoid GIL contention.
466
+ - Parallelism (number of executor processes) is fixed and executor
467
+ processes have same lifecycle with the server, which ensures
468
+ best-effort cache reusing and stable resources consumption.
469
+ - Reject to start in low resource environments, to avoid flaky
470
+ deployments.
471
+ - Local mode (deploy=False), we assume the server is running in a shared
472
+ environment (e.g. laptop) and users typically do not pay attention to
473
+ the resource setup of the server. Moreover, existing users may expect
474
+ some consistent behaviors with old versions, i.e. before API server was
475
+ introduced, so:
476
+ - The max number of long-running executor processes are limited, to avoid
477
+ high memory consumption when the server is idle.
478
+ - Allow burstable workers to handle requests when all long-running
479
+ workers are busy, which mimics the behavior of local sky CLI before
480
+ API server was introduced.
481
+ - Works in low resources environments, and further reduce the memory
482
+ consumption in low resource environments.
483
+
484
+ Note that there is still significant overhead for SDK users when migrate to
485
+ local API server. Since the users are free to run sky operations in Threads
486
+ when using SDK but all client operations will occupy at least one worker
487
+ process after API server was introduced.
488
+ """
442
489
  # Determine the job capacity of the workers based on the system resources.
443
490
  cpu_count = common_utils.get_cpu_count()
444
491
  mem_size_gb = common_utils.get_mem_size_gb()
445
492
  mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
493
+ # Runs in low resource mode if the available memory is less than
494
+ # server_constants.MIN_AVAIL_MEM_GB.
446
495
  max_parallel_for_long = _max_long_worker_parallism(cpu_count,
447
496
  mem_size_gb,
448
497
  local=not deploy)
449
498
  max_parallel_for_short = _max_short_worker_parallism(
450
499
  mem_size_gb, max_parallel_for_long)
451
- logger.info(
452
- f'SkyPilot API server will start {max_parallel_for_long} workers for '
453
- f'long requests and will allow at max '
454
- f'{max_parallel_for_short} short requests in parallel.')
455
-
500
+ if mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
501
+ # Permanent worker process may have significant memory consumption
502
+ # (~350MB per worker) after running commands like `sky check`, so we
503
+ # don't start any permanent workers in low resource local mode. This
504
+ # mimics the behavior of local sky CLI before API server was
505
+ # introduced, where the CLI will start new process everytime and
506
+ # never reject to start due to resource constraints.
507
+ # Note that the refresh daemon will still occupy one worker
508
+ # permanently because it never exits.
509
+ max_parallel_for_long = 0
510
+ max_parallel_for_short = 0
511
+ logger.warning(
512
+ 'SkyPilot API server will run in low resource mode because '
513
+ 'the available memory is less than '
514
+ f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
515
+ else:
516
+ logger.info(
517
+ f'SkyPilot API server will start {max_parallel_for_long} workers '
518
+ f'for long requests and will allow at max '
519
+ f'{max_parallel_for_short} short requests in parallel.')
520
+ if not deploy:
521
+ # For local mode, use local queue backend since we only run 1 uvicorn
522
+ # worker in local mode.
523
+ global queue_backend
524
+ queue_backend = QueueBackend.LOCAL
456
525
  sub_procs = []
457
526
  # Setup the queues.
458
527
  if queue_backend == QueueBackend.MULTIPROCESSING:
@@ -471,28 +540,37 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
471
540
  target=mp_queue.start_queue_manager, args=(queue_names, port))
472
541
  queue_server.start()
473
542
  sub_procs.append(queue_server)
474
- mp_queue.wait_for_queues_to_be_ready(queue_names, queue_server, port)
543
+ mp_queue.wait_for_queues_to_be_ready(queue_names,
544
+ queue_server,
545
+ port=port)
546
+ elif queue_backend == QueueBackend.LOCAL:
547
+ # No setup is needed for local queue backend.
548
+ pass
549
+ else:
550
+ # Should be checked earlier, but just in case.
551
+ raise RuntimeError(f'Invalid queue backend: {queue_backend}')
475
552
 
476
553
  logger.info('Request queues created')
477
554
 
478
- long_workers = []
479
- for worker_id in range(max_parallel_for_long):
480
- worker = RequestWorker(id=worker_id,
481
- schedule_type=api_requests.ScheduleType.LONG)
482
- worker_proc = multiprocessing.Process(target=request_worker,
483
- args=(worker, 1))
484
- long_workers.append(worker_proc)
485
- sub_procs.append(worker_proc)
486
- threading.Thread(target=subprocess_utils.slow_start_processes,
487
- args=(long_workers,),
488
- daemon=True).start()
555
+ def run_worker_in_background(worker: RequestWorker):
556
+ # Thread dispatcher is sufficient for current scale, refer to
557
+ # tests/load_tests/test_queue_dispatcher.py for more details.
558
+ # Use daemon thread for automatic cleanup.
559
+ thread = threading.Thread(target=worker.run, daemon=True)
560
+ thread.start()
561
+
562
+ burstable_parallelism = _BURSTABLE_WORKERS_FOR_LOCAL if not deploy else 0
563
+ # Start a worker for long requests.
564
+ long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
565
+ garanteed_parallelism=max_parallel_for_long,
566
+ burstable_parallelism=burstable_parallelism)
567
+ run_worker_in_background(long_worker)
489
568
 
490
569
  # Start a worker for short requests.
491
- worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
492
- worker_proc = multiprocessing.Process(target=request_worker,
493
- args=(worker, max_parallel_for_short))
494
- worker_proc.start()
495
- sub_procs.append(worker_proc)
570
+ short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
571
+ garanteed_parallelism=max_parallel_for_short,
572
+ burstable_parallelism=burstable_parallelism)
573
+ run_worker_in_background(short_worker)
496
574
  return sub_procs
497
575
 
498
576
 
@@ -0,0 +1,212 @@
1
+ """ProcessPoolExecutor with additional supports for skypilot."""
2
+ import concurrent.futures
3
+ import logging
4
+ import multiprocessing
5
+ import threading
6
+ import time
7
+ from typing import Callable, Dict, Optional, Tuple
8
+
9
+ from sky.utils import atomic
10
+ from sky.utils import subprocess_utils
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class PoolExecutor(concurrent.futures.ProcessPoolExecutor):
16
+ """A custom ProcessPoolExecutor with additional supports for skypilot.
17
+
18
+ The additional supports include:
19
+ 1. Disposable workers: support control whether the worker process should
20
+ exit after complete a task.
21
+ 2. Idle check: support check if there are any idle workers.
22
+ 3. Proactive shutdown: SIGTERM worker processes when the executor is
23
+ shutting down instead of indefinitely waiting.
24
+ """
25
+
26
+ def __init__(self, max_workers: int, **kwargs):
27
+ super().__init__(max_workers=max_workers, **kwargs)
28
+ self.max_workers: int = max_workers
29
+ # The number of workers that are handling tasks, atomicity across
30
+ # multiple threads is sufficient since the idleness check is
31
+ # best-effort and does not affect the correctness.
32
+ # E.g. the following case is totally fine:
33
+ # 1. Thread 1 checks running == max_workers
34
+ # 2. Thread 2 decrements running
35
+ # 3. Thread 1 schedules the task to other pool even if the pool is
36
+ # currently idle.
37
+ self.running: atomic.AtomicInt = atomic.AtomicInt(0)
38
+
39
+ def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
40
+ """Submit a task for execution.
41
+
42
+ If reuse_worker is False, wraps the function to exit after completion.
43
+ """
44
+ self.running.increment()
45
+ future = super().submit(fn, *args, **kwargs)
46
+ future.add_done_callback(lambda _: self.running.decrement())
47
+ return future
48
+
49
+ def has_idle_workers(self) -> bool:
50
+ """Check if there are any idle workers."""
51
+ return self.running.get() < self.max_workers
52
+
53
+ def shutdown(self, wait: bool = True) -> None:
54
+ """Shutdown the executor."""
55
+ # Here wait means wait for the proactive cancellation complete.
56
+ # TODO(aylei): we may support wait=True in the future if needed.
57
+ assert wait is True, 'wait=False is not supported'
58
+ executor_processes = list(self._processes.values())
59
+ # Shutdown the executor so that executor process can exit once the
60
+ # running task is finished or interrupted.
61
+ super().shutdown(wait=False)
62
+ # Proactively interrupt the running task to avoid indefinite waiting.
63
+ subprocess_utils.run_in_parallel(
64
+ subprocess_utils.kill_process_with_grace_period,
65
+ executor_processes,
66
+ num_threads=len(executor_processes))
67
+
68
+
69
+ # Define the worker function outside of the class to avoid pickling self
70
+ def _disposable_worker(fn, initializer: Optional[Callable], initargs: Tuple,
71
+ args, kwargs):
72
+ try:
73
+ if initializer is not None:
74
+ initializer(*initargs)
75
+ fn(*args, **kwargs)
76
+ except BaseException as e: # pylint: disable=broad-except
77
+ return e
78
+
79
+
80
+ class DisposableExecutor:
81
+ """A simple wrapper that creates a new process for each task.
82
+
83
+ This is a workaround for Python 3.10 since `max_tasks_per_child` of
84
+ ProcessPoolExecutor was introduced in 3.11. There is no way to control
85
+ the worker lifetime in 3.10.
86
+ Ref: https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor # pylint: disable=line-too-long
87
+ TODO(aylei): use the official `max_tasks_per_child` when upgrade to 3.11
88
+ """
89
+
90
+ def __init__(self,
91
+ max_workers: Optional[int] = None,
92
+ initializer: Optional[Callable] = None,
93
+ initargs: Tuple = ()):
94
+ self.max_workers: Optional[int] = max_workers
95
+ self.workers: Dict[int, multiprocessing.Process] = {}
96
+ self._shutdown: bool = False
97
+ self._lock: threading.Lock = threading.Lock()
98
+ self._initializer: Optional[Callable] = initializer
99
+ self._initargs: Tuple = initargs
100
+
101
+ def _monitor_worker(self, process: multiprocessing.Process) -> None:
102
+ """Monitor the worker process and cleanup when it's done."""
103
+ process.join()
104
+ if process.pid:
105
+ with self._lock:
106
+ if process.pid in self.workers:
107
+ del self.workers[process.pid]
108
+
109
+ # Submit is not compatible with ProcessPoolExecutor because we does not
110
+ # bother to return a Future. Can be improved if needed.
111
+ def submit(self, fn, *args, **kwargs) -> bool:
112
+ """Submit a task for execution."""
113
+ if self._shutdown:
114
+ return False
115
+ with self._lock:
116
+ if (self.max_workers is not None and
117
+ len(self.workers) >= self.max_workers):
118
+ return False
119
+
120
+ process = multiprocessing.Process(target=_disposable_worker,
121
+ args=(fn, self._initializer,
122
+ self._initargs, args, kwargs))
123
+ process.start()
124
+
125
+ with self._lock:
126
+ pid = process.pid or 0
127
+ if pid == 0:
128
+ raise RuntimeError('Failed to start process')
129
+ self.workers[pid] = process
130
+
131
+ # Start monitor thread to cleanup the worker process when it's done.
132
+ monitor_thread = threading.Thread(target=self._monitor_worker,
133
+ args=(process,),
134
+ daemon=True)
135
+ monitor_thread.start()
136
+
137
+ return True
138
+
139
+ def has_idle_workers(self) -> bool:
140
+ """Check if there are any idle workers."""
141
+ if self.max_workers is None:
142
+ return True
143
+ with self._lock:
144
+ return len(self.workers) < self.max_workers
145
+
146
+ def shutdown(self):
147
+ """Shutdown the executor."""
148
+ with self._lock:
149
+ self._shutdown = True
150
+ subprocess_utils.run_in_parallel(
151
+ subprocess_utils.kill_process_with_grace_period,
152
+ list(self.workers.values()), # Convert dict values to list
153
+ num_threads=len(self.workers))
154
+
155
+
156
+ class BurstableExecutor:
157
+ """An multiprocessing executor that supports bursting worker processes."""
158
+
159
+ # _executor is a PoolExecutor that is used to run guaranteed requests.
160
+ _executor: Optional[PoolExecutor] = None
161
+ # _burst_executor is a ProcessPoolExecutor that is used to run burst
162
+ # requests.
163
+ _burst_executor: Optional[DisposableExecutor] = None
164
+
165
+ def __init__(self,
166
+ garanteed_workers: int,
167
+ burst_workers: int = 0,
168
+ **kwargs):
169
+ if garanteed_workers > 0:
170
+ self._executor = PoolExecutor(max_workers=garanteed_workers,
171
+ **kwargs)
172
+ if burst_workers > 0:
173
+ self._burst_executor = DisposableExecutor(max_workers=burst_workers,
174
+ **kwargs)
175
+
176
+ def submit_until_success(self, fn, *args, **kwargs):
177
+ """Submit a task for execution until success.
178
+
179
+ Prioritizes submitting to the guaranteed pool. If no idle workers
180
+ are available in the guaranteed pool, it will submit to the burst
181
+ pool.
182
+ TODO(aylei): this is coupled with executor.RequestWorker since we
183
+ know the worker is dedicated to request scheduling and it either
184
+ blocks on request polling or request submitting. So it is no harm
185
+ to make submit blocking here. But for general cases, we need an
186
+ internal queue to decouple submit and run.
187
+ """
188
+
189
+ while True:
190
+ if self._executor is not None and self._executor.has_idle_workers():
191
+ self._executor.submit(fn, *args, **kwargs)
192
+ break
193
+ if (self._burst_executor is not None and
194
+ self._burst_executor.has_idle_workers()):
195
+ self._burst_executor.submit(fn, *args, **kwargs)
196
+ break
197
+ if self._executor is not None:
198
+ # No idle workers in either pool, still queue the request
199
+ # to the guaranteed pool to keep behavior consistent.
200
+ self._executor.submit(fn, *args, **kwargs)
201
+ break
202
+ logger.debug('No guaranteed pool set and the burst pool is full, '
203
+ 'retry later.')
204
+ time.sleep(0.1)
205
+
206
+ def shutdown(self) -> None:
207
+ """Shutdown the executor."""
208
+
209
+ if self._burst_executor is not None:
210
+ self._burst_executor.shutdown()
211
+ if self._executor is not None:
212
+ self._executor.shutdown(wait=True)
@@ -0,0 +1,16 @@
1
+ """Process-local queue implementation."""
2
+ import queue
3
+ import threading
4
+ from typing import Dict
5
+
6
+ # Global dict to store queues
7
+ _queues: Dict[str, queue.Queue] = {}
8
+ _lock = threading.Lock()
9
+
10
+
11
+ def get_queue(queue_name: str) -> queue.Queue:
12
+ """Get or create a queue by name."""
13
+ with _lock:
14
+ if queue_name not in _queues:
15
+ _queues[queue_name] = queue.Queue()
16
+ return _queues[queue_name]
@@ -9,7 +9,7 @@ import sys
9
9
  from typing import Dict, List
10
10
 
11
11
  install_requires = [
12
- 'wheel',
12
+ 'wheel<0.46.0', # https://github.com/skypilot-org/skypilot/issues/5153
13
13
  'cachetools',
14
14
  # NOTE: ray requires click>=7.0.
15
15
  'click >= 7.0',
sky/skylet/log_lib.py CHANGED
@@ -149,6 +149,7 @@ def run_with_log(
149
149
  process_stream: bool = True,
150
150
  line_processor: Optional[log_utils.LineProcessor] = None,
151
151
  streaming_prefix: Optional[str] = None,
152
+ log_cmd: bool = False,
152
153
  **kwargs,
153
154
  ) -> Union[int, Tuple[int, str, str]]:
154
155
  """Runs a command and logs its output to a file.
@@ -182,6 +183,9 @@ def run_with_log(
182
183
  # the terminal output when typing in the terminal that starts the API
183
184
  # server.
184
185
  stdin = kwargs.pop('stdin', subprocess.DEVNULL)
186
+ if log_cmd:
187
+ with open(log_path, 'a', encoding='utf-8') as f:
188
+ print(f'Running command: {cmd}', file=f)
185
189
  with subprocess.Popen(cmd,
186
190
  stdout=stdout_arg,
187
191
  stderr=stderr_arg,