skypilot-nightly 1.0.0.dev20250407__py3-none-any.whl → 1.0.0.dev20250408__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'c5039370280815a3f347e76622dc154ede36d6c3'
8
+ _SKYPILOT_COMMIT_SHA = 'e0674be528e87191ade88961c44c6449d01232fa'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250407'
38
+ __version__ = '1.0.0.dev20250408'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -655,12 +655,9 @@ class RayCodeGen:
655
655
  rclone_flush_script = {rclone_flush_script!r}
656
656
  if run_fn is not None:
657
657
  script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
658
- if script is not None:
659
- script += rclone_flush_script
660
- else:
661
- script = rclone_flush_script
662
658
 
663
659
  if script is not None:
660
+ script += rclone_flush_script
664
661
  sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
665
662
  # Backward compatibility: Environment starting with `SKY_` is
666
663
  # deprecated. Remove it in v0.9.0.
sky/data/storage_utils.py CHANGED
@@ -227,6 +227,9 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
227
227
  expand_src_dir_path = os.path.expanduser(src_dir_path)
228
228
  skyignore_path = os.path.join(expand_src_dir_path,
229
229
  constants.SKY_IGNORE_FILE)
230
+ # Fail fast if the source is a file.
231
+ if os.path.isfile(expand_src_dir_path):
232
+ raise ValueError(f'{src_dir_path} is a file, not a directory.')
230
233
  if os.path.exists(skyignore_path):
231
234
  logger.debug(f' {colorama.Style.DIM}'
232
235
  f'Excluded files to sync to cluster based on '
@@ -267,11 +270,15 @@ def zip_files_and_folders(items: List[str],
267
270
  item = os.path.expanduser(item)
268
271
  if not os.path.isfile(item) and not os.path.isdir(item):
269
272
  raise ValueError(f'{item} does not exist.')
270
- excluded_files = set(
271
- [os.path.join(item, f) for f in get_excluded_files(item)])
272
- if os.path.isfile(item) and item not in excluded_files:
273
+ if os.path.isfile(item):
274
+ # Add the file to the zip archive even if it matches
275
+ # patterns in dot ignore files, as it was explicitly
276
+ # specified by user.
273
277
  zipf.write(item)
274
278
  elif os.path.isdir(item):
279
+ excluded_files = set([
280
+ os.path.join(item, f) for f in get_excluded_files(item)
281
+ ])
275
282
  for root, dirs, files in os.walk(item, followlinks=False):
276
283
  # Modify dirs in-place to control os.walk()'s traversal
277
284
  # behavior. This filters out excluded directories BEFORE
@@ -18,9 +18,7 @@ The number of the workers is determined by the system resources.
18
18
 
19
19
  See the [README.md](../README.md) for detailed architecture of the executor.
20
20
  """
21
- import concurrent.futures
22
21
  import contextlib
23
- import dataclasses
24
22
  import enum
25
23
  import multiprocessing
26
24
  import os
@@ -42,7 +40,9 @@ from sky.server import common as server_common
42
40
  from sky.server import constants as server_constants
43
41
  from sky.server.requests import payloads
44
42
  from sky.server.requests import preconditions
43
+ from sky.server.requests import process
45
44
  from sky.server.requests import requests as api_requests
45
+ from sky.server.requests.queues import local_queue
46
46
  from sky.server.requests.queues import mp_queue
47
47
  from sky.skylet import constants
48
48
  from sky.utils import annotations
@@ -101,22 +101,23 @@ _MIN_LONG_WORKERS = 1
101
101
  # workers so at least 2 workers are needed to ensure responsiveness.
102
102
  _MIN_SHORT_WORKERS = 2
103
103
 
104
+ # Default number of burstable workers for local API server. A heuristic number
105
+ # that is large enough for most local cases.
106
+ # TODO(aylei): the number of burstable workers should be auto-tuned based on the
107
+ # system usage stats.
108
+ _BURSTABLE_WORKERS_FOR_LOCAL = 1024
109
+
104
110
 
105
111
  class QueueBackend(enum.Enum):
112
+ # Local queue backend serves queues in each process locally, which has
113
+ # lower resource usage but the consumer must be in the same process, i.e.
114
+ # this only works in single-process mode.
115
+ LOCAL = 'local'
116
+ # Multi-process queue backend starts a dedicated process for serving queues.
106
117
  MULTIPROCESSING = 'multiprocessing'
107
118
  # TODO(zhwu): we can add redis backend in the future.
108
119
 
109
120
 
110
- @dataclasses.dataclass
111
- class RequestWorker:
112
- id: int
113
- # The type of queue this worker works on.
114
- schedule_type: api_requests.ScheduleType
115
-
116
- def __str__(self) -> str:
117
- return f'Worker(id={self.id}, schedule_type={self.schedule_type.value})'
118
-
119
-
120
121
  class RequestQueue:
121
122
  """The queue for the requests, either redis or multiprocessing.
122
123
 
@@ -128,9 +129,12 @@ class RequestQueue:
128
129
  backend: Optional[QueueBackend] = None) -> None:
129
130
  self.name = schedule_type.value
130
131
  self.backend = backend
131
- assert (backend is None or
132
- backend == QueueBackend.MULTIPROCESSING), backend
133
- self.queue = mp_queue.get_queue(self.name)
132
+ if backend == QueueBackend.MULTIPROCESSING:
133
+ self.queue = mp_queue.get_queue(self.name)
134
+ elif backend == QueueBackend.LOCAL:
135
+ self.queue = local_queue.get_queue(self.name)
136
+ else:
137
+ raise RuntimeError(f'Invalid queue backend: {backend}')
134
138
 
135
139
  def put(self, request: Tuple[str, bool]) -> None:
136
140
  """Put and request to the queue.
@@ -161,6 +165,104 @@ class RequestQueue:
161
165
  queue_backend = QueueBackend.MULTIPROCESSING
162
166
 
163
167
 
168
+ def executor_initializer(proc_group: str):
169
+ setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
170
+ f'{multiprocessing.current_process().pid}')
171
+
172
+
173
+ class RequestWorker:
174
+ """A worker that polls requests from the queue and runs them.
175
+
176
+ The worker can run at least `garanteed_parallelism` requests in parallel.
177
+ If there are more resources available, it can spin up extra workers up to
178
+ `garanteed_parallelism + burstable_parallelism`.
179
+ """
180
+
181
+ # The type of queue this worker works on.
182
+ schedule_type: api_requests.ScheduleType
183
+ # The least number of requests that this worker can run in parallel.
184
+ garanteed_parallelism: int
185
+ # The extra number of requests that this worker can run in parallel
186
+ # if there are available CPU/memory resources.
187
+ burstable_parallelism: int = 0
188
+
189
+ def __init__(self,
190
+ schedule_type: api_requests.ScheduleType,
191
+ garanteed_parallelism: int,
192
+ burstable_parallelism: int = 0) -> None:
193
+ self.schedule_type = schedule_type
194
+ self.garanteed_parallelism = garanteed_parallelism
195
+ self.burstable_parallelism = burstable_parallelism
196
+
197
+ def __str__(self) -> str:
198
+ return f'Worker(schedule_type={self.schedule_type.value})'
199
+
200
+ def process_request(self, executor: process.BurstableExecutor,
201
+ queue: RequestQueue) -> None:
202
+ try:
203
+ request_element = queue.get()
204
+ if request_element is None:
205
+ time.sleep(0.1)
206
+ return
207
+ request_id, ignore_return_value = request_element
208
+ request = api_requests.get_request(request_id)
209
+ assert request is not None, f'Request with ID {request_id} is None'
210
+ if request.status == api_requests.RequestStatus.CANCELLED:
211
+ return
212
+ logger.info(f'[{self}] Submitting request: {request_id}')
213
+ # Start additional process to run the request, so that it can be
214
+ # cancelled when requested by a user.
215
+ # TODO(zhwu): since the executor is reusing the request process,
216
+ # multiple requests can share the same process pid, which may cause
217
+ # issues with SkyPilot core functions if they rely on the exit of
218
+ # the process, such as subprocess_daemon.py.
219
+ executor.submit_until_success(_request_execution_wrapper,
220
+ request_id, ignore_return_value)
221
+
222
+ logger.info(f'[{self}] Submitted request: {request_id}')
223
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
224
+ # Catch any other exceptions to avoid crashing the worker process.
225
+ logger.error(
226
+ f'[{self}] Error processing request: '
227
+ f'{request_id if "request_id" in locals() else ""} '
228
+ f'{common_utils.format_exception(e, use_bracket=True)}')
229
+
230
+ def run(self) -> None:
231
+ # Handle the SIGTERM signal to abort the executor process gracefully.
232
+ proc_group = f'{self.schedule_type.value}'
233
+ if threading.current_thread() is threading.main_thread():
234
+ signal.signal(signal.SIGTERM, _sigterm_handler)
235
+ setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
236
+ queue = _get_queue(self.schedule_type)
237
+
238
+ # Use concurrent.futures.ProcessPoolExecutor instead of
239
+ # multiprocessing.Pool because the former is more efficient with the
240
+ # support of lazy creation of worker processes.
241
+ # We use executor instead of individual multiprocessing.Process to avoid
242
+ # the overhead of forking a new process for each request, which can be
243
+ # about 1s delay.
244
+ try:
245
+ executor = process.BurstableExecutor(
246
+ garanteed_workers=self.garanteed_parallelism,
247
+ burst_workers=self.burstable_parallelism,
248
+ initializer=executor_initializer,
249
+ initargs=(proc_group,))
250
+ while True:
251
+ self.process_request(executor, queue)
252
+ # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
253
+ except KeyboardInterrupt:
254
+ pass
255
+ finally:
256
+ # In most cases, here we receive either ctrl-c in foreground
257
+ # execution or SIGTERM on server exiting. Gracefully exit the
258
+ # worker process and the executor.
259
+ # TODO(aylei): worker may also be killed by system daemons like
260
+ # OOM killer, crash the API server or recreate the worker process
261
+ # to avoid broken state in such cases.
262
+ logger.info(f'[{self}] Worker process interrupted')
263
+ executor.shutdown()
264
+
265
+
164
266
  @annotations.lru_cache(scope='global', maxsize=None)
165
267
  def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
166
268
  return RequestQueue(schedule_type, backend=queue_backend)
@@ -349,110 +451,77 @@ def schedule_request(
349
451
  enqueue()
350
452
 
351
453
 
352
- def executor_initializer(proc_group: str):
353
- setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
354
- f'{multiprocessing.current_process().pid}')
355
-
356
-
357
- def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
358
- """Worker for the requests.
359
-
360
- Args:
361
- max_parallel_size: Maximum number of parallel jobs this worker can run.
362
- """
363
- # Handle the SIGTERM signal to abort the executor process gracefully.
364
- signal.signal(signal.SIGTERM, _sigterm_handler)
365
- proc_group = f'{worker.schedule_type.value}-{worker.id}'
366
- setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
367
- queue = _get_queue(worker.schedule_type)
368
-
369
- def process_request(executor: concurrent.futures.ProcessPoolExecutor):
370
- try:
371
- request_element = queue.get()
372
- if request_element is None:
373
- time.sleep(0.1)
374
- return
375
- request_id, ignore_return_value = request_element
376
- request = api_requests.get_request(request_id)
377
- assert request is not None, f'Request with ID {request_id} is None'
378
- if request.status == api_requests.RequestStatus.CANCELLED:
379
- return
380
- logger.info(f'[{worker}] Submitting request: {request_id}')
381
- # Start additional process to run the request, so that it can be
382
- # cancelled when requested by a user.
383
- # TODO(zhwu): since the executor is reusing the request process,
384
- # multiple requests can share the same process pid, which may cause
385
- # issues with SkyPilot core functions if they rely on the exit of
386
- # the process, such as subprocess_daemon.py.
387
- future = executor.submit(_request_execution_wrapper, request_id,
388
- ignore_return_value)
389
-
390
- if worker.schedule_type == api_requests.ScheduleType.LONG:
391
- try:
392
- future.result(timeout=None)
393
- except Exception as e: # pylint: disable=broad-except
394
- logger.error(f'[{worker}] Request {request_id} failed: {e}')
395
- logger.info(f'[{worker}] Finished request: {request_id}')
396
- else:
397
- logger.info(f'[{worker}] Submitted request: {request_id}')
398
- except (Exception, SystemExit) as e: # pylint: disable=broad-except
399
- # Catch any other exceptions to avoid crashing the worker process.
400
- logger.error(
401
- f'[{worker}] Error processing request: '
402
- f'{request_id if "request_id" in locals() else ""} '
403
- f'{common_utils.format_exception(e, use_bracket=True)}')
404
-
405
- # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
406
- # because the former is more efficient with the support of lazy creation of
407
- # worker processes.
408
- # We use executor instead of individual multiprocessing.Process to avoid
409
- # the overhead of forking a new process for each request, which can be about
410
- # 1s delay.
411
- try:
412
- executor = concurrent.futures.ProcessPoolExecutor(
413
- max_workers=max_parallel_size,
414
- initializer=executor_initializer,
415
- initargs=(proc_group,))
416
- while True:
417
- process_request(executor)
418
- # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
419
- except KeyboardInterrupt:
420
- pass
421
- finally:
422
- # In most cases, here we receive either ctrl-c in foreground execution
423
- # or SIGTERM on server exiting. Gracefully exit the worker process and
424
- # the executor.
425
- # TODO(aylei): worker may also be killed by system daemons like OOM
426
- # killer, crash the API server or recreate the worker process to avoid
427
- # broken state in such cases.
428
- logger.info(f'[{worker}] Worker process interrupted')
429
- executor_processes = list(executor._processes.values()) # pylint: disable=protected-access,line-too-long
430
- # Shutdown the executor so that executor process can exit once the
431
- # running task is finished or interrupted.
432
- executor.shutdown(wait=False)
433
- # Proactively interrupt the running task to avoid indefinite waiting.
434
- subprocess_utils.run_in_parallel(
435
- subprocess_utils.kill_process_with_grace_period,
436
- executor_processes,
437
- num_threads=len(executor_processes))
438
-
439
-
440
454
  def start(deploy: bool) -> List[multiprocessing.Process]:
441
- """Start the request workers."""
455
+ """Start the request workers.
456
+
457
+ Request workers run in background, schedule the requests and delegate the
458
+ request execution to executor processes. We have different assumptions for
459
+ the resources in different deployment modes, which leads to different
460
+ worker setups:
461
+
462
+ - Deployment mode (deploy=True), we assume the resources are dedicated to
463
+ the API server and the resources will be tuned for serious use cases, so:
464
+ - Use multiprocessing queue backend and dedicated workers processes to
465
+ avoid GIL contention.
466
+ - Parallelism (number of executor processes) is fixed and executor
467
+ processes have same lifecycle with the server, which ensures
468
+ best-effort cache reusing and stable resources consumption.
469
+ - Reject to start in low resource environments, to avoid flaky
470
+ deployments.
471
+ - Local mode (deploy=False), we assume the server is running in a shared
472
+ environment (e.g. laptop) and users typically do not pay attention to
473
+ the resource setup of the server. Moreover, existing users may expect
474
+ some consistent behaviors with old versions, i.e. before API server was
475
+ introduced, so:
476
+ - The max number of long-running executor processes are limited, to avoid
477
+ high memory consumption when the server is idle.
478
+ - Allow burstable workers to handle requests when all long-running
479
+ workers are busy, which mimics the behavior of local sky CLI before
480
+ API server was introduced.
481
+ - Works in low resources environments, and further reduce the memory
482
+ consumption in low resource environments.
483
+
484
+ Note that there is still significant overhead for SDK users when migrate to
485
+ local API server. Since the users are free to run sky operations in Threads
486
+ when using SDK but all client operations will occupy at least one worker
487
+ process after API server was introduced.
488
+ """
442
489
  # Determine the job capacity of the workers based on the system resources.
443
490
  cpu_count = common_utils.get_cpu_count()
444
491
  mem_size_gb = common_utils.get_mem_size_gb()
445
492
  mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
493
+ # Runs in low resource mode if the available memory is less than
494
+ # server_constants.MIN_AVAIL_MEM_GB.
446
495
  max_parallel_for_long = _max_long_worker_parallism(cpu_count,
447
496
  mem_size_gb,
448
497
  local=not deploy)
449
498
  max_parallel_for_short = _max_short_worker_parallism(
450
499
  mem_size_gb, max_parallel_for_long)
451
- logger.info(
452
- f'SkyPilot API server will start {max_parallel_for_long} workers for '
453
- f'long requests and will allow at max '
454
- f'{max_parallel_for_short} short requests in parallel.')
455
-
500
+ if mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
501
+ # Permanent worker process may have significant memory consumption
502
+ # (~350MB per worker) after running commands like `sky check`, so we
503
+ # don't start any permanent workers in low resource local mode. This
504
+ # mimics the behavior of local sky CLI before API server was
505
+ # introduced, where the CLI will start new process everytime and
506
+ # never reject to start due to resource constraints.
507
+ # Note that the refresh daemon will still occupy one worker
508
+ # permanently because it never exits.
509
+ max_parallel_for_long = 0
510
+ max_parallel_for_short = 0
511
+ logger.warning(
512
+ 'SkyPilot API server will run in low resource mode because '
513
+ 'the available memory is less than '
514
+ f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
515
+ else:
516
+ logger.info(
517
+ f'SkyPilot API server will start {max_parallel_for_long} workers '
518
+ f'for long requests and will allow at max '
519
+ f'{max_parallel_for_short} short requests in parallel.')
520
+ if not deploy:
521
+ # For local mode, use local queue backend since we only run 1 uvicorn
522
+ # worker in local mode.
523
+ global queue_backend
524
+ queue_backend = QueueBackend.LOCAL
456
525
  sub_procs = []
457
526
  # Setup the queues.
458
527
  if queue_backend == QueueBackend.MULTIPROCESSING:
@@ -471,28 +540,37 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
471
540
  target=mp_queue.start_queue_manager, args=(queue_names, port))
472
541
  queue_server.start()
473
542
  sub_procs.append(queue_server)
474
- mp_queue.wait_for_queues_to_be_ready(queue_names, queue_server, port)
543
+ mp_queue.wait_for_queues_to_be_ready(queue_names,
544
+ queue_server,
545
+ port=port)
546
+ elif queue_backend == QueueBackend.LOCAL:
547
+ # No setup is needed for local queue backend.
548
+ pass
549
+ else:
550
+ # Should be checked earlier, but just in case.
551
+ raise RuntimeError(f'Invalid queue backend: {queue_backend}')
475
552
 
476
553
  logger.info('Request queues created')
477
554
 
478
- long_workers = []
479
- for worker_id in range(max_parallel_for_long):
480
- worker = RequestWorker(id=worker_id,
481
- schedule_type=api_requests.ScheduleType.LONG)
482
- worker_proc = multiprocessing.Process(target=request_worker,
483
- args=(worker, 1))
484
- long_workers.append(worker_proc)
485
- sub_procs.append(worker_proc)
486
- threading.Thread(target=subprocess_utils.slow_start_processes,
487
- args=(long_workers,),
488
- daemon=True).start()
555
+ def run_worker_in_background(worker: RequestWorker):
556
+ # Thread dispatcher is sufficient for current scale, refer to
557
+ # tests/load_tests/test_queue_dispatcher.py for more details.
558
+ # Use daemon thread for automatic cleanup.
559
+ thread = threading.Thread(target=worker.run, daemon=True)
560
+ thread.start()
561
+
562
+ burstable_parallelism = _BURSTABLE_WORKERS_FOR_LOCAL if not deploy else 0
563
+ # Start a worker for long requests.
564
+ long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
565
+ garanteed_parallelism=max_parallel_for_long,
566
+ burstable_parallelism=burstable_parallelism)
567
+ run_worker_in_background(long_worker)
489
568
 
490
569
  # Start a worker for short requests.
491
- worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
492
- worker_proc = multiprocessing.Process(target=request_worker,
493
- args=(worker, max_parallel_for_short))
494
- worker_proc.start()
495
- sub_procs.append(worker_proc)
570
+ short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
571
+ garanteed_parallelism=max_parallel_for_short,
572
+ burstable_parallelism=burstable_parallelism)
573
+ run_worker_in_background(short_worker)
496
574
  return sub_procs
497
575
 
498
576
 
@@ -0,0 +1,212 @@
1
+ """ProcessPoolExecutor with additional supports for skypilot."""
2
+ import concurrent.futures
3
+ import logging
4
+ import multiprocessing
5
+ import threading
6
+ import time
7
+ from typing import Callable, Dict, Optional, Tuple
8
+
9
+ from sky.utils import atomic
10
+ from sky.utils import subprocess_utils
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class PoolExecutor(concurrent.futures.ProcessPoolExecutor):
16
+ """A custom ProcessPoolExecutor with additional supports for skypilot.
17
+
18
+ The additional supports include:
19
+ 1. Disposable workers: support control whether the worker process should
20
+ exit after complete a task.
21
+ 2. Idle check: support check if there are any idle workers.
22
+ 3. Proactive shutdown: SIGTERM worker processes when the executor is
23
+ shutting down instead of indefinitely waiting.
24
+ """
25
+
26
+ def __init__(self, max_workers: int, **kwargs):
27
+ super().__init__(max_workers=max_workers, **kwargs)
28
+ self.max_workers: int = max_workers
29
+ # The number of workers that are handling tasks, atomicity across
30
+ # multiple threads is sufficient since the idleness check is
31
+ # best-effort and does not affect the correctness.
32
+ # E.g. the following case is totally fine:
33
+ # 1. Thread 1 checks running == max_workers
34
+ # 2. Thread 2 decrements running
35
+ # 3. Thread 1 schedules the task to other pool even if the pool is
36
+ # currently idle.
37
+ self.running: atomic.AtomicInt = atomic.AtomicInt(0)
38
+
39
+ def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
40
+ """Submit a task for execution.
41
+
42
+ If reuse_worker is False, wraps the function to exit after completion.
43
+ """
44
+ self.running.increment()
45
+ future = super().submit(fn, *args, **kwargs)
46
+ future.add_done_callback(lambda _: self.running.decrement())
47
+ return future
48
+
49
+ def has_idle_workers(self) -> bool:
50
+ """Check if there are any idle workers."""
51
+ return self.running.get() < self.max_workers
52
+
53
+ def shutdown(self, wait: bool = True) -> None:
54
+ """Shutdown the executor."""
55
+ # Here wait means wait for the proactive cancellation complete.
56
+ # TODO(aylei): we may support wait=True in the future if needed.
57
+ assert wait is True, 'wait=False is not supported'
58
+ executor_processes = list(self._processes.values())
59
+ # Shutdown the executor so that executor process can exit once the
60
+ # running task is finished or interrupted.
61
+ super().shutdown(wait=False)
62
+ # Proactively interrupt the running task to avoid indefinite waiting.
63
+ subprocess_utils.run_in_parallel(
64
+ subprocess_utils.kill_process_with_grace_period,
65
+ executor_processes,
66
+ num_threads=len(executor_processes))
67
+
68
+
69
+ # Define the worker function outside of the class to avoid pickling self
70
+ def _disposable_worker(fn, initializer: Optional[Callable], initargs: Tuple,
71
+ args, kwargs):
72
+ try:
73
+ if initializer is not None:
74
+ initializer(*initargs)
75
+ fn(*args, **kwargs)
76
+ except BaseException as e: # pylint: disable=broad-except
77
+ return e
78
+
79
+
80
+ class DisposableExecutor:
81
+ """A simple wrapper that creates a new process for each task.
82
+
83
+ This is a workaround for Python 3.10 since `max_tasks_per_child` of
84
+ ProcessPoolExecutor was introduced in 3.11. There is no way to control
85
+ the worker lifetime in 3.10.
86
+ Ref: https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor # pylint: disable=line-too-long
87
+ TODO(aylei): use the official `max_tasks_per_child` when upgrade to 3.11
88
+ """
89
+
90
+ def __init__(self,
91
+ max_workers: Optional[int] = None,
92
+ initializer: Optional[Callable] = None,
93
+ initargs: Tuple = ()):
94
+ self.max_workers: Optional[int] = max_workers
95
+ self.workers: Dict[int, multiprocessing.Process] = {}
96
+ self._shutdown: bool = False
97
+ self._lock: threading.Lock = threading.Lock()
98
+ self._initializer: Optional[Callable] = initializer
99
+ self._initargs: Tuple = initargs
100
+
101
+ def _monitor_worker(self, process: multiprocessing.Process) -> None:
102
+ """Monitor the worker process and cleanup when it's done."""
103
+ process.join()
104
+ if process.pid:
105
+ with self._lock:
106
+ if process.pid in self.workers:
107
+ del self.workers[process.pid]
108
+
109
+ # Submit is not compatible with ProcessPoolExecutor because we does not
110
+ # bother to return a Future. Can be improved if needed.
111
+ def submit(self, fn, *args, **kwargs) -> bool:
112
+ """Submit a task for execution."""
113
+ if self._shutdown:
114
+ return False
115
+ with self._lock:
116
+ if (self.max_workers is not None and
117
+ len(self.workers) >= self.max_workers):
118
+ return False
119
+
120
+ process = multiprocessing.Process(target=_disposable_worker,
121
+ args=(fn, self._initializer,
122
+ self._initargs, args, kwargs))
123
+ process.start()
124
+
125
+ with self._lock:
126
+ pid = process.pid or 0
127
+ if pid == 0:
128
+ raise RuntimeError('Failed to start process')
129
+ self.workers[pid] = process
130
+
131
+ # Start monitor thread to cleanup the worker process when it's done.
132
+ monitor_thread = threading.Thread(target=self._monitor_worker,
133
+ args=(process,),
134
+ daemon=True)
135
+ monitor_thread.start()
136
+
137
+ return True
138
+
139
+ def has_idle_workers(self) -> bool:
140
+ """Check if there are any idle workers."""
141
+ if self.max_workers is None:
142
+ return True
143
+ with self._lock:
144
+ return len(self.workers) < self.max_workers
145
+
146
+ def shutdown(self):
147
+ """Shutdown the executor."""
148
+ with self._lock:
149
+ self._shutdown = True
150
+ subprocess_utils.run_in_parallel(
151
+ subprocess_utils.kill_process_with_grace_period,
152
+ list(self.workers.values()), # Convert dict values to list
153
+ num_threads=len(self.workers))
154
+
155
+
156
+ class BurstableExecutor:
157
+ """An multiprocessing executor that supports bursting worker processes."""
158
+
159
+ # _executor is a PoolExecutor that is used to run guaranteed requests.
160
+ _executor: Optional[PoolExecutor] = None
161
+ # _burst_executor is a ProcessPoolExecutor that is used to run burst
162
+ # requests.
163
+ _burst_executor: Optional[DisposableExecutor] = None
164
+
165
+ def __init__(self,
166
+ garanteed_workers: int,
167
+ burst_workers: int = 0,
168
+ **kwargs):
169
+ if garanteed_workers > 0:
170
+ self._executor = PoolExecutor(max_workers=garanteed_workers,
171
+ **kwargs)
172
+ if burst_workers > 0:
173
+ self._burst_executor = DisposableExecutor(max_workers=burst_workers,
174
+ **kwargs)
175
+
176
+ def submit_until_success(self, fn, *args, **kwargs):
177
+ """Submit a task for execution until success.
178
+
179
+ Prioritizes submitting to the guaranteed pool. If no idle workers
180
+ are available in the guaranteed pool, it will submit to the burst
181
+ pool.
182
+ TODO(aylei): this is coupled with executor.RequestWorker since we
183
+ know the worker is dedicated to request scheduling and it either
184
+ blocks on request polling or request submitting. So it is no harm
185
+ to make submit blocking here. But for general cases, we need an
186
+ internal queue to decouple submit and run.
187
+ """
188
+
189
+ while True:
190
+ if self._executor is not None and self._executor.has_idle_workers():
191
+ self._executor.submit(fn, *args, **kwargs)
192
+ break
193
+ if (self._burst_executor is not None and
194
+ self._burst_executor.has_idle_workers()):
195
+ self._burst_executor.submit(fn, *args, **kwargs)
196
+ break
197
+ if self._executor is not None:
198
+ # No idle workers in either pool, still queue the request
199
+ # to the guaranteed pool to keep behavior consistent.
200
+ self._executor.submit(fn, *args, **kwargs)
201
+ break
202
+ logger.debug('No guaranteed pool set and the burst pool is full, '
203
+ 'retry later.')
204
+ time.sleep(0.1)
205
+
206
+ def shutdown(self) -> None:
207
+ """Shutdown the executor."""
208
+
209
+ if self._burst_executor is not None:
210
+ self._burst_executor.shutdown()
211
+ if self._executor is not None:
212
+ self._executor.shutdown(wait=True)
@@ -0,0 +1,16 @@
1
+ """Process-local queue implementation."""
2
+ import queue
3
+ import threading
4
+ from typing import Dict
5
+
6
+ # Global dict to store queues
7
+ _queues: Dict[str, queue.Queue] = {}
8
+ _lock = threading.Lock()
9
+
10
+
11
+ def get_queue(queue_name: str) -> queue.Queue:
12
+ """Get or create a queue by name."""
13
+ with _lock:
14
+ if queue_name not in _queues:
15
+ _queues[queue_name] = queue.Queue()
16
+ return _queues[queue_name]
sky/utils/atomic.py ADDED
@@ -0,0 +1,52 @@
1
+ """Atomic structures and utilties."""
2
+
3
+ import threading
4
+
5
+
6
+ class AtomicInt:
7
+ """A thread-safe atomic integer implementation."""
8
+
9
+ def __init__(self, initial_value: int = 0):
10
+ self._value = initial_value
11
+ self._lock = threading.Lock()
12
+
13
+ def get(self) -> int:
14
+ """Get the current value atomically.
15
+
16
+ Returns:
17
+ The current integer value.
18
+ """
19
+ with self._lock:
20
+ return self._value
21
+
22
+ def increment(self, delta: int = 1) -> int:
23
+ """Atomically increment by delta and return new value.
24
+
25
+ Args:
26
+ delta: Amount to increment by (default: 1)
27
+
28
+ Returns:
29
+ The new value after incrementing.
30
+ """
31
+ with self._lock:
32
+ self._value += delta
33
+ return self._value
34
+
35
+ def decrement(self, delta: int = 1) -> int:
36
+ """Atomically decrement by delta and return new value.
37
+
38
+ Args:
39
+ delta: Amount to decrement by (default: 1)
40
+
41
+ Returns:
42
+ The new value after decrementing.
43
+ """
44
+ with self._lock:
45
+ self._value -= delta
46
+ return self._value
47
+
48
+ def __str__(self) -> str:
49
+ return str(self.get())
50
+
51
+ def __repr__(self) -> str:
52
+ return f'AtomicInt({self.get()})'
sky/utils/common_utils.py CHANGED
@@ -17,6 +17,8 @@ import typing
17
17
  from typing import Any, Callable, Dict, List, Optional, Union
18
18
  import uuid
19
19
 
20
+ import jsonschema
21
+
20
22
  from sky import exceptions
21
23
  from sky import sky_logging
22
24
  from sky.adaptors import common as adaptors_common
@@ -28,12 +30,10 @@ from sky.utils import validator
28
30
 
29
31
  if typing.TYPE_CHECKING:
30
32
  import jinja2
31
- import jsonschema
32
33
  import psutil
33
34
  import yaml
34
35
  else:
35
36
  jinja2 = adaptors_common.LazyImport('jinja2')
36
- jsonschema = adaptors_common.LazyImport('jsonschema')
37
37
  psutil = adaptors_common.LazyImport('psutil')
38
38
  yaml = adaptors_common.LazyImport('yaml')
39
39
 
sky/utils/validator.py CHANGED
@@ -4,14 +4,7 @@ The main motivation behind extending the existing JSON Schema validator is to
4
4
  allow for case-insensitive enum matching since this is currently not supported
5
5
  by the JSON Schema specification.
6
6
  """
7
- import typing
8
-
9
- from sky.adaptors import common as adaptors_common
10
-
11
- if typing.TYPE_CHECKING:
12
- import jsonschema
13
- else:
14
- jsonschema = adaptors_common.LazyImport('jsonschema')
7
+ import jsonschema
15
8
 
16
9
 
17
10
  def case_insensitive_enum(validator, enums, instance, schema):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250407
3
+ Version: 1.0.0.dev20250408
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,4 +1,4 @@
1
- sky/__init__.py,sha256=-f-rcPq-1NRczFhvTmgyY0eGeL4xNdnjClhgY-sPx5I,6428
1
+ sky/__init__.py,sha256=q1bqMlklbkN76ppGuGrZUg38yFnoTcFONAreuXS5ffY,6428
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=ND011K_-Ud1dVZF37A9KrwYir_ihJXcHc7iDWmuBc8Q,22872
4
4
  sky/check.py,sha256=PPNQnaaZBA9_aogJpN4gnG4XWnTqkd74c-rBYDkDRDY,16101
@@ -34,7 +34,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
34
34
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
35
35
  sky/backends/backend.py,sha256=4BOqKZ-bwBTpjNnZF4JAHX2m2Iga7EmEn8Ao3tEivaM,7527
36
36
  sky/backends/backend_utils.py,sha256=ndY4IPs1F9QovyiKAnB1FNYGWm52_ylwf_K7wY50cv0,134922
37
- sky/backends/cloud_vm_ray_backend.py,sha256=ICo21xsKd1Ipy_nBHbP2FUWllOmdS0Pvr4mfypSYhXI,252012
37
+ sky/backends/cloud_vm_ray_backend.py,sha256=mjedyasnvINYz9pIFThBqscIvjqiXs1DKZyVD8twnc0,251926
38
38
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
39
39
  sky/backends/local_docker_backend.py,sha256=nSYCjms3HOPjPNOrcCqsUKm1WV3AAovRFjEQ7hcEXW4,17021
40
40
  sky/backends/wheel_utils.py,sha256=meypuMaygSXXjGdXfq6dhWl-OrpAybg9KVRoup4D0wU,9098
@@ -106,7 +106,7 @@ sky/data/data_transfer.py,sha256=-JcnVa_LT0kQejcSCnBwYtxhuuaNDPf_Q5oz62p186c,119
106
106
  sky/data/data_utils.py,sha256=ryKUPgNBdeDmGIttqK-J7AKdfc70INTuYH5GOWm3C9g,33581
107
107
  sky/data/mounting_utils.py,sha256=ph2p8cYB28FODgxK5ibiD4B4iMD7T3or99zNQaD9HLs,20162
108
108
  sky/data/storage.py,sha256=85LcC64yxfd5bzTijGZVyMZV41NyzUhOn0xJZieK2Dc,236652
109
- sky/data/storage_utils.py,sha256=fDEEErxu97XhOtwPdnNBqRukWcfRT4eTBUhrSGrAvsY,13255
109
+ sky/data/storage_utils.py,sha256=_0NYCWPSjyEGiLNckOl8NzclO5Rd03jRS-hgbQMofBs,13597
110
110
  sky/jobs/__init__.py,sha256=qoI53-xXE0-SOkrLWigvhgFXjk7dWE0OTqGPYIk-kmM,1458
111
111
  sky/jobs/constants.py,sha256=1XiIqdR5dEgGgepLKWkZCRT3MYSsMBR-dO7N4RTsjwg,3088
112
112
  sky/jobs/controller.py,sha256=d5qQYHadesfFgU7-dYtt2trZwyd5IzvlVJeNh5O8OiA,31386
@@ -238,11 +238,13 @@ sky/server/uvicorn.py,sha256=wajwPHJ3IEEP3GMNOCc0S81-1v2qT5F-ejUkLFVhUzk,2953
238
238
  sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
239
239
  sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
240
240
  sky/server/requests/event_loop.py,sha256=OhpPbuce65bbjpGRlcJa78AVnYSm08SzFKt70ypCUuQ,1211
241
- sky/server/requests/executor.py,sha256=txzvCUKLafRzEoY4Snk9xVFbIdw5cnu7_wkHTldQdmE,22085
241
+ sky/server/requests/executor.py,sha256=z9DaLJOy__7BUddMhXCODmxqD3iAblo6-siEsmO9DiU,26495
242
242
  sky/server/requests/payloads.py,sha256=3sF36Z9_PLzpEncW0AplJtOz-_nsn5PJaM5lS-3Y8bw,16558
243
243
  sky/server/requests/preconditions.py,sha256=ipxIb_3JXG6S3-ymcOdqQNb7VDvoPqADxu9ZK7-nQWc,7179
244
+ sky/server/requests/process.py,sha256=uv6JmqdT1vR6S5j3a0CEmxz3fUoKQoZCryQsjZpZE7E,8734
244
245
  sky/server/requests/requests.py,sha256=9ovdQE-zv_Mvc6IbGATHVyQlOxSKjg_OankZbgDVGeE,21338
245
246
  sky/server/requests/queues/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
247
+ sky/server/requests/queues/local_queue.py,sha256=X6VkBiUmgd_kfqIK1hCtMWG1b8GiZbY70TBiBR6c6GY,416
246
248
  sky/server/requests/queues/mp_queue.py,sha256=jDqP4Jd28U3ibSFyMR1DF9I2OWZrPZqFJrG5S6RFpyw,3403
247
249
  sky/server/requests/serializers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
248
250
  sky/server/requests/serializers/decoders.py,sha256=0cpg80uAqkdK_LqcQPkpKswhcNUUztG9luDLm_0eUow,6811
@@ -311,11 +313,12 @@ sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
311
313
  sky/utils/accelerator_registry.py,sha256=rZniDbqqPAF-vjkrwxGwEErFSAp6puOimkRj3ppOSRY,3905
312
314
  sky/utils/admin_policy_utils.py,sha256=y_do0VH6qh163EqSuRW1uGeKvTnJhiYNrHUs77uoOcA,6013
313
315
  sky/utils/annotations.py,sha256=-rfacB30Sl0xkFriejGvxma3oKctGfXXLZkQPHG33eo,1626
316
+ sky/utils/atomic.py,sha256=vrw-7XCnckF0xCx-ttamao7evPdGtVsnjaTtgMlBXIE,1280
314
317
  sky/utils/cluster_utils.py,sha256=s6DFRXktv6_gF_DnwDEXJ7CniifHp8CAPeGciRCbXgI,14432
315
318
  sky/utils/command_runner.py,sha256=aEBs4Km8b6PqDklNc63tVYMK0w3PBGQEEP21_wmhG1k,39191
316
319
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
317
320
  sky/utils/common.py,sha256=P4oVXFATUYgkruHX92cN12SJBtfb8DiOOYZtbN1kvP0,1927
318
- sky/utils/common_utils.py,sha256=s5YIo9wtFCwWLfLRW7fCjlC9BzqQKPGatWQjrEyYqpc,31680
321
+ sky/utils/common_utils.py,sha256=UM2eSQNdXRvAzlbfC839E7-7DXC9BMMUkquLsmYpu8w,31619
319
322
  sky/utils/config_utils.py,sha256=VQ2E3DQ2XysD-kul-diSrxn_pXWsDMfKAev91OiJQ1Q,9041
320
323
  sky/utils/control_master_utils.py,sha256=iD4M0onjYOdZ2RuxjwMBl4KhafHXJzuHjvqlBUnu-VE,1450
321
324
  sky/utils/controller_utils.py,sha256=mrmkerYyeu7gsCQ56cB3AjCz0r9WaN7teqXUItA47oQ,49805
@@ -334,7 +337,7 @@ sky/utils/status_lib.py,sha256=zn_MSuRYQdNKF8pnFOGQ54X_s_R7dyqWS6Q3a9zENw8,1512
334
337
  sky/utils/subprocess_utils.py,sha256=yM2WumV49gSKuZs0v6E3R8XKl5Q9b6veIzi6us5ORU8,15927
335
338
  sky/utils/timeline.py,sha256=ob6s3bc7nwAuSI76yLKBrSR5bzOHnOhbozz1avwoet4,4070
336
339
  sky/utils/ux_utils.py,sha256=R-ddrqcwKngziZz5haHufxiUnABaMMbmRVsaUljrPBg,10181
337
- sky/utils/validator.py,sha256=moqe3T_PBKmri_SEtpgoJiKuf_PbdSJxsa8CQlcTbxI,1016
340
+ sky/utils/validator.py,sha256=yo5cPUjGxqfa0ZxGyEYZMCWZ8O35G-k3VOEAtAoA_3w,856
338
341
  sky/utils/cli_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
339
342
  sky/utils/cli_utils/status_utils.py,sha256=LwGXzMgvnQeGR1fCC24q38hRLuAPeeSDkQ387eG6YSs,13495
340
343
  sky/utils/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -350,9 +353,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
350
353
  sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=HPVgNt-wbCVPd9dpDFiA7t2mzQLpjXHJ61eiwRbEr-c,10378
351
354
  sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
352
355
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
353
- skypilot_nightly-1.0.0.dev20250407.dist-info/licenses/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
354
- skypilot_nightly-1.0.0.dev20250407.dist-info/METADATA,sha256=hqvdfiv3pR-AR3iUrwYaDHD9U1Qra2EFlv8mwLdtAmk,18552
355
- skypilot_nightly-1.0.0.dev20250407.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
356
- skypilot_nightly-1.0.0.dev20250407.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
357
- skypilot_nightly-1.0.0.dev20250407.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
358
- skypilot_nightly-1.0.0.dev20250407.dist-info/RECORD,,
356
+ skypilot_nightly-1.0.0.dev20250408.dist-info/licenses/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
357
+ skypilot_nightly-1.0.0.dev20250408.dist-info/METADATA,sha256=EO_QBlBiR_CcaunlS8EDv2fOBCqiy0SQACbeUa6Pd88,18552
358
+ skypilot_nightly-1.0.0.dev20250408.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
359
+ skypilot_nightly-1.0.0.dev20250408.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
360
+ skypilot_nightly-1.0.0.dev20250408.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
361
+ skypilot_nightly-1.0.0.dev20250408.dist-info/RECORD,,