skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (57) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +25 -4
  3. sky/backends/cloud_vm_ray_backend.py +151 -36
  4. sky/client/cli/command.py +2 -1
  5. sky/client/cli/table_utils.py +34 -0
  6. sky/client/sdk.py +7 -5
  7. sky/client/sdk_async.py +5 -5
  8. sky/core.py +3 -4
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
  11. sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-7340bc0f0dd8ae74.js} +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/execution.py +0 -1
  30. sky/global_user_state.py +3 -3
  31. sky/jobs/server/core.py +96 -26
  32. sky/jobs/server/utils.py +65 -32
  33. sky/jobs/state.py +145 -3
  34. sky/jobs/utils.py +85 -7
  35. sky/schemas/api/responses.py +18 -0
  36. sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
  37. sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
  38. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  39. sky/serve/serve_utils.py +16 -0
  40. sky/serve/server/core.py +1 -1
  41. sky/serve/server/impl.py +6 -6
  42. sky/server/requests/serializers/decoders.py +2 -2
  43. sky/server/requests/serializers/encoders.py +7 -3
  44. sky/skylet/constants.py +1 -1
  45. sky/skylet/job_lib.py +2 -32
  46. sky/skylet/log_lib.py +211 -0
  47. sky/skylet/log_lib.pyi +30 -1
  48. sky/skylet/services.py +208 -2
  49. sky/skylet/skylet.py +3 -0
  50. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +32 -32
  51. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +56 -52
  52. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
  53. /sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
  54. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
  55. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
  56. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
  57. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py CHANGED
@@ -8,11 +8,13 @@ import functools
8
8
  import io
9
9
  import multiprocessing.pool
10
10
  import os
11
+ import queue as queue_lib
11
12
  import shlex
12
13
  import subprocess
13
14
  import sys
14
15
  import tempfile
15
16
  import textwrap
17
+ import threading
16
18
  import time
17
19
  from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
18
20
  Tuple, Union)
@@ -39,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
39
41
 
40
42
  LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
41
43
 
44
+ # 16-64KiB seems to be the sweet spot:
45
+ # https://github.com/grpc/grpc.github.io/issues/371
46
+ # TODO(kevin): Benchmark this ourselves and verify.
47
+ DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
48
+
42
49
 
43
50
  class _ProcessingArgs:
44
51
  """Arguments for processing logs."""
@@ -563,3 +570,207 @@ def tail_logs(job_id: Optional[int],
563
570
  except FileNotFoundError:
564
571
  print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
565
572
  f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
573
+
574
+
575
+ def tail_logs_iter(job_id: Optional[int],
576
+ log_dir: Optional[str],
577
+ managed_job_id: Optional[int] = None,
578
+ follow: bool = True,
579
+ tail: int = 0) -> Iterator[str]:
580
+ """Tail the logs of a job. This is mostly the same as tail_logs, but
581
+ returns an iterator instead of printing to stdout/stderr."""
582
+ if job_id is None:
583
+ # This only happens when job_lib.get_latest_job_id() returns None,
584
+ # which means no job has been submitted to this cluster. See
585
+ # sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
586
+ logger.info('Skip streaming logs as no job has been submitted.')
587
+ return
588
+ job_str = f'job {job_id}'
589
+ if managed_job_id is not None:
590
+ job_str = f'managed job {managed_job_id}'
591
+ if log_dir is None:
592
+ msg = f'{job_str.capitalize()} not found (see `sky queue`).'
593
+ yield msg + '\n'
594
+ return
595
+ logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
596
+ f'{managed_job_id}.')
597
+ log_path = os.path.join(log_dir, 'run.log')
598
+ log_path = os.path.expanduser(log_path)
599
+
600
+ status = job_lib.update_job_status([job_id], silent=True)[0]
601
+
602
+ # Wait for the log to be written. This is needed due to the `ray submit`
603
+ # will take some time to start the job and write the log.
604
+ retry_cnt = 0
605
+ while status is not None and not status.is_terminal():
606
+ retry_cnt += 1
607
+ if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
608
+ break
609
+ if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
610
+ err = (f'{colorama.Fore.RED}ERROR: Logs for '
611
+ f'{job_str} (status: {status.value}) does not exist '
612
+ f'after retrying {retry_cnt} times.'
613
+ f'{colorama.Style.RESET_ALL}')
614
+ yield err + '\n'
615
+ return
616
+ waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
617
+ 'to be written...')
618
+ yield waiting + '\n'
619
+ time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
620
+ status = job_lib.update_job_status([job_id], silent=True)[0]
621
+
622
+ start_stream_at = LOG_FILE_START_STREAMING_AT
623
+ # Explicitly declare the type to avoid mypy warning.
624
+ lines: Iterable[str] = []
625
+ if follow and status in [
626
+ job_lib.JobStatus.SETTING_UP,
627
+ job_lib.JobStatus.PENDING,
628
+ job_lib.JobStatus.RUNNING,
629
+ ]:
630
+ # Not using `ray job logs` because it will put progress bar in
631
+ # multiple lines.
632
+ with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
633
+ # Using `_follow` instead of `tail -f` to streaming the whole
634
+ # log and creating a new process for tail.
635
+ start_streaming = False
636
+ if tail > 0:
637
+ head_lines_of_log_file = _peek_head_lines(log_file)
638
+ lines = collections.deque(log_file, maxlen=tail)
639
+ start_streaming = _should_stream_the_whole_tail_lines(
640
+ head_lines_of_log_file, lines, start_stream_at)
641
+ for line in lines:
642
+ if start_stream_at in line:
643
+ start_streaming = True
644
+ if start_streaming:
645
+ yield line
646
+ # Now, the cursor is at the end of the last lines
647
+ # if tail > 0
648
+ for line in _follow_job_logs(log_file,
649
+ job_id=job_id,
650
+ start_streaming=start_streaming,
651
+ start_streaming_at=start_stream_at):
652
+ yield line
653
+ else:
654
+ try:
655
+ start_streaming = False
656
+ with open(log_path, 'r', encoding='utf-8') as log_file:
657
+ if tail > 0:
658
+ # If tail > 0, we need to read the last n lines.
659
+ # We use double ended queue to rotate the last n lines.
660
+ head_lines_of_log_file = _peek_head_lines(log_file)
661
+ lines = collections.deque(log_file, maxlen=tail)
662
+ start_streaming = _should_stream_the_whole_tail_lines(
663
+ head_lines_of_log_file, lines, start_stream_at)
664
+ else:
665
+ lines = log_file
666
+ for line in lines:
667
+ if start_stream_at in line:
668
+ start_streaming = True
669
+ if start_streaming:
670
+ yield line
671
+ status_str = status.value if status is not None else 'None'
672
+ # Only show "Job finished" for actually terminal states
673
+ if status is not None and status.is_terminal():
674
+ finish = ux_utils.finishing_message(
675
+ f'Job finished (status: {status_str}).')
676
+ yield finish + '\n'
677
+ return
678
+ except FileNotFoundError:
679
+ err = (
680
+ f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
681
+ f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
682
+ yield err + '\n'
683
+
684
+
685
+ class LogBuffer:
686
+ """In-memory buffer for chunking log lines for streaming."""
687
+
688
+ def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
689
+ """Initialize the log buffer.
690
+
691
+ Args:
692
+ max_chars: Maximum buffer size (in characters, not bytes) before
693
+ flushing. The actual amount of bytes (UTF-8 encoding)
694
+ could be more than this, depending on the characters,
695
+ i.e. ASCII characters take 1 byte, while others
696
+ may take 2-4 bytes. But this is fine as our default
697
+ chunk size is well below the default value of
698
+ grpc.max_receive_message_length which is 4MB.
699
+ """
700
+ self.max_chars = max_chars
701
+ self._buffer = io.StringIO()
702
+
703
+ def _should_flush(self) -> bool:
704
+ return self._buffer.tell() >= self.max_chars
705
+
706
+ def flush(self) -> str:
707
+ """Get the current buffered content and clear the buffer.
708
+
709
+ Returns:
710
+ The buffered log lines as a single string
711
+ """
712
+ if not self._buffer.tell():
713
+ return ''
714
+ chunk = self._buffer.getvalue()
715
+ self._buffer.truncate(0)
716
+ self._buffer.seek(0)
717
+ return chunk
718
+
719
+ def write(self, line: str) -> bool:
720
+ """Add a line to the buffer.
721
+
722
+ Args:
723
+ line: The log line to add
724
+
725
+ Returns:
726
+ True if buffer should be flushed after adding the line
727
+ """
728
+ self._buffer.write(line)
729
+ return self._should_flush()
730
+
731
+ def close(self):
732
+ self._buffer.close()
733
+
734
+
735
+ def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
736
+ timeout: float) -> Iterable[str]:
737
+ """Iterates over an iterable, writing each item to a buffer,
738
+ and flushing the buffer when it is full or no item is
739
+ yielded within the timeout duration."""
740
+ # TODO(kevin): Simplify this using asyncio.timeout, once we move
741
+ # the skylet event loop and gRPC server to asyncio.
742
+ # https://docs.python.org/3/library/asyncio-task.html#timeouts
743
+
744
+ queue: queue_lib.Queue = queue_lib.Queue()
745
+ sentinel = object()
746
+
747
+ def producer():
748
+ try:
749
+ for item in iterable:
750
+ queue.put(item)
751
+ finally:
752
+ queue.put(sentinel)
753
+
754
+ thread = threading.Thread(target=producer, daemon=True)
755
+ thread.start()
756
+
757
+ while True:
758
+ try:
759
+ item = queue.get(timeout=timeout)
760
+ except queue_lib.Empty:
761
+ out = buffer.flush()
762
+ if out:
763
+ yield out
764
+ continue
765
+
766
+ if item is sentinel:
767
+ thread.join()
768
+ out = buffer.flush()
769
+ if out:
770
+ yield out
771
+ return
772
+
773
+ if buffer.write(item):
774
+ out = buffer.flush()
775
+ if out:
776
+ yield out
sky/skylet/log_lib.pyi CHANGED
@@ -4,7 +4,7 @@ overloaded type hints for run_with_log(), as we need to determine
4
4
  the return type based on the value of require_outputs.
5
5
  """
6
6
  import typing
7
- from typing import Dict, List, Optional, Tuple, Union
7
+ from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
8
8
 
9
9
  from typing_extensions import Literal
10
10
 
@@ -143,3 +143,32 @@ def tail_logs(job_id: int,
143
143
  managed_job_id: Optional[int] = ...,
144
144
  follow: bool = ...) -> None:
145
145
  ...
146
+
147
+
148
+ def tail_logs_iter(job_id: Optional[int],
149
+ log_dir: Optional[str],
150
+ managed_job_id: Optional[int] = ...,
151
+ follow: bool = ...,
152
+ tail: int = ...) -> Iterator[str]:
153
+ ...
154
+
155
+
156
+ class LogBuffer:
157
+ max_chars: int
158
+
159
+ def __init__(self, max_chars: int = ...):
160
+ ...
161
+
162
+ def flush(self) -> str:
163
+ ...
164
+
165
+ def write(self, line: str) -> bool:
166
+ ...
167
+
168
+ def close(self):
169
+ ...
170
+
171
+
172
+ def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
173
+ timeout: float) -> Iterable[str]:
174
+ ...
sky/skylet/services.py CHANGED
@@ -1,15 +1,20 @@
1
1
  """gRPC service implementations for skylet."""
2
2
 
3
3
  import os
4
+ from typing import List, Optional
4
5
 
5
6
  import grpc
6
7
 
8
+ from sky import exceptions
7
9
  from sky import sky_logging
8
10
  from sky.jobs import state as managed_job_state
11
+ from sky.jobs import utils as managed_job_utils
9
12
  from sky.schemas.generated import autostopv1_pb2
10
13
  from sky.schemas.generated import autostopv1_pb2_grpc
11
14
  from sky.schemas.generated import jobsv1_pb2
12
15
  from sky.schemas.generated import jobsv1_pb2_grpc
16
+ from sky.schemas.generated import managed_jobsv1_pb2
17
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
13
18
  from sky.schemas.generated import servev1_pb2
14
19
  from sky.schemas.generated import servev1_pb2_grpc
15
20
  from sky.serve import serve_rpc_utils
@@ -18,9 +23,14 @@ from sky.serve import serve_utils
18
23
  from sky.skylet import autostop_lib
19
24
  from sky.skylet import constants
20
25
  from sky.skylet import job_lib
26
+ from sky.skylet import log_lib
21
27
 
22
28
  logger = sky_logging.init_logger(__name__)
23
29
 
30
+ # In the worst case, flush the log buffer every 50ms,
31
+ # to ensure responsiveness.
32
+ DEFAULT_LOG_CHUNK_FLUSH_INTERVAL = 0.05
33
+
24
34
 
25
35
  class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
26
36
  """Implementation of the AutostopService gRPC service."""
@@ -275,8 +285,39 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
275
285
  self,
276
286
  request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
277
287
  context: grpc.ServicerContext):
278
- # TODO(kevin): implement this
279
- raise NotImplementedError('TailLogs is not implemented')
288
+ buffer = log_lib.LogBuffer()
289
+ try:
290
+ job_id = request.job_id if request.HasField(
291
+ 'job_id') else job_lib.get_latest_job_id()
292
+ managed_job_id = request.managed_job_id if request.HasField(
293
+ 'managed_job_id') else None
294
+ log_dir = job_lib.get_log_dir_for_job(job_id)
295
+ if log_dir is None:
296
+ run_timestamp = job_lib.get_run_timestamp(job_id)
297
+ log_dir = None if run_timestamp is None else os.path.join(
298
+ constants.SKY_LOGS_DIRECTORY, run_timestamp)
299
+
300
+ for line in log_lib.buffered_iter_with_timeout(
301
+ buffer,
302
+ log_lib.tail_logs_iter(job_id, log_dir, managed_job_id,
303
+ request.follow, request.tail),
304
+ DEFAULT_LOG_CHUNK_FLUSH_INTERVAL):
305
+ yield jobsv1_pb2.TailLogsResponse(log_line=line)
306
+
307
+ job_status = job_lib.get_status(job_id)
308
+ exit_code = exceptions.JobExitCode.from_job_status(job_status)
309
+ # Fix for dashboard: When follow=False and job is still running
310
+ # (NOT_FINISHED=101), exit with success (0) since fetching current
311
+ # logs is a successful operation.
312
+ # This prevents shell wrappers from printing "command terminated
313
+ # with exit code 101".
314
+ exit_code_int = 0 if not request.follow and int(
315
+ exit_code) == 101 else int(exit_code)
316
+ yield jobsv1_pb2.TailLogsResponse(exit_code=exit_code_int)
317
+ except Exception as e: # pylint: disable=broad-except
318
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
319
+ finally:
320
+ buffer.close()
280
321
 
281
322
  def GetJobStatus( # type: ignore[return]
282
323
  self, request: jobsv1_pb2.GetJobStatusRequest,
@@ -343,3 +384,168 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
343
384
  job_log_dirs=job_log_dirs)
344
385
  except Exception as e: # pylint: disable=broad-except
345
386
  context.abort(grpc.StatusCode.INTERNAL, str(e))
387
+
388
+
389
+ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
390
+ ):
391
+ """Implementation of the ManagedJobsService gRPC service."""
392
+
393
+ def GetVersion( # type: ignore[return]
394
+ self, request: managed_jobsv1_pb2.GetVersionRequest,
395
+ context: grpc.ServicerContext
396
+ ) -> managed_jobsv1_pb2.GetVersionResponse:
397
+ try:
398
+ return managed_jobsv1_pb2.GetVersionResponse(
399
+ controller_version=constants.SKYLET_VERSION)
400
+ except Exception as e: # pylint: disable=broad-except
401
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
402
+
403
+ def GetJobTable( # type: ignore[return]
404
+ self, request: managed_jobsv1_pb2.GetJobTableRequest,
405
+ context: grpc.ServicerContext
406
+ ) -> managed_jobsv1_pb2.GetJobTableResponse:
407
+ try:
408
+ accessible_workspaces = list(request.accessible_workspaces)
409
+ job_ids = list(request.job_ids.ids) if request.job_ids else None
410
+ user_hashes: Optional[List[Optional[str]]] = None
411
+ if request.user_hashes:
412
+ user_hashes = list(request.user_hashes.hashes)
413
+ # For backwards compatibility, we show jobs that do not have a
414
+ # user_hash. TODO: Remove before 0.12.0.
415
+ if request.show_jobs_without_user_hash:
416
+ user_hashes.append(None)
417
+ statuses = list(
418
+ request.statuses.statuses) if request.statuses else None
419
+
420
+ job_queue = managed_job_utils.get_managed_job_queue(
421
+ skip_finished=request.skip_finished,
422
+ accessible_workspaces=accessible_workspaces,
423
+ job_ids=job_ids,
424
+ workspace_match=request.workspace_match
425
+ if request.HasField('workspace_match') else None,
426
+ name_match=request.name_match
427
+ if request.HasField('name_match') else None,
428
+ pool_match=request.pool_match
429
+ if request.HasField('pool_match') else None,
430
+ page=request.page if request.HasField('page') else None,
431
+ limit=request.limit if request.HasField('limit') else None,
432
+ user_hashes=user_hashes,
433
+ statuses=statuses)
434
+ jobs = job_queue['jobs']
435
+ total = job_queue['total']
436
+ total_no_filter = job_queue['total_no_filter']
437
+ status_counts = job_queue['status_counts']
438
+
439
+ jobs_info = []
440
+ for job in jobs:
441
+ job_info = managed_jobsv1_pb2.ManagedJobInfo(
442
+ job_id=job.get('job_id'),
443
+ task_id=job.get('task_id'),
444
+ job_name=job.get('job_name'),
445
+ task_name=job.get('task_name'),
446
+ job_duration=job.get('job_duration'),
447
+ workspace=job.get('workspace'),
448
+ status=managed_job_state.ManagedJobStatus(
449
+ job.get('status')).to_protobuf(),
450
+ schedule_state=managed_job_state.ManagedJobScheduleState(
451
+ job.get('schedule_state')).to_protobuf(),
452
+ resources=job.get('resources'),
453
+ cluster_resources=job.get('cluster_resources'),
454
+ cluster_resources_full=job.get('cluster_resources_full'),
455
+ cloud=job.get('cloud'),
456
+ region=job.get('region'),
457
+ infra=job.get('infra'),
458
+ accelerators=job.get('accelerators'),
459
+ recovery_count=job.get('recovery_count'),
460
+ details=job.get('details'),
461
+ failure_reason=job.get('failure_reason'),
462
+ user_name=job.get('user_name'),
463
+ user_hash=job.get('user_hash'),
464
+ submitted_at=job.get('submitted_at'),
465
+ start_at=job.get('start_at'),
466
+ end_at=job.get('end_at'),
467
+ user_yaml=job.get('user_yaml'),
468
+ entrypoint=job.get('entrypoint'),
469
+ metadata={
470
+ k: v
471
+ for k, v in job.get('metadata', {}).items()
472
+ if v is not None
473
+ },
474
+ pool=job.get('pool'),
475
+ pool_hash=job.get('pool_hash'))
476
+ jobs_info.append(job_info)
477
+
478
+ return managed_jobsv1_pb2.GetJobTableResponse(
479
+ jobs=jobs_info,
480
+ total=total,
481
+ total_no_filter=total_no_filter,
482
+ status_counts=status_counts)
483
+ except Exception as e: # pylint: disable=broad-except
484
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
485
+
486
+ def GetAllJobIdsByName( # type: ignore[return]
487
+ self, request: managed_jobsv1_pb2.GetAllJobIdsByNameRequest,
488
+ context: grpc.ServicerContext
489
+ ) -> managed_jobsv1_pb2.GetAllJobIdsByNameResponse:
490
+ try:
491
+ job_name = request.job_name if request.HasField(
492
+ 'job_name') else None
493
+ job_ids = managed_job_state.get_all_job_ids_by_name(job_name)
494
+ return managed_jobsv1_pb2.GetAllJobIdsByNameResponse(
495
+ job_ids=job_ids)
496
+ except Exception as e: # pylint: disable=broad-except
497
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
498
+
499
+ def CancelJobs( # type: ignore[return]
500
+ self, request: managed_jobsv1_pb2.CancelJobsRequest,
501
+ context: grpc.ServicerContext
502
+ ) -> managed_jobsv1_pb2.CancelJobsResponse:
503
+ try:
504
+ cancellation_criteria = request.WhichOneof('cancellation_criteria')
505
+ if cancellation_criteria is None:
506
+ context.abort(
507
+ grpc.StatusCode.INVALID_ARGUMENT,
508
+ 'exactly one cancellation criteria must be specified.')
509
+
510
+ if cancellation_criteria == 'all_users':
511
+ user_hash = request.user_hash if request.HasField(
512
+ 'user_hash') else None
513
+ all_users = request.all_users
514
+ if not all_users and user_hash is None:
515
+ context.abort(
516
+ grpc.StatusCode.INVALID_ARGUMENT,
517
+ 'user_hash is required when all_users is False')
518
+ message = managed_job_utils.cancel_jobs_by_id(
519
+ job_ids=None,
520
+ all_users=all_users,
521
+ current_workspace=request.current_workspace,
522
+ user_hash=user_hash)
523
+ elif cancellation_criteria == 'job_ids':
524
+ job_ids = list(request.job_ids.ids)
525
+ message = managed_job_utils.cancel_jobs_by_id(
526
+ job_ids=job_ids,
527
+ current_workspace=request.current_workspace)
528
+ elif cancellation_criteria == 'job_name':
529
+ message = managed_job_utils.cancel_job_by_name(
530
+ job_name=request.job_name,
531
+ current_workspace=request.current_workspace)
532
+ elif cancellation_criteria == 'pool_name':
533
+ message = managed_job_utils.cancel_jobs_by_pool(
534
+ pool_name=request.pool_name,
535
+ current_workspace=request.current_workspace)
536
+ else:
537
+ context.abort(
538
+ grpc.StatusCode.INVALID_ARGUMENT,
539
+ f'invalid cancellation criteria: {cancellation_criteria}')
540
+ return managed_jobsv1_pb2.CancelJobsResponse(message=message)
541
+ except Exception as e: # pylint: disable=broad-except
542
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
543
+
544
+ def StreamLogs(
545
+ self,
546
+ request: managed_jobsv1_pb2.
547
+ StreamLogsRequest, # type: ignore[return]
548
+ context: grpc.ServicerContext):
549
+ # TODO(kevin): implement this
550
+ context.abort(grpc.StatusCode.UNIMPLEMENTED,
551
+ 'StreamLogs is not implemented')
sky/skylet/skylet.py CHANGED
@@ -10,6 +10,7 @@ import sky
10
10
  from sky import sky_logging
11
11
  from sky.schemas.generated import autostopv1_pb2_grpc
12
12
  from sky.schemas.generated import jobsv1_pb2_grpc
13
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
13
14
  from sky.schemas.generated import servev1_pb2_grpc
14
15
  from sky.skylet import constants
15
16
  from sky.skylet import events
@@ -55,6 +56,8 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
55
56
  services.JobsServiceImpl(), server)
56
57
  servev1_pb2_grpc.add_ServeServiceServicer_to_server(
57
58
  services.ServeServiceImpl(), server)
59
+ managed_jobsv1_pb2_grpc.add_ManagedJobsServiceServicer_to_server(
60
+ services.ManagedJobsServiceImpl(), server)
58
61
 
59
62
  listen_addr = f'127.0.0.1:{port}'
60
63
  server.add_insecure_port(listen_addr)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250926
3
+ Version: 1.0.0.dev20250927
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -154,51 +154,51 @@ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
154
154
  Requires-Dist: aiosqlite; extra == "server"
155
155
  Requires-Dist: greenlet; extra == "server"
156
156
  Provides-Extra: all
157
- Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
158
157
  Requires-Dist: azure-core>=1.31.0; extra == "all"
159
- Requires-Dist: docker; extra == "all"
158
+ Requires-Dist: runpod>=1.6.1; extra == "all"
160
159
  Requires-Dist: ibm-cos-sdk; extra == "all"
161
- Requires-Dist: azure-identity>=1.19.0; extra == "all"
162
- Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
163
- Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
164
- Requires-Dist: azure-cli>=2.65.0; extra == "all"
160
+ Requires-Dist: botocore>=1.29.10; extra == "all"
161
+ Requires-Dist: msgraph-sdk; extra == "all"
162
+ Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
165
163
  Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
164
+ Requires-Dist: websockets; extra == "all"
165
+ Requires-Dist: azure-identity>=1.19.0; extra == "all"
166
+ Requires-Dist: sqlalchemy_adapter; extra == "all"
166
167
  Requires-Dist: boto3>=1.26.1; extra == "all"
167
- Requires-Dist: msrestazure; extra == "all"
168
+ Requires-Dist: ecsapi>=0.2.0; extra == "all"
168
169
  Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
169
- Requires-Dist: aiohttp; extra == "all"
170
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
171
+ Requires-Dist: google-cloud-storage; extra == "all"
172
+ Requires-Dist: ibm-cloud-sdk-core; extra == "all"
170
173
  Requires-Dist: colorama<0.4.5; extra == "all"
171
- Requires-Dist: websockets; extra == "all"
172
- Requires-Dist: ecsapi>=0.2.0; extra == "all"
173
- Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
174
+ Requires-Dist: docker; extra == "all"
174
175
  Requires-Dist: tomli; python_version < "3.11" and extra == "all"
175
- Requires-Dist: pydo>=0.3.0; extra == "all"
176
- Requires-Dist: casbin; extra == "all"
177
176
  Requires-Dist: ray[default]>=2.6.1; extra == "all"
178
- Requires-Dist: azure-core>=1.24.0; extra == "all"
179
- Requires-Dist: cudo-compute>=0.1.10; extra == "all"
180
- Requires-Dist: sqlalchemy_adapter; extra == "all"
181
- Requires-Dist: ibm-vpc; extra == "all"
182
- Requires-Dist: runpod>=1.6.1; extra == "all"
183
- Requires-Dist: passlib; extra == "all"
184
- Requires-Dist: anyio; extra == "all"
185
- Requires-Dist: grpcio>=1.63.0; extra == "all"
186
- Requires-Dist: python-dateutil; extra == "all"
187
- Requires-Dist: nebius>=0.2.47; extra == "all"
177
+ Requires-Dist: pyjwt; extra == "all"
178
+ Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
188
179
  Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
180
+ Requires-Dist: grpcio>=1.63.0; extra == "all"
181
+ Requires-Dist: msrestazure; extra == "all"
189
182
  Requires-Dist: oci; extra == "all"
190
- Requires-Dist: pyjwt; extra == "all"
191
- Requires-Dist: msgraph-sdk; extra == "all"
192
- Requires-Dist: azure-common; extra == "all"
183
+ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
184
+ Requires-Dist: pydo>=0.3.0; extra == "all"
185
+ Requires-Dist: casbin; extra == "all"
186
+ Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
187
+ Requires-Dist: cudo-compute>=0.1.10; extra == "all"
193
188
  Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
194
- Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
189
+ Requires-Dist: azure-cli>=2.65.0; extra == "all"
195
190
  Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
196
- Requires-Dist: ibm-cloud-sdk-core; extra == "all"
191
+ Requires-Dist: greenlet; extra == "all"
192
+ Requires-Dist: azure-common; extra == "all"
197
193
  Requires-Dist: aiosqlite; extra == "all"
194
+ Requires-Dist: anyio; extra == "all"
195
+ Requires-Dist: python-dateutil; extra == "all"
198
196
  Requires-Dist: awscli>=1.27.10; extra == "all"
199
- Requires-Dist: botocore>=1.29.10; extra == "all"
200
- Requires-Dist: google-cloud-storage; extra == "all"
201
- Requires-Dist: greenlet; extra == "all"
197
+ Requires-Dist: azure-core>=1.24.0; extra == "all"
198
+ Requires-Dist: ibm-vpc; extra == "all"
199
+ Requires-Dist: nebius>=0.2.47; extra == "all"
200
+ Requires-Dist: passlib; extra == "all"
201
+ Requires-Dist: aiohttp; extra == "all"
202
202
  Dynamic: author
203
203
  Dynamic: classifier
204
204
  Dynamic: description