skypilot-nightly 1.0.0.dev20250925__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (78) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +38 -14
  3. sky/backends/cloud_vm_ray_backend.py +151 -36
  4. sky/client/cli/command.py +18 -9
  5. sky/client/cli/table_utils.py +34 -0
  6. sky/client/common.py +4 -2
  7. sky/client/sdk.py +11 -7
  8. sky/client/sdk_async.py +5 -5
  9. sky/core.py +6 -6
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
  17. sky/dashboard/out/_next/static/chunks/{webpack-16ba1d7187d2e3b1.js → webpack-7340bc0f0dd8ae74.js} +1 -1
  18. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  19. sky/dashboard/out/clusters/[cluster].html +1 -1
  20. sky/dashboard/out/clusters.html +1 -1
  21. sky/dashboard/out/config.html +1 -1
  22. sky/dashboard/out/index.html +1 -1
  23. sky/dashboard/out/infra/[context].html +1 -1
  24. sky/dashboard/out/infra.html +1 -1
  25. sky/dashboard/out/jobs/[job].html +1 -1
  26. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/volumes.html +1 -1
  30. sky/dashboard/out/workspace/new.html +1 -1
  31. sky/dashboard/out/workspaces/[name].html +1 -1
  32. sky/dashboard/out/workspaces.html +1 -1
  33. sky/execution.py +0 -1
  34. sky/global_user_state.py +57 -34
  35. sky/jobs/constants.py +2 -0
  36. sky/jobs/controller.py +4 -0
  37. sky/jobs/server/core.py +98 -26
  38. sky/jobs/server/utils.py +65 -32
  39. sky/jobs/state.py +145 -3
  40. sky/jobs/utils.py +85 -7
  41. sky/provision/runpod/__init__.py +2 -0
  42. sky/schemas/api/responses.py +18 -0
  43. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  44. sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
  45. sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
  46. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  47. sky/serve/serve_utils.py +16 -0
  48. sky/serve/server/core.py +1 -1
  49. sky/serve/server/impl.py +6 -6
  50. sky/server/requests/payloads.py +2 -1
  51. sky/server/requests/serializers/decoders.py +2 -2
  52. sky/server/requests/serializers/encoders.py +7 -3
  53. sky/setup_files/dependencies.py +1 -1
  54. sky/skylet/constants.py +4 -1
  55. sky/skylet/events.py +42 -0
  56. sky/skylet/job_lib.py +2 -32
  57. sky/skylet/log_lib.py +211 -0
  58. sky/skylet/log_lib.pyi +30 -1
  59. sky/skylet/services.py +208 -2
  60. sky/skylet/skylet.py +3 -0
  61. sky/templates/jobs-controller.yaml.j2 +3 -0
  62. sky/templates/kubernetes-ray.yml.j2 +8 -3
  63. sky/utils/db/db_utils.py +5 -1
  64. sky/utils/db/migration_utils.py +1 -1
  65. sky/utils/kubernetes/kubernetes_deploy_utils.py +35 -12
  66. sky/volumes/server/core.py +1 -0
  67. sky/volumes/volume.py +16 -17
  68. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +36 -36
  69. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +74 -69
  70. sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +0 -1
  71. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
  72. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
  73. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +0 -16
  74. /sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
  75. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
  76. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
  77. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
  78. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py CHANGED
@@ -19,7 +19,9 @@ from sky import provision as provision_lib
19
19
  from sky import sky_logging
20
20
  from sky import skypilot_config
21
21
  from sky import task as task_lib
22
+ from sky.adaptors import common as adaptors_common
22
23
  from sky.backends import backend_utils
24
+ from sky.backends import cloud_vm_ray_backend
23
25
  from sky.catalog import common as service_catalog_common
24
26
  from sky.data import storage as storage_lib
25
27
  from sky.jobs import constants as managed_job_constants
@@ -44,8 +46,15 @@ from sky.utils import ux_utils
44
46
  from sky.workspaces import core as workspaces_core
45
47
 
46
48
  if typing.TYPE_CHECKING:
49
+ from google.protobuf import json_format
50
+
47
51
  import sky
48
- from sky.backends import cloud_vm_ray_backend
52
+ from sky.schemas.generated import managed_jobsv1_pb2
53
+ else:
54
+ json_format = adaptors_common.LazyImport('google.protobuf.json_format')
55
+
56
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
57
+ 'sky.schemas.generated.managed_jobsv1_pb2')
49
58
 
50
59
  logger = sky_logging.init_logger(__name__)
51
60
 
@@ -368,6 +377,8 @@ def launch(
368
377
  'priority': priority,
369
378
  'consolidation_mode_job_id': consolidation_mode_job_id,
370
379
  'pool': pool,
380
+ 'job_controller_indicator_file':
381
+ managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
371
382
  **controller_utils.shared_controller_vars_to_fill(
372
383
  controller,
373
384
  remote_user_config_path=remote_user_config_path,
@@ -699,11 +710,13 @@ def queue_v2(
699
710
  assert isinstance(backend, backends.CloudVmRayBackend)
700
711
 
701
712
  user_hashes: Optional[List[Optional[str]]] = None
713
+ show_jobs_without_user_hash = False
702
714
  if not all_users:
703
715
  user_hashes = [common_utils.get_user_hash()]
704
716
  # For backwards compatibility, we show jobs that do not have a
705
717
  # user_hash. TODO(cooperc): Remove before 0.12.0.
706
718
  user_hashes.append(None)
719
+ show_jobs_without_user_hash = True
707
720
  elif user_match is not None:
708
721
  users = global_user_state.get_user_by_name_match(user_match)
709
722
  if not users:
@@ -711,6 +724,38 @@ def queue_v2(
711
724
  user_hashes = [user.id for user in users]
712
725
 
713
726
  accessible_workspaces = list(workspaces_core.get_workspaces().keys())
727
+
728
+ if handle.is_grpc_enabled_with_flag:
729
+ try:
730
+ request = managed_jobsv1_pb2.GetJobTableRequest(
731
+ skip_finished=skip_finished,
732
+ accessible_workspaces=accessible_workspaces,
733
+ job_ids=managed_jobsv1_pb2.JobIds(
734
+ ids=job_ids) if job_ids is not None else None,
735
+ workspace_match=workspace_match,
736
+ name_match=name_match,
737
+ pool_match=pool_match,
738
+ page=page,
739
+ limit=limit,
740
+ # Remove None from user_hashes, as the gRPC server uses the
741
+ # show_jobs_without_user_hash flag instead.
742
+ user_hashes=managed_jobsv1_pb2.UserHashes(hashes=[
743
+ user_hash for user_hash in user_hashes
744
+ if user_hash is not None
745
+ ]) if user_hashes is not None else None,
746
+ statuses=managed_jobsv1_pb2.Statuses(
747
+ statuses=statuses) if statuses is not None else None,
748
+ show_jobs_without_user_hash=show_jobs_without_user_hash,
749
+ )
750
+ response = backend_utils.invoke_skylet_with_retries(
751
+ lambda: cloud_vm_ray_backend.SkyletClient(
752
+ handle.get_grpc_channel()).get_managed_job_table(request))
753
+ jobs = managed_job_utils.decode_managed_job_protos(response.jobs)
754
+ return jobs, response.total, dict(
755
+ response.status_counts), response.total_no_filter
756
+ except exceptions.SkyletMethodNotImplementedError:
757
+ pass
758
+
714
759
  code = managed_job_utils.ManagedJobCodeGen.get_job_table(
715
760
  skip_finished, accessible_workspaces, job_ids, workspace_match,
716
761
  name_match, pool_match, page, limit, user_hashes, statuses)
@@ -817,33 +862,60 @@ def cancel(name: Optional[str] = None,
817
862
  'Can only specify one of JOB_IDS, name, pool, or all/'
818
863
  f'all_users. Provided {" ".join(arguments)!r}.')
819
864
 
865
+ job_ids = None if (all_users or all) else job_ids
866
+
820
867
  backend = backend_utils.get_backend_from_handle(handle)
821
868
  assert isinstance(backend, backends.CloudVmRayBackend)
822
- if all_users:
823
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
824
- None, all_users=True)
825
- elif all:
826
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(None)
827
- elif job_ids:
828
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
829
- job_ids)
830
- elif name is not None:
831
- code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
832
- else:
833
- assert pool is not None, (job_ids, name, pool, all)
834
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(pool)
835
- # The stderr is redirected to stdout
836
- returncode, stdout, stderr = backend.run_on_head(handle,
837
- code,
838
- require_outputs=True,
839
- stream_logs=False)
840
- try:
841
- subprocess_utils.handle_returncode(returncode, code,
842
- 'Failed to cancel managed job',
843
- stdout + stderr)
844
- except exceptions.CommandError as e:
845
- with ux_utils.print_exception_no_traceback():
846
- raise RuntimeError(e.error_msg) from e
869
+
870
+ use_legacy = not handle.is_grpc_enabled_with_flag
871
+
872
+ if not use_legacy:
873
+ current_workspace = skypilot_config.get_active_workspace()
874
+ try:
875
+ request = managed_jobsv1_pb2.CancelJobsRequest(
876
+ current_workspace=current_workspace)
877
+
878
+ if all_users or all or job_ids:
879
+ request.all_users = all_users
880
+ if all:
881
+ request.user_hash = common_utils.get_user_hash()
882
+ if job_ids is not None:
883
+ request.job_ids.CopyFrom(
884
+ managed_jobsv1_pb2.JobIds(ids=job_ids))
885
+ elif name is not None:
886
+ request.job_name = name
887
+ else:
888
+ assert pool is not None, (job_ids, name, pool, all)
889
+ request.pool_name = pool
890
+
891
+ response = backend_utils.invoke_skylet_with_retries(
892
+ lambda: cloud_vm_ray_backend.SkyletClient(
893
+ handle.get_grpc_channel()).cancel_managed_jobs(request))
894
+ stdout = response.message
895
+ except exceptions.SkyletMethodNotImplementedError:
896
+ use_legacy = True
897
+
898
+ if use_legacy:
899
+ if all_users or all or job_ids:
900
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
901
+ job_ids, all_users=all_users)
902
+ elif name is not None:
903
+ code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(
904
+ name)
905
+ else:
906
+ assert pool is not None, (job_ids, name, pool, all)
907
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(
908
+ pool)
909
+ # The stderr is redirected to stdout
910
+ returncode, stdout, stderr = backend.run_on_head(
911
+ handle, code, require_outputs=True, stream_logs=False)
912
+ try:
913
+ subprocess_utils.handle_returncode(
914
+ returncode, code, 'Failed to cancel managed job',
915
+ stdout + stderr)
916
+ except exceptions.CommandError as e:
917
+ with ux_utils.print_exception_no_traceback():
918
+ raise RuntimeError(e.error_msg) from e
847
919
 
848
920
  logger.info(stdout)
849
921
  if 'Multiple jobs found with name' in stdout:
sky/jobs/server/utils.py CHANGED
@@ -1,13 +1,24 @@
1
1
  """Utility functions for managed jobs."""
2
+ import typing
3
+
2
4
  from sky import backends
5
+ from sky import exceptions
3
6
  from sky import sky_logging
7
+ from sky.adaptors import common as adaptors_common
4
8
  from sky.backends import backend_utils
9
+ from sky.backends import cloud_vm_ray_backend
5
10
  from sky.jobs import utils as managed_job_utils
6
11
  from sky.skylet import constants as skylet_constants
7
12
  from sky.utils import controller_utils
8
13
 
9
14
  logger = sky_logging.init_logger(__name__)
10
15
 
16
+ if typing.TYPE_CHECKING:
17
+ from sky.schemas.generated import managed_jobsv1_pb2
18
+ else:
19
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
20
+ 'sky.schemas.generated.managed_jobsv1_pb2')
21
+
11
22
 
12
23
  def check_version_mismatch_and_non_terminal_jobs() -> None:
13
24
  """Check if controller has version mismatch and non-terminal jobs exist.
@@ -28,42 +39,64 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
28
39
  backend = backend_utils.get_backend_from_handle(handle)
29
40
  assert isinstance(backend, backends.CloudVmRayBackend)
30
41
 
31
- # Get controller version and raw job table
32
- code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
33
-
34
- returncode, output, stderr = backend.run_on_head(handle,
35
- code,
36
- require_outputs=True,
37
- stream_logs=False,
38
- separate_stderr=True)
39
-
40
- if returncode != 0:
41
- logger.error(output + stderr)
42
- raise ValueError('Failed to check controller version and jobs with '
43
- f'returncode: {returncode}.\n{output + stderr}')
44
-
45
- # Parse the output to extract controller version (split only on first
46
- # newline)
47
- output_parts = output.strip().split('\n', 1)
48
-
49
- # Extract controller version from first line
50
- if len(output_parts) < 2 or not output_parts[0].startswith(
51
- 'controller_version:'):
52
- raise ValueError(
53
- f'Expected controller version in first line, got: {output}')
54
-
55
- controller_version = output_parts[0].split(':', 1)[1]
56
-
57
- # Rest is job table payload (preserving any newlines within it)
58
- job_table_payload = output_parts[1]
42
+ use_legacy = not handle.is_grpc_enabled_with_flag
43
+
44
+ if not use_legacy:
45
+ try:
46
+ version_request = managed_jobsv1_pb2.GetVersionRequest()
47
+ version_response = backend_utils.invoke_skylet_with_retries(
48
+ lambda: cloud_vm_ray_backend.SkyletClient(
49
+ handle.get_grpc_channel(
50
+ )).get_managed_job_controller_version(version_request))
51
+ controller_version = version_response.controller_version
52
+
53
+ job_table_request = managed_jobsv1_pb2.GetJobTableRequest()
54
+ job_table_response = backend_utils.invoke_skylet_with_retries(
55
+ lambda: cloud_vm_ray_backend.SkyletClient(
56
+ handle.get_grpc_channel()).get_managed_job_table(
57
+ job_table_request))
58
+ jobs = managed_job_utils.decode_managed_job_protos(
59
+ job_table_response.jobs)
60
+ except exceptions.SkyletMethodNotImplementedError:
61
+ use_legacy = True
62
+
63
+ if use_legacy:
64
+ # Get controller version and raw job table
65
+ code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
66
+
67
+ returncode, output, stderr = backend.run_on_head(handle,
68
+ code,
69
+ require_outputs=True,
70
+ stream_logs=False,
71
+ separate_stderr=True)
72
+
73
+ if returncode != 0:
74
+ logger.error(output + stderr)
75
+ raise ValueError('Failed to check controller version and jobs with '
76
+ f'returncode: {returncode}.\n{output + stderr}')
77
+
78
+ # Parse the output to extract controller version (split only on first
79
+ # newline)
80
+ output_parts = output.strip().split('\n', 1)
81
+
82
+ # Extract controller version from first line
83
+ if len(output_parts) < 2 or not output_parts[0].startswith(
84
+ 'controller_version:'):
85
+ raise ValueError(
86
+ f'Expected controller version in first line, got: {output}')
87
+
88
+ controller_version = output_parts[0].split(':', 1)[1]
89
+
90
+ # Rest is job table payload (preserving any newlines within it)
91
+ job_table_payload = output_parts[1]
92
+
93
+ # Load and filter jobs locally using existing method
94
+ jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
95
+ job_table_payload)
59
96
 
60
97
  # Process locally: check version match and filter non-terminal jobs
61
98
  version_matches = (controller_version == local_version or
62
99
  int(controller_version) > 17)
63
-
64
- # Load and filter jobs locally using existing method
65
- jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
66
- job_table_payload)
67
100
  non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
68
101
  has_non_terminal_jobs = len(non_terminal_jobs) > 0
69
102
 
sky/jobs/state.py CHANGED
@@ -25,6 +25,7 @@ from sqlalchemy.ext import declarative
25
25
  from sky import exceptions
26
26
  from sky import sky_logging
27
27
  from sky import skypilot_config
28
+ from sky.adaptors import common as adaptors_common
28
29
  from sky.skylet import constants
29
30
  from sky.utils import common_utils
30
31
  from sky.utils import context_utils
@@ -34,6 +35,11 @@ from sky.utils.db import migration_utils
34
35
  if typing.TYPE_CHECKING:
35
36
  from sqlalchemy.engine import row
36
37
 
38
+ from sky.schemas.generated import managed_jobsv1_pb2
39
+ else:
40
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
41
+ 'sky.schemas.generated.managed_jobsv1_pb2')
42
+
37
43
  # Separate callback types for sync and async contexts
38
44
  SyncCallbackType = Callable[[str], None]
39
45
  AsyncCallbackType = Callable[[str], Awaitable[Any]]
@@ -448,6 +454,75 @@ class ManagedJobStatus(enum.Enum):
448
454
  cls.RECOVERING,
449
455
  ]
450
456
 
457
+ @classmethod
458
+ def from_protobuf(
459
+ cls, protobuf_value: 'managed_jobsv1_pb2.ManagedJobStatus'
460
+ ) -> Optional['ManagedJobStatus']:
461
+ """Convert protobuf ManagedJobStatus enum to Python enum value."""
462
+ protobuf_to_enum = {
463
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_UNSPECIFIED: None,
464
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_PENDING: cls.PENDING,
465
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUBMITTED:
466
+ cls.DEPRECATED_SUBMITTED,
467
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_STARTING: cls.STARTING,
468
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_RUNNING: cls.RUNNING,
469
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
470
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED: cls.FAILED,
471
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_CONTROLLER:
472
+ cls.FAILED_CONTROLLER,
473
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_SETUP:
474
+ cls.FAILED_SETUP,
475
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLED: cls.CANCELLED,
476
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_RECOVERING: cls.RECOVERING,
477
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLING: cls.CANCELLING,
478
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_PRECHECKS:
479
+ cls.FAILED_PRECHECKS,
480
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_NO_RESOURCE:
481
+ cls.FAILED_NO_RESOURCE,
482
+ }
483
+
484
+ if protobuf_value not in protobuf_to_enum:
485
+ raise ValueError(
486
+ f'Unknown protobuf ManagedJobStatus value: {protobuf_value}')
487
+
488
+ return protobuf_to_enum[protobuf_value]
489
+
490
+ def to_protobuf(self) -> 'managed_jobsv1_pb2.ManagedJobStatus':
491
+ """Convert this Python enum value to protobuf enum value."""
492
+ enum_to_protobuf = {
493
+ ManagedJobStatus.PENDING:
494
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_PENDING,
495
+ ManagedJobStatus.DEPRECATED_SUBMITTED:
496
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUBMITTED,
497
+ ManagedJobStatus.STARTING:
498
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_STARTING,
499
+ ManagedJobStatus.RUNNING:
500
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_RUNNING,
501
+ ManagedJobStatus.SUCCEEDED:
502
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUCCEEDED,
503
+ ManagedJobStatus.FAILED:
504
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED,
505
+ ManagedJobStatus.FAILED_CONTROLLER:
506
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_CONTROLLER,
507
+ ManagedJobStatus.FAILED_SETUP:
508
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_SETUP,
509
+ ManagedJobStatus.CANCELLED:
510
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLED,
511
+ ManagedJobStatus.RECOVERING:
512
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_RECOVERING,
513
+ ManagedJobStatus.CANCELLING:
514
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLING,
515
+ ManagedJobStatus.FAILED_PRECHECKS:
516
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_PRECHECKS,
517
+ ManagedJobStatus.FAILED_NO_RESOURCE:
518
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_NO_RESOURCE,
519
+ }
520
+
521
+ if self not in enum_to_protobuf:
522
+ raise ValueError(f'Unknown ManagedJobStatus value: {self}')
523
+
524
+ return enum_to_protobuf[self]
525
+
451
526
 
452
527
  _SPOT_STATUS_TO_COLOR = {
453
528
  ManagedJobStatus.PENDING: colorama.Fore.BLUE,
@@ -537,6 +612,60 @@ class ManagedJobScheduleState(enum.Enum):
537
612
  # The job is in a terminal state. (Not necessarily SUCCEEDED.)
538
613
  DONE = 'DONE'
539
614
 
615
+ @classmethod
616
+ def from_protobuf(
617
+ cls, protobuf_value: 'managed_jobsv1_pb2.ManagedJobScheduleState'
618
+ ) -> Optional['ManagedJobScheduleState']:
619
+ """Convert protobuf ManagedJobScheduleState enum to Python enum value.
620
+ """
621
+ protobuf_to_enum = {
622
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: None,
623
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INVALID: cls.INVALID,
624
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INACTIVE:
625
+ cls.INACTIVE,
626
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_WAITING: cls.WAITING,
627
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING:
628
+ cls.ALIVE_WAITING,
629
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_LAUNCHING:
630
+ cls.LAUNCHING,
631
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF:
632
+ cls.ALIVE_BACKOFF,
633
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE: cls.ALIVE,
634
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_DONE: cls.DONE,
635
+ }
636
+
637
+ if protobuf_value not in protobuf_to_enum:
638
+ raise ValueError('Unknown protobuf ManagedJobScheduleState value: '
639
+ f'{protobuf_value}')
640
+
641
+ return protobuf_to_enum[protobuf_value]
642
+
643
+ def to_protobuf(self) -> 'managed_jobsv1_pb2.ManagedJobScheduleState':
644
+ """Convert this Python enum value to protobuf enum value."""
645
+ enum_to_protobuf = {
646
+ ManagedJobScheduleState.INVALID:
647
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INVALID,
648
+ ManagedJobScheduleState.INACTIVE:
649
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INACTIVE,
650
+ ManagedJobScheduleState.WAITING:
651
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_WAITING,
652
+ ManagedJobScheduleState.ALIVE_WAITING:
653
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING,
654
+ ManagedJobScheduleState.LAUNCHING:
655
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_LAUNCHING,
656
+ ManagedJobScheduleState.ALIVE_BACKOFF:
657
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF,
658
+ ManagedJobScheduleState.ALIVE:
659
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE,
660
+ ManagedJobScheduleState.DONE:
661
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_DONE,
662
+ }
663
+
664
+ if self not in enum_to_protobuf:
665
+ raise ValueError(f'Unknown ManagedJobScheduleState value: {self}')
666
+
667
+ return enum_to_protobuf[self]
668
+
540
669
 
541
670
  # === Status transition functions ===
542
671
  @_init_db
@@ -792,8 +921,14 @@ def set_local_log_file(job_id: int, task_id: Optional[int],
792
921
  # ======== utility functions ========
793
922
  @_init_db
794
923
  def get_nonterminal_job_ids_by_name(name: Optional[str],
924
+ user_hash: Optional[str] = None,
795
925
  all_users: bool = False) -> List[int]:
796
- """Get non-terminal job ids by name."""
926
+ """Get non-terminal job ids by name.
927
+
928
+ If name is None:
929
+ 1. if all_users is False, get for the given user_hash
930
+ 2. otherwise, get for all users
931
+ """
797
932
  assert _SQLALCHEMY_ENGINE is not None
798
933
 
799
934
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
@@ -810,8 +945,15 @@ def get_nonterminal_job_ids_by_name(name: Optional[str],
810
945
  ])
811
946
  ]
812
947
  if name is None and not all_users:
813
- where_conditions.append(
814
- job_info_table.c.user_hash == common_utils.get_user_hash())
948
+ if user_hash is None:
949
+ # For backwards compatibility. With codegen, USER_ID_ENV_VAR
950
+ # was set to the correct value by the jobs controller, as
951
+ # part of ManagedJobCodeGen._build(). This is no longer the
952
+ # case for the Skylet gRPC server, which is why we need to
953
+ # pass it explicitly through the request body.
954
+ logger.debug('user_hash is None, using current user hash')
955
+ user_hash = common_utils.get_user_hash()
956
+ where_conditions.append(job_info_table.c.user_hash == user_hash)
815
957
  if name is not None:
816
958
  # We match the job name from `job_info` for the jobs submitted after
817
959
  # #1982, and from `spot` for the jobs submitted before #1982, whose
sky/jobs/utils.py CHANGED
@@ -16,8 +16,8 @@ import textwrap
16
16
  import time
17
17
  import traceback
18
18
  import typing
19
- from typing import (Any, Deque, Dict, List, Literal, Optional, Set, TextIO,
20
- Tuple, Union)
19
+ from typing import (Any, Deque, Dict, Iterable, List, Literal, Optional, Set,
20
+ TextIO, Tuple, Union)
21
21
 
22
22
  import colorama
23
23
  import filelock
@@ -51,16 +51,23 @@ from sky.utils import subprocess_utils
51
51
  from sky.utils import ux_utils
52
52
 
53
53
  if typing.TYPE_CHECKING:
54
+ from google.protobuf import descriptor
55
+ from google.protobuf import json_format
54
56
  import grpc
55
57
  import psutil
56
58
 
57
59
  import sky
58
60
  from sky import dag as dag_lib
59
61
  from sky.schemas.generated import jobsv1_pb2
62
+ from sky.schemas.generated import managed_jobsv1_pb2
60
63
  else:
64
+ json_format = adaptors_common.LazyImport('google.protobuf.json_format')
65
+ descriptor = adaptors_common.LazyImport('google.protobuf.descriptor')
61
66
  psutil = adaptors_common.LazyImport('psutil')
62
67
  grpc = adaptors_common.LazyImport('grpc')
63
68
  jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
69
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
70
+ 'sky.schemas.generated.managed_jobsv1_pb2')
64
71
 
65
72
  logger = sky_logging.init_logger(__name__)
66
73
 
@@ -169,7 +176,7 @@ def _validate_consolidation_mode_config(
169
176
  if all_jobs:
170
177
  nonterminal_jobs = (
171
178
  managed_job_state.get_nonterminal_job_ids_by_name(
172
- None, all_users=True))
179
+ None, None, all_users=True))
173
180
  if nonterminal_jobs:
174
181
  with ux_utils.print_exception_no_traceback():
175
182
  raise exceptions.InconsistentConsolidationModeError(
@@ -698,14 +705,15 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
698
705
 
699
706
  def cancel_jobs_by_id(job_ids: Optional[List[int]],
700
707
  all_users: bool = False,
701
- current_workspace: Optional[str] = None) -> str:
708
+ current_workspace: Optional[str] = None,
709
+ user_hash: Optional[str] = None) -> str:
702
710
  """Cancel jobs by id.
703
711
 
704
712
  If job_ids is None, cancel all jobs.
705
713
  """
706
714
  if job_ids is None:
707
715
  job_ids = managed_job_state.get_nonterminal_job_ids_by_name(
708
- None, all_users)
716
+ None, user_hash, all_users)
709
717
  job_ids = list(set(job_ids))
710
718
  if not job_ids:
711
719
  return 'No job to cancel.'
@@ -1241,6 +1249,24 @@ def dump_managed_job_queue(
1241
1249
  user_hashes: Optional[List[Optional[str]]] = None,
1242
1250
  statuses: Optional[List[str]] = None,
1243
1251
  ) -> str:
1252
+ return message_utils.encode_payload(
1253
+ get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
1254
+ workspace_match, name_match, pool_match, page,
1255
+ limit, user_hashes, statuses))
1256
+
1257
+
1258
+ def get_managed_job_queue(
1259
+ skip_finished: bool = False,
1260
+ accessible_workspaces: Optional[List[str]] = None,
1261
+ job_ids: Optional[List[int]] = None,
1262
+ workspace_match: Optional[str] = None,
1263
+ name_match: Optional[str] = None,
1264
+ pool_match: Optional[str] = None,
1265
+ page: Optional[int] = None,
1266
+ limit: Optional[int] = None,
1267
+ user_hashes: Optional[List[Optional[str]]] = None,
1268
+ statuses: Optional[List[str]] = None,
1269
+ ) -> Dict[str, Any]:
1244
1270
  # Make sure to get all jobs - some logic below (e.g. high priority job
1245
1271
  # detection) requires a full view of the jobs table.
1246
1272
  jobs = managed_job_state.get_managed_jobs()
@@ -1371,12 +1397,12 @@ def dump_managed_job_queue(
1371
1397
  else:
1372
1398
  job['details'] = None
1373
1399
 
1374
- return message_utils.encode_payload({
1400
+ return {
1375
1401
  'jobs': jobs,
1376
1402
  'total': total,
1377
1403
  'total_no_filter': total_no_filter,
1378
1404
  'status_counts': status_counts
1379
- })
1405
+ }
1380
1406
 
1381
1407
 
1382
1408
  def filter_jobs(
@@ -1824,6 +1850,58 @@ def format_job_table(
1824
1850
  return output
1825
1851
 
1826
1852
 
1853
+ def decode_managed_job_protos(
1854
+ job_protos: Iterable['managed_jobsv1_pb2.ManagedJobInfo']
1855
+ ) -> List[Dict[str, Any]]:
1856
+ """Decode job protos to dicts. Similar to load_managed_job_queue."""
1857
+ user_hash_to_user = global_user_state.get_users(
1858
+ set(job.user_hash for job in job_protos if job.user_hash))
1859
+
1860
+ jobs = []
1861
+ for job_proto in job_protos:
1862
+ job_dict = _job_proto_to_dict(job_proto)
1863
+ user_hash = job_dict.get('user_hash', None)
1864
+ if user_hash is not None:
1865
+ # Skip jobs that do not have user_hash info.
1866
+ # TODO(cooperc): Remove check before 0.12.0.
1867
+ user = user_hash_to_user.get(user_hash, None)
1868
+ job_dict['user_name'] = user.name if user is not None else None
1869
+ jobs.append(job_dict)
1870
+ return jobs
1871
+
1872
+
1873
+ def _job_proto_to_dict(
1874
+ job_proto: 'managed_jobsv1_pb2.ManagedJobInfo') -> Dict[str, Any]:
1875
+ job_dict = json_format.MessageToDict(
1876
+ job_proto,
1877
+ always_print_fields_with_no_presence=True,
1878
+ # Our API returns fields in snake_case.
1879
+ preserving_proto_field_name=True,
1880
+ use_integers_for_enums=True)
1881
+ for field in job_proto.DESCRIPTOR.fields:
1882
+ # Ensure optional fields are present with None values for
1883
+ # backwards compatibility with older clients.
1884
+ if field.has_presence and field.name not in job_dict:
1885
+ job_dict[field.name] = None
1886
+ # json_format.MessageToDict is meant for encoding to JSON,
1887
+ # and Protobuf encodes int64 as decimal strings in JSON,
1888
+ # so we need to convert them back to ints.
1889
+ # https://protobuf.dev/programming-guides/json/#field-representation
1890
+ if field.type == descriptor.FieldDescriptor.TYPE_INT64:
1891
+ job_dict[field.name] = int(job_dict[field.name])
1892
+ job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
1893
+ job_dict['status'])
1894
+ # For backwards compatibility, convert schedule_state to a string,
1895
+ # as we don't have the logic to handle it in our request
1896
+ # encoder/decoder, unlike status.
1897
+ schedule_state_enum = (
1898
+ managed_job_state.ManagedJobScheduleState.from_protobuf(
1899
+ job_dict['schedule_state']))
1900
+ job_dict['schedule_state'] = (schedule_state_enum.value
1901
+ if schedule_state_enum is not None else None)
1902
+ return job_dict
1903
+
1904
+
1827
1905
  class ManagedJobCodeGen:
1828
1906
  """Code generator for managed job utility functions.
1829
1907
 
@@ -11,4 +11,6 @@ from sky.provision.runpod.instance import terminate_instances
11
11
  from sky.provision.runpod.instance import wait_instances
12
12
  from sky.provision.runpod.volume import apply_volume
13
13
  from sky.provision.runpod.volume import delete_volume
14
+ from sky.provision.runpod.volume import get_all_volumes_usedby
14
15
  from sky.provision.runpod.volume import get_volume_usedby
16
+ from sky.provision.runpod.volume import map_all_volumes_usedby