skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (57) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +25 -4
  3. sky/backends/cloud_vm_ray_backend.py +151 -36
  4. sky/client/cli/command.py +2 -1
  5. sky/client/cli/table_utils.py +34 -0
  6. sky/client/sdk.py +7 -5
  7. sky/client/sdk_async.py +5 -5
  8. sky/core.py +3 -4
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
  11. sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-7340bc0f0dd8ae74.js} +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/execution.py +0 -1
  30. sky/global_user_state.py +3 -3
  31. sky/jobs/server/core.py +96 -26
  32. sky/jobs/server/utils.py +65 -32
  33. sky/jobs/state.py +145 -3
  34. sky/jobs/utils.py +85 -7
  35. sky/schemas/api/responses.py +18 -0
  36. sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
  37. sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
  38. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  39. sky/serve/serve_utils.py +16 -0
  40. sky/serve/server/core.py +1 -1
  41. sky/serve/server/impl.py +6 -6
  42. sky/server/requests/serializers/decoders.py +2 -2
  43. sky/server/requests/serializers/encoders.py +7 -3
  44. sky/skylet/constants.py +1 -1
  45. sky/skylet/job_lib.py +2 -32
  46. sky/skylet/log_lib.py +211 -0
  47. sky/skylet/log_lib.pyi +30 -1
  48. sky/skylet/services.py +208 -2
  49. sky/skylet/skylet.py +3 -0
  50. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +32 -32
  51. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +56 -52
  52. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
  53. /sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
  54. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
  55. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
  56. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
  57. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-8e64d11e58eab5cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7528cc0ef8c522c5.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"VXU6_xE28M55BOdwmUUJS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-7340bc0f0dd8ae74.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7528cc0ef8c522c5.js" defer=""></script><script src="/dashboard/_next/static/UDSEoDB67vwFMZyCJ4HWU/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/UDSEoDB67vwFMZyCJ4HWU/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"UDSEoDB67vwFMZyCJ4HWU","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/execution.py CHANGED
@@ -673,7 +673,6 @@ def launch(
673
673
  # see the setup logs when inspecting the launch process to know
674
674
  # excatly what the job is waiting for.
675
675
  detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
676
-
677
676
  return _execute(
678
677
  entrypoint=entrypoint,
679
678
  dryrun=dryrun,
sky/global_user_state.py CHANGED
@@ -483,7 +483,7 @@ def get_user(user_id: str) -> Optional[models.User]:
483
483
 
484
484
  @_init_db
485
485
  @metrics_lib.time_me
486
- def _get_users(user_ids: Set[str]) -> Dict[str, models.User]:
486
+ def get_users(user_ids: Set[str]) -> Dict[str, models.User]:
487
487
  assert _SQLALCHEMY_ENGINE is not None
488
488
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
489
489
  rows = session.query(user_table).filter(
@@ -1659,7 +1659,7 @@ def get_clusters(
1659
1659
 
1660
1660
  # get all users needed for the rows at once
1661
1661
  user_hashes = set(row_to_user_hash.values())
1662
- user_hash_to_user = _get_users(user_hashes)
1662
+ user_hash_to_user = get_users(user_hashes)
1663
1663
 
1664
1664
  # get last cluster event for each row
1665
1665
  cluster_hashes = set(row_to_user_hash.keys())
@@ -1807,7 +1807,7 @@ def get_clusters_from_history(
1807
1807
  row_to_user_hash[row.cluster_hash] = user_hash
1808
1808
 
1809
1809
  user_hashes = set(row_to_user_hash.values())
1810
- user_hash_to_user = _get_users(user_hashes)
1810
+ user_hash_to_user = get_users(user_hashes)
1811
1811
  cluster_hashes = set(row_to_user_hash.keys())
1812
1812
  if not abbreviate_response:
1813
1813
  last_cluster_event_dict = _get_last_cluster_event_multiple(
sky/jobs/server/core.py CHANGED
@@ -19,7 +19,9 @@ from sky import provision as provision_lib
19
19
  from sky import sky_logging
20
20
  from sky import skypilot_config
21
21
  from sky import task as task_lib
22
+ from sky.adaptors import common as adaptors_common
22
23
  from sky.backends import backend_utils
24
+ from sky.backends import cloud_vm_ray_backend
23
25
  from sky.catalog import common as service_catalog_common
24
26
  from sky.data import storage as storage_lib
25
27
  from sky.jobs import constants as managed_job_constants
@@ -44,8 +46,15 @@ from sky.utils import ux_utils
44
46
  from sky.workspaces import core as workspaces_core
45
47
 
46
48
  if typing.TYPE_CHECKING:
49
+ from google.protobuf import json_format
50
+
47
51
  import sky
48
- from sky.backends import cloud_vm_ray_backend
52
+ from sky.schemas.generated import managed_jobsv1_pb2
53
+ else:
54
+ json_format = adaptors_common.LazyImport('google.protobuf.json_format')
55
+
56
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
57
+ 'sky.schemas.generated.managed_jobsv1_pb2')
49
58
 
50
59
  logger = sky_logging.init_logger(__name__)
51
60
 
@@ -701,11 +710,13 @@ def queue_v2(
701
710
  assert isinstance(backend, backends.CloudVmRayBackend)
702
711
 
703
712
  user_hashes: Optional[List[Optional[str]]] = None
713
+ show_jobs_without_user_hash = False
704
714
  if not all_users:
705
715
  user_hashes = [common_utils.get_user_hash()]
706
716
  # For backwards compatibility, we show jobs that do not have a
707
717
  # user_hash. TODO(cooperc): Remove before 0.12.0.
708
718
  user_hashes.append(None)
719
+ show_jobs_without_user_hash = True
709
720
  elif user_match is not None:
710
721
  users = global_user_state.get_user_by_name_match(user_match)
711
722
  if not users:
@@ -713,6 +724,38 @@ def queue_v2(
713
724
  user_hashes = [user.id for user in users]
714
725
 
715
726
  accessible_workspaces = list(workspaces_core.get_workspaces().keys())
727
+
728
+ if handle.is_grpc_enabled_with_flag:
729
+ try:
730
+ request = managed_jobsv1_pb2.GetJobTableRequest(
731
+ skip_finished=skip_finished,
732
+ accessible_workspaces=accessible_workspaces,
733
+ job_ids=managed_jobsv1_pb2.JobIds(
734
+ ids=job_ids) if job_ids is not None else None,
735
+ workspace_match=workspace_match,
736
+ name_match=name_match,
737
+ pool_match=pool_match,
738
+ page=page,
739
+ limit=limit,
740
+ # Remove None from user_hashes, as the gRPC server uses the
741
+ # show_jobs_without_user_hash flag instead.
742
+ user_hashes=managed_jobsv1_pb2.UserHashes(hashes=[
743
+ user_hash for user_hash in user_hashes
744
+ if user_hash is not None
745
+ ]) if user_hashes is not None else None,
746
+ statuses=managed_jobsv1_pb2.Statuses(
747
+ statuses=statuses) if statuses is not None else None,
748
+ show_jobs_without_user_hash=show_jobs_without_user_hash,
749
+ )
750
+ response = backend_utils.invoke_skylet_with_retries(
751
+ lambda: cloud_vm_ray_backend.SkyletClient(
752
+ handle.get_grpc_channel()).get_managed_job_table(request))
753
+ jobs = managed_job_utils.decode_managed_job_protos(response.jobs)
754
+ return jobs, response.total, dict(
755
+ response.status_counts), response.total_no_filter
756
+ except exceptions.SkyletMethodNotImplementedError:
757
+ pass
758
+
716
759
  code = managed_job_utils.ManagedJobCodeGen.get_job_table(
717
760
  skip_finished, accessible_workspaces, job_ids, workspace_match,
718
761
  name_match, pool_match, page, limit, user_hashes, statuses)
@@ -819,33 +862,60 @@ def cancel(name: Optional[str] = None,
819
862
  'Can only specify one of JOB_IDS, name, pool, or all/'
820
863
  f'all_users. Provided {" ".join(arguments)!r}.')
821
864
 
865
+ job_ids = None if (all_users or all) else job_ids
866
+
822
867
  backend = backend_utils.get_backend_from_handle(handle)
823
868
  assert isinstance(backend, backends.CloudVmRayBackend)
824
- if all_users:
825
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
826
- None, all_users=True)
827
- elif all:
828
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(None)
829
- elif job_ids:
830
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
831
- job_ids)
832
- elif name is not None:
833
- code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
834
- else:
835
- assert pool is not None, (job_ids, name, pool, all)
836
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(pool)
837
- # The stderr is redirected to stdout
838
- returncode, stdout, stderr = backend.run_on_head(handle,
839
- code,
840
- require_outputs=True,
841
- stream_logs=False)
842
- try:
843
- subprocess_utils.handle_returncode(returncode, code,
844
- 'Failed to cancel managed job',
845
- stdout + stderr)
846
- except exceptions.CommandError as e:
847
- with ux_utils.print_exception_no_traceback():
848
- raise RuntimeError(e.error_msg) from e
869
+
870
+ use_legacy = not handle.is_grpc_enabled_with_flag
871
+
872
+ if not use_legacy:
873
+ current_workspace = skypilot_config.get_active_workspace()
874
+ try:
875
+ request = managed_jobsv1_pb2.CancelJobsRequest(
876
+ current_workspace=current_workspace)
877
+
878
+ if all_users or all or job_ids:
879
+ request.all_users = all_users
880
+ if all:
881
+ request.user_hash = common_utils.get_user_hash()
882
+ if job_ids is not None:
883
+ request.job_ids.CopyFrom(
884
+ managed_jobsv1_pb2.JobIds(ids=job_ids))
885
+ elif name is not None:
886
+ request.job_name = name
887
+ else:
888
+ assert pool is not None, (job_ids, name, pool, all)
889
+ request.pool_name = pool
890
+
891
+ response = backend_utils.invoke_skylet_with_retries(
892
+ lambda: cloud_vm_ray_backend.SkyletClient(
893
+ handle.get_grpc_channel()).cancel_managed_jobs(request))
894
+ stdout = response.message
895
+ except exceptions.SkyletMethodNotImplementedError:
896
+ use_legacy = True
897
+
898
+ if use_legacy:
899
+ if all_users or all or job_ids:
900
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
901
+ job_ids, all_users=all_users)
902
+ elif name is not None:
903
+ code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(
904
+ name)
905
+ else:
906
+ assert pool is not None, (job_ids, name, pool, all)
907
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(
908
+ pool)
909
+ # The stderr is redirected to stdout
910
+ returncode, stdout, stderr = backend.run_on_head(
911
+ handle, code, require_outputs=True, stream_logs=False)
912
+ try:
913
+ subprocess_utils.handle_returncode(
914
+ returncode, code, 'Failed to cancel managed job',
915
+ stdout + stderr)
916
+ except exceptions.CommandError as e:
917
+ with ux_utils.print_exception_no_traceback():
918
+ raise RuntimeError(e.error_msg) from e
849
919
 
850
920
  logger.info(stdout)
851
921
  if 'Multiple jobs found with name' in stdout:
sky/jobs/server/utils.py CHANGED
@@ -1,13 +1,24 @@
1
1
  """Utility functions for managed jobs."""
2
+ import typing
3
+
2
4
  from sky import backends
5
+ from sky import exceptions
3
6
  from sky import sky_logging
7
+ from sky.adaptors import common as adaptors_common
4
8
  from sky.backends import backend_utils
9
+ from sky.backends import cloud_vm_ray_backend
5
10
  from sky.jobs import utils as managed_job_utils
6
11
  from sky.skylet import constants as skylet_constants
7
12
  from sky.utils import controller_utils
8
13
 
9
14
  logger = sky_logging.init_logger(__name__)
10
15
 
16
+ if typing.TYPE_CHECKING:
17
+ from sky.schemas.generated import managed_jobsv1_pb2
18
+ else:
19
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
20
+ 'sky.schemas.generated.managed_jobsv1_pb2')
21
+
11
22
 
12
23
  def check_version_mismatch_and_non_terminal_jobs() -> None:
13
24
  """Check if controller has version mismatch and non-terminal jobs exist.
@@ -28,42 +39,64 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
28
39
  backend = backend_utils.get_backend_from_handle(handle)
29
40
  assert isinstance(backend, backends.CloudVmRayBackend)
30
41
 
31
- # Get controller version and raw job table
32
- code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
33
-
34
- returncode, output, stderr = backend.run_on_head(handle,
35
- code,
36
- require_outputs=True,
37
- stream_logs=False,
38
- separate_stderr=True)
39
-
40
- if returncode != 0:
41
- logger.error(output + stderr)
42
- raise ValueError('Failed to check controller version and jobs with '
43
- f'returncode: {returncode}.\n{output + stderr}')
44
-
45
- # Parse the output to extract controller version (split only on first
46
- # newline)
47
- output_parts = output.strip().split('\n', 1)
48
-
49
- # Extract controller version from first line
50
- if len(output_parts) < 2 or not output_parts[0].startswith(
51
- 'controller_version:'):
52
- raise ValueError(
53
- f'Expected controller version in first line, got: {output}')
54
-
55
- controller_version = output_parts[0].split(':', 1)[1]
56
-
57
- # Rest is job table payload (preserving any newlines within it)
58
- job_table_payload = output_parts[1]
42
+ use_legacy = not handle.is_grpc_enabled_with_flag
43
+
44
+ if not use_legacy:
45
+ try:
46
+ version_request = managed_jobsv1_pb2.GetVersionRequest()
47
+ version_response = backend_utils.invoke_skylet_with_retries(
48
+ lambda: cloud_vm_ray_backend.SkyletClient(
49
+ handle.get_grpc_channel(
50
+ )).get_managed_job_controller_version(version_request))
51
+ controller_version = version_response.controller_version
52
+
53
+ job_table_request = managed_jobsv1_pb2.GetJobTableRequest()
54
+ job_table_response = backend_utils.invoke_skylet_with_retries(
55
+ lambda: cloud_vm_ray_backend.SkyletClient(
56
+ handle.get_grpc_channel()).get_managed_job_table(
57
+ job_table_request))
58
+ jobs = managed_job_utils.decode_managed_job_protos(
59
+ job_table_response.jobs)
60
+ except exceptions.SkyletMethodNotImplementedError:
61
+ use_legacy = True
62
+
63
+ if use_legacy:
64
+ # Get controller version and raw job table
65
+ code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
66
+
67
+ returncode, output, stderr = backend.run_on_head(handle,
68
+ code,
69
+ require_outputs=True,
70
+ stream_logs=False,
71
+ separate_stderr=True)
72
+
73
+ if returncode != 0:
74
+ logger.error(output + stderr)
75
+ raise ValueError('Failed to check controller version and jobs with '
76
+ f'returncode: {returncode}.\n{output + stderr}')
77
+
78
+ # Parse the output to extract controller version (split only on first
79
+ # newline)
80
+ output_parts = output.strip().split('\n', 1)
81
+
82
+ # Extract controller version from first line
83
+ if len(output_parts) < 2 or not output_parts[0].startswith(
84
+ 'controller_version:'):
85
+ raise ValueError(
86
+ f'Expected controller version in first line, got: {output}')
87
+
88
+ controller_version = output_parts[0].split(':', 1)[1]
89
+
90
+ # Rest is job table payload (preserving any newlines within it)
91
+ job_table_payload = output_parts[1]
92
+
93
+ # Load and filter jobs locally using existing method
94
+ jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
95
+ job_table_payload)
59
96
 
60
97
  # Process locally: check version match and filter non-terminal jobs
61
98
  version_matches = (controller_version == local_version or
62
99
  int(controller_version) > 17)
63
-
64
- # Load and filter jobs locally using existing method
65
- jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
66
- job_table_payload)
67
100
  non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
68
101
  has_non_terminal_jobs = len(non_terminal_jobs) > 0
69
102
 
sky/jobs/state.py CHANGED
@@ -25,6 +25,7 @@ from sqlalchemy.ext import declarative
25
25
  from sky import exceptions
26
26
  from sky import sky_logging
27
27
  from sky import skypilot_config
28
+ from sky.adaptors import common as adaptors_common
28
29
  from sky.skylet import constants
29
30
  from sky.utils import common_utils
30
31
  from sky.utils import context_utils
@@ -34,6 +35,11 @@ from sky.utils.db import migration_utils
34
35
  if typing.TYPE_CHECKING:
35
36
  from sqlalchemy.engine import row
36
37
 
38
+ from sky.schemas.generated import managed_jobsv1_pb2
39
+ else:
40
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
41
+ 'sky.schemas.generated.managed_jobsv1_pb2')
42
+
37
43
  # Separate callback types for sync and async contexts
38
44
  SyncCallbackType = Callable[[str], None]
39
45
  AsyncCallbackType = Callable[[str], Awaitable[Any]]
@@ -448,6 +454,75 @@ class ManagedJobStatus(enum.Enum):
448
454
  cls.RECOVERING,
449
455
  ]
450
456
 
457
+ @classmethod
458
+ def from_protobuf(
459
+ cls, protobuf_value: 'managed_jobsv1_pb2.ManagedJobStatus'
460
+ ) -> Optional['ManagedJobStatus']:
461
+ """Convert protobuf ManagedJobStatus enum to Python enum value."""
462
+ protobuf_to_enum = {
463
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_UNSPECIFIED: None,
464
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_PENDING: cls.PENDING,
465
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUBMITTED:
466
+ cls.DEPRECATED_SUBMITTED,
467
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_STARTING: cls.STARTING,
468
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_RUNNING: cls.RUNNING,
469
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
470
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED: cls.FAILED,
471
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_CONTROLLER:
472
+ cls.FAILED_CONTROLLER,
473
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_SETUP:
474
+ cls.FAILED_SETUP,
475
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLED: cls.CANCELLED,
476
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_RECOVERING: cls.RECOVERING,
477
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLING: cls.CANCELLING,
478
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_PRECHECKS:
479
+ cls.FAILED_PRECHECKS,
480
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_NO_RESOURCE:
481
+ cls.FAILED_NO_RESOURCE,
482
+ }
483
+
484
+ if protobuf_value not in protobuf_to_enum:
485
+ raise ValueError(
486
+ f'Unknown protobuf ManagedJobStatus value: {protobuf_value}')
487
+
488
+ return protobuf_to_enum[protobuf_value]
489
+
490
+ def to_protobuf(self) -> 'managed_jobsv1_pb2.ManagedJobStatus':
491
+ """Convert this Python enum value to protobuf enum value."""
492
+ enum_to_protobuf = {
493
+ ManagedJobStatus.PENDING:
494
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_PENDING,
495
+ ManagedJobStatus.DEPRECATED_SUBMITTED:
496
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUBMITTED,
497
+ ManagedJobStatus.STARTING:
498
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_STARTING,
499
+ ManagedJobStatus.RUNNING:
500
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_RUNNING,
501
+ ManagedJobStatus.SUCCEEDED:
502
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUCCEEDED,
503
+ ManagedJobStatus.FAILED:
504
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED,
505
+ ManagedJobStatus.FAILED_CONTROLLER:
506
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_CONTROLLER,
507
+ ManagedJobStatus.FAILED_SETUP:
508
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_SETUP,
509
+ ManagedJobStatus.CANCELLED:
510
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLED,
511
+ ManagedJobStatus.RECOVERING:
512
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_RECOVERING,
513
+ ManagedJobStatus.CANCELLING:
514
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLING,
515
+ ManagedJobStatus.FAILED_PRECHECKS:
516
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_PRECHECKS,
517
+ ManagedJobStatus.FAILED_NO_RESOURCE:
518
+ managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_NO_RESOURCE,
519
+ }
520
+
521
+ if self not in enum_to_protobuf:
522
+ raise ValueError(f'Unknown ManagedJobStatus value: {self}')
523
+
524
+ return enum_to_protobuf[self]
525
+
451
526
 
452
527
  _SPOT_STATUS_TO_COLOR = {
453
528
  ManagedJobStatus.PENDING: colorama.Fore.BLUE,
@@ -537,6 +612,60 @@ class ManagedJobScheduleState(enum.Enum):
537
612
  # The job is in a terminal state. (Not necessarily SUCCEEDED.)
538
613
  DONE = 'DONE'
539
614
 
615
+ @classmethod
616
+ def from_protobuf(
617
+ cls, protobuf_value: 'managed_jobsv1_pb2.ManagedJobScheduleState'
618
+ ) -> Optional['ManagedJobScheduleState']:
619
+ """Convert protobuf ManagedJobScheduleState enum to Python enum value.
620
+ """
621
+ protobuf_to_enum = {
622
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: None,
623
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INVALID: cls.INVALID,
624
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INACTIVE:
625
+ cls.INACTIVE,
626
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_WAITING: cls.WAITING,
627
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING:
628
+ cls.ALIVE_WAITING,
629
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_LAUNCHING:
630
+ cls.LAUNCHING,
631
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF:
632
+ cls.ALIVE_BACKOFF,
633
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE: cls.ALIVE,
634
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_DONE: cls.DONE,
635
+ }
636
+
637
+ if protobuf_value not in protobuf_to_enum:
638
+ raise ValueError('Unknown protobuf ManagedJobScheduleState value: '
639
+ f'{protobuf_value}')
640
+
641
+ return protobuf_to_enum[protobuf_value]
642
+
643
+ def to_protobuf(self) -> 'managed_jobsv1_pb2.ManagedJobScheduleState':
644
+ """Convert this Python enum value to protobuf enum value."""
645
+ enum_to_protobuf = {
646
+ ManagedJobScheduleState.INVALID:
647
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INVALID,
648
+ ManagedJobScheduleState.INACTIVE:
649
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INACTIVE,
650
+ ManagedJobScheduleState.WAITING:
651
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_WAITING,
652
+ ManagedJobScheduleState.ALIVE_WAITING:
653
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING,
654
+ ManagedJobScheduleState.LAUNCHING:
655
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_LAUNCHING,
656
+ ManagedJobScheduleState.ALIVE_BACKOFF:
657
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF,
658
+ ManagedJobScheduleState.ALIVE:
659
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE,
660
+ ManagedJobScheduleState.DONE:
661
+ managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_DONE,
662
+ }
663
+
664
+ if self not in enum_to_protobuf:
665
+ raise ValueError(f'Unknown ManagedJobScheduleState value: {self}')
666
+
667
+ return enum_to_protobuf[self]
668
+
540
669
 
541
670
  # === Status transition functions ===
542
671
  @_init_db
@@ -792,8 +921,14 @@ def set_local_log_file(job_id: int, task_id: Optional[int],
792
921
  # ======== utility functions ========
793
922
  @_init_db
794
923
  def get_nonterminal_job_ids_by_name(name: Optional[str],
924
+ user_hash: Optional[str] = None,
795
925
  all_users: bool = False) -> List[int]:
796
- """Get non-terminal job ids by name."""
926
+ """Get non-terminal job ids by name.
927
+
928
+ If name is None:
929
+ 1. if all_users is False, get for the given user_hash
930
+ 2. otherwise, get for all users
931
+ """
797
932
  assert _SQLALCHEMY_ENGINE is not None
798
933
 
799
934
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
@@ -810,8 +945,15 @@ def get_nonterminal_job_ids_by_name(name: Optional[str],
810
945
  ])
811
946
  ]
812
947
  if name is None and not all_users:
813
- where_conditions.append(
814
- job_info_table.c.user_hash == common_utils.get_user_hash())
948
+ if user_hash is None:
949
+ # For backwards compatibility. With codegen, USER_ID_ENV_VAR
950
+ # was set to the correct value by the jobs controller, as
951
+ # part of ManagedJobCodeGen._build(). This is no longer the
952
+ # case for the Skylet gRPC server, which is why we need to
953
+ # pass it explicitly through the request body.
954
+ logger.debug('user_hash is None, using current user hash')
955
+ user_hash = common_utils.get_user_hash()
956
+ where_conditions.append(job_info_table.c.user_hash == user_hash)
815
957
  if name is not None:
816
958
  # We match the job name from `job_info` for the jobs submitted after
817
959
  # #1982, and from `spot` for the jobs submitted before #1982, whose