skypilot-nightly 1.0.0.dev20250925__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +38 -14
- sky/backends/cloud_vm_ray_backend.py +151 -36
- sky/client/cli/command.py +18 -9
- sky/client/cli/table_utils.py +34 -0
- sky/client/common.py +4 -2
- sky/client/sdk.py +11 -7
- sky/client/sdk_async.py +5 -5
- sky/core.py +6 -6
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
- sky/dashboard/out/_next/static/chunks/{webpack-16ba1d7187d2e3b1.js → webpack-7340bc0f0dd8ae74.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +0 -1
- sky/global_user_state.py +57 -34
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +4 -0
- sky/jobs/server/core.py +98 -26
- sky/jobs/server/utils.py +65 -32
- sky/jobs/state.py +145 -3
- sky/jobs/utils.py +85 -7
- sky/provision/runpod/__init__.py +2 -0
- sky/schemas/api/responses.py +18 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/serve/serve_utils.py +16 -0
- sky/serve/server/core.py +1 -1
- sky/serve/server/impl.py +6 -6
- sky/server/requests/payloads.py +2 -1
- sky/server/requests/serializers/decoders.py +2 -2
- sky/server/requests/serializers/encoders.py +7 -3
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/constants.py +4 -1
- sky/skylet/events.py +42 -0
- sky/skylet/job_lib.py +2 -32
- sky/skylet/log_lib.py +211 -0
- sky/skylet/log_lib.pyi +30 -1
- sky/skylet/services.py +208 -2
- sky/skylet/skylet.py +3 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +8 -3
- sky/utils/db/db_utils.py +5 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/kubernetes_deploy_utils.py +35 -12
- sky/volumes/server/core.py +1 -0
- sky/volumes/volume.py +16 -17
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +36 -36
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +74 -69
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +0 -16
- /sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py
CHANGED
|
@@ -19,7 +19,9 @@ from sky import provision as provision_lib
|
|
|
19
19
|
from sky import sky_logging
|
|
20
20
|
from sky import skypilot_config
|
|
21
21
|
from sky import task as task_lib
|
|
22
|
+
from sky.adaptors import common as adaptors_common
|
|
22
23
|
from sky.backends import backend_utils
|
|
24
|
+
from sky.backends import cloud_vm_ray_backend
|
|
23
25
|
from sky.catalog import common as service_catalog_common
|
|
24
26
|
from sky.data import storage as storage_lib
|
|
25
27
|
from sky.jobs import constants as managed_job_constants
|
|
@@ -44,8 +46,15 @@ from sky.utils import ux_utils
|
|
|
44
46
|
from sky.workspaces import core as workspaces_core
|
|
45
47
|
|
|
46
48
|
if typing.TYPE_CHECKING:
|
|
49
|
+
from google.protobuf import json_format
|
|
50
|
+
|
|
47
51
|
import sky
|
|
48
|
-
from sky.
|
|
52
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
53
|
+
else:
|
|
54
|
+
json_format = adaptors_common.LazyImport('google.protobuf.json_format')
|
|
55
|
+
|
|
56
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
57
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
49
58
|
|
|
50
59
|
logger = sky_logging.init_logger(__name__)
|
|
51
60
|
|
|
@@ -368,6 +377,8 @@ def launch(
|
|
|
368
377
|
'priority': priority,
|
|
369
378
|
'consolidation_mode_job_id': consolidation_mode_job_id,
|
|
370
379
|
'pool': pool,
|
|
380
|
+
'job_controller_indicator_file':
|
|
381
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
|
|
371
382
|
**controller_utils.shared_controller_vars_to_fill(
|
|
372
383
|
controller,
|
|
373
384
|
remote_user_config_path=remote_user_config_path,
|
|
@@ -699,11 +710,13 @@ def queue_v2(
|
|
|
699
710
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
700
711
|
|
|
701
712
|
user_hashes: Optional[List[Optional[str]]] = None
|
|
713
|
+
show_jobs_without_user_hash = False
|
|
702
714
|
if not all_users:
|
|
703
715
|
user_hashes = [common_utils.get_user_hash()]
|
|
704
716
|
# For backwards compatibility, we show jobs that do not have a
|
|
705
717
|
# user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
706
718
|
user_hashes.append(None)
|
|
719
|
+
show_jobs_without_user_hash = True
|
|
707
720
|
elif user_match is not None:
|
|
708
721
|
users = global_user_state.get_user_by_name_match(user_match)
|
|
709
722
|
if not users:
|
|
@@ -711,6 +724,38 @@ def queue_v2(
|
|
|
711
724
|
user_hashes = [user.id for user in users]
|
|
712
725
|
|
|
713
726
|
accessible_workspaces = list(workspaces_core.get_workspaces().keys())
|
|
727
|
+
|
|
728
|
+
if handle.is_grpc_enabled_with_flag:
|
|
729
|
+
try:
|
|
730
|
+
request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
731
|
+
skip_finished=skip_finished,
|
|
732
|
+
accessible_workspaces=accessible_workspaces,
|
|
733
|
+
job_ids=managed_jobsv1_pb2.JobIds(
|
|
734
|
+
ids=job_ids) if job_ids is not None else None,
|
|
735
|
+
workspace_match=workspace_match,
|
|
736
|
+
name_match=name_match,
|
|
737
|
+
pool_match=pool_match,
|
|
738
|
+
page=page,
|
|
739
|
+
limit=limit,
|
|
740
|
+
# Remove None from user_hashes, as the gRPC server uses the
|
|
741
|
+
# show_jobs_without_user_hash flag instead.
|
|
742
|
+
user_hashes=managed_jobsv1_pb2.UserHashes(hashes=[
|
|
743
|
+
user_hash for user_hash in user_hashes
|
|
744
|
+
if user_hash is not None
|
|
745
|
+
]) if user_hashes is not None else None,
|
|
746
|
+
statuses=managed_jobsv1_pb2.Statuses(
|
|
747
|
+
statuses=statuses) if statuses is not None else None,
|
|
748
|
+
show_jobs_without_user_hash=show_jobs_without_user_hash,
|
|
749
|
+
)
|
|
750
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
751
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
752
|
+
handle.get_grpc_channel()).get_managed_job_table(request))
|
|
753
|
+
jobs = managed_job_utils.decode_managed_job_protos(response.jobs)
|
|
754
|
+
return jobs, response.total, dict(
|
|
755
|
+
response.status_counts), response.total_no_filter
|
|
756
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
757
|
+
pass
|
|
758
|
+
|
|
714
759
|
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
715
760
|
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
716
761
|
name_match, pool_match, page, limit, user_hashes, statuses)
|
|
@@ -817,33 +862,60 @@ def cancel(name: Optional[str] = None,
|
|
|
817
862
|
'Can only specify one of JOB_IDS, name, pool, or all/'
|
|
818
863
|
f'all_users. Provided {" ".join(arguments)!r}.')
|
|
819
864
|
|
|
865
|
+
job_ids = None if (all_users or all) else job_ids
|
|
866
|
+
|
|
820
867
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
821
868
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
869
|
+
|
|
870
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
871
|
+
|
|
872
|
+
if not use_legacy:
|
|
873
|
+
current_workspace = skypilot_config.get_active_workspace()
|
|
874
|
+
try:
|
|
875
|
+
request = managed_jobsv1_pb2.CancelJobsRequest(
|
|
876
|
+
current_workspace=current_workspace)
|
|
877
|
+
|
|
878
|
+
if all_users or all or job_ids:
|
|
879
|
+
request.all_users = all_users
|
|
880
|
+
if all:
|
|
881
|
+
request.user_hash = common_utils.get_user_hash()
|
|
882
|
+
if job_ids is not None:
|
|
883
|
+
request.job_ids.CopyFrom(
|
|
884
|
+
managed_jobsv1_pb2.JobIds(ids=job_ids))
|
|
885
|
+
elif name is not None:
|
|
886
|
+
request.job_name = name
|
|
887
|
+
else:
|
|
888
|
+
assert pool is not None, (job_ids, name, pool, all)
|
|
889
|
+
request.pool_name = pool
|
|
890
|
+
|
|
891
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
892
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
893
|
+
handle.get_grpc_channel()).cancel_managed_jobs(request))
|
|
894
|
+
stdout = response.message
|
|
895
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
896
|
+
use_legacy = True
|
|
897
|
+
|
|
898
|
+
if use_legacy:
|
|
899
|
+
if all_users or all or job_ids:
|
|
900
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
|
|
901
|
+
job_ids, all_users=all_users)
|
|
902
|
+
elif name is not None:
|
|
903
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(
|
|
904
|
+
name)
|
|
905
|
+
else:
|
|
906
|
+
assert pool is not None, (job_ids, name, pool, all)
|
|
907
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(
|
|
908
|
+
pool)
|
|
909
|
+
# The stderr is redirected to stdout
|
|
910
|
+
returncode, stdout, stderr = backend.run_on_head(
|
|
911
|
+
handle, code, require_outputs=True, stream_logs=False)
|
|
912
|
+
try:
|
|
913
|
+
subprocess_utils.handle_returncode(
|
|
914
|
+
returncode, code, 'Failed to cancel managed job',
|
|
915
|
+
stdout + stderr)
|
|
916
|
+
except exceptions.CommandError as e:
|
|
917
|
+
with ux_utils.print_exception_no_traceback():
|
|
918
|
+
raise RuntimeError(e.error_msg) from e
|
|
847
919
|
|
|
848
920
|
logger.info(stdout)
|
|
849
921
|
if 'Multiple jobs found with name' in stdout:
|
sky/jobs/server/utils.py
CHANGED
|
@@ -1,13 +1,24 @@
|
|
|
1
1
|
"""Utility functions for managed jobs."""
|
|
2
|
+
import typing
|
|
3
|
+
|
|
2
4
|
from sky import backends
|
|
5
|
+
from sky import exceptions
|
|
3
6
|
from sky import sky_logging
|
|
7
|
+
from sky.adaptors import common as adaptors_common
|
|
4
8
|
from sky.backends import backend_utils
|
|
9
|
+
from sky.backends import cloud_vm_ray_backend
|
|
5
10
|
from sky.jobs import utils as managed_job_utils
|
|
6
11
|
from sky.skylet import constants as skylet_constants
|
|
7
12
|
from sky.utils import controller_utils
|
|
8
13
|
|
|
9
14
|
logger = sky_logging.init_logger(__name__)
|
|
10
15
|
|
|
16
|
+
if typing.TYPE_CHECKING:
|
|
17
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
18
|
+
else:
|
|
19
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
20
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
21
|
+
|
|
11
22
|
|
|
12
23
|
def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
13
24
|
"""Check if controller has version mismatch and non-terminal jobs exist.
|
|
@@ -28,42 +39,64 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
28
39
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
29
40
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
30
41
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
42
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
43
|
+
|
|
44
|
+
if not use_legacy:
|
|
45
|
+
try:
|
|
46
|
+
version_request = managed_jobsv1_pb2.GetVersionRequest()
|
|
47
|
+
version_response = backend_utils.invoke_skylet_with_retries(
|
|
48
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
49
|
+
handle.get_grpc_channel(
|
|
50
|
+
)).get_managed_job_controller_version(version_request))
|
|
51
|
+
controller_version = version_response.controller_version
|
|
52
|
+
|
|
53
|
+
job_table_request = managed_jobsv1_pb2.GetJobTableRequest()
|
|
54
|
+
job_table_response = backend_utils.invoke_skylet_with_retries(
|
|
55
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
56
|
+
handle.get_grpc_channel()).get_managed_job_table(
|
|
57
|
+
job_table_request))
|
|
58
|
+
jobs = managed_job_utils.decode_managed_job_protos(
|
|
59
|
+
job_table_response.jobs)
|
|
60
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
61
|
+
use_legacy = True
|
|
62
|
+
|
|
63
|
+
if use_legacy:
|
|
64
|
+
# Get controller version and raw job table
|
|
65
|
+
code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
|
|
66
|
+
|
|
67
|
+
returncode, output, stderr = backend.run_on_head(handle,
|
|
68
|
+
code,
|
|
69
|
+
require_outputs=True,
|
|
70
|
+
stream_logs=False,
|
|
71
|
+
separate_stderr=True)
|
|
72
|
+
|
|
73
|
+
if returncode != 0:
|
|
74
|
+
logger.error(output + stderr)
|
|
75
|
+
raise ValueError('Failed to check controller version and jobs with '
|
|
76
|
+
f'returncode: {returncode}.\n{output + stderr}')
|
|
77
|
+
|
|
78
|
+
# Parse the output to extract controller version (split only on first
|
|
79
|
+
# newline)
|
|
80
|
+
output_parts = output.strip().split('\n', 1)
|
|
81
|
+
|
|
82
|
+
# Extract controller version from first line
|
|
83
|
+
if len(output_parts) < 2 or not output_parts[0].startswith(
|
|
84
|
+
'controller_version:'):
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f'Expected controller version in first line, got: {output}')
|
|
87
|
+
|
|
88
|
+
controller_version = output_parts[0].split(':', 1)[1]
|
|
89
|
+
|
|
90
|
+
# Rest is job table payload (preserving any newlines within it)
|
|
91
|
+
job_table_payload = output_parts[1]
|
|
92
|
+
|
|
93
|
+
# Load and filter jobs locally using existing method
|
|
94
|
+
jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
|
|
95
|
+
job_table_payload)
|
|
59
96
|
|
|
60
97
|
# Process locally: check version match and filter non-terminal jobs
|
|
61
98
|
version_matches = (controller_version == local_version or
|
|
62
99
|
int(controller_version) > 17)
|
|
63
|
-
|
|
64
|
-
# Load and filter jobs locally using existing method
|
|
65
|
-
jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
|
|
66
|
-
job_table_payload)
|
|
67
100
|
non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
|
|
68
101
|
has_non_terminal_jobs = len(non_terminal_jobs) > 0
|
|
69
102
|
|
sky/jobs/state.py
CHANGED
|
@@ -25,6 +25,7 @@ from sqlalchemy.ext import declarative
|
|
|
25
25
|
from sky import exceptions
|
|
26
26
|
from sky import sky_logging
|
|
27
27
|
from sky import skypilot_config
|
|
28
|
+
from sky.adaptors import common as adaptors_common
|
|
28
29
|
from sky.skylet import constants
|
|
29
30
|
from sky.utils import common_utils
|
|
30
31
|
from sky.utils import context_utils
|
|
@@ -34,6 +35,11 @@ from sky.utils.db import migration_utils
|
|
|
34
35
|
if typing.TYPE_CHECKING:
|
|
35
36
|
from sqlalchemy.engine import row
|
|
36
37
|
|
|
38
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
39
|
+
else:
|
|
40
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
41
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
42
|
+
|
|
37
43
|
# Separate callback types for sync and async contexts
|
|
38
44
|
SyncCallbackType = Callable[[str], None]
|
|
39
45
|
AsyncCallbackType = Callable[[str], Awaitable[Any]]
|
|
@@ -448,6 +454,75 @@ class ManagedJobStatus(enum.Enum):
|
|
|
448
454
|
cls.RECOVERING,
|
|
449
455
|
]
|
|
450
456
|
|
|
457
|
+
@classmethod
|
|
458
|
+
def from_protobuf(
|
|
459
|
+
cls, protobuf_value: 'managed_jobsv1_pb2.ManagedJobStatus'
|
|
460
|
+
) -> Optional['ManagedJobStatus']:
|
|
461
|
+
"""Convert protobuf ManagedJobStatus enum to Python enum value."""
|
|
462
|
+
protobuf_to_enum = {
|
|
463
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_UNSPECIFIED: None,
|
|
464
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_PENDING: cls.PENDING,
|
|
465
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUBMITTED:
|
|
466
|
+
cls.DEPRECATED_SUBMITTED,
|
|
467
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_STARTING: cls.STARTING,
|
|
468
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_RUNNING: cls.RUNNING,
|
|
469
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
|
|
470
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED: cls.FAILED,
|
|
471
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_CONTROLLER:
|
|
472
|
+
cls.FAILED_CONTROLLER,
|
|
473
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_SETUP:
|
|
474
|
+
cls.FAILED_SETUP,
|
|
475
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLED: cls.CANCELLED,
|
|
476
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_RECOVERING: cls.RECOVERING,
|
|
477
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLING: cls.CANCELLING,
|
|
478
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_PRECHECKS:
|
|
479
|
+
cls.FAILED_PRECHECKS,
|
|
480
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_NO_RESOURCE:
|
|
481
|
+
cls.FAILED_NO_RESOURCE,
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
if protobuf_value not in protobuf_to_enum:
|
|
485
|
+
raise ValueError(
|
|
486
|
+
f'Unknown protobuf ManagedJobStatus value: {protobuf_value}')
|
|
487
|
+
|
|
488
|
+
return protobuf_to_enum[protobuf_value]
|
|
489
|
+
|
|
490
|
+
def to_protobuf(self) -> 'managed_jobsv1_pb2.ManagedJobStatus':
|
|
491
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
492
|
+
enum_to_protobuf = {
|
|
493
|
+
ManagedJobStatus.PENDING:
|
|
494
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_PENDING,
|
|
495
|
+
ManagedJobStatus.DEPRECATED_SUBMITTED:
|
|
496
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUBMITTED,
|
|
497
|
+
ManagedJobStatus.STARTING:
|
|
498
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_STARTING,
|
|
499
|
+
ManagedJobStatus.RUNNING:
|
|
500
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_RUNNING,
|
|
501
|
+
ManagedJobStatus.SUCCEEDED:
|
|
502
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUCCEEDED,
|
|
503
|
+
ManagedJobStatus.FAILED:
|
|
504
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED,
|
|
505
|
+
ManagedJobStatus.FAILED_CONTROLLER:
|
|
506
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_CONTROLLER,
|
|
507
|
+
ManagedJobStatus.FAILED_SETUP:
|
|
508
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_SETUP,
|
|
509
|
+
ManagedJobStatus.CANCELLED:
|
|
510
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLED,
|
|
511
|
+
ManagedJobStatus.RECOVERING:
|
|
512
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_RECOVERING,
|
|
513
|
+
ManagedJobStatus.CANCELLING:
|
|
514
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLING,
|
|
515
|
+
ManagedJobStatus.FAILED_PRECHECKS:
|
|
516
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_PRECHECKS,
|
|
517
|
+
ManagedJobStatus.FAILED_NO_RESOURCE:
|
|
518
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_NO_RESOURCE,
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
if self not in enum_to_protobuf:
|
|
522
|
+
raise ValueError(f'Unknown ManagedJobStatus value: {self}')
|
|
523
|
+
|
|
524
|
+
return enum_to_protobuf[self]
|
|
525
|
+
|
|
451
526
|
|
|
452
527
|
_SPOT_STATUS_TO_COLOR = {
|
|
453
528
|
ManagedJobStatus.PENDING: colorama.Fore.BLUE,
|
|
@@ -537,6 +612,60 @@ class ManagedJobScheduleState(enum.Enum):
|
|
|
537
612
|
# The job is in a terminal state. (Not necessarily SUCCEEDED.)
|
|
538
613
|
DONE = 'DONE'
|
|
539
614
|
|
|
615
|
+
@classmethod
|
|
616
|
+
def from_protobuf(
|
|
617
|
+
cls, protobuf_value: 'managed_jobsv1_pb2.ManagedJobScheduleState'
|
|
618
|
+
) -> Optional['ManagedJobScheduleState']:
|
|
619
|
+
"""Convert protobuf ManagedJobScheduleState enum to Python enum value.
|
|
620
|
+
"""
|
|
621
|
+
protobuf_to_enum = {
|
|
622
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: None,
|
|
623
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INVALID: cls.INVALID,
|
|
624
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INACTIVE:
|
|
625
|
+
cls.INACTIVE,
|
|
626
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_WAITING: cls.WAITING,
|
|
627
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING:
|
|
628
|
+
cls.ALIVE_WAITING,
|
|
629
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_LAUNCHING:
|
|
630
|
+
cls.LAUNCHING,
|
|
631
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF:
|
|
632
|
+
cls.ALIVE_BACKOFF,
|
|
633
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE: cls.ALIVE,
|
|
634
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_DONE: cls.DONE,
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
if protobuf_value not in protobuf_to_enum:
|
|
638
|
+
raise ValueError('Unknown protobuf ManagedJobScheduleState value: '
|
|
639
|
+
f'{protobuf_value}')
|
|
640
|
+
|
|
641
|
+
return protobuf_to_enum[protobuf_value]
|
|
642
|
+
|
|
643
|
+
def to_protobuf(self) -> 'managed_jobsv1_pb2.ManagedJobScheduleState':
|
|
644
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
645
|
+
enum_to_protobuf = {
|
|
646
|
+
ManagedJobScheduleState.INVALID:
|
|
647
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INVALID,
|
|
648
|
+
ManagedJobScheduleState.INACTIVE:
|
|
649
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INACTIVE,
|
|
650
|
+
ManagedJobScheduleState.WAITING:
|
|
651
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_WAITING,
|
|
652
|
+
ManagedJobScheduleState.ALIVE_WAITING:
|
|
653
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING,
|
|
654
|
+
ManagedJobScheduleState.LAUNCHING:
|
|
655
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_LAUNCHING,
|
|
656
|
+
ManagedJobScheduleState.ALIVE_BACKOFF:
|
|
657
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF,
|
|
658
|
+
ManagedJobScheduleState.ALIVE:
|
|
659
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE,
|
|
660
|
+
ManagedJobScheduleState.DONE:
|
|
661
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_DONE,
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
if self not in enum_to_protobuf:
|
|
665
|
+
raise ValueError(f'Unknown ManagedJobScheduleState value: {self}')
|
|
666
|
+
|
|
667
|
+
return enum_to_protobuf[self]
|
|
668
|
+
|
|
540
669
|
|
|
541
670
|
# === Status transition functions ===
|
|
542
671
|
@_init_db
|
|
@@ -792,8 +921,14 @@ def set_local_log_file(job_id: int, task_id: Optional[int],
|
|
|
792
921
|
# ======== utility functions ========
|
|
793
922
|
@_init_db
|
|
794
923
|
def get_nonterminal_job_ids_by_name(name: Optional[str],
|
|
924
|
+
user_hash: Optional[str] = None,
|
|
795
925
|
all_users: bool = False) -> List[int]:
|
|
796
|
-
"""Get non-terminal job ids by name.
|
|
926
|
+
"""Get non-terminal job ids by name.
|
|
927
|
+
|
|
928
|
+
If name is None:
|
|
929
|
+
1. if all_users is False, get for the given user_hash
|
|
930
|
+
2. otherwise, get for all users
|
|
931
|
+
"""
|
|
797
932
|
assert _SQLALCHEMY_ENGINE is not None
|
|
798
933
|
|
|
799
934
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
@@ -810,8 +945,15 @@ def get_nonterminal_job_ids_by_name(name: Optional[str],
|
|
|
810
945
|
])
|
|
811
946
|
]
|
|
812
947
|
if name is None and not all_users:
|
|
813
|
-
|
|
814
|
-
|
|
948
|
+
if user_hash is None:
|
|
949
|
+
# For backwards compatibility. With codegen, USER_ID_ENV_VAR
|
|
950
|
+
# was set to the correct value by the jobs controller, as
|
|
951
|
+
# part of ManagedJobCodeGen._build(). This is no longer the
|
|
952
|
+
# case for the Skylet gRPC server, which is why we need to
|
|
953
|
+
# pass it explicitly through the request body.
|
|
954
|
+
logger.debug('user_hash is None, using current user hash')
|
|
955
|
+
user_hash = common_utils.get_user_hash()
|
|
956
|
+
where_conditions.append(job_info_table.c.user_hash == user_hash)
|
|
815
957
|
if name is not None:
|
|
816
958
|
# We match the job name from `job_info` for the jobs submitted after
|
|
817
959
|
# #1982, and from `spot` for the jobs submitted before #1982, whose
|
sky/jobs/utils.py
CHANGED
|
@@ -16,8 +16,8 @@ import textwrap
|
|
|
16
16
|
import time
|
|
17
17
|
import traceback
|
|
18
18
|
import typing
|
|
19
|
-
from typing import (Any, Deque, Dict, List, Literal, Optional, Set,
|
|
20
|
-
Tuple, Union)
|
|
19
|
+
from typing import (Any, Deque, Dict, Iterable, List, Literal, Optional, Set,
|
|
20
|
+
TextIO, Tuple, Union)
|
|
21
21
|
|
|
22
22
|
import colorama
|
|
23
23
|
import filelock
|
|
@@ -51,16 +51,23 @@ from sky.utils import subprocess_utils
|
|
|
51
51
|
from sky.utils import ux_utils
|
|
52
52
|
|
|
53
53
|
if typing.TYPE_CHECKING:
|
|
54
|
+
from google.protobuf import descriptor
|
|
55
|
+
from google.protobuf import json_format
|
|
54
56
|
import grpc
|
|
55
57
|
import psutil
|
|
56
58
|
|
|
57
59
|
import sky
|
|
58
60
|
from sky import dag as dag_lib
|
|
59
61
|
from sky.schemas.generated import jobsv1_pb2
|
|
62
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
60
63
|
else:
|
|
64
|
+
json_format = adaptors_common.LazyImport('google.protobuf.json_format')
|
|
65
|
+
descriptor = adaptors_common.LazyImport('google.protobuf.descriptor')
|
|
61
66
|
psutil = adaptors_common.LazyImport('psutil')
|
|
62
67
|
grpc = adaptors_common.LazyImport('grpc')
|
|
63
68
|
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
69
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
70
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
64
71
|
|
|
65
72
|
logger = sky_logging.init_logger(__name__)
|
|
66
73
|
|
|
@@ -169,7 +176,7 @@ def _validate_consolidation_mode_config(
|
|
|
169
176
|
if all_jobs:
|
|
170
177
|
nonterminal_jobs = (
|
|
171
178
|
managed_job_state.get_nonterminal_job_ids_by_name(
|
|
172
|
-
None, all_users=True))
|
|
179
|
+
None, None, all_users=True))
|
|
173
180
|
if nonterminal_jobs:
|
|
174
181
|
with ux_utils.print_exception_no_traceback():
|
|
175
182
|
raise exceptions.InconsistentConsolidationModeError(
|
|
@@ -698,14 +705,15 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
|
|
|
698
705
|
|
|
699
706
|
def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
|
700
707
|
all_users: bool = False,
|
|
701
|
-
current_workspace: Optional[str] = None
|
|
708
|
+
current_workspace: Optional[str] = None,
|
|
709
|
+
user_hash: Optional[str] = None) -> str:
|
|
702
710
|
"""Cancel jobs by id.
|
|
703
711
|
|
|
704
712
|
If job_ids is None, cancel all jobs.
|
|
705
713
|
"""
|
|
706
714
|
if job_ids is None:
|
|
707
715
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(
|
|
708
|
-
None, all_users)
|
|
716
|
+
None, user_hash, all_users)
|
|
709
717
|
job_ids = list(set(job_ids))
|
|
710
718
|
if not job_ids:
|
|
711
719
|
return 'No job to cancel.'
|
|
@@ -1241,6 +1249,24 @@ def dump_managed_job_queue(
|
|
|
1241
1249
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1242
1250
|
statuses: Optional[List[str]] = None,
|
|
1243
1251
|
) -> str:
|
|
1252
|
+
return message_utils.encode_payload(
|
|
1253
|
+
get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
|
|
1254
|
+
workspace_match, name_match, pool_match, page,
|
|
1255
|
+
limit, user_hashes, statuses))
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
def get_managed_job_queue(
|
|
1259
|
+
skip_finished: bool = False,
|
|
1260
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1261
|
+
job_ids: Optional[List[int]] = None,
|
|
1262
|
+
workspace_match: Optional[str] = None,
|
|
1263
|
+
name_match: Optional[str] = None,
|
|
1264
|
+
pool_match: Optional[str] = None,
|
|
1265
|
+
page: Optional[int] = None,
|
|
1266
|
+
limit: Optional[int] = None,
|
|
1267
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1268
|
+
statuses: Optional[List[str]] = None,
|
|
1269
|
+
) -> Dict[str, Any]:
|
|
1244
1270
|
# Make sure to get all jobs - some logic below (e.g. high priority job
|
|
1245
1271
|
# detection) requires a full view of the jobs table.
|
|
1246
1272
|
jobs = managed_job_state.get_managed_jobs()
|
|
@@ -1371,12 +1397,12 @@ def dump_managed_job_queue(
|
|
|
1371
1397
|
else:
|
|
1372
1398
|
job['details'] = None
|
|
1373
1399
|
|
|
1374
|
-
return
|
|
1400
|
+
return {
|
|
1375
1401
|
'jobs': jobs,
|
|
1376
1402
|
'total': total,
|
|
1377
1403
|
'total_no_filter': total_no_filter,
|
|
1378
1404
|
'status_counts': status_counts
|
|
1379
|
-
}
|
|
1405
|
+
}
|
|
1380
1406
|
|
|
1381
1407
|
|
|
1382
1408
|
def filter_jobs(
|
|
@@ -1824,6 +1850,58 @@ def format_job_table(
|
|
|
1824
1850
|
return output
|
|
1825
1851
|
|
|
1826
1852
|
|
|
1853
|
+
def decode_managed_job_protos(
|
|
1854
|
+
job_protos: Iterable['managed_jobsv1_pb2.ManagedJobInfo']
|
|
1855
|
+
) -> List[Dict[str, Any]]:
|
|
1856
|
+
"""Decode job protos to dicts. Similar to load_managed_job_queue."""
|
|
1857
|
+
user_hash_to_user = global_user_state.get_users(
|
|
1858
|
+
set(job.user_hash for job in job_protos if job.user_hash))
|
|
1859
|
+
|
|
1860
|
+
jobs = []
|
|
1861
|
+
for job_proto in job_protos:
|
|
1862
|
+
job_dict = _job_proto_to_dict(job_proto)
|
|
1863
|
+
user_hash = job_dict.get('user_hash', None)
|
|
1864
|
+
if user_hash is not None:
|
|
1865
|
+
# Skip jobs that do not have user_hash info.
|
|
1866
|
+
# TODO(cooperc): Remove check before 0.12.0.
|
|
1867
|
+
user = user_hash_to_user.get(user_hash, None)
|
|
1868
|
+
job_dict['user_name'] = user.name if user is not None else None
|
|
1869
|
+
jobs.append(job_dict)
|
|
1870
|
+
return jobs
|
|
1871
|
+
|
|
1872
|
+
|
|
1873
|
+
def _job_proto_to_dict(
|
|
1874
|
+
job_proto: 'managed_jobsv1_pb2.ManagedJobInfo') -> Dict[str, Any]:
|
|
1875
|
+
job_dict = json_format.MessageToDict(
|
|
1876
|
+
job_proto,
|
|
1877
|
+
always_print_fields_with_no_presence=True,
|
|
1878
|
+
# Our API returns fields in snake_case.
|
|
1879
|
+
preserving_proto_field_name=True,
|
|
1880
|
+
use_integers_for_enums=True)
|
|
1881
|
+
for field in job_proto.DESCRIPTOR.fields:
|
|
1882
|
+
# Ensure optional fields are present with None values for
|
|
1883
|
+
# backwards compatibility with older clients.
|
|
1884
|
+
if field.has_presence and field.name not in job_dict:
|
|
1885
|
+
job_dict[field.name] = None
|
|
1886
|
+
# json_format.MessageToDict is meant for encoding to JSON,
|
|
1887
|
+
# and Protobuf encodes int64 as decimal strings in JSON,
|
|
1888
|
+
# so we need to convert them back to ints.
|
|
1889
|
+
# https://protobuf.dev/programming-guides/json/#field-representation
|
|
1890
|
+
if field.type == descriptor.FieldDescriptor.TYPE_INT64:
|
|
1891
|
+
job_dict[field.name] = int(job_dict[field.name])
|
|
1892
|
+
job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
|
|
1893
|
+
job_dict['status'])
|
|
1894
|
+
# For backwards compatibility, convert schedule_state to a string,
|
|
1895
|
+
# as we don't have the logic to handle it in our request
|
|
1896
|
+
# encoder/decoder, unlike status.
|
|
1897
|
+
schedule_state_enum = (
|
|
1898
|
+
managed_job_state.ManagedJobScheduleState.from_protobuf(
|
|
1899
|
+
job_dict['schedule_state']))
|
|
1900
|
+
job_dict['schedule_state'] = (schedule_state_enum.value
|
|
1901
|
+
if schedule_state_enum is not None else None)
|
|
1902
|
+
return job_dict
|
|
1903
|
+
|
|
1904
|
+
|
|
1827
1905
|
class ManagedJobCodeGen:
|
|
1828
1906
|
"""Code generator for managed job utility functions.
|
|
1829
1907
|
|
sky/provision/runpod/__init__.py
CHANGED
|
@@ -11,4 +11,6 @@ from sky.provision.runpod.instance import terminate_instances
|
|
|
11
11
|
from sky.provision.runpod.instance import wait_instances
|
|
12
12
|
from sky.provision.runpod.volume import apply_volume
|
|
13
13
|
from sky.provision.runpod.volume import delete_volume
|
|
14
|
+
from sky.provision.runpod.volume import get_all_volumes_usedby
|
|
14
15
|
from sky.provision.runpod.volume import get_volume_usedby
|
|
16
|
+
from sky.provision.runpod.volume import map_all_volumes_usedby
|