skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +25 -4
- sky/backends/cloud_vm_ray_backend.py +151 -36
- sky/client/cli/command.py +2 -1
- sky/client/cli/table_utils.py +34 -0
- sky/client/sdk.py +7 -5
- sky/client/sdk_async.py +5 -5
- sky/core.py +3 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-7340bc0f0dd8ae74.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +0 -1
- sky/global_user_state.py +3 -3
- sky/jobs/server/core.py +96 -26
- sky/jobs/server/utils.py +65 -32
- sky/jobs/state.py +145 -3
- sky/jobs/utils.py +85 -7
- sky/schemas/api/responses.py +18 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/serve/serve_utils.py +16 -0
- sky/serve/server/core.py +1 -1
- sky/serve/server/impl.py +6 -6
- sky/server/requests/serializers/decoders.py +2 -2
- sky/server/requests/serializers/encoders.py +7 -3
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +2 -32
- sky/skylet/log_lib.py +211 -0
- sky/skylet/log_lib.pyi +30 -1
- sky/skylet/services.py +208 -2
- sky/skylet/skylet.py +3 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +32 -32
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +56 -52
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
- /sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-7340bc0f0dd8ae74.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7528cc0ef8c522c5.js" defer=""></script><script src="/dashboard/_next/static/UDSEoDB67vwFMZyCJ4HWU/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/UDSEoDB67vwFMZyCJ4HWU/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"UDSEoDB67vwFMZyCJ4HWU","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/execution.py
CHANGED
|
@@ -673,7 +673,6 @@ def launch(
|
|
|
673
673
|
# see the setup logs when inspecting the launch process to know
|
|
674
674
|
# excatly what the job is waiting for.
|
|
675
675
|
detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
|
|
676
|
-
|
|
677
676
|
return _execute(
|
|
678
677
|
entrypoint=entrypoint,
|
|
679
678
|
dryrun=dryrun,
|
sky/global_user_state.py
CHANGED
|
@@ -483,7 +483,7 @@ def get_user(user_id: str) -> Optional[models.User]:
|
|
|
483
483
|
|
|
484
484
|
@_init_db
|
|
485
485
|
@metrics_lib.time_me
|
|
486
|
-
def
|
|
486
|
+
def get_users(user_ids: Set[str]) -> Dict[str, models.User]:
|
|
487
487
|
assert _SQLALCHEMY_ENGINE is not None
|
|
488
488
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
489
489
|
rows = session.query(user_table).filter(
|
|
@@ -1659,7 +1659,7 @@ def get_clusters(
|
|
|
1659
1659
|
|
|
1660
1660
|
# get all users needed for the rows at once
|
|
1661
1661
|
user_hashes = set(row_to_user_hash.values())
|
|
1662
|
-
user_hash_to_user =
|
|
1662
|
+
user_hash_to_user = get_users(user_hashes)
|
|
1663
1663
|
|
|
1664
1664
|
# get last cluster event for each row
|
|
1665
1665
|
cluster_hashes = set(row_to_user_hash.keys())
|
|
@@ -1807,7 +1807,7 @@ def get_clusters_from_history(
|
|
|
1807
1807
|
row_to_user_hash[row.cluster_hash] = user_hash
|
|
1808
1808
|
|
|
1809
1809
|
user_hashes = set(row_to_user_hash.values())
|
|
1810
|
-
user_hash_to_user =
|
|
1810
|
+
user_hash_to_user = get_users(user_hashes)
|
|
1811
1811
|
cluster_hashes = set(row_to_user_hash.keys())
|
|
1812
1812
|
if not abbreviate_response:
|
|
1813
1813
|
last_cluster_event_dict = _get_last_cluster_event_multiple(
|
sky/jobs/server/core.py
CHANGED
|
@@ -19,7 +19,9 @@ from sky import provision as provision_lib
|
|
|
19
19
|
from sky import sky_logging
|
|
20
20
|
from sky import skypilot_config
|
|
21
21
|
from sky import task as task_lib
|
|
22
|
+
from sky.adaptors import common as adaptors_common
|
|
22
23
|
from sky.backends import backend_utils
|
|
24
|
+
from sky.backends import cloud_vm_ray_backend
|
|
23
25
|
from sky.catalog import common as service_catalog_common
|
|
24
26
|
from sky.data import storage as storage_lib
|
|
25
27
|
from sky.jobs import constants as managed_job_constants
|
|
@@ -44,8 +46,15 @@ from sky.utils import ux_utils
|
|
|
44
46
|
from sky.workspaces import core as workspaces_core
|
|
45
47
|
|
|
46
48
|
if typing.TYPE_CHECKING:
|
|
49
|
+
from google.protobuf import json_format
|
|
50
|
+
|
|
47
51
|
import sky
|
|
48
|
-
from sky.
|
|
52
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
53
|
+
else:
|
|
54
|
+
json_format = adaptors_common.LazyImport('google.protobuf.json_format')
|
|
55
|
+
|
|
56
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
57
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
49
58
|
|
|
50
59
|
logger = sky_logging.init_logger(__name__)
|
|
51
60
|
|
|
@@ -701,11 +710,13 @@ def queue_v2(
|
|
|
701
710
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
702
711
|
|
|
703
712
|
user_hashes: Optional[List[Optional[str]]] = None
|
|
713
|
+
show_jobs_without_user_hash = False
|
|
704
714
|
if not all_users:
|
|
705
715
|
user_hashes = [common_utils.get_user_hash()]
|
|
706
716
|
# For backwards compatibility, we show jobs that do not have a
|
|
707
717
|
# user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
708
718
|
user_hashes.append(None)
|
|
719
|
+
show_jobs_without_user_hash = True
|
|
709
720
|
elif user_match is not None:
|
|
710
721
|
users = global_user_state.get_user_by_name_match(user_match)
|
|
711
722
|
if not users:
|
|
@@ -713,6 +724,38 @@ def queue_v2(
|
|
|
713
724
|
user_hashes = [user.id for user in users]
|
|
714
725
|
|
|
715
726
|
accessible_workspaces = list(workspaces_core.get_workspaces().keys())
|
|
727
|
+
|
|
728
|
+
if handle.is_grpc_enabled_with_flag:
|
|
729
|
+
try:
|
|
730
|
+
request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
731
|
+
skip_finished=skip_finished,
|
|
732
|
+
accessible_workspaces=accessible_workspaces,
|
|
733
|
+
job_ids=managed_jobsv1_pb2.JobIds(
|
|
734
|
+
ids=job_ids) if job_ids is not None else None,
|
|
735
|
+
workspace_match=workspace_match,
|
|
736
|
+
name_match=name_match,
|
|
737
|
+
pool_match=pool_match,
|
|
738
|
+
page=page,
|
|
739
|
+
limit=limit,
|
|
740
|
+
# Remove None from user_hashes, as the gRPC server uses the
|
|
741
|
+
# show_jobs_without_user_hash flag instead.
|
|
742
|
+
user_hashes=managed_jobsv1_pb2.UserHashes(hashes=[
|
|
743
|
+
user_hash for user_hash in user_hashes
|
|
744
|
+
if user_hash is not None
|
|
745
|
+
]) if user_hashes is not None else None,
|
|
746
|
+
statuses=managed_jobsv1_pb2.Statuses(
|
|
747
|
+
statuses=statuses) if statuses is not None else None,
|
|
748
|
+
show_jobs_without_user_hash=show_jobs_without_user_hash,
|
|
749
|
+
)
|
|
750
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
751
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
752
|
+
handle.get_grpc_channel()).get_managed_job_table(request))
|
|
753
|
+
jobs = managed_job_utils.decode_managed_job_protos(response.jobs)
|
|
754
|
+
return jobs, response.total, dict(
|
|
755
|
+
response.status_counts), response.total_no_filter
|
|
756
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
757
|
+
pass
|
|
758
|
+
|
|
716
759
|
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
717
760
|
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
718
761
|
name_match, pool_match, page, limit, user_hashes, statuses)
|
|
@@ -819,33 +862,60 @@ def cancel(name: Optional[str] = None,
|
|
|
819
862
|
'Can only specify one of JOB_IDS, name, pool, or all/'
|
|
820
863
|
f'all_users. Provided {" ".join(arguments)!r}.')
|
|
821
864
|
|
|
865
|
+
job_ids = None if (all_users or all) else job_ids
|
|
866
|
+
|
|
822
867
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
823
868
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
869
|
+
|
|
870
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
871
|
+
|
|
872
|
+
if not use_legacy:
|
|
873
|
+
current_workspace = skypilot_config.get_active_workspace()
|
|
874
|
+
try:
|
|
875
|
+
request = managed_jobsv1_pb2.CancelJobsRequest(
|
|
876
|
+
current_workspace=current_workspace)
|
|
877
|
+
|
|
878
|
+
if all_users or all or job_ids:
|
|
879
|
+
request.all_users = all_users
|
|
880
|
+
if all:
|
|
881
|
+
request.user_hash = common_utils.get_user_hash()
|
|
882
|
+
if job_ids is not None:
|
|
883
|
+
request.job_ids.CopyFrom(
|
|
884
|
+
managed_jobsv1_pb2.JobIds(ids=job_ids))
|
|
885
|
+
elif name is not None:
|
|
886
|
+
request.job_name = name
|
|
887
|
+
else:
|
|
888
|
+
assert pool is not None, (job_ids, name, pool, all)
|
|
889
|
+
request.pool_name = pool
|
|
890
|
+
|
|
891
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
892
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
893
|
+
handle.get_grpc_channel()).cancel_managed_jobs(request))
|
|
894
|
+
stdout = response.message
|
|
895
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
896
|
+
use_legacy = True
|
|
897
|
+
|
|
898
|
+
if use_legacy:
|
|
899
|
+
if all_users or all or job_ids:
|
|
900
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
|
|
901
|
+
job_ids, all_users=all_users)
|
|
902
|
+
elif name is not None:
|
|
903
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(
|
|
904
|
+
name)
|
|
905
|
+
else:
|
|
906
|
+
assert pool is not None, (job_ids, name, pool, all)
|
|
907
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(
|
|
908
|
+
pool)
|
|
909
|
+
# The stderr is redirected to stdout
|
|
910
|
+
returncode, stdout, stderr = backend.run_on_head(
|
|
911
|
+
handle, code, require_outputs=True, stream_logs=False)
|
|
912
|
+
try:
|
|
913
|
+
subprocess_utils.handle_returncode(
|
|
914
|
+
returncode, code, 'Failed to cancel managed job',
|
|
915
|
+
stdout + stderr)
|
|
916
|
+
except exceptions.CommandError as e:
|
|
917
|
+
with ux_utils.print_exception_no_traceback():
|
|
918
|
+
raise RuntimeError(e.error_msg) from e
|
|
849
919
|
|
|
850
920
|
logger.info(stdout)
|
|
851
921
|
if 'Multiple jobs found with name' in stdout:
|
sky/jobs/server/utils.py
CHANGED
|
@@ -1,13 +1,24 @@
|
|
|
1
1
|
"""Utility functions for managed jobs."""
|
|
2
|
+
import typing
|
|
3
|
+
|
|
2
4
|
from sky import backends
|
|
5
|
+
from sky import exceptions
|
|
3
6
|
from sky import sky_logging
|
|
7
|
+
from sky.adaptors import common as adaptors_common
|
|
4
8
|
from sky.backends import backend_utils
|
|
9
|
+
from sky.backends import cloud_vm_ray_backend
|
|
5
10
|
from sky.jobs import utils as managed_job_utils
|
|
6
11
|
from sky.skylet import constants as skylet_constants
|
|
7
12
|
from sky.utils import controller_utils
|
|
8
13
|
|
|
9
14
|
logger = sky_logging.init_logger(__name__)
|
|
10
15
|
|
|
16
|
+
if typing.TYPE_CHECKING:
|
|
17
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
18
|
+
else:
|
|
19
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
20
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
21
|
+
|
|
11
22
|
|
|
12
23
|
def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
13
24
|
"""Check if controller has version mismatch and non-terminal jobs exist.
|
|
@@ -28,42 +39,64 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
28
39
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
29
40
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
30
41
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
42
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
43
|
+
|
|
44
|
+
if not use_legacy:
|
|
45
|
+
try:
|
|
46
|
+
version_request = managed_jobsv1_pb2.GetVersionRequest()
|
|
47
|
+
version_response = backend_utils.invoke_skylet_with_retries(
|
|
48
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
49
|
+
handle.get_grpc_channel(
|
|
50
|
+
)).get_managed_job_controller_version(version_request))
|
|
51
|
+
controller_version = version_response.controller_version
|
|
52
|
+
|
|
53
|
+
job_table_request = managed_jobsv1_pb2.GetJobTableRequest()
|
|
54
|
+
job_table_response = backend_utils.invoke_skylet_with_retries(
|
|
55
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
56
|
+
handle.get_grpc_channel()).get_managed_job_table(
|
|
57
|
+
job_table_request))
|
|
58
|
+
jobs = managed_job_utils.decode_managed_job_protos(
|
|
59
|
+
job_table_response.jobs)
|
|
60
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
61
|
+
use_legacy = True
|
|
62
|
+
|
|
63
|
+
if use_legacy:
|
|
64
|
+
# Get controller version and raw job table
|
|
65
|
+
code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
|
|
66
|
+
|
|
67
|
+
returncode, output, stderr = backend.run_on_head(handle,
|
|
68
|
+
code,
|
|
69
|
+
require_outputs=True,
|
|
70
|
+
stream_logs=False,
|
|
71
|
+
separate_stderr=True)
|
|
72
|
+
|
|
73
|
+
if returncode != 0:
|
|
74
|
+
logger.error(output + stderr)
|
|
75
|
+
raise ValueError('Failed to check controller version and jobs with '
|
|
76
|
+
f'returncode: {returncode}.\n{output + stderr}')
|
|
77
|
+
|
|
78
|
+
# Parse the output to extract controller version (split only on first
|
|
79
|
+
# newline)
|
|
80
|
+
output_parts = output.strip().split('\n', 1)
|
|
81
|
+
|
|
82
|
+
# Extract controller version from first line
|
|
83
|
+
if len(output_parts) < 2 or not output_parts[0].startswith(
|
|
84
|
+
'controller_version:'):
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f'Expected controller version in first line, got: {output}')
|
|
87
|
+
|
|
88
|
+
controller_version = output_parts[0].split(':', 1)[1]
|
|
89
|
+
|
|
90
|
+
# Rest is job table payload (preserving any newlines within it)
|
|
91
|
+
job_table_payload = output_parts[1]
|
|
92
|
+
|
|
93
|
+
# Load and filter jobs locally using existing method
|
|
94
|
+
jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
|
|
95
|
+
job_table_payload)
|
|
59
96
|
|
|
60
97
|
# Process locally: check version match and filter non-terminal jobs
|
|
61
98
|
version_matches = (controller_version == local_version or
|
|
62
99
|
int(controller_version) > 17)
|
|
63
|
-
|
|
64
|
-
# Load and filter jobs locally using existing method
|
|
65
|
-
jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
|
|
66
|
-
job_table_payload)
|
|
67
100
|
non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
|
|
68
101
|
has_non_terminal_jobs = len(non_terminal_jobs) > 0
|
|
69
102
|
|
sky/jobs/state.py
CHANGED
|
@@ -25,6 +25,7 @@ from sqlalchemy.ext import declarative
|
|
|
25
25
|
from sky import exceptions
|
|
26
26
|
from sky import sky_logging
|
|
27
27
|
from sky import skypilot_config
|
|
28
|
+
from sky.adaptors import common as adaptors_common
|
|
28
29
|
from sky.skylet import constants
|
|
29
30
|
from sky.utils import common_utils
|
|
30
31
|
from sky.utils import context_utils
|
|
@@ -34,6 +35,11 @@ from sky.utils.db import migration_utils
|
|
|
34
35
|
if typing.TYPE_CHECKING:
|
|
35
36
|
from sqlalchemy.engine import row
|
|
36
37
|
|
|
38
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
39
|
+
else:
|
|
40
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
41
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
42
|
+
|
|
37
43
|
# Separate callback types for sync and async contexts
|
|
38
44
|
SyncCallbackType = Callable[[str], None]
|
|
39
45
|
AsyncCallbackType = Callable[[str], Awaitable[Any]]
|
|
@@ -448,6 +454,75 @@ class ManagedJobStatus(enum.Enum):
|
|
|
448
454
|
cls.RECOVERING,
|
|
449
455
|
]
|
|
450
456
|
|
|
457
|
+
@classmethod
|
|
458
|
+
def from_protobuf(
|
|
459
|
+
cls, protobuf_value: 'managed_jobsv1_pb2.ManagedJobStatus'
|
|
460
|
+
) -> Optional['ManagedJobStatus']:
|
|
461
|
+
"""Convert protobuf ManagedJobStatus enum to Python enum value."""
|
|
462
|
+
protobuf_to_enum = {
|
|
463
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_UNSPECIFIED: None,
|
|
464
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_PENDING: cls.PENDING,
|
|
465
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUBMITTED:
|
|
466
|
+
cls.DEPRECATED_SUBMITTED,
|
|
467
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_STARTING: cls.STARTING,
|
|
468
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_RUNNING: cls.RUNNING,
|
|
469
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
|
|
470
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED: cls.FAILED,
|
|
471
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_CONTROLLER:
|
|
472
|
+
cls.FAILED_CONTROLLER,
|
|
473
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_SETUP:
|
|
474
|
+
cls.FAILED_SETUP,
|
|
475
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLED: cls.CANCELLED,
|
|
476
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_RECOVERING: cls.RECOVERING,
|
|
477
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLING: cls.CANCELLING,
|
|
478
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_PRECHECKS:
|
|
479
|
+
cls.FAILED_PRECHECKS,
|
|
480
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_NO_RESOURCE:
|
|
481
|
+
cls.FAILED_NO_RESOURCE,
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
if protobuf_value not in protobuf_to_enum:
|
|
485
|
+
raise ValueError(
|
|
486
|
+
f'Unknown protobuf ManagedJobStatus value: {protobuf_value}')
|
|
487
|
+
|
|
488
|
+
return protobuf_to_enum[protobuf_value]
|
|
489
|
+
|
|
490
|
+
def to_protobuf(self) -> 'managed_jobsv1_pb2.ManagedJobStatus':
|
|
491
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
492
|
+
enum_to_protobuf = {
|
|
493
|
+
ManagedJobStatus.PENDING:
|
|
494
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_PENDING,
|
|
495
|
+
ManagedJobStatus.DEPRECATED_SUBMITTED:
|
|
496
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUBMITTED,
|
|
497
|
+
ManagedJobStatus.STARTING:
|
|
498
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_STARTING,
|
|
499
|
+
ManagedJobStatus.RUNNING:
|
|
500
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_RUNNING,
|
|
501
|
+
ManagedJobStatus.SUCCEEDED:
|
|
502
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_SUCCEEDED,
|
|
503
|
+
ManagedJobStatus.FAILED:
|
|
504
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED,
|
|
505
|
+
ManagedJobStatus.FAILED_CONTROLLER:
|
|
506
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_CONTROLLER,
|
|
507
|
+
ManagedJobStatus.FAILED_SETUP:
|
|
508
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_SETUP,
|
|
509
|
+
ManagedJobStatus.CANCELLED:
|
|
510
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLED,
|
|
511
|
+
ManagedJobStatus.RECOVERING:
|
|
512
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_RECOVERING,
|
|
513
|
+
ManagedJobStatus.CANCELLING:
|
|
514
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_CANCELLING,
|
|
515
|
+
ManagedJobStatus.FAILED_PRECHECKS:
|
|
516
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_PRECHECKS,
|
|
517
|
+
ManagedJobStatus.FAILED_NO_RESOURCE:
|
|
518
|
+
managed_jobsv1_pb2.MANAGED_JOB_STATUS_FAILED_NO_RESOURCE,
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
if self not in enum_to_protobuf:
|
|
522
|
+
raise ValueError(f'Unknown ManagedJobStatus value: {self}')
|
|
523
|
+
|
|
524
|
+
return enum_to_protobuf[self]
|
|
525
|
+
|
|
451
526
|
|
|
452
527
|
_SPOT_STATUS_TO_COLOR = {
|
|
453
528
|
ManagedJobStatus.PENDING: colorama.Fore.BLUE,
|
|
@@ -537,6 +612,60 @@ class ManagedJobScheduleState(enum.Enum):
|
|
|
537
612
|
# The job is in a terminal state. (Not necessarily SUCCEEDED.)
|
|
538
613
|
DONE = 'DONE'
|
|
539
614
|
|
|
615
|
+
@classmethod
|
|
616
|
+
def from_protobuf(
|
|
617
|
+
cls, protobuf_value: 'managed_jobsv1_pb2.ManagedJobScheduleState'
|
|
618
|
+
) -> Optional['ManagedJobScheduleState']:
|
|
619
|
+
"""Convert protobuf ManagedJobScheduleState enum to Python enum value.
|
|
620
|
+
"""
|
|
621
|
+
protobuf_to_enum = {
|
|
622
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: None,
|
|
623
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INVALID: cls.INVALID,
|
|
624
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INACTIVE:
|
|
625
|
+
cls.INACTIVE,
|
|
626
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_WAITING: cls.WAITING,
|
|
627
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING:
|
|
628
|
+
cls.ALIVE_WAITING,
|
|
629
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_LAUNCHING:
|
|
630
|
+
cls.LAUNCHING,
|
|
631
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF:
|
|
632
|
+
cls.ALIVE_BACKOFF,
|
|
633
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE: cls.ALIVE,
|
|
634
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_DONE: cls.DONE,
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
if protobuf_value not in protobuf_to_enum:
|
|
638
|
+
raise ValueError('Unknown protobuf ManagedJobScheduleState value: '
|
|
639
|
+
f'{protobuf_value}')
|
|
640
|
+
|
|
641
|
+
return protobuf_to_enum[protobuf_value]
|
|
642
|
+
|
|
643
|
+
def to_protobuf(self) -> 'managed_jobsv1_pb2.ManagedJobScheduleState':
|
|
644
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
645
|
+
enum_to_protobuf = {
|
|
646
|
+
ManagedJobScheduleState.INVALID:
|
|
647
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INVALID,
|
|
648
|
+
ManagedJobScheduleState.INACTIVE:
|
|
649
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INACTIVE,
|
|
650
|
+
ManagedJobScheduleState.WAITING:
|
|
651
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_WAITING,
|
|
652
|
+
ManagedJobScheduleState.ALIVE_WAITING:
|
|
653
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING,
|
|
654
|
+
ManagedJobScheduleState.LAUNCHING:
|
|
655
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_LAUNCHING,
|
|
656
|
+
ManagedJobScheduleState.ALIVE_BACKOFF:
|
|
657
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF,
|
|
658
|
+
ManagedJobScheduleState.ALIVE:
|
|
659
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_ALIVE,
|
|
660
|
+
ManagedJobScheduleState.DONE:
|
|
661
|
+
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_DONE,
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
if self not in enum_to_protobuf:
|
|
665
|
+
raise ValueError(f'Unknown ManagedJobScheduleState value: {self}')
|
|
666
|
+
|
|
667
|
+
return enum_to_protobuf[self]
|
|
668
|
+
|
|
540
669
|
|
|
541
670
|
# === Status transition functions ===
|
|
542
671
|
@_init_db
|
|
@@ -792,8 +921,14 @@ def set_local_log_file(job_id: int, task_id: Optional[int],
|
|
|
792
921
|
# ======== utility functions ========
|
|
793
922
|
@_init_db
|
|
794
923
|
def get_nonterminal_job_ids_by_name(name: Optional[str],
|
|
924
|
+
user_hash: Optional[str] = None,
|
|
795
925
|
all_users: bool = False) -> List[int]:
|
|
796
|
-
"""Get non-terminal job ids by name.
|
|
926
|
+
"""Get non-terminal job ids by name.
|
|
927
|
+
|
|
928
|
+
If name is None:
|
|
929
|
+
1. if all_users is False, get for the given user_hash
|
|
930
|
+
2. otherwise, get for all users
|
|
931
|
+
"""
|
|
797
932
|
assert _SQLALCHEMY_ENGINE is not None
|
|
798
933
|
|
|
799
934
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
@@ -810,8 +945,15 @@ def get_nonterminal_job_ids_by_name(name: Optional[str],
|
|
|
810
945
|
])
|
|
811
946
|
]
|
|
812
947
|
if name is None and not all_users:
|
|
813
|
-
|
|
814
|
-
|
|
948
|
+
if user_hash is None:
|
|
949
|
+
# For backwards compatibility. With codegen, USER_ID_ENV_VAR
|
|
950
|
+
# was set to the correct value by the jobs controller, as
|
|
951
|
+
# part of ManagedJobCodeGen._build(). This is no longer the
|
|
952
|
+
# case for the Skylet gRPC server, which is why we need to
|
|
953
|
+
# pass it explicitly through the request body.
|
|
954
|
+
logger.debug('user_hash is None, using current user hash')
|
|
955
|
+
user_hash = common_utils.get_user_hash()
|
|
956
|
+
where_conditions.append(job_info_table.c.user_hash == user_hash)
|
|
815
957
|
if name is not None:
|
|
816
958
|
# We match the job name from `job_info` for the jobs submitted after
|
|
817
959
|
# #1982, and from `spot` for the jobs submitted before #1982, whose
|