skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20251001__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (71) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +43 -14
  3. sky/backends/cloud_vm_ray_backend.py +153 -38
  4. sky/check.py +0 -29
  5. sky/client/cli/command.py +48 -26
  6. sky/client/cli/table_utils.py +91 -0
  7. sky/client/sdk.py +14 -23
  8. sky/client/sdk_async.py +5 -5
  9. sky/core.py +18 -20
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-07349868f7905d37.js → [pool]-509b2977a6373bf6.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-4f0c389a4ce5fd9c.js} +1 -1
  15. sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → m3YT2i5s6v4SsIdYc8WZa}/_buildManifest.js +1 -1
  16. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  17. sky/dashboard/out/clusters/[cluster].html +1 -1
  18. sky/dashboard/out/clusters.html +1 -1
  19. sky/dashboard/out/config.html +1 -1
  20. sky/dashboard/out/index.html +1 -1
  21. sky/dashboard/out/infra/[context].html +1 -1
  22. sky/dashboard/out/infra.html +1 -1
  23. sky/dashboard/out/jobs/[job].html +1 -1
  24. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  25. sky/dashboard/out/jobs.html +1 -1
  26. sky/dashboard/out/users.html +1 -1
  27. sky/dashboard/out/volumes.html +1 -1
  28. sky/dashboard/out/workspace/new.html +1 -1
  29. sky/dashboard/out/workspaces/[name].html +1 -1
  30. sky/dashboard/out/workspaces.html +1 -1
  31. sky/data/storage.py +11 -0
  32. sky/data/storage_utils.py +1 -45
  33. sky/execution.py +0 -1
  34. sky/global_user_state.py +3 -3
  35. sky/jobs/client/sdk.py +3 -2
  36. sky/jobs/controller.py +15 -0
  37. sky/jobs/server/core.py +120 -28
  38. sky/jobs/server/server.py +1 -1
  39. sky/jobs/server/utils.py +65 -32
  40. sky/jobs/state.py +145 -3
  41. sky/jobs/utils.py +87 -8
  42. sky/provision/kubernetes/instance.py +1 -1
  43. sky/schemas/api/responses.py +73 -0
  44. sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
  45. sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
  46. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  47. sky/serve/serve_utils.py +16 -0
  48. sky/serve/server/core.py +1 -1
  49. sky/serve/server/impl.py +6 -6
  50. sky/server/common.py +2 -1
  51. sky/server/requests/serializers/decoders.py +10 -6
  52. sky/server/requests/serializers/encoders.py +13 -8
  53. sky/skylet/constants.py +1 -1
  54. sky/skylet/job_lib.py +2 -32
  55. sky/skylet/log_lib.py +211 -0
  56. sky/skylet/log_lib.pyi +30 -1
  57. sky/skylet/services.py +208 -2
  58. sky/skylet/skylet.py +3 -0
  59. sky/task.py +4 -0
  60. sky/utils/cluster_utils.py +23 -5
  61. sky/utils/command_runner.py +21 -5
  62. sky/utils/command_runner.pyi +11 -0
  63. sky/utils/volume.py +5 -0
  64. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/METADATA +35 -35
  65. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/RECORD +70 -66
  66. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
  67. /sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → m3YT2i5s6v4SsIdYc8WZa}/_ssgManifest.js +0 -0
  68. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/WHEEL +0 -0
  69. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/entry_points.txt +0 -0
  70. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/licenses/LICENSE +0 -0
  71. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,262 @@
1
+ from google.protobuf.internal import containers as _containers
2
+ from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
3
+ from google.protobuf import descriptor as _descriptor
4
+ from google.protobuf import message as _message
5
+ from typing import ClassVar as _ClassVar, Iterable as _Iterable, Mapping as _Mapping, Optional as _Optional, Union as _Union
6
+
7
+ DESCRIPTOR: _descriptor.FileDescriptor
8
+
9
+ class ManagedJobStatus(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
10
+ __slots__ = ()
11
+ MANAGED_JOB_STATUS_UNSPECIFIED: _ClassVar[ManagedJobStatus]
12
+ MANAGED_JOB_STATUS_PENDING: _ClassVar[ManagedJobStatus]
13
+ MANAGED_JOB_STATUS_SUBMITTED: _ClassVar[ManagedJobStatus]
14
+ MANAGED_JOB_STATUS_STARTING: _ClassVar[ManagedJobStatus]
15
+ MANAGED_JOB_STATUS_RUNNING: _ClassVar[ManagedJobStatus]
16
+ MANAGED_JOB_STATUS_RECOVERING: _ClassVar[ManagedJobStatus]
17
+ MANAGED_JOB_STATUS_CANCELLING: _ClassVar[ManagedJobStatus]
18
+ MANAGED_JOB_STATUS_SUCCEEDED: _ClassVar[ManagedJobStatus]
19
+ MANAGED_JOB_STATUS_CANCELLED: _ClassVar[ManagedJobStatus]
20
+ MANAGED_JOB_STATUS_FAILED: _ClassVar[ManagedJobStatus]
21
+ MANAGED_JOB_STATUS_FAILED_SETUP: _ClassVar[ManagedJobStatus]
22
+ MANAGED_JOB_STATUS_FAILED_PRECHECKS: _ClassVar[ManagedJobStatus]
23
+ MANAGED_JOB_STATUS_FAILED_NO_RESOURCE: _ClassVar[ManagedJobStatus]
24
+ MANAGED_JOB_STATUS_FAILED_CONTROLLER: _ClassVar[ManagedJobStatus]
25
+
26
+ class ManagedJobScheduleState(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
27
+ __slots__ = ()
28
+ MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: _ClassVar[ManagedJobScheduleState]
29
+ MANAGED_JOB_SCHEDULE_STATE_INVALID: _ClassVar[ManagedJobScheduleState]
30
+ MANAGED_JOB_SCHEDULE_STATE_INACTIVE: _ClassVar[ManagedJobScheduleState]
31
+ MANAGED_JOB_SCHEDULE_STATE_WAITING: _ClassVar[ManagedJobScheduleState]
32
+ MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING: _ClassVar[ManagedJobScheduleState]
33
+ MANAGED_JOB_SCHEDULE_STATE_LAUNCHING: _ClassVar[ManagedJobScheduleState]
34
+ MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF: _ClassVar[ManagedJobScheduleState]
35
+ MANAGED_JOB_SCHEDULE_STATE_ALIVE: _ClassVar[ManagedJobScheduleState]
36
+ MANAGED_JOB_SCHEDULE_STATE_DONE: _ClassVar[ManagedJobScheduleState]
37
+ MANAGED_JOB_STATUS_UNSPECIFIED: ManagedJobStatus
38
+ MANAGED_JOB_STATUS_PENDING: ManagedJobStatus
39
+ MANAGED_JOB_STATUS_SUBMITTED: ManagedJobStatus
40
+ MANAGED_JOB_STATUS_STARTING: ManagedJobStatus
41
+ MANAGED_JOB_STATUS_RUNNING: ManagedJobStatus
42
+ MANAGED_JOB_STATUS_RECOVERING: ManagedJobStatus
43
+ MANAGED_JOB_STATUS_CANCELLING: ManagedJobStatus
44
+ MANAGED_JOB_STATUS_SUCCEEDED: ManagedJobStatus
45
+ MANAGED_JOB_STATUS_CANCELLED: ManagedJobStatus
46
+ MANAGED_JOB_STATUS_FAILED: ManagedJobStatus
47
+ MANAGED_JOB_STATUS_FAILED_SETUP: ManagedJobStatus
48
+ MANAGED_JOB_STATUS_FAILED_PRECHECKS: ManagedJobStatus
49
+ MANAGED_JOB_STATUS_FAILED_NO_RESOURCE: ManagedJobStatus
50
+ MANAGED_JOB_STATUS_FAILED_CONTROLLER: ManagedJobStatus
51
+ MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: ManagedJobScheduleState
52
+ MANAGED_JOB_SCHEDULE_STATE_INVALID: ManagedJobScheduleState
53
+ MANAGED_JOB_SCHEDULE_STATE_INACTIVE: ManagedJobScheduleState
54
+ MANAGED_JOB_SCHEDULE_STATE_WAITING: ManagedJobScheduleState
55
+ MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING: ManagedJobScheduleState
56
+ MANAGED_JOB_SCHEDULE_STATE_LAUNCHING: ManagedJobScheduleState
57
+ MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF: ManagedJobScheduleState
58
+ MANAGED_JOB_SCHEDULE_STATE_ALIVE: ManagedJobScheduleState
59
+ MANAGED_JOB_SCHEDULE_STATE_DONE: ManagedJobScheduleState
60
+
61
+ class JobIds(_message.Message):
62
+ __slots__ = ("ids",)
63
+ IDS_FIELD_NUMBER: _ClassVar[int]
64
+ ids: _containers.RepeatedScalarFieldContainer[int]
65
+ def __init__(self, ids: _Optional[_Iterable[int]] = ...) -> None: ...
66
+
67
+ class UserHashes(_message.Message):
68
+ __slots__ = ("hashes",)
69
+ HASHES_FIELD_NUMBER: _ClassVar[int]
70
+ hashes: _containers.RepeatedScalarFieldContainer[str]
71
+ def __init__(self, hashes: _Optional[_Iterable[str]] = ...) -> None: ...
72
+
73
+ class Statuses(_message.Message):
74
+ __slots__ = ("statuses",)
75
+ STATUSES_FIELD_NUMBER: _ClassVar[int]
76
+ statuses: _containers.RepeatedScalarFieldContainer[str]
77
+ def __init__(self, statuses: _Optional[_Iterable[str]] = ...) -> None: ...
78
+
79
+ class GetVersionRequest(_message.Message):
80
+ __slots__ = ()
81
+ def __init__(self) -> None: ...
82
+
83
+ class GetVersionResponse(_message.Message):
84
+ __slots__ = ("controller_version",)
85
+ CONTROLLER_VERSION_FIELD_NUMBER: _ClassVar[int]
86
+ controller_version: str
87
+ def __init__(self, controller_version: _Optional[str] = ...) -> None: ...
88
+
89
+ class GetJobTableRequest(_message.Message):
90
+ __slots__ = ("skip_finished", "accessible_workspaces", "job_ids", "workspace_match", "name_match", "pool_match", "page", "limit", "user_hashes", "statuses", "show_jobs_without_user_hash")
91
+ SKIP_FINISHED_FIELD_NUMBER: _ClassVar[int]
92
+ ACCESSIBLE_WORKSPACES_FIELD_NUMBER: _ClassVar[int]
93
+ JOB_IDS_FIELD_NUMBER: _ClassVar[int]
94
+ WORKSPACE_MATCH_FIELD_NUMBER: _ClassVar[int]
95
+ NAME_MATCH_FIELD_NUMBER: _ClassVar[int]
96
+ POOL_MATCH_FIELD_NUMBER: _ClassVar[int]
97
+ PAGE_FIELD_NUMBER: _ClassVar[int]
98
+ LIMIT_FIELD_NUMBER: _ClassVar[int]
99
+ USER_HASHES_FIELD_NUMBER: _ClassVar[int]
100
+ STATUSES_FIELD_NUMBER: _ClassVar[int]
101
+ SHOW_JOBS_WITHOUT_USER_HASH_FIELD_NUMBER: _ClassVar[int]
102
+ skip_finished: bool
103
+ accessible_workspaces: _containers.RepeatedScalarFieldContainer[str]
104
+ job_ids: JobIds
105
+ workspace_match: str
106
+ name_match: str
107
+ pool_match: str
108
+ page: int
109
+ limit: int
110
+ user_hashes: UserHashes
111
+ statuses: Statuses
112
+ show_jobs_without_user_hash: bool
113
+ def __init__(self, skip_finished: bool = ..., accessible_workspaces: _Optional[_Iterable[str]] = ..., job_ids: _Optional[_Union[JobIds, _Mapping]] = ..., workspace_match: _Optional[str] = ..., name_match: _Optional[str] = ..., pool_match: _Optional[str] = ..., page: _Optional[int] = ..., limit: _Optional[int] = ..., user_hashes: _Optional[_Union[UserHashes, _Mapping]] = ..., statuses: _Optional[_Union[Statuses, _Mapping]] = ..., show_jobs_without_user_hash: bool = ...) -> None: ...
114
+
115
+ class ManagedJobInfo(_message.Message):
116
+ __slots__ = ("job_id", "task_id", "job_name", "task_name", "job_duration", "workspace", "status", "schedule_state", "resources", "cluster_resources", "cluster_resources_full", "cloud", "region", "infra", "accelerators", "recovery_count", "details", "failure_reason", "user_name", "user_hash", "submitted_at", "start_at", "end_at", "user_yaml", "entrypoint", "metadata", "pool", "pool_hash")
117
+ class AcceleratorsEntry(_message.Message):
118
+ __slots__ = ("key", "value")
119
+ KEY_FIELD_NUMBER: _ClassVar[int]
120
+ VALUE_FIELD_NUMBER: _ClassVar[int]
121
+ key: str
122
+ value: float
123
+ def __init__(self, key: _Optional[str] = ..., value: _Optional[float] = ...) -> None: ...
124
+ class MetadataEntry(_message.Message):
125
+ __slots__ = ("key", "value")
126
+ KEY_FIELD_NUMBER: _ClassVar[int]
127
+ VALUE_FIELD_NUMBER: _ClassVar[int]
128
+ key: str
129
+ value: str
130
+ def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
131
+ JOB_ID_FIELD_NUMBER: _ClassVar[int]
132
+ TASK_ID_FIELD_NUMBER: _ClassVar[int]
133
+ JOB_NAME_FIELD_NUMBER: _ClassVar[int]
134
+ TASK_NAME_FIELD_NUMBER: _ClassVar[int]
135
+ JOB_DURATION_FIELD_NUMBER: _ClassVar[int]
136
+ WORKSPACE_FIELD_NUMBER: _ClassVar[int]
137
+ STATUS_FIELD_NUMBER: _ClassVar[int]
138
+ SCHEDULE_STATE_FIELD_NUMBER: _ClassVar[int]
139
+ RESOURCES_FIELD_NUMBER: _ClassVar[int]
140
+ CLUSTER_RESOURCES_FIELD_NUMBER: _ClassVar[int]
141
+ CLUSTER_RESOURCES_FULL_FIELD_NUMBER: _ClassVar[int]
142
+ CLOUD_FIELD_NUMBER: _ClassVar[int]
143
+ REGION_FIELD_NUMBER: _ClassVar[int]
144
+ INFRA_FIELD_NUMBER: _ClassVar[int]
145
+ ACCELERATORS_FIELD_NUMBER: _ClassVar[int]
146
+ RECOVERY_COUNT_FIELD_NUMBER: _ClassVar[int]
147
+ DETAILS_FIELD_NUMBER: _ClassVar[int]
148
+ FAILURE_REASON_FIELD_NUMBER: _ClassVar[int]
149
+ USER_NAME_FIELD_NUMBER: _ClassVar[int]
150
+ USER_HASH_FIELD_NUMBER: _ClassVar[int]
151
+ SUBMITTED_AT_FIELD_NUMBER: _ClassVar[int]
152
+ START_AT_FIELD_NUMBER: _ClassVar[int]
153
+ END_AT_FIELD_NUMBER: _ClassVar[int]
154
+ USER_YAML_FIELD_NUMBER: _ClassVar[int]
155
+ ENTRYPOINT_FIELD_NUMBER: _ClassVar[int]
156
+ METADATA_FIELD_NUMBER: _ClassVar[int]
157
+ POOL_FIELD_NUMBER: _ClassVar[int]
158
+ POOL_HASH_FIELD_NUMBER: _ClassVar[int]
159
+ job_id: int
160
+ task_id: int
161
+ job_name: str
162
+ task_name: str
163
+ job_duration: float
164
+ workspace: str
165
+ status: ManagedJobStatus
166
+ schedule_state: ManagedJobScheduleState
167
+ resources: str
168
+ cluster_resources: str
169
+ cluster_resources_full: str
170
+ cloud: str
171
+ region: str
172
+ infra: str
173
+ accelerators: _containers.ScalarMap[str, float]
174
+ recovery_count: int
175
+ details: str
176
+ failure_reason: str
177
+ user_name: str
178
+ user_hash: str
179
+ submitted_at: float
180
+ start_at: float
181
+ end_at: float
182
+ user_yaml: str
183
+ entrypoint: str
184
+ metadata: _containers.ScalarMap[str, str]
185
+ pool: str
186
+ pool_hash: str
187
+ def __init__(self, job_id: _Optional[int] = ..., task_id: _Optional[int] = ..., job_name: _Optional[str] = ..., task_name: _Optional[str] = ..., job_duration: _Optional[float] = ..., workspace: _Optional[str] = ..., status: _Optional[_Union[ManagedJobStatus, str]] = ..., schedule_state: _Optional[_Union[ManagedJobScheduleState, str]] = ..., resources: _Optional[str] = ..., cluster_resources: _Optional[str] = ..., cluster_resources_full: _Optional[str] = ..., cloud: _Optional[str] = ..., region: _Optional[str] = ..., infra: _Optional[str] = ..., accelerators: _Optional[_Mapping[str, float]] = ..., recovery_count: _Optional[int] = ..., details: _Optional[str] = ..., failure_reason: _Optional[str] = ..., user_name: _Optional[str] = ..., user_hash: _Optional[str] = ..., submitted_at: _Optional[float] = ..., start_at: _Optional[float] = ..., end_at: _Optional[float] = ..., user_yaml: _Optional[str] = ..., entrypoint: _Optional[str] = ..., metadata: _Optional[_Mapping[str, str]] = ..., pool: _Optional[str] = ..., pool_hash: _Optional[str] = ...) -> None: ...
188
+
189
+ class GetJobTableResponse(_message.Message):
190
+ __slots__ = ("jobs", "total", "total_no_filter", "status_counts")
191
+ class StatusCountsEntry(_message.Message):
192
+ __slots__ = ("key", "value")
193
+ KEY_FIELD_NUMBER: _ClassVar[int]
194
+ VALUE_FIELD_NUMBER: _ClassVar[int]
195
+ key: str
196
+ value: int
197
+ def __init__(self, key: _Optional[str] = ..., value: _Optional[int] = ...) -> None: ...
198
+ JOBS_FIELD_NUMBER: _ClassVar[int]
199
+ TOTAL_FIELD_NUMBER: _ClassVar[int]
200
+ TOTAL_NO_FILTER_FIELD_NUMBER: _ClassVar[int]
201
+ STATUS_COUNTS_FIELD_NUMBER: _ClassVar[int]
202
+ jobs: _containers.RepeatedCompositeFieldContainer[ManagedJobInfo]
203
+ total: int
204
+ total_no_filter: int
205
+ status_counts: _containers.ScalarMap[str, int]
206
+ def __init__(self, jobs: _Optional[_Iterable[_Union[ManagedJobInfo, _Mapping]]] = ..., total: _Optional[int] = ..., total_no_filter: _Optional[int] = ..., status_counts: _Optional[_Mapping[str, int]] = ...) -> None: ...
207
+
208
+ class GetAllJobIdsByNameRequest(_message.Message):
209
+ __slots__ = ("job_name",)
210
+ JOB_NAME_FIELD_NUMBER: _ClassVar[int]
211
+ job_name: str
212
+ def __init__(self, job_name: _Optional[str] = ...) -> None: ...
213
+
214
+ class GetAllJobIdsByNameResponse(_message.Message):
215
+ __slots__ = ("job_ids",)
216
+ JOB_IDS_FIELD_NUMBER: _ClassVar[int]
217
+ job_ids: _containers.RepeatedScalarFieldContainer[int]
218
+ def __init__(self, job_ids: _Optional[_Iterable[int]] = ...) -> None: ...
219
+
220
+ class CancelJobsRequest(_message.Message):
221
+ __slots__ = ("current_workspace", "user_hash", "all_users", "job_ids", "job_name", "pool_name")
222
+ CURRENT_WORKSPACE_FIELD_NUMBER: _ClassVar[int]
223
+ USER_HASH_FIELD_NUMBER: _ClassVar[int]
224
+ ALL_USERS_FIELD_NUMBER: _ClassVar[int]
225
+ JOB_IDS_FIELD_NUMBER: _ClassVar[int]
226
+ JOB_NAME_FIELD_NUMBER: _ClassVar[int]
227
+ POOL_NAME_FIELD_NUMBER: _ClassVar[int]
228
+ current_workspace: str
229
+ user_hash: str
230
+ all_users: bool
231
+ job_ids: JobIds
232
+ job_name: str
233
+ pool_name: str
234
+ def __init__(self, current_workspace: _Optional[str] = ..., user_hash: _Optional[str] = ..., all_users: bool = ..., job_ids: _Optional[_Union[JobIds, _Mapping]] = ..., job_name: _Optional[str] = ..., pool_name: _Optional[str] = ...) -> None: ...
235
+
236
+ class CancelJobsResponse(_message.Message):
237
+ __slots__ = ("message",)
238
+ MESSAGE_FIELD_NUMBER: _ClassVar[int]
239
+ message: str
240
+ def __init__(self, message: _Optional[str] = ...) -> None: ...
241
+
242
+ class StreamLogsRequest(_message.Message):
243
+ __slots__ = ("job_name", "job_id", "follow", "controller", "tail")
244
+ JOB_NAME_FIELD_NUMBER: _ClassVar[int]
245
+ JOB_ID_FIELD_NUMBER: _ClassVar[int]
246
+ FOLLOW_FIELD_NUMBER: _ClassVar[int]
247
+ CONTROLLER_FIELD_NUMBER: _ClassVar[int]
248
+ TAIL_FIELD_NUMBER: _ClassVar[int]
249
+ job_name: str
250
+ job_id: int
251
+ follow: bool
252
+ controller: bool
253
+ tail: int
254
+ def __init__(self, job_name: _Optional[str] = ..., job_id: _Optional[int] = ..., follow: bool = ..., controller: bool = ..., tail: _Optional[int] = ...) -> None: ...
255
+
256
+ class StreamLogsResponse(_message.Message):
257
+ __slots__ = ("log_line", "exit_code")
258
+ LOG_LINE_FIELD_NUMBER: _ClassVar[int]
259
+ EXIT_CODE_FIELD_NUMBER: _ClassVar[int]
260
+ log_line: str
261
+ exit_code: int
262
+ def __init__(self, log_line: _Optional[str] = ..., exit_code: _Optional[int] = ...) -> None: ...
@@ -0,0 +1,278 @@
1
+ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
2
+ """Client and server classes corresponding to protobuf-defined services."""
3
+ import grpc
4
+ import warnings
5
+
6
+ from sky.schemas.generated import managed_jobsv1_pb2 as sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2
7
+
8
+ GRPC_GENERATED_VERSION = '1.63.0'
9
+ GRPC_VERSION = grpc.__version__
10
+ EXPECTED_ERROR_RELEASE = '1.65.0'
11
+ SCHEDULED_RELEASE_DATE = 'June 25, 2024'
12
+ _version_not_supported = False
13
+
14
+ try:
15
+ from grpc._utilities import first_version_is_lower
16
+ _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
17
+ except ImportError:
18
+ _version_not_supported = True
19
+
20
+ if _version_not_supported:
21
+ warnings.warn(
22
+ f'The grpc package installed is at version {GRPC_VERSION},'
23
+ + f' but the generated code in sky/schemas/generated/managed_jobsv1_pb2_grpc.py depends on'
24
+ + f' grpcio>={GRPC_GENERATED_VERSION}.'
25
+ + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
26
+ + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
27
+ + f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
28
+ + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
29
+ RuntimeWarning
30
+ )
31
+
32
+
33
+ class ManagedJobsServiceStub(object):
34
+ """Missing associated documentation comment in .proto file."""
35
+
36
+ def __init__(self, channel):
37
+ """Constructor.
38
+
39
+ Args:
40
+ channel: A grpc.Channel.
41
+ """
42
+ self.GetVersion = channel.unary_unary(
43
+ '/managed_jobs.v1.ManagedJobsService/GetVersion',
44
+ request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionRequest.SerializeToString,
45
+ response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionResponse.FromString,
46
+ _registered_method=True)
47
+ self.GetJobTable = channel.unary_unary(
48
+ '/managed_jobs.v1.ManagedJobsService/GetJobTable',
49
+ request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableRequest.SerializeToString,
50
+ response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableResponse.FromString,
51
+ _registered_method=True)
52
+ self.GetAllJobIdsByName = channel.unary_unary(
53
+ '/managed_jobs.v1.ManagedJobsService/GetAllJobIdsByName',
54
+ request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameRequest.SerializeToString,
55
+ response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameResponse.FromString,
56
+ _registered_method=True)
57
+ self.CancelJobs = channel.unary_unary(
58
+ '/managed_jobs.v1.ManagedJobsService/CancelJobs',
59
+ request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsRequest.SerializeToString,
60
+ response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsResponse.FromString,
61
+ _registered_method=True)
62
+ self.StreamLogs = channel.unary_stream(
63
+ '/managed_jobs.v1.ManagedJobsService/StreamLogs',
64
+ request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsRequest.SerializeToString,
65
+ response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsResponse.FromString,
66
+ _registered_method=True)
67
+
68
+
69
+ class ManagedJobsServiceServicer(object):
70
+ """Missing associated documentation comment in .proto file."""
71
+
72
+ def GetVersion(self, request, context):
73
+ """Get controller version.
74
+ """
75
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
76
+ context.set_details('Method not implemented!')
77
+ raise NotImplementedError('Method not implemented!')
78
+
79
+ def GetJobTable(self, request, context):
80
+ """Get the managed job queue with advanced filtering.
81
+ """
82
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
83
+ context.set_details('Method not implemented!')
84
+ raise NotImplementedError('Method not implemented!')
85
+
86
+ def GetAllJobIdsByName(self, request, context):
87
+ """Get all job IDs by name.
88
+ """
89
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
90
+ context.set_details('Method not implemented!')
91
+ raise NotImplementedError('Method not implemented!')
92
+
93
+ def CancelJobs(self, request, context):
94
+ """Cancel managed jobs.
95
+ """
96
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
97
+ context.set_details('Method not implemented!')
98
+ raise NotImplementedError('Method not implemented!')
99
+
100
+ def StreamLogs(self, request, context):
101
+ """Stream managed job logs.
102
+ """
103
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
104
+ context.set_details('Method not implemented!')
105
+ raise NotImplementedError('Method not implemented!')
106
+
107
+
108
+ def add_ManagedJobsServiceServicer_to_server(servicer, server):
109
+ rpc_method_handlers = {
110
+ 'GetVersion': grpc.unary_unary_rpc_method_handler(
111
+ servicer.GetVersion,
112
+ request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionRequest.FromString,
113
+ response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionResponse.SerializeToString,
114
+ ),
115
+ 'GetJobTable': grpc.unary_unary_rpc_method_handler(
116
+ servicer.GetJobTable,
117
+ request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableRequest.FromString,
118
+ response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableResponse.SerializeToString,
119
+ ),
120
+ 'GetAllJobIdsByName': grpc.unary_unary_rpc_method_handler(
121
+ servicer.GetAllJobIdsByName,
122
+ request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameRequest.FromString,
123
+ response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameResponse.SerializeToString,
124
+ ),
125
+ 'CancelJobs': grpc.unary_unary_rpc_method_handler(
126
+ servicer.CancelJobs,
127
+ request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsRequest.FromString,
128
+ response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsResponse.SerializeToString,
129
+ ),
130
+ 'StreamLogs': grpc.unary_stream_rpc_method_handler(
131
+ servicer.StreamLogs,
132
+ request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsRequest.FromString,
133
+ response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsResponse.SerializeToString,
134
+ ),
135
+ }
136
+ generic_handler = grpc.method_handlers_generic_handler(
137
+ 'managed_jobs.v1.ManagedJobsService', rpc_method_handlers)
138
+ server.add_generic_rpc_handlers((generic_handler,))
139
+
140
+
141
+ # This class is part of an EXPERIMENTAL API.
142
+ class ManagedJobsService(object):
143
+ """Missing associated documentation comment in .proto file."""
144
+
145
+ @staticmethod
146
+ def GetVersion(request,
147
+ target,
148
+ options=(),
149
+ channel_credentials=None,
150
+ call_credentials=None,
151
+ insecure=False,
152
+ compression=None,
153
+ wait_for_ready=None,
154
+ timeout=None,
155
+ metadata=None):
156
+ return grpc.experimental.unary_unary(
157
+ request,
158
+ target,
159
+ '/managed_jobs.v1.ManagedJobsService/GetVersion',
160
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionRequest.SerializeToString,
161
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionResponse.FromString,
162
+ options,
163
+ channel_credentials,
164
+ insecure,
165
+ call_credentials,
166
+ compression,
167
+ wait_for_ready,
168
+ timeout,
169
+ metadata,
170
+ _registered_method=True)
171
+
172
+ @staticmethod
173
+ def GetJobTable(request,
174
+ target,
175
+ options=(),
176
+ channel_credentials=None,
177
+ call_credentials=None,
178
+ insecure=False,
179
+ compression=None,
180
+ wait_for_ready=None,
181
+ timeout=None,
182
+ metadata=None):
183
+ return grpc.experimental.unary_unary(
184
+ request,
185
+ target,
186
+ '/managed_jobs.v1.ManagedJobsService/GetJobTable',
187
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableRequest.SerializeToString,
188
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableResponse.FromString,
189
+ options,
190
+ channel_credentials,
191
+ insecure,
192
+ call_credentials,
193
+ compression,
194
+ wait_for_ready,
195
+ timeout,
196
+ metadata,
197
+ _registered_method=True)
198
+
199
+ @staticmethod
200
+ def GetAllJobIdsByName(request,
201
+ target,
202
+ options=(),
203
+ channel_credentials=None,
204
+ call_credentials=None,
205
+ insecure=False,
206
+ compression=None,
207
+ wait_for_ready=None,
208
+ timeout=None,
209
+ metadata=None):
210
+ return grpc.experimental.unary_unary(
211
+ request,
212
+ target,
213
+ '/managed_jobs.v1.ManagedJobsService/GetAllJobIdsByName',
214
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameRequest.SerializeToString,
215
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameResponse.FromString,
216
+ options,
217
+ channel_credentials,
218
+ insecure,
219
+ call_credentials,
220
+ compression,
221
+ wait_for_ready,
222
+ timeout,
223
+ metadata,
224
+ _registered_method=True)
225
+
226
+ @staticmethod
227
+ def CancelJobs(request,
228
+ target,
229
+ options=(),
230
+ channel_credentials=None,
231
+ call_credentials=None,
232
+ insecure=False,
233
+ compression=None,
234
+ wait_for_ready=None,
235
+ timeout=None,
236
+ metadata=None):
237
+ return grpc.experimental.unary_unary(
238
+ request,
239
+ target,
240
+ '/managed_jobs.v1.ManagedJobsService/CancelJobs',
241
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsRequest.SerializeToString,
242
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsResponse.FromString,
243
+ options,
244
+ channel_credentials,
245
+ insecure,
246
+ call_credentials,
247
+ compression,
248
+ wait_for_ready,
249
+ timeout,
250
+ metadata,
251
+ _registered_method=True)
252
+
253
+ @staticmethod
254
+ def StreamLogs(request,
255
+ target,
256
+ options=(),
257
+ channel_credentials=None,
258
+ call_credentials=None,
259
+ insecure=False,
260
+ compression=None,
261
+ wait_for_ready=None,
262
+ timeout=None,
263
+ metadata=None):
264
+ return grpc.experimental.unary_stream(
265
+ request,
266
+ target,
267
+ '/managed_jobs.v1.ManagedJobsService/StreamLogs',
268
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsRequest.SerializeToString,
269
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsResponse.FromString,
270
+ options,
271
+ channel_credentials,
272
+ insecure,
273
+ call_credentials,
274
+ compression,
275
+ wait_for_ready,
276
+ timeout,
277
+ metadata,
278
+ _registered_method=True)
sky/serve/serve_utils.py CHANGED
@@ -408,6 +408,22 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
408
408
  f'{sys_name} will replenish preempted spot '
409
409
  f'with {policy_description} instances.')
410
410
 
411
+ if pool:
412
+ accelerators = set()
413
+ for resource in task.resources:
414
+ if resource.accelerators is not None:
415
+ if isinstance(resource.accelerators, str):
416
+ accelerators.add(resource.accelerators)
417
+ elif isinstance(resource.accelerators, dict):
418
+ accelerators.update(resource.accelerators.keys())
419
+ elif isinstance(resource.accelerators, list):
420
+ accelerators.update(resource.accelerators)
421
+ if len(accelerators) > 1:
422
+ with ux_utils.print_exception_no_traceback():
423
+ raise ValueError('Heterogeneous clusters are not supported for '
424
+ 'cluster pools please specify one accelerator '
425
+ 'for all workers.')
426
+
411
427
  # Try to create a spot placer from the task yaml. Check if the task yaml
412
428
  # is valid for spot placer.
413
429
  spot_placer.SpotPlacer.from_task(task.service, task)
sky/serve/server/core.py CHANGED
@@ -117,7 +117,7 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
117
117
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
118
118
  use_legacy = not handle.is_grpc_enabled_with_flag
119
119
 
120
- if handle.is_grpc_enabled_with_flag:
120
+ if not use_legacy:
121
121
  try:
122
122
  stdout = serve_rpc_utils.RpcRunner.terminate_replica(
123
123
  handle, service_name, replica_id, purge)
sky/serve/server/impl.py CHANGED
@@ -89,7 +89,7 @@ def _get_service_record(
89
89
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
90
90
  use_legacy = not handle.is_grpc_enabled_with_flag
91
91
 
92
- if handle.is_grpc_enabled_with_flag:
92
+ if not use_legacy:
93
93
  try:
94
94
  service_statuses = serve_rpc_utils.RpcRunner.get_service_status(
95
95
  handle, [service_name], pool)
@@ -589,7 +589,7 @@ def update(
589
589
 
590
590
  use_legacy = not handle.is_grpc_enabled_with_flag
591
591
 
592
- if handle.is_grpc_enabled_with_flag:
592
+ if not use_legacy:
593
593
  try:
594
594
  current_version = serve_rpc_utils.RpcRunner.add_version(
595
595
  handle, service_name)
@@ -636,7 +636,7 @@ def update(
636
636
 
637
637
  use_legacy = not handle.is_grpc_enabled_with_flag
638
638
 
639
- if handle.is_grpc_enabled_with_flag:
639
+ if not use_legacy:
640
640
  try:
641
641
  serve_rpc_utils.RpcRunner.update_service(
642
642
  handle, service_name, current_version, mode, pool)
@@ -730,7 +730,7 @@ def down(
730
730
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
731
731
  use_legacy = not handle.is_grpc_enabled_with_flag
732
732
 
733
- if handle.is_grpc_enabled_with_flag:
733
+ if not use_legacy:
734
734
  try:
735
735
  stdout = serve_rpc_utils.RpcRunner.terminate_services(
736
736
  handle, service_names, purge, pool)
@@ -792,7 +792,7 @@ def status(
792
792
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
793
793
  use_legacy = not handle.is_grpc_enabled_with_flag
794
794
 
795
- if handle.is_grpc_enabled_with_flag:
795
+ if not use_legacy:
796
796
  try:
797
797
  service_records = serve_rpc_utils.RpcRunner.get_service_status(
798
798
  handle, service_names, pool)
@@ -928,7 +928,7 @@ def _get_all_replica_targets(
928
928
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
929
929
  use_legacy = not handle.is_grpc_enabled_with_flag
930
930
 
931
- if handle.is_grpc_enabled_with_flag:
931
+ if not use_legacy:
932
932
  try:
933
933
  service_records = serve_rpc_utils.RpcRunner.get_service_status(
934
934
  handle, [service_name], pool)
sky/server/common.py CHANGED
@@ -780,6 +780,7 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
780
780
  os.path.expanduser(constants.API_SERVER_CREATION_LOCK_PATH)):
781
781
  # Check again if server is already running. Other processes may
782
782
  # have started the server while we were waiting for the lock.
783
+ get_api_server_status.cache_clear() # type: ignore[attr-defined]
783
784
  api_server_info = get_api_server_status(endpoint)
784
785
  if api_server_info.status == ApiServerStatus.UNHEALTHY:
785
786
  _start_api_server(deploy, host, foreground, metrics,
@@ -841,7 +842,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
841
842
  for task_config in task_configs:
842
843
  if task_config is None:
843
844
  continue
844
- file_mounts_mapping = task_config.get('file_mounts_mapping', {})
845
+ file_mounts_mapping = task_config.pop('file_mounts_mapping', {})
845
846
  if not file_mounts_mapping:
846
847
  # We did not mount any files to new paths on the remote server
847
848
  # so no need to resolve filepaths.