skypilot-nightly 1.0.0.dev20250925__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (78) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +38 -14
  3. sky/backends/cloud_vm_ray_backend.py +151 -36
  4. sky/client/cli/command.py +18 -9
  5. sky/client/cli/table_utils.py +34 -0
  6. sky/client/common.py +4 -2
  7. sky/client/sdk.py +11 -7
  8. sky/client/sdk_async.py +5 -5
  9. sky/core.py +6 -6
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
  17. sky/dashboard/out/_next/static/chunks/{webpack-16ba1d7187d2e3b1.js → webpack-7340bc0f0dd8ae74.js} +1 -1
  18. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  19. sky/dashboard/out/clusters/[cluster].html +1 -1
  20. sky/dashboard/out/clusters.html +1 -1
  21. sky/dashboard/out/config.html +1 -1
  22. sky/dashboard/out/index.html +1 -1
  23. sky/dashboard/out/infra/[context].html +1 -1
  24. sky/dashboard/out/infra.html +1 -1
  25. sky/dashboard/out/jobs/[job].html +1 -1
  26. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/volumes.html +1 -1
  30. sky/dashboard/out/workspace/new.html +1 -1
  31. sky/dashboard/out/workspaces/[name].html +1 -1
  32. sky/dashboard/out/workspaces.html +1 -1
  33. sky/execution.py +0 -1
  34. sky/global_user_state.py +57 -34
  35. sky/jobs/constants.py +2 -0
  36. sky/jobs/controller.py +4 -0
  37. sky/jobs/server/core.py +98 -26
  38. sky/jobs/server/utils.py +65 -32
  39. sky/jobs/state.py +145 -3
  40. sky/jobs/utils.py +85 -7
  41. sky/provision/runpod/__init__.py +2 -0
  42. sky/schemas/api/responses.py +18 -0
  43. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  44. sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
  45. sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
  46. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  47. sky/serve/serve_utils.py +16 -0
  48. sky/serve/server/core.py +1 -1
  49. sky/serve/server/impl.py +6 -6
  50. sky/server/requests/payloads.py +2 -1
  51. sky/server/requests/serializers/decoders.py +2 -2
  52. sky/server/requests/serializers/encoders.py +7 -3
  53. sky/setup_files/dependencies.py +1 -1
  54. sky/skylet/constants.py +4 -1
  55. sky/skylet/events.py +42 -0
  56. sky/skylet/job_lib.py +2 -32
  57. sky/skylet/log_lib.py +211 -0
  58. sky/skylet/log_lib.pyi +30 -1
  59. sky/skylet/services.py +208 -2
  60. sky/skylet/skylet.py +3 -0
  61. sky/templates/jobs-controller.yaml.j2 +3 -0
  62. sky/templates/kubernetes-ray.yml.j2 +8 -3
  63. sky/utils/db/db_utils.py +5 -1
  64. sky/utils/db/migration_utils.py +1 -1
  65. sky/utils/kubernetes/kubernetes_deploy_utils.py +35 -12
  66. sky/volumes/server/core.py +1 -0
  67. sky/volumes/volume.py +16 -17
  68. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +36 -36
  69. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +74 -69
  70. sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +0 -1
  71. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
  72. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
  73. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +0 -16
  74. /sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
  75. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
  76. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
  77. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
  78. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,278 @@
1
+ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
2
+ """Client and server classes corresponding to protobuf-defined services."""
3
+ import grpc
4
+ import warnings
5
+
6
+ from sky.schemas.generated import managed_jobsv1_pb2 as sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2
7
+
8
+ GRPC_GENERATED_VERSION = '1.63.0'
9
+ GRPC_VERSION = grpc.__version__
10
+ EXPECTED_ERROR_RELEASE = '1.65.0'
11
+ SCHEDULED_RELEASE_DATE = 'June 25, 2024'
12
+ _version_not_supported = False
13
+
14
+ try:
15
+ from grpc._utilities import first_version_is_lower
16
+ _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
17
+ except ImportError:
18
+ _version_not_supported = True
19
+
20
+ if _version_not_supported:
21
+ warnings.warn(
22
+ f'The grpc package installed is at version {GRPC_VERSION},'
23
+ + f' but the generated code in sky/schemas/generated/managed_jobsv1_pb2_grpc.py depends on'
24
+ + f' grpcio>={GRPC_GENERATED_VERSION}.'
25
+ + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
26
+ + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
27
+ + f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
28
+ + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
29
+ RuntimeWarning
30
+ )
31
+
32
+
33
+ class ManagedJobsServiceStub(object):
34
+ """Missing associated documentation comment in .proto file."""
35
+
36
+ def __init__(self, channel):
37
+ """Constructor.
38
+
39
+ Args:
40
+ channel: A grpc.Channel.
41
+ """
42
+ self.GetVersion = channel.unary_unary(
43
+ '/managed_jobs.v1.ManagedJobsService/GetVersion',
44
+ request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionRequest.SerializeToString,
45
+ response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionResponse.FromString,
46
+ _registered_method=True)
47
+ self.GetJobTable = channel.unary_unary(
48
+ '/managed_jobs.v1.ManagedJobsService/GetJobTable',
49
+ request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableRequest.SerializeToString,
50
+ response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableResponse.FromString,
51
+ _registered_method=True)
52
+ self.GetAllJobIdsByName = channel.unary_unary(
53
+ '/managed_jobs.v1.ManagedJobsService/GetAllJobIdsByName',
54
+ request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameRequest.SerializeToString,
55
+ response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameResponse.FromString,
56
+ _registered_method=True)
57
+ self.CancelJobs = channel.unary_unary(
58
+ '/managed_jobs.v1.ManagedJobsService/CancelJobs',
59
+ request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsRequest.SerializeToString,
60
+ response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsResponse.FromString,
61
+ _registered_method=True)
62
+ self.StreamLogs = channel.unary_stream(
63
+ '/managed_jobs.v1.ManagedJobsService/StreamLogs',
64
+ request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsRequest.SerializeToString,
65
+ response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsResponse.FromString,
66
+ _registered_method=True)
67
+
68
+
69
+ class ManagedJobsServiceServicer(object):
70
+ """Missing associated documentation comment in .proto file."""
71
+
72
+ def GetVersion(self, request, context):
73
+ """Get controller version.
74
+ """
75
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
76
+ context.set_details('Method not implemented!')
77
+ raise NotImplementedError('Method not implemented!')
78
+
79
+ def GetJobTable(self, request, context):
80
+ """Get the managed job queue with advanced filtering.
81
+ """
82
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
83
+ context.set_details('Method not implemented!')
84
+ raise NotImplementedError('Method not implemented!')
85
+
86
+ def GetAllJobIdsByName(self, request, context):
87
+ """Get all job IDs by name.
88
+ """
89
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
90
+ context.set_details('Method not implemented!')
91
+ raise NotImplementedError('Method not implemented!')
92
+
93
+ def CancelJobs(self, request, context):
94
+ """Cancel managed jobs.
95
+ """
96
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
97
+ context.set_details('Method not implemented!')
98
+ raise NotImplementedError('Method not implemented!')
99
+
100
+ def StreamLogs(self, request, context):
101
+ """Stream managed job logs.
102
+ """
103
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
104
+ context.set_details('Method not implemented!')
105
+ raise NotImplementedError('Method not implemented!')
106
+
107
+
108
+ def add_ManagedJobsServiceServicer_to_server(servicer, server):
109
+ rpc_method_handlers = {
110
+ 'GetVersion': grpc.unary_unary_rpc_method_handler(
111
+ servicer.GetVersion,
112
+ request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionRequest.FromString,
113
+ response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionResponse.SerializeToString,
114
+ ),
115
+ 'GetJobTable': grpc.unary_unary_rpc_method_handler(
116
+ servicer.GetJobTable,
117
+ request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableRequest.FromString,
118
+ response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableResponse.SerializeToString,
119
+ ),
120
+ 'GetAllJobIdsByName': grpc.unary_unary_rpc_method_handler(
121
+ servicer.GetAllJobIdsByName,
122
+ request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameRequest.FromString,
123
+ response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameResponse.SerializeToString,
124
+ ),
125
+ 'CancelJobs': grpc.unary_unary_rpc_method_handler(
126
+ servicer.CancelJobs,
127
+ request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsRequest.FromString,
128
+ response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsResponse.SerializeToString,
129
+ ),
130
+ 'StreamLogs': grpc.unary_stream_rpc_method_handler(
131
+ servicer.StreamLogs,
132
+ request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsRequest.FromString,
133
+ response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsResponse.SerializeToString,
134
+ ),
135
+ }
136
+ generic_handler = grpc.method_handlers_generic_handler(
137
+ 'managed_jobs.v1.ManagedJobsService', rpc_method_handlers)
138
+ server.add_generic_rpc_handlers((generic_handler,))
139
+
140
+
141
+ # This class is part of an EXPERIMENTAL API.
142
+ class ManagedJobsService(object):
143
+ """Missing associated documentation comment in .proto file."""
144
+
145
+ @staticmethod
146
+ def GetVersion(request,
147
+ target,
148
+ options=(),
149
+ channel_credentials=None,
150
+ call_credentials=None,
151
+ insecure=False,
152
+ compression=None,
153
+ wait_for_ready=None,
154
+ timeout=None,
155
+ metadata=None):
156
+ return grpc.experimental.unary_unary(
157
+ request,
158
+ target,
159
+ '/managed_jobs.v1.ManagedJobsService/GetVersion',
160
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionRequest.SerializeToString,
161
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionResponse.FromString,
162
+ options,
163
+ channel_credentials,
164
+ insecure,
165
+ call_credentials,
166
+ compression,
167
+ wait_for_ready,
168
+ timeout,
169
+ metadata,
170
+ _registered_method=True)
171
+
172
+ @staticmethod
173
+ def GetJobTable(request,
174
+ target,
175
+ options=(),
176
+ channel_credentials=None,
177
+ call_credentials=None,
178
+ insecure=False,
179
+ compression=None,
180
+ wait_for_ready=None,
181
+ timeout=None,
182
+ metadata=None):
183
+ return grpc.experimental.unary_unary(
184
+ request,
185
+ target,
186
+ '/managed_jobs.v1.ManagedJobsService/GetJobTable',
187
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableRequest.SerializeToString,
188
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableResponse.FromString,
189
+ options,
190
+ channel_credentials,
191
+ insecure,
192
+ call_credentials,
193
+ compression,
194
+ wait_for_ready,
195
+ timeout,
196
+ metadata,
197
+ _registered_method=True)
198
+
199
+ @staticmethod
200
+ def GetAllJobIdsByName(request,
201
+ target,
202
+ options=(),
203
+ channel_credentials=None,
204
+ call_credentials=None,
205
+ insecure=False,
206
+ compression=None,
207
+ wait_for_ready=None,
208
+ timeout=None,
209
+ metadata=None):
210
+ return grpc.experimental.unary_unary(
211
+ request,
212
+ target,
213
+ '/managed_jobs.v1.ManagedJobsService/GetAllJobIdsByName',
214
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameRequest.SerializeToString,
215
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameResponse.FromString,
216
+ options,
217
+ channel_credentials,
218
+ insecure,
219
+ call_credentials,
220
+ compression,
221
+ wait_for_ready,
222
+ timeout,
223
+ metadata,
224
+ _registered_method=True)
225
+
226
+ @staticmethod
227
+ def CancelJobs(request,
228
+ target,
229
+ options=(),
230
+ channel_credentials=None,
231
+ call_credentials=None,
232
+ insecure=False,
233
+ compression=None,
234
+ wait_for_ready=None,
235
+ timeout=None,
236
+ metadata=None):
237
+ return grpc.experimental.unary_unary(
238
+ request,
239
+ target,
240
+ '/managed_jobs.v1.ManagedJobsService/CancelJobs',
241
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsRequest.SerializeToString,
242
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsResponse.FromString,
243
+ options,
244
+ channel_credentials,
245
+ insecure,
246
+ call_credentials,
247
+ compression,
248
+ wait_for_ready,
249
+ timeout,
250
+ metadata,
251
+ _registered_method=True)
252
+
253
+ @staticmethod
254
+ def StreamLogs(request,
255
+ target,
256
+ options=(),
257
+ channel_credentials=None,
258
+ call_credentials=None,
259
+ insecure=False,
260
+ compression=None,
261
+ wait_for_ready=None,
262
+ timeout=None,
263
+ metadata=None):
264
+ return grpc.experimental.unary_stream(
265
+ request,
266
+ target,
267
+ '/managed_jobs.v1.ManagedJobsService/StreamLogs',
268
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsRequest.SerializeToString,
269
+ sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsResponse.FromString,
270
+ options,
271
+ channel_credentials,
272
+ insecure,
273
+ call_credentials,
274
+ compression,
275
+ wait_for_ready,
276
+ timeout,
277
+ metadata,
278
+ _registered_method=True)
sky/serve/serve_utils.py CHANGED
@@ -408,6 +408,22 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
408
408
  f'{sys_name} will replenish preempted spot '
409
409
  f'with {policy_description} instances.')
410
410
 
411
+ if pool:
412
+ accelerators = set()
413
+ for resource in task.resources:
414
+ if resource.accelerators is not None:
415
+ if isinstance(resource.accelerators, str):
416
+ accelerators.add(resource.accelerators)
417
+ elif isinstance(resource.accelerators, dict):
418
+ accelerators.update(resource.accelerators.keys())
419
+ elif isinstance(resource.accelerators, list):
420
+ accelerators.update(resource.accelerators)
421
+ if len(accelerators) > 1:
422
+ with ux_utils.print_exception_no_traceback():
423
+ raise ValueError('Heterogeneous clusters are not supported for '
424
+ 'cluster pools please specify one accelerator '
425
+ 'for all workers.')
426
+
411
427
  # Try to create a spot placer from the task yaml. Check if the task yaml
412
428
  # is valid for spot placer.
413
429
  spot_placer.SpotPlacer.from_task(task.service, task)
sky/serve/server/core.py CHANGED
@@ -117,7 +117,7 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
117
117
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
118
118
  use_legacy = not handle.is_grpc_enabled_with_flag
119
119
 
120
- if handle.is_grpc_enabled_with_flag:
120
+ if not use_legacy:
121
121
  try:
122
122
  stdout = serve_rpc_utils.RpcRunner.terminate_replica(
123
123
  handle, service_name, replica_id, purge)
sky/serve/server/impl.py CHANGED
@@ -89,7 +89,7 @@ def _get_service_record(
89
89
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
90
90
  use_legacy = not handle.is_grpc_enabled_with_flag
91
91
 
92
- if handle.is_grpc_enabled_with_flag:
92
+ if not use_legacy:
93
93
  try:
94
94
  service_statuses = serve_rpc_utils.RpcRunner.get_service_status(
95
95
  handle, [service_name], pool)
@@ -589,7 +589,7 @@ def update(
589
589
 
590
590
  use_legacy = not handle.is_grpc_enabled_with_flag
591
591
 
592
- if handle.is_grpc_enabled_with_flag:
592
+ if not use_legacy:
593
593
  try:
594
594
  current_version = serve_rpc_utils.RpcRunner.add_version(
595
595
  handle, service_name)
@@ -636,7 +636,7 @@ def update(
636
636
 
637
637
  use_legacy = not handle.is_grpc_enabled_with_flag
638
638
 
639
- if handle.is_grpc_enabled_with_flag:
639
+ if not use_legacy:
640
640
  try:
641
641
  serve_rpc_utils.RpcRunner.update_service(
642
642
  handle, service_name, current_version, mode, pool)
@@ -730,7 +730,7 @@ def down(
730
730
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
731
731
  use_legacy = not handle.is_grpc_enabled_with_flag
732
732
 
733
- if handle.is_grpc_enabled_with_flag:
733
+ if not use_legacy:
734
734
  try:
735
735
  stdout = serve_rpc_utils.RpcRunner.terminate_services(
736
736
  handle, service_names, purge, pool)
@@ -792,7 +792,7 @@ def status(
792
792
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
793
793
  use_legacy = not handle.is_grpc_enabled_with_flag
794
794
 
795
- if handle.is_grpc_enabled_with_flag:
795
+ if not use_legacy:
796
796
  try:
797
797
  service_records = serve_rpc_utils.RpcRunner.get_service_status(
798
798
  handle, service_names, pool)
@@ -928,7 +928,7 @@ def _get_all_replica_targets(
928
928
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
929
929
  use_legacy = not handle.is_grpc_enabled_with_flag
930
930
 
931
- if handle.is_grpc_enabled_with_flag:
931
+ if not use_legacy:
932
932
  try:
933
933
  service_records = serve_rpc_utils.RpcRunner.get_service_status(
934
934
  handle, [service_name], pool)
@@ -683,8 +683,9 @@ class LocalUpBody(RequestBody):
683
683
  ssh_key: Optional[str] = None
684
684
  cleanup: bool = False
685
685
  context_name: Optional[str] = None
686
- name: Optional[str] = None
687
686
  password: Optional[str] = None
687
+ name: Optional[str] = None
688
+ port_start: Optional[int] = None
688
689
 
689
690
 
690
691
  class LocalDownBody(RequestBody):
@@ -101,11 +101,11 @@ def decode_start(return_value: str) -> 'backends.CloudVmRayResourceHandle':
101
101
 
102
102
 
103
103
  @register_decoders('queue')
104
- def decode_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
104
+ def decode_queue(return_value: List[dict],) -> List[responses.ClusterJobRecord]:
105
105
  jobs = return_value
106
106
  for job in jobs:
107
107
  job['status'] = job_lib.JobStatus(job['status'])
108
- return jobs
108
+ return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
109
109
 
110
110
 
111
111
  @register_decoders('jobs.queue')
@@ -92,10 +92,14 @@ def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
92
92
 
93
93
 
94
94
  @register_encoder('queue')
95
- def encode_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
95
+ def encode_queue(
96
+ jobs: List[responses.ClusterJobRecord],) -> List[Dict[str, Any]]:
97
+ response = []
96
98
  for job in jobs:
97
- job['status'] = job['status'].value
98
- return jobs
99
+ response_job = job.model_dump()
100
+ response_job['status'] = job['status'].value
101
+ response.append(response_job)
102
+ return response
99
103
 
100
104
 
101
105
  @register_encoder('status_kubernetes')
@@ -79,7 +79,7 @@ install_requires = [
79
79
  # Required for API server metrics
80
80
  'prometheus_client>=0.8.0',
81
81
  'passlib',
82
- 'bcrypt',
82
+ 'bcrypt==4.0.1',
83
83
  'pyjwt',
84
84
  'gitpython',
85
85
  'types-paramiko',
sky/skylet/constants.py CHANGED
@@ -57,6 +57,9 @@ SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
57
57
  ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
58
58
  # uv is used for venv and pip, much faster than python implementations.
59
59
  SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
60
+ # set UV_SYSTEM_PYTHON to false in case the
61
+ # user provided docker image set it to true.
62
+ # unset PYTHONPATH in case the user provided docker image set it.
60
63
  SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
61
64
  f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
62
65
  # This won't reinstall uv if it's already installed, so it's safe to re-run.
@@ -97,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
97
100
  # cluster yaml is updated.
98
101
  #
99
102
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
100
- SKYLET_VERSION = '18'
103
+ SKYLET_VERSION = '21'
101
104
  # The version of the lib files that skylet/jobs use. Whenever there is an API
102
105
  # change for the job_lib or log_lib, we need to bump this version, so that the
103
106
  # user can be notified to update their SkyPilot version on the remote cluster.
sky/skylet/events.py CHANGED
@@ -11,6 +11,7 @@ import psutil
11
11
  from sky import clouds
12
12
  from sky import sky_logging
13
13
  from sky.backends import cloud_vm_ray_backend
14
+ from sky.jobs import constants as managed_job_constants
14
15
  from sky.jobs import scheduler
15
16
  from sky.jobs import state as managed_job_state
16
17
  from sky.jobs import utils as managed_job_utils
@@ -21,6 +22,7 @@ from sky.skylet import job_lib
21
22
  from sky.usage import usage_lib
22
23
  from sky.utils import cluster_utils
23
24
  from sky.utils import registry
25
+ from sky.utils import subprocess_utils
24
26
  from sky.utils import ux_utils
25
27
  from sky.utils import yaml_utils
26
28
 
@@ -74,6 +76,46 @@ class ManagedJobEvent(SkyletEvent):
74
76
  EVENT_INTERVAL_SECONDS = 300
75
77
 
76
78
  def _run(self):
79
+ if not os.path.exists(
80
+ os.path.expanduser(
81
+ managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)):
82
+ # Note: since the skylet is started before the user setup (in
83
+ # jobs-controller.yaml.j2) runs, it's possible that we hit this
84
+ # before the indicator file is written. However, since we will wait
85
+ # EVENT_INTERVAL_SECONDS before the first run, this should be very
86
+ # unlikely.
87
+ logger.info('No jobs controller indicator file found.')
88
+ all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
89
+ if not all_job_ids:
90
+ logger.info('No jobs running. Stopping controllers.')
91
+ # TODO(cooperc): Move this to a shared function also called by
92
+ # sdk.api_stop(). (#7229)
93
+ try:
94
+ with open(os.path.expanduser(
95
+ scheduler.JOB_CONTROLLER_PID_PATH),
96
+ 'r',
97
+ encoding='utf-8') as f:
98
+ pids = f.read().split('\n')[:-1]
99
+ for pid in pids:
100
+ if subprocess_utils.is_process_alive(
101
+ int(pid.strip())):
102
+ subprocess_utils.kill_children_processes(
103
+ parent_pids=[int(pid.strip())], force=True)
104
+ os.remove(
105
+ os.path.expanduser(scheduler.JOB_CONTROLLER_PID_PATH))
106
+ except FileNotFoundError:
107
+ # its fine we will create it
108
+ pass
109
+ except Exception as e: # pylint: disable=broad-except
110
+ # in case we get perm issues or something is messed up, just
111
+ # ignore it and assume the process is dead
112
+ logger.error(
113
+ f'Error looking at job controller pid file: {e}')
114
+ pass
115
+ logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
116
+ 'indicator file hasn\'t been written yet.')
117
+ return
118
+
77
119
  logger.info('=== Updating managed job status ===')
78
120
  managed_job_utils.update_managed_jobs_statuses()
79
121
  scheduler.maybe_start_controllers()
sky/skylet/job_lib.py CHANGED
@@ -24,7 +24,6 @@ from sky import sky_logging
24
24
  from sky.adaptors import common as adaptors_common
25
25
  from sky.skylet import constants
26
26
  from sky.utils import common_utils
27
- from sky.utils import log_utils
28
27
  from sky.utils import message_utils
29
28
  from sky.utils import subprocess_utils
30
29
  from sky.utils.db import db_utils
@@ -612,8 +611,8 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
612
611
  PENDING state.
613
612
 
614
613
  The normal job duration will use `start_at` instead of `submitted_at` (in
615
- `format_job_queue()`), because the job may stay in PENDING if the cluster is
616
- busy.
614
+ `table_utils.format_job_queue()`), because the job may stay in PENDING if
615
+ the cluster is busy.
617
616
  """
618
617
  return message_utils.encode_payload(
619
618
  get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
@@ -941,35 +940,6 @@ def is_cluster_idle() -> bool:
941
940
  assert False, 'Should not reach here'
942
941
 
943
942
 
944
- def format_job_queue(jobs: List[Dict[str, Any]]):
945
- """Format the job queue for display.
946
-
947
- Usage:
948
- jobs = get_job_queue()
949
- print(format_job_queue(jobs))
950
- """
951
- job_table = log_utils.create_table([
952
- 'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
953
- 'STATUS', 'LOG', 'GIT COMMIT'
954
- ])
955
- for job in jobs:
956
- job_table.add_row([
957
- job['job_id'],
958
- job['job_name'],
959
- job['username'],
960
- log_utils.readable_time_duration(job['submitted_at']),
961
- log_utils.readable_time_duration(job['start_at']),
962
- log_utils.readable_time_duration(job['start_at'],
963
- job['end_at'],
964
- absolute=True),
965
- job['resources'],
966
- job['status'].colored_str(),
967
- job['log_path'],
968
- job.get('metadata', {}).get('git_commit', '-'),
969
- ])
970
- return job_table
971
-
972
-
973
943
  def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
974
944
  """Get the job queue in encoded json format.
975
945