skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (105) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/seeweb.py +103 -0
  3. sky/authentication.py +38 -0
  4. sky/backends/backend_utils.py +148 -30
  5. sky/backends/cloud_vm_ray_backend.py +606 -223
  6. sky/catalog/__init__.py +7 -0
  7. sky/catalog/aws_catalog.py +4 -0
  8. sky/catalog/common.py +18 -0
  9. sky/catalog/data_fetchers/fetch_aws.py +13 -37
  10. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  11. sky/catalog/seeweb_catalog.py +184 -0
  12. sky/client/cli/command.py +2 -71
  13. sky/client/sdk_async.py +5 -2
  14. sky/clouds/__init__.py +2 -0
  15. sky/clouds/aws.py +23 -5
  16. sky/clouds/cloud.py +8 -0
  17. sky/clouds/kubernetes.py +2 -0
  18. sky/clouds/seeweb.py +463 -0
  19. sky/core.py +46 -12
  20. sky/dashboard/out/404.html +1 -1
  21. sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
  22. sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
  25. sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
  27. sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
  30. sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
  31. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  32. sky/dashboard/out/clusters/[cluster].html +1 -1
  33. sky/dashboard/out/clusters.html +1 -1
  34. sky/dashboard/out/config.html +1 -1
  35. sky/dashboard/out/index.html +1 -1
  36. sky/dashboard/out/infra/[context].html +1 -1
  37. sky/dashboard/out/infra.html +1 -1
  38. sky/dashboard/out/jobs/[job].html +1 -1
  39. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/volumes.html +1 -1
  43. sky/dashboard/out/workspace/new.html +1 -1
  44. sky/dashboard/out/workspaces/[name].html +1 -1
  45. sky/dashboard/out/workspaces.html +1 -1
  46. sky/exceptions.py +5 -0
  47. sky/global_user_state.py +75 -26
  48. sky/jobs/client/sdk_async.py +4 -2
  49. sky/jobs/controller.py +4 -2
  50. sky/jobs/recovery_strategy.py +1 -1
  51. sky/jobs/state.py +26 -16
  52. sky/jobs/utils.py +67 -24
  53. sky/logs/agent.py +10 -2
  54. sky/provision/__init__.py +1 -0
  55. sky/provision/kubernetes/config.py +7 -2
  56. sky/provision/kubernetes/instance.py +84 -41
  57. sky/provision/kubernetes/utils.py +14 -3
  58. sky/provision/seeweb/__init__.py +11 -0
  59. sky/provision/seeweb/config.py +13 -0
  60. sky/provision/seeweb/instance.py +806 -0
  61. sky/provision/vast/instance.py +1 -1
  62. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  63. sky/schemas/generated/jobsv1_pb2.py +86 -0
  64. sky/schemas/generated/jobsv1_pb2.pyi +252 -0
  65. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  66. sky/server/config.py +14 -5
  67. sky/server/metrics.py +41 -8
  68. sky/server/requests/executor.py +41 -4
  69. sky/server/server.py +1 -0
  70. sky/server/uvicorn.py +11 -5
  71. sky/setup_files/dependencies.py +8 -1
  72. sky/skylet/constants.py +14 -8
  73. sky/skylet/job_lib.py +128 -10
  74. sky/skylet/log_lib.py +14 -3
  75. sky/skylet/log_lib.pyi +9 -0
  76. sky/skylet/services.py +203 -0
  77. sky/skylet/skylet.py +4 -0
  78. sky/task.py +62 -0
  79. sky/templates/kubernetes-ray.yml.j2 +120 -3
  80. sky/templates/seeweb-ray.yml.j2 +108 -0
  81. sky/utils/accelerator_registry.py +3 -1
  82. sky/utils/command_runner.py +35 -11
  83. sky/utils/command_runner.pyi +22 -0
  84. sky/utils/context_utils.py +15 -2
  85. sky/utils/controller_utils.py +11 -5
  86. sky/utils/db/migration_utils.py +1 -1
  87. sky/utils/git.py +559 -1
  88. sky/utils/resource_checker.py +8 -7
  89. sky/workspaces/core.py +57 -21
  90. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
  91. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
  92. sky/client/cli/git.py +0 -549
  93. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  98. sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  101. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
  102. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
  103. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
  104. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
  105. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,542 @@
1
+ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
2
+ """Client and server classes corresponding to protobuf-defined services."""
3
+ import grpc
4
+ import warnings
5
+
6
+ from sky.schemas.generated import jobsv1_pb2 as sky_dot_schemas_dot_generated_dot_jobsv1__pb2
7
+
8
+ GRPC_GENERATED_VERSION = '1.63.0'
9
+ GRPC_VERSION = grpc.__version__
10
+ EXPECTED_ERROR_RELEASE = '1.65.0'
11
+ SCHEDULED_RELEASE_DATE = 'June 25, 2024'
12
+ _version_not_supported = False
13
+
14
+ try:
15
+ from grpc._utilities import first_version_is_lower
16
+ _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
17
+ except ImportError:
18
+ _version_not_supported = True
19
+
20
+ if _version_not_supported:
21
+ warnings.warn(
22
+ f'The grpc package installed is at version {GRPC_VERSION},'
23
+ + f' but the generated code in sky/schemas/generated/jobsv1_pb2_grpc.py depends on'
24
+ + f' grpcio>={GRPC_GENERATED_VERSION}.'
25
+ + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
26
+ + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
27
+ + f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
28
+ + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
29
+ RuntimeWarning
30
+ )
31
+
32
+
33
+ class JobsServiceStub(object):
34
+ """Missing associated documentation comment in .proto file."""
35
+
36
+ def __init__(self, channel):
37
+ """Constructor.
38
+
39
+ Args:
40
+ channel: A grpc.Channel.
41
+ """
42
+ self.AddJob = channel.unary_unary(
43
+ '/jobs.v1.JobsService/AddJob',
44
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobRequest.SerializeToString,
45
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobResponse.FromString,
46
+ _registered_method=True)
47
+ self.QueueJob = channel.unary_unary(
48
+ '/jobs.v1.JobsService/QueueJob',
49
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobRequest.SerializeToString,
50
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobResponse.FromString,
51
+ _registered_method=True)
52
+ self.UpdateStatus = channel.unary_unary(
53
+ '/jobs.v1.JobsService/UpdateStatus',
54
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusRequest.SerializeToString,
55
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusResponse.FromString,
56
+ _registered_method=True)
57
+ self.GetJobQueue = channel.unary_unary(
58
+ '/jobs.v1.JobsService/GetJobQueue',
59
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueRequest.SerializeToString,
60
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueResponse.FromString,
61
+ _registered_method=True)
62
+ self.CancelJobs = channel.unary_unary(
63
+ '/jobs.v1.JobsService/CancelJobs',
64
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsRequest.SerializeToString,
65
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsResponse.FromString,
66
+ _registered_method=True)
67
+ self.FailAllInProgressJobs = channel.unary_unary(
68
+ '/jobs.v1.JobsService/FailAllInProgressJobs',
69
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsRequest.SerializeToString,
70
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsResponse.FromString,
71
+ _registered_method=True)
72
+ self.TailLogs = channel.unary_stream(
73
+ '/jobs.v1.JobsService/TailLogs',
74
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsRequest.SerializeToString,
75
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsResponse.FromString,
76
+ _registered_method=True)
77
+ self.GetJobStatus = channel.unary_unary(
78
+ '/jobs.v1.JobsService/GetJobStatus',
79
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusRequest.SerializeToString,
80
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusResponse.FromString,
81
+ _registered_method=True)
82
+ self.GetJobSubmittedTimestamp = channel.unary_unary(
83
+ '/jobs.v1.JobsService/GetJobSubmittedTimestamp',
84
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampRequest.SerializeToString,
85
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampResponse.FromString,
86
+ _registered_method=True)
87
+ self.GetJobEndedTimestamp = channel.unary_unary(
88
+ '/jobs.v1.JobsService/GetJobEndedTimestamp',
89
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampRequest.SerializeToString,
90
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampResponse.FromString,
91
+ _registered_method=True)
92
+ self.GetLogDirsForJobs = channel.unary_unary(
93
+ '/jobs.v1.JobsService/GetLogDirsForJobs',
94
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsRequest.SerializeToString,
95
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsResponse.FromString,
96
+ _registered_method=True)
97
+
98
+
99
+ class JobsServiceServicer(object):
100
+ """Missing associated documentation comment in .proto file."""
101
+
102
+ def AddJob(self, request, context):
103
+ """Add a new job to the database.
104
+ """
105
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
106
+ context.set_details('Method not implemented!')
107
+ raise NotImplementedError('Method not implemented!')
108
+
109
+ def QueueJob(self, request, context):
110
+ """Queue a job for execution.
111
+ """
112
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
113
+ context.set_details('Method not implemented!')
114
+ raise NotImplementedError('Method not implemented!')
115
+
116
+ def UpdateStatus(self, request, context):
117
+ """Update status of all jobs.
118
+ """
119
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
120
+ context.set_details('Method not implemented!')
121
+ raise NotImplementedError('Method not implemented!')
122
+
123
+ def GetJobQueue(self, request, context):
124
+ """Get the job queue.
125
+ """
126
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
127
+ context.set_details('Method not implemented!')
128
+ raise NotImplementedError('Method not implemented!')
129
+
130
+ def CancelJobs(self, request, context):
131
+ """Cancel jobs.
132
+ """
133
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
134
+ context.set_details('Method not implemented!')
135
+ raise NotImplementedError('Method not implemented!')
136
+
137
+ def FailAllInProgressJobs(self, request, context):
138
+ """Fail all in-progress jobs.
139
+ """
140
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
141
+ context.set_details('Method not implemented!')
142
+ raise NotImplementedError('Method not implemented!')
143
+
144
+ def TailLogs(self, request, context):
145
+ """Tail job logs.
146
+ """
147
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
148
+ context.set_details('Method not implemented!')
149
+ raise NotImplementedError('Method not implemented!')
150
+
151
+ def GetJobStatus(self, request, context):
152
+ """Get job status.
153
+ """
154
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
155
+ context.set_details('Method not implemented!')
156
+ raise NotImplementedError('Method not implemented!')
157
+
158
+ def GetJobSubmittedTimestamp(self, request, context):
159
+ """Get job submitted timestamp.
160
+ """
161
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
162
+ context.set_details('Method not implemented!')
163
+ raise NotImplementedError('Method not implemented!')
164
+
165
+ def GetJobEndedTimestamp(self, request, context):
166
+ """Get job ended timestamp.
167
+ """
168
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
169
+ context.set_details('Method not implemented!')
170
+ raise NotImplementedError('Method not implemented!')
171
+
172
+ def GetLogDirsForJobs(self, request, context):
173
+ """Get log directories for jobs.
174
+ """
175
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
176
+ context.set_details('Method not implemented!')
177
+ raise NotImplementedError('Method not implemented!')
178
+
179
+
180
+ def add_JobsServiceServicer_to_server(servicer, server):
181
+ rpc_method_handlers = {
182
+ 'AddJob': grpc.unary_unary_rpc_method_handler(
183
+ servicer.AddJob,
184
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobRequest.FromString,
185
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobResponse.SerializeToString,
186
+ ),
187
+ 'QueueJob': grpc.unary_unary_rpc_method_handler(
188
+ servicer.QueueJob,
189
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobRequest.FromString,
190
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobResponse.SerializeToString,
191
+ ),
192
+ 'UpdateStatus': grpc.unary_unary_rpc_method_handler(
193
+ servicer.UpdateStatus,
194
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusRequest.FromString,
195
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusResponse.SerializeToString,
196
+ ),
197
+ 'GetJobQueue': grpc.unary_unary_rpc_method_handler(
198
+ servicer.GetJobQueue,
199
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueRequest.FromString,
200
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueResponse.SerializeToString,
201
+ ),
202
+ 'CancelJobs': grpc.unary_unary_rpc_method_handler(
203
+ servicer.CancelJobs,
204
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsRequest.FromString,
205
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsResponse.SerializeToString,
206
+ ),
207
+ 'FailAllInProgressJobs': grpc.unary_unary_rpc_method_handler(
208
+ servicer.FailAllInProgressJobs,
209
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsRequest.FromString,
210
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsResponse.SerializeToString,
211
+ ),
212
+ 'TailLogs': grpc.unary_stream_rpc_method_handler(
213
+ servicer.TailLogs,
214
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsRequest.FromString,
215
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsResponse.SerializeToString,
216
+ ),
217
+ 'GetJobStatus': grpc.unary_unary_rpc_method_handler(
218
+ servicer.GetJobStatus,
219
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusRequest.FromString,
220
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusResponse.SerializeToString,
221
+ ),
222
+ 'GetJobSubmittedTimestamp': grpc.unary_unary_rpc_method_handler(
223
+ servicer.GetJobSubmittedTimestamp,
224
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampRequest.FromString,
225
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampResponse.SerializeToString,
226
+ ),
227
+ 'GetJobEndedTimestamp': grpc.unary_unary_rpc_method_handler(
228
+ servicer.GetJobEndedTimestamp,
229
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampRequest.FromString,
230
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampResponse.SerializeToString,
231
+ ),
232
+ 'GetLogDirsForJobs': grpc.unary_unary_rpc_method_handler(
233
+ servicer.GetLogDirsForJobs,
234
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsRequest.FromString,
235
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsResponse.SerializeToString,
236
+ ),
237
+ }
238
+ generic_handler = grpc.method_handlers_generic_handler(
239
+ 'jobs.v1.JobsService', rpc_method_handlers)
240
+ server.add_generic_rpc_handlers((generic_handler,))
241
+
242
+
243
+ # This class is part of an EXPERIMENTAL API.
244
+ class JobsService(object):
245
+ """Missing associated documentation comment in .proto file."""
246
+
247
+ @staticmethod
248
+ def AddJob(request,
249
+ target,
250
+ options=(),
251
+ channel_credentials=None,
252
+ call_credentials=None,
253
+ insecure=False,
254
+ compression=None,
255
+ wait_for_ready=None,
256
+ timeout=None,
257
+ metadata=None):
258
+ return grpc.experimental.unary_unary(
259
+ request,
260
+ target,
261
+ '/jobs.v1.JobsService/AddJob',
262
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobRequest.SerializeToString,
263
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobResponse.FromString,
264
+ options,
265
+ channel_credentials,
266
+ insecure,
267
+ call_credentials,
268
+ compression,
269
+ wait_for_ready,
270
+ timeout,
271
+ metadata,
272
+ _registered_method=True)
273
+
274
+ @staticmethod
275
+ def QueueJob(request,
276
+ target,
277
+ options=(),
278
+ channel_credentials=None,
279
+ call_credentials=None,
280
+ insecure=False,
281
+ compression=None,
282
+ wait_for_ready=None,
283
+ timeout=None,
284
+ metadata=None):
285
+ return grpc.experimental.unary_unary(
286
+ request,
287
+ target,
288
+ '/jobs.v1.JobsService/QueueJob',
289
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobRequest.SerializeToString,
290
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobResponse.FromString,
291
+ options,
292
+ channel_credentials,
293
+ insecure,
294
+ call_credentials,
295
+ compression,
296
+ wait_for_ready,
297
+ timeout,
298
+ metadata,
299
+ _registered_method=True)
300
+
301
+ @staticmethod
302
+ def UpdateStatus(request,
303
+ target,
304
+ options=(),
305
+ channel_credentials=None,
306
+ call_credentials=None,
307
+ insecure=False,
308
+ compression=None,
309
+ wait_for_ready=None,
310
+ timeout=None,
311
+ metadata=None):
312
+ return grpc.experimental.unary_unary(
313
+ request,
314
+ target,
315
+ '/jobs.v1.JobsService/UpdateStatus',
316
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusRequest.SerializeToString,
317
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusResponse.FromString,
318
+ options,
319
+ channel_credentials,
320
+ insecure,
321
+ call_credentials,
322
+ compression,
323
+ wait_for_ready,
324
+ timeout,
325
+ metadata,
326
+ _registered_method=True)
327
+
328
+ @staticmethod
329
+ def GetJobQueue(request,
330
+ target,
331
+ options=(),
332
+ channel_credentials=None,
333
+ call_credentials=None,
334
+ insecure=False,
335
+ compression=None,
336
+ wait_for_ready=None,
337
+ timeout=None,
338
+ metadata=None):
339
+ return grpc.experimental.unary_unary(
340
+ request,
341
+ target,
342
+ '/jobs.v1.JobsService/GetJobQueue',
343
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueRequest.SerializeToString,
344
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueResponse.FromString,
345
+ options,
346
+ channel_credentials,
347
+ insecure,
348
+ call_credentials,
349
+ compression,
350
+ wait_for_ready,
351
+ timeout,
352
+ metadata,
353
+ _registered_method=True)
354
+
355
+ @staticmethod
356
+ def CancelJobs(request,
357
+ target,
358
+ options=(),
359
+ channel_credentials=None,
360
+ call_credentials=None,
361
+ insecure=False,
362
+ compression=None,
363
+ wait_for_ready=None,
364
+ timeout=None,
365
+ metadata=None):
366
+ return grpc.experimental.unary_unary(
367
+ request,
368
+ target,
369
+ '/jobs.v1.JobsService/CancelJobs',
370
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsRequest.SerializeToString,
371
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsResponse.FromString,
372
+ options,
373
+ channel_credentials,
374
+ insecure,
375
+ call_credentials,
376
+ compression,
377
+ wait_for_ready,
378
+ timeout,
379
+ metadata,
380
+ _registered_method=True)
381
+
382
+ @staticmethod
383
+ def FailAllInProgressJobs(request,
384
+ target,
385
+ options=(),
386
+ channel_credentials=None,
387
+ call_credentials=None,
388
+ insecure=False,
389
+ compression=None,
390
+ wait_for_ready=None,
391
+ timeout=None,
392
+ metadata=None):
393
+ return grpc.experimental.unary_unary(
394
+ request,
395
+ target,
396
+ '/jobs.v1.JobsService/FailAllInProgressJobs',
397
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsRequest.SerializeToString,
398
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsResponse.FromString,
399
+ options,
400
+ channel_credentials,
401
+ insecure,
402
+ call_credentials,
403
+ compression,
404
+ wait_for_ready,
405
+ timeout,
406
+ metadata,
407
+ _registered_method=True)
408
+
409
+ @staticmethod
410
+ def TailLogs(request,
411
+ target,
412
+ options=(),
413
+ channel_credentials=None,
414
+ call_credentials=None,
415
+ insecure=False,
416
+ compression=None,
417
+ wait_for_ready=None,
418
+ timeout=None,
419
+ metadata=None):
420
+ return grpc.experimental.unary_stream(
421
+ request,
422
+ target,
423
+ '/jobs.v1.JobsService/TailLogs',
424
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsRequest.SerializeToString,
425
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsResponse.FromString,
426
+ options,
427
+ channel_credentials,
428
+ insecure,
429
+ call_credentials,
430
+ compression,
431
+ wait_for_ready,
432
+ timeout,
433
+ metadata,
434
+ _registered_method=True)
435
+
436
+ @staticmethod
437
+ def GetJobStatus(request,
438
+ target,
439
+ options=(),
440
+ channel_credentials=None,
441
+ call_credentials=None,
442
+ insecure=False,
443
+ compression=None,
444
+ wait_for_ready=None,
445
+ timeout=None,
446
+ metadata=None):
447
+ return grpc.experimental.unary_unary(
448
+ request,
449
+ target,
450
+ '/jobs.v1.JobsService/GetJobStatus',
451
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusRequest.SerializeToString,
452
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusResponse.FromString,
453
+ options,
454
+ channel_credentials,
455
+ insecure,
456
+ call_credentials,
457
+ compression,
458
+ wait_for_ready,
459
+ timeout,
460
+ metadata,
461
+ _registered_method=True)
462
+
463
+ @staticmethod
464
+ def GetJobSubmittedTimestamp(request,
465
+ target,
466
+ options=(),
467
+ channel_credentials=None,
468
+ call_credentials=None,
469
+ insecure=False,
470
+ compression=None,
471
+ wait_for_ready=None,
472
+ timeout=None,
473
+ metadata=None):
474
+ return grpc.experimental.unary_unary(
475
+ request,
476
+ target,
477
+ '/jobs.v1.JobsService/GetJobSubmittedTimestamp',
478
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampRequest.SerializeToString,
479
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampResponse.FromString,
480
+ options,
481
+ channel_credentials,
482
+ insecure,
483
+ call_credentials,
484
+ compression,
485
+ wait_for_ready,
486
+ timeout,
487
+ metadata,
488
+ _registered_method=True)
489
+
490
+ @staticmethod
491
+ def GetJobEndedTimestamp(request,
492
+ target,
493
+ options=(),
494
+ channel_credentials=None,
495
+ call_credentials=None,
496
+ insecure=False,
497
+ compression=None,
498
+ wait_for_ready=None,
499
+ timeout=None,
500
+ metadata=None):
501
+ return grpc.experimental.unary_unary(
502
+ request,
503
+ target,
504
+ '/jobs.v1.JobsService/GetJobEndedTimestamp',
505
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampRequest.SerializeToString,
506
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampResponse.FromString,
507
+ options,
508
+ channel_credentials,
509
+ insecure,
510
+ call_credentials,
511
+ compression,
512
+ wait_for_ready,
513
+ timeout,
514
+ metadata,
515
+ _registered_method=True)
516
+
517
+ @staticmethod
518
+ def GetLogDirsForJobs(request,
519
+ target,
520
+ options=(),
521
+ channel_credentials=None,
522
+ call_credentials=None,
523
+ insecure=False,
524
+ compression=None,
525
+ wait_for_ready=None,
526
+ timeout=None,
527
+ metadata=None):
528
+ return grpc.experimental.unary_unary(
529
+ request,
530
+ target,
531
+ '/jobs.v1.JobsService/GetLogDirsForJobs',
532
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsRequest.SerializeToString,
533
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsResponse.FromString,
534
+ options,
535
+ channel_credentials,
536
+ insecure,
537
+ call_credentials,
538
+ compression,
539
+ wait_for_ready,
540
+ timeout,
541
+ metadata,
542
+ _registered_method=True)
sky/server/config.py CHANGED
@@ -6,6 +6,7 @@ from typing import Optional
6
6
 
7
7
  from sky import sky_logging
8
8
  from sky.server import constants as server_constants
9
+ from sky.server import daemons
9
10
  from sky.utils import common_utils
10
11
 
11
12
  # Constants based on profiling the peak memory usage while serving various
@@ -21,7 +22,7 @@ from sky.utils import common_utils
21
22
  # in the future.
22
23
  # TODO(luca): The future is now! ^^^
23
24
  LONG_WORKER_MEM_GB = 0.4
24
- SHORT_WORKER_MEM_GB = 0.25
25
+ SHORT_WORKER_MEM_GB = 0.3
25
26
  # To control the number of long workers.
26
27
  _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
27
28
  # Limit the number of long workers of local API server, since local server is
@@ -36,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
36
37
  _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
37
38
  # Minimal number of long workers to ensure responsiveness.
38
39
  _MIN_LONG_WORKERS = 1
39
- # Minimal number of short workers, there is a daemon task running on short
40
- # workers so at least 2 workers are needed to ensure responsiveness.
41
- _MIN_SHORT_WORKERS = 2
40
+ # Minimal number of idle short workers to ensure responsiveness.
41
+ _MIN_IDLE_SHORT_WORKERS = 1
42
42
 
43
43
  # Default number of burstable workers for local API server. A heuristic number
44
44
  # that is large enough for most local cases.
@@ -216,6 +216,15 @@ def _max_long_worker_parallism(cpu_count: int,
216
216
  return n
217
217
 
218
218
 
219
+ def _get_min_short_workers() -> int:
220
+ """Min number of short workers."""
221
+ daemon_count = 0
222
+ for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
223
+ if not daemon.should_skip():
224
+ daemon_count += 1
225
+ return _MIN_IDLE_SHORT_WORKERS + daemon_count
226
+
227
+
219
228
  def _max_short_worker_parallism(mem_size_gb: float,
220
229
  long_worker_parallism: int) -> int:
221
230
  """Max parallelism for short workers."""
@@ -227,5 +236,5 @@ def _max_short_worker_parallism(mem_size_gb: float,
227
236
  server_constants.MIN_AVAIL_MEM_GB)
228
237
  reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
229
238
  available_mem = max(0, mem_size_gb - reserved_mem)
230
- n = max(_MIN_SHORT_WORKERS, int(available_mem / SHORT_WORKER_MEM_GB))
239
+ n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
231
240
  return n
sky/server/metrics.py CHANGED
@@ -4,6 +4,7 @@ import contextlib
4
4
  import functools
5
5
  import multiprocessing
6
6
  import os
7
+ import threading
7
8
  import time
8
9
 
9
10
  import fastapi
@@ -21,6 +22,24 @@ from sky.skylet import constants
21
22
  METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
22
23
  'false').lower() == 'true'
23
24
 
25
+ _KB = 2**10
26
+ _MB = 2**20
27
+ _MEM_BUCKETS = [
28
+ _KB,
29
+ 256 * _KB,
30
+ 512 * _KB,
31
+ _MB,
32
+ 2 * _MB,
33
+ 4 * _MB,
34
+ 8 * _MB,
35
+ 16 * _MB,
36
+ 32 * _MB,
37
+ 64 * _MB,
38
+ 128 * _MB,
39
+ 256 * _MB,
40
+ float('inf'),
41
+ ]
42
+
24
43
  logger = sky_logging.init_logger(__name__)
25
44
 
26
45
  # Total number of API server requests, grouped by path, method, and status.
@@ -92,6 +111,16 @@ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
92
111
  ['pid', 'type', 'mode'],
93
112
  )
94
113
 
114
+ SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
115
+ 'sky_apiserver_request_memory_usage_bytes',
116
+ 'Peak memory usage of requests', ['name'],
117
+ buckets=_MEM_BUCKETS)
118
+
119
+ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
120
+ 'sky_apiserver_request_rss_incr_bytes',
121
+ 'RSS increment after requests', ['name'],
122
+ buckets=_MEM_BUCKETS)
123
+
95
124
  metrics_app = fastapi.FastAPI()
96
125
 
97
126
 
@@ -208,19 +237,23 @@ def time_me_async(func):
208
237
  return async_wrapper
209
238
 
210
239
 
211
- def process_monitor(process_type: str):
240
+ peak_rss_bytes = 0
241
+
242
+
243
+ def process_monitor(process_type: str, stop: threading.Event):
212
244
  pid = multiprocessing.current_process().pid
213
245
  proc = psutil.Process(pid)
214
- peak_rss = 0
215
246
  last_bucket_end = time.time()
216
- while True:
247
+ bucket_peak = 0
248
+ global peak_rss_bytes
249
+ while not stop.is_set():
217
250
  if time.time() - last_bucket_end >= 30:
218
- # Reset peak RSS every 30 seconds.
251
+ # Reset peak RSS for the next time bucket.
219
252
  last_bucket_end = time.time()
220
- peak_rss = 0
221
- peak_rss = max(peak_rss, proc.memory_info().rss)
222
- SKY_APISERVER_PROCESS_PEAK_RSS.labels(pid=pid,
223
- type=process_type).set(peak_rss)
253
+ bucket_peak = 0
254
+ peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
255
+ SKY_APISERVER_PROCESS_PEAK_RSS.labels(
256
+ pid=pid, type=process_type).set(peak_rss_bytes)
224
257
  ctimes = proc.cpu_times()
225
258
  SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
226
259
  type=process_type,