skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/seeweb.py +103 -0
- sky/authentication.py +38 -0
- sky/backends/backend_utils.py +148 -30
- sky/backends/cloud_vm_ray_backend.py +606 -223
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -37
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/seeweb_catalog.py +184 -0
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/seeweb.py +463 -0
- sky/core.py +46 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
- sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +75 -26
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +67 -24
- sky/logs/agent.py +10 -2
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +14 -3
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +806 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +252 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +14 -8
- sky/skylet/job_lib.py +128 -10
- sky/skylet/log_lib.py +14 -3
- sky/skylet/log_lib.pyi +9 -0
- sky/skylet/services.py +203 -0
- sky/skylet/skylet.py +4 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +11 -5
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
|
|
2
|
+
"""Client and server classes corresponding to protobuf-defined services."""
|
|
3
|
+
import grpc
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
from sky.schemas.generated import jobsv1_pb2 as sky_dot_schemas_dot_generated_dot_jobsv1__pb2
|
|
7
|
+
|
|
8
|
+
GRPC_GENERATED_VERSION = '1.63.0'
|
|
9
|
+
GRPC_VERSION = grpc.__version__
|
|
10
|
+
EXPECTED_ERROR_RELEASE = '1.65.0'
|
|
11
|
+
SCHEDULED_RELEASE_DATE = 'June 25, 2024'
|
|
12
|
+
_version_not_supported = False
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from grpc._utilities import first_version_is_lower
|
|
16
|
+
_version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
|
|
17
|
+
except ImportError:
|
|
18
|
+
_version_not_supported = True
|
|
19
|
+
|
|
20
|
+
if _version_not_supported:
|
|
21
|
+
warnings.warn(
|
|
22
|
+
f'The grpc package installed is at version {GRPC_VERSION},'
|
|
23
|
+
+ f' but the generated code in sky/schemas/generated/jobsv1_pb2_grpc.py depends on'
|
|
24
|
+
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
|
|
25
|
+
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
|
|
26
|
+
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
|
|
27
|
+
+ f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
|
|
28
|
+
+ f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
|
|
29
|
+
RuntimeWarning
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class JobsServiceStub(object):
|
|
34
|
+
"""Missing associated documentation comment in .proto file."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, channel):
|
|
37
|
+
"""Constructor.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
channel: A grpc.Channel.
|
|
41
|
+
"""
|
|
42
|
+
self.AddJob = channel.unary_unary(
|
|
43
|
+
'/jobs.v1.JobsService/AddJob',
|
|
44
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobRequest.SerializeToString,
|
|
45
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobResponse.FromString,
|
|
46
|
+
_registered_method=True)
|
|
47
|
+
self.QueueJob = channel.unary_unary(
|
|
48
|
+
'/jobs.v1.JobsService/QueueJob',
|
|
49
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobRequest.SerializeToString,
|
|
50
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobResponse.FromString,
|
|
51
|
+
_registered_method=True)
|
|
52
|
+
self.UpdateStatus = channel.unary_unary(
|
|
53
|
+
'/jobs.v1.JobsService/UpdateStatus',
|
|
54
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusRequest.SerializeToString,
|
|
55
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusResponse.FromString,
|
|
56
|
+
_registered_method=True)
|
|
57
|
+
self.GetJobQueue = channel.unary_unary(
|
|
58
|
+
'/jobs.v1.JobsService/GetJobQueue',
|
|
59
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueRequest.SerializeToString,
|
|
60
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueResponse.FromString,
|
|
61
|
+
_registered_method=True)
|
|
62
|
+
self.CancelJobs = channel.unary_unary(
|
|
63
|
+
'/jobs.v1.JobsService/CancelJobs',
|
|
64
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsRequest.SerializeToString,
|
|
65
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsResponse.FromString,
|
|
66
|
+
_registered_method=True)
|
|
67
|
+
self.FailAllInProgressJobs = channel.unary_unary(
|
|
68
|
+
'/jobs.v1.JobsService/FailAllInProgressJobs',
|
|
69
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsRequest.SerializeToString,
|
|
70
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsResponse.FromString,
|
|
71
|
+
_registered_method=True)
|
|
72
|
+
self.TailLogs = channel.unary_stream(
|
|
73
|
+
'/jobs.v1.JobsService/TailLogs',
|
|
74
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsRequest.SerializeToString,
|
|
75
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsResponse.FromString,
|
|
76
|
+
_registered_method=True)
|
|
77
|
+
self.GetJobStatus = channel.unary_unary(
|
|
78
|
+
'/jobs.v1.JobsService/GetJobStatus',
|
|
79
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusRequest.SerializeToString,
|
|
80
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusResponse.FromString,
|
|
81
|
+
_registered_method=True)
|
|
82
|
+
self.GetJobSubmittedTimestamp = channel.unary_unary(
|
|
83
|
+
'/jobs.v1.JobsService/GetJobSubmittedTimestamp',
|
|
84
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampRequest.SerializeToString,
|
|
85
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampResponse.FromString,
|
|
86
|
+
_registered_method=True)
|
|
87
|
+
self.GetJobEndedTimestamp = channel.unary_unary(
|
|
88
|
+
'/jobs.v1.JobsService/GetJobEndedTimestamp',
|
|
89
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampRequest.SerializeToString,
|
|
90
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampResponse.FromString,
|
|
91
|
+
_registered_method=True)
|
|
92
|
+
self.GetLogDirsForJobs = channel.unary_unary(
|
|
93
|
+
'/jobs.v1.JobsService/GetLogDirsForJobs',
|
|
94
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsRequest.SerializeToString,
|
|
95
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsResponse.FromString,
|
|
96
|
+
_registered_method=True)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class JobsServiceServicer(object):
|
|
100
|
+
"""Missing associated documentation comment in .proto file."""
|
|
101
|
+
|
|
102
|
+
def AddJob(self, request, context):
|
|
103
|
+
"""Add a new job to the database.
|
|
104
|
+
"""
|
|
105
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
106
|
+
context.set_details('Method not implemented!')
|
|
107
|
+
raise NotImplementedError('Method not implemented!')
|
|
108
|
+
|
|
109
|
+
def QueueJob(self, request, context):
|
|
110
|
+
"""Queue a job for execution.
|
|
111
|
+
"""
|
|
112
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
113
|
+
context.set_details('Method not implemented!')
|
|
114
|
+
raise NotImplementedError('Method not implemented!')
|
|
115
|
+
|
|
116
|
+
def UpdateStatus(self, request, context):
|
|
117
|
+
"""Update status of all jobs.
|
|
118
|
+
"""
|
|
119
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
120
|
+
context.set_details('Method not implemented!')
|
|
121
|
+
raise NotImplementedError('Method not implemented!')
|
|
122
|
+
|
|
123
|
+
def GetJobQueue(self, request, context):
|
|
124
|
+
"""Get the job queue.
|
|
125
|
+
"""
|
|
126
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
127
|
+
context.set_details('Method not implemented!')
|
|
128
|
+
raise NotImplementedError('Method not implemented!')
|
|
129
|
+
|
|
130
|
+
def CancelJobs(self, request, context):
|
|
131
|
+
"""Cancel jobs.
|
|
132
|
+
"""
|
|
133
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
134
|
+
context.set_details('Method not implemented!')
|
|
135
|
+
raise NotImplementedError('Method not implemented!')
|
|
136
|
+
|
|
137
|
+
def FailAllInProgressJobs(self, request, context):
|
|
138
|
+
"""Fail all in-progress jobs.
|
|
139
|
+
"""
|
|
140
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
141
|
+
context.set_details('Method not implemented!')
|
|
142
|
+
raise NotImplementedError('Method not implemented!')
|
|
143
|
+
|
|
144
|
+
def TailLogs(self, request, context):
|
|
145
|
+
"""Tail job logs.
|
|
146
|
+
"""
|
|
147
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
148
|
+
context.set_details('Method not implemented!')
|
|
149
|
+
raise NotImplementedError('Method not implemented!')
|
|
150
|
+
|
|
151
|
+
def GetJobStatus(self, request, context):
|
|
152
|
+
"""Get job status.
|
|
153
|
+
"""
|
|
154
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
155
|
+
context.set_details('Method not implemented!')
|
|
156
|
+
raise NotImplementedError('Method not implemented!')
|
|
157
|
+
|
|
158
|
+
def GetJobSubmittedTimestamp(self, request, context):
|
|
159
|
+
"""Get job submitted timestamp.
|
|
160
|
+
"""
|
|
161
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
162
|
+
context.set_details('Method not implemented!')
|
|
163
|
+
raise NotImplementedError('Method not implemented!')
|
|
164
|
+
|
|
165
|
+
def GetJobEndedTimestamp(self, request, context):
|
|
166
|
+
"""Get job ended timestamp.
|
|
167
|
+
"""
|
|
168
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
169
|
+
context.set_details('Method not implemented!')
|
|
170
|
+
raise NotImplementedError('Method not implemented!')
|
|
171
|
+
|
|
172
|
+
def GetLogDirsForJobs(self, request, context):
|
|
173
|
+
"""Get log directories for jobs.
|
|
174
|
+
"""
|
|
175
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
176
|
+
context.set_details('Method not implemented!')
|
|
177
|
+
raise NotImplementedError('Method not implemented!')
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def add_JobsServiceServicer_to_server(servicer, server):
|
|
181
|
+
rpc_method_handlers = {
|
|
182
|
+
'AddJob': grpc.unary_unary_rpc_method_handler(
|
|
183
|
+
servicer.AddJob,
|
|
184
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobRequest.FromString,
|
|
185
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobResponse.SerializeToString,
|
|
186
|
+
),
|
|
187
|
+
'QueueJob': grpc.unary_unary_rpc_method_handler(
|
|
188
|
+
servicer.QueueJob,
|
|
189
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobRequest.FromString,
|
|
190
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobResponse.SerializeToString,
|
|
191
|
+
),
|
|
192
|
+
'UpdateStatus': grpc.unary_unary_rpc_method_handler(
|
|
193
|
+
servicer.UpdateStatus,
|
|
194
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusRequest.FromString,
|
|
195
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusResponse.SerializeToString,
|
|
196
|
+
),
|
|
197
|
+
'GetJobQueue': grpc.unary_unary_rpc_method_handler(
|
|
198
|
+
servicer.GetJobQueue,
|
|
199
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueRequest.FromString,
|
|
200
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueResponse.SerializeToString,
|
|
201
|
+
),
|
|
202
|
+
'CancelJobs': grpc.unary_unary_rpc_method_handler(
|
|
203
|
+
servicer.CancelJobs,
|
|
204
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsRequest.FromString,
|
|
205
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsResponse.SerializeToString,
|
|
206
|
+
),
|
|
207
|
+
'FailAllInProgressJobs': grpc.unary_unary_rpc_method_handler(
|
|
208
|
+
servicer.FailAllInProgressJobs,
|
|
209
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsRequest.FromString,
|
|
210
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsResponse.SerializeToString,
|
|
211
|
+
),
|
|
212
|
+
'TailLogs': grpc.unary_stream_rpc_method_handler(
|
|
213
|
+
servicer.TailLogs,
|
|
214
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsRequest.FromString,
|
|
215
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsResponse.SerializeToString,
|
|
216
|
+
),
|
|
217
|
+
'GetJobStatus': grpc.unary_unary_rpc_method_handler(
|
|
218
|
+
servicer.GetJobStatus,
|
|
219
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusRequest.FromString,
|
|
220
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusResponse.SerializeToString,
|
|
221
|
+
),
|
|
222
|
+
'GetJobSubmittedTimestamp': grpc.unary_unary_rpc_method_handler(
|
|
223
|
+
servicer.GetJobSubmittedTimestamp,
|
|
224
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampRequest.FromString,
|
|
225
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampResponse.SerializeToString,
|
|
226
|
+
),
|
|
227
|
+
'GetJobEndedTimestamp': grpc.unary_unary_rpc_method_handler(
|
|
228
|
+
servicer.GetJobEndedTimestamp,
|
|
229
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampRequest.FromString,
|
|
230
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampResponse.SerializeToString,
|
|
231
|
+
),
|
|
232
|
+
'GetLogDirsForJobs': grpc.unary_unary_rpc_method_handler(
|
|
233
|
+
servicer.GetLogDirsForJobs,
|
|
234
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsRequest.FromString,
|
|
235
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsResponse.SerializeToString,
|
|
236
|
+
),
|
|
237
|
+
}
|
|
238
|
+
generic_handler = grpc.method_handlers_generic_handler(
|
|
239
|
+
'jobs.v1.JobsService', rpc_method_handlers)
|
|
240
|
+
server.add_generic_rpc_handlers((generic_handler,))
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# This class is part of an EXPERIMENTAL API.
|
|
244
|
+
class JobsService(object):
|
|
245
|
+
"""Missing associated documentation comment in .proto file."""
|
|
246
|
+
|
|
247
|
+
@staticmethod
|
|
248
|
+
def AddJob(request,
|
|
249
|
+
target,
|
|
250
|
+
options=(),
|
|
251
|
+
channel_credentials=None,
|
|
252
|
+
call_credentials=None,
|
|
253
|
+
insecure=False,
|
|
254
|
+
compression=None,
|
|
255
|
+
wait_for_ready=None,
|
|
256
|
+
timeout=None,
|
|
257
|
+
metadata=None):
|
|
258
|
+
return grpc.experimental.unary_unary(
|
|
259
|
+
request,
|
|
260
|
+
target,
|
|
261
|
+
'/jobs.v1.JobsService/AddJob',
|
|
262
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobRequest.SerializeToString,
|
|
263
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.AddJobResponse.FromString,
|
|
264
|
+
options,
|
|
265
|
+
channel_credentials,
|
|
266
|
+
insecure,
|
|
267
|
+
call_credentials,
|
|
268
|
+
compression,
|
|
269
|
+
wait_for_ready,
|
|
270
|
+
timeout,
|
|
271
|
+
metadata,
|
|
272
|
+
_registered_method=True)
|
|
273
|
+
|
|
274
|
+
@staticmethod
|
|
275
|
+
def QueueJob(request,
|
|
276
|
+
target,
|
|
277
|
+
options=(),
|
|
278
|
+
channel_credentials=None,
|
|
279
|
+
call_credentials=None,
|
|
280
|
+
insecure=False,
|
|
281
|
+
compression=None,
|
|
282
|
+
wait_for_ready=None,
|
|
283
|
+
timeout=None,
|
|
284
|
+
metadata=None):
|
|
285
|
+
return grpc.experimental.unary_unary(
|
|
286
|
+
request,
|
|
287
|
+
target,
|
|
288
|
+
'/jobs.v1.JobsService/QueueJob',
|
|
289
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobRequest.SerializeToString,
|
|
290
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.QueueJobResponse.FromString,
|
|
291
|
+
options,
|
|
292
|
+
channel_credentials,
|
|
293
|
+
insecure,
|
|
294
|
+
call_credentials,
|
|
295
|
+
compression,
|
|
296
|
+
wait_for_ready,
|
|
297
|
+
timeout,
|
|
298
|
+
metadata,
|
|
299
|
+
_registered_method=True)
|
|
300
|
+
|
|
301
|
+
@staticmethod
|
|
302
|
+
def UpdateStatus(request,
|
|
303
|
+
target,
|
|
304
|
+
options=(),
|
|
305
|
+
channel_credentials=None,
|
|
306
|
+
call_credentials=None,
|
|
307
|
+
insecure=False,
|
|
308
|
+
compression=None,
|
|
309
|
+
wait_for_ready=None,
|
|
310
|
+
timeout=None,
|
|
311
|
+
metadata=None):
|
|
312
|
+
return grpc.experimental.unary_unary(
|
|
313
|
+
request,
|
|
314
|
+
target,
|
|
315
|
+
'/jobs.v1.JobsService/UpdateStatus',
|
|
316
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusRequest.SerializeToString,
|
|
317
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.UpdateStatusResponse.FromString,
|
|
318
|
+
options,
|
|
319
|
+
channel_credentials,
|
|
320
|
+
insecure,
|
|
321
|
+
call_credentials,
|
|
322
|
+
compression,
|
|
323
|
+
wait_for_ready,
|
|
324
|
+
timeout,
|
|
325
|
+
metadata,
|
|
326
|
+
_registered_method=True)
|
|
327
|
+
|
|
328
|
+
@staticmethod
|
|
329
|
+
def GetJobQueue(request,
|
|
330
|
+
target,
|
|
331
|
+
options=(),
|
|
332
|
+
channel_credentials=None,
|
|
333
|
+
call_credentials=None,
|
|
334
|
+
insecure=False,
|
|
335
|
+
compression=None,
|
|
336
|
+
wait_for_ready=None,
|
|
337
|
+
timeout=None,
|
|
338
|
+
metadata=None):
|
|
339
|
+
return grpc.experimental.unary_unary(
|
|
340
|
+
request,
|
|
341
|
+
target,
|
|
342
|
+
'/jobs.v1.JobsService/GetJobQueue',
|
|
343
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueRequest.SerializeToString,
|
|
344
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobQueueResponse.FromString,
|
|
345
|
+
options,
|
|
346
|
+
channel_credentials,
|
|
347
|
+
insecure,
|
|
348
|
+
call_credentials,
|
|
349
|
+
compression,
|
|
350
|
+
wait_for_ready,
|
|
351
|
+
timeout,
|
|
352
|
+
metadata,
|
|
353
|
+
_registered_method=True)
|
|
354
|
+
|
|
355
|
+
@staticmethod
|
|
356
|
+
def CancelJobs(request,
|
|
357
|
+
target,
|
|
358
|
+
options=(),
|
|
359
|
+
channel_credentials=None,
|
|
360
|
+
call_credentials=None,
|
|
361
|
+
insecure=False,
|
|
362
|
+
compression=None,
|
|
363
|
+
wait_for_ready=None,
|
|
364
|
+
timeout=None,
|
|
365
|
+
metadata=None):
|
|
366
|
+
return grpc.experimental.unary_unary(
|
|
367
|
+
request,
|
|
368
|
+
target,
|
|
369
|
+
'/jobs.v1.JobsService/CancelJobs',
|
|
370
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsRequest.SerializeToString,
|
|
371
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.CancelJobsResponse.FromString,
|
|
372
|
+
options,
|
|
373
|
+
channel_credentials,
|
|
374
|
+
insecure,
|
|
375
|
+
call_credentials,
|
|
376
|
+
compression,
|
|
377
|
+
wait_for_ready,
|
|
378
|
+
timeout,
|
|
379
|
+
metadata,
|
|
380
|
+
_registered_method=True)
|
|
381
|
+
|
|
382
|
+
@staticmethod
|
|
383
|
+
def FailAllInProgressJobs(request,
|
|
384
|
+
target,
|
|
385
|
+
options=(),
|
|
386
|
+
channel_credentials=None,
|
|
387
|
+
call_credentials=None,
|
|
388
|
+
insecure=False,
|
|
389
|
+
compression=None,
|
|
390
|
+
wait_for_ready=None,
|
|
391
|
+
timeout=None,
|
|
392
|
+
metadata=None):
|
|
393
|
+
return grpc.experimental.unary_unary(
|
|
394
|
+
request,
|
|
395
|
+
target,
|
|
396
|
+
'/jobs.v1.JobsService/FailAllInProgressJobs',
|
|
397
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsRequest.SerializeToString,
|
|
398
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.FailAllInProgressJobsResponse.FromString,
|
|
399
|
+
options,
|
|
400
|
+
channel_credentials,
|
|
401
|
+
insecure,
|
|
402
|
+
call_credentials,
|
|
403
|
+
compression,
|
|
404
|
+
wait_for_ready,
|
|
405
|
+
timeout,
|
|
406
|
+
metadata,
|
|
407
|
+
_registered_method=True)
|
|
408
|
+
|
|
409
|
+
@staticmethod
|
|
410
|
+
def TailLogs(request,
|
|
411
|
+
target,
|
|
412
|
+
options=(),
|
|
413
|
+
channel_credentials=None,
|
|
414
|
+
call_credentials=None,
|
|
415
|
+
insecure=False,
|
|
416
|
+
compression=None,
|
|
417
|
+
wait_for_ready=None,
|
|
418
|
+
timeout=None,
|
|
419
|
+
metadata=None):
|
|
420
|
+
return grpc.experimental.unary_stream(
|
|
421
|
+
request,
|
|
422
|
+
target,
|
|
423
|
+
'/jobs.v1.JobsService/TailLogs',
|
|
424
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsRequest.SerializeToString,
|
|
425
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.TailLogsResponse.FromString,
|
|
426
|
+
options,
|
|
427
|
+
channel_credentials,
|
|
428
|
+
insecure,
|
|
429
|
+
call_credentials,
|
|
430
|
+
compression,
|
|
431
|
+
wait_for_ready,
|
|
432
|
+
timeout,
|
|
433
|
+
metadata,
|
|
434
|
+
_registered_method=True)
|
|
435
|
+
|
|
436
|
+
@staticmethod
|
|
437
|
+
def GetJobStatus(request,
|
|
438
|
+
target,
|
|
439
|
+
options=(),
|
|
440
|
+
channel_credentials=None,
|
|
441
|
+
call_credentials=None,
|
|
442
|
+
insecure=False,
|
|
443
|
+
compression=None,
|
|
444
|
+
wait_for_ready=None,
|
|
445
|
+
timeout=None,
|
|
446
|
+
metadata=None):
|
|
447
|
+
return grpc.experimental.unary_unary(
|
|
448
|
+
request,
|
|
449
|
+
target,
|
|
450
|
+
'/jobs.v1.JobsService/GetJobStatus',
|
|
451
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusRequest.SerializeToString,
|
|
452
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobStatusResponse.FromString,
|
|
453
|
+
options,
|
|
454
|
+
channel_credentials,
|
|
455
|
+
insecure,
|
|
456
|
+
call_credentials,
|
|
457
|
+
compression,
|
|
458
|
+
wait_for_ready,
|
|
459
|
+
timeout,
|
|
460
|
+
metadata,
|
|
461
|
+
_registered_method=True)
|
|
462
|
+
|
|
463
|
+
@staticmethod
|
|
464
|
+
def GetJobSubmittedTimestamp(request,
|
|
465
|
+
target,
|
|
466
|
+
options=(),
|
|
467
|
+
channel_credentials=None,
|
|
468
|
+
call_credentials=None,
|
|
469
|
+
insecure=False,
|
|
470
|
+
compression=None,
|
|
471
|
+
wait_for_ready=None,
|
|
472
|
+
timeout=None,
|
|
473
|
+
metadata=None):
|
|
474
|
+
return grpc.experimental.unary_unary(
|
|
475
|
+
request,
|
|
476
|
+
target,
|
|
477
|
+
'/jobs.v1.JobsService/GetJobSubmittedTimestamp',
|
|
478
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampRequest.SerializeToString,
|
|
479
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobSubmittedTimestampResponse.FromString,
|
|
480
|
+
options,
|
|
481
|
+
channel_credentials,
|
|
482
|
+
insecure,
|
|
483
|
+
call_credentials,
|
|
484
|
+
compression,
|
|
485
|
+
wait_for_ready,
|
|
486
|
+
timeout,
|
|
487
|
+
metadata,
|
|
488
|
+
_registered_method=True)
|
|
489
|
+
|
|
490
|
+
@staticmethod
|
|
491
|
+
def GetJobEndedTimestamp(request,
|
|
492
|
+
target,
|
|
493
|
+
options=(),
|
|
494
|
+
channel_credentials=None,
|
|
495
|
+
call_credentials=None,
|
|
496
|
+
insecure=False,
|
|
497
|
+
compression=None,
|
|
498
|
+
wait_for_ready=None,
|
|
499
|
+
timeout=None,
|
|
500
|
+
metadata=None):
|
|
501
|
+
return grpc.experimental.unary_unary(
|
|
502
|
+
request,
|
|
503
|
+
target,
|
|
504
|
+
'/jobs.v1.JobsService/GetJobEndedTimestamp',
|
|
505
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampRequest.SerializeToString,
|
|
506
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobEndedTimestampResponse.FromString,
|
|
507
|
+
options,
|
|
508
|
+
channel_credentials,
|
|
509
|
+
insecure,
|
|
510
|
+
call_credentials,
|
|
511
|
+
compression,
|
|
512
|
+
wait_for_ready,
|
|
513
|
+
timeout,
|
|
514
|
+
metadata,
|
|
515
|
+
_registered_method=True)
|
|
516
|
+
|
|
517
|
+
@staticmethod
|
|
518
|
+
def GetLogDirsForJobs(request,
|
|
519
|
+
target,
|
|
520
|
+
options=(),
|
|
521
|
+
channel_credentials=None,
|
|
522
|
+
call_credentials=None,
|
|
523
|
+
insecure=False,
|
|
524
|
+
compression=None,
|
|
525
|
+
wait_for_ready=None,
|
|
526
|
+
timeout=None,
|
|
527
|
+
metadata=None):
|
|
528
|
+
return grpc.experimental.unary_unary(
|
|
529
|
+
request,
|
|
530
|
+
target,
|
|
531
|
+
'/jobs.v1.JobsService/GetLogDirsForJobs',
|
|
532
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsRequest.SerializeToString,
|
|
533
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsResponse.FromString,
|
|
534
|
+
options,
|
|
535
|
+
channel_credentials,
|
|
536
|
+
insecure,
|
|
537
|
+
call_credentials,
|
|
538
|
+
compression,
|
|
539
|
+
wait_for_ready,
|
|
540
|
+
timeout,
|
|
541
|
+
metadata,
|
|
542
|
+
_registered_method=True)
|
sky/server/config.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import Optional
|
|
|
6
6
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky.server import constants as server_constants
|
|
9
|
+
from sky.server import daemons
|
|
9
10
|
from sky.utils import common_utils
|
|
10
11
|
|
|
11
12
|
# Constants based on profiling the peak memory usage while serving various
|
|
@@ -21,7 +22,7 @@ from sky.utils import common_utils
|
|
|
21
22
|
# in the future.
|
|
22
23
|
# TODO(luca): The future is now! ^^^
|
|
23
24
|
LONG_WORKER_MEM_GB = 0.4
|
|
24
|
-
SHORT_WORKER_MEM_GB = 0.
|
|
25
|
+
SHORT_WORKER_MEM_GB = 0.3
|
|
25
26
|
# To control the number of long workers.
|
|
26
27
|
_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
|
|
27
28
|
# Limit the number of long workers of local API server, since local server is
|
|
@@ -36,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
|
|
|
36
37
|
_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
|
|
37
38
|
# Minimal number of long workers to ensure responsiveness.
|
|
38
39
|
_MIN_LONG_WORKERS = 1
|
|
39
|
-
# Minimal number of short workers
|
|
40
|
-
|
|
41
|
-
_MIN_SHORT_WORKERS = 2
|
|
40
|
+
# Minimal number of idle short workers to ensure responsiveness.
|
|
41
|
+
_MIN_IDLE_SHORT_WORKERS = 1
|
|
42
42
|
|
|
43
43
|
# Default number of burstable workers for local API server. A heuristic number
|
|
44
44
|
# that is large enough for most local cases.
|
|
@@ -216,6 +216,15 @@ def _max_long_worker_parallism(cpu_count: int,
|
|
|
216
216
|
return n
|
|
217
217
|
|
|
218
218
|
|
|
219
|
+
def _get_min_short_workers() -> int:
|
|
220
|
+
"""Min number of short workers."""
|
|
221
|
+
daemon_count = 0
|
|
222
|
+
for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
|
|
223
|
+
if not daemon.should_skip():
|
|
224
|
+
daemon_count += 1
|
|
225
|
+
return _MIN_IDLE_SHORT_WORKERS + daemon_count
|
|
226
|
+
|
|
227
|
+
|
|
219
228
|
def _max_short_worker_parallism(mem_size_gb: float,
|
|
220
229
|
long_worker_parallism: int) -> int:
|
|
221
230
|
"""Max parallelism for short workers."""
|
|
@@ -227,5 +236,5 @@ def _max_short_worker_parallism(mem_size_gb: float,
|
|
|
227
236
|
server_constants.MIN_AVAIL_MEM_GB)
|
|
228
237
|
reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
|
|
229
238
|
available_mem = max(0, mem_size_gb - reserved_mem)
|
|
230
|
-
n = max(
|
|
239
|
+
n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
|
|
231
240
|
return n
|
sky/server/metrics.py
CHANGED
|
@@ -4,6 +4,7 @@ import contextlib
|
|
|
4
4
|
import functools
|
|
5
5
|
import multiprocessing
|
|
6
6
|
import os
|
|
7
|
+
import threading
|
|
7
8
|
import time
|
|
8
9
|
|
|
9
10
|
import fastapi
|
|
@@ -21,6 +22,24 @@ from sky.skylet import constants
|
|
|
21
22
|
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
22
23
|
'false').lower() == 'true'
|
|
23
24
|
|
|
25
|
+
_KB = 2**10
|
|
26
|
+
_MB = 2**20
|
|
27
|
+
_MEM_BUCKETS = [
|
|
28
|
+
_KB,
|
|
29
|
+
256 * _KB,
|
|
30
|
+
512 * _KB,
|
|
31
|
+
_MB,
|
|
32
|
+
2 * _MB,
|
|
33
|
+
4 * _MB,
|
|
34
|
+
8 * _MB,
|
|
35
|
+
16 * _MB,
|
|
36
|
+
32 * _MB,
|
|
37
|
+
64 * _MB,
|
|
38
|
+
128 * _MB,
|
|
39
|
+
256 * _MB,
|
|
40
|
+
float('inf'),
|
|
41
|
+
]
|
|
42
|
+
|
|
24
43
|
logger = sky_logging.init_logger(__name__)
|
|
25
44
|
|
|
26
45
|
# Total number of API server requests, grouped by path, method, and status.
|
|
@@ -92,6 +111,16 @@ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
|
92
111
|
['pid', 'type', 'mode'],
|
|
93
112
|
)
|
|
94
113
|
|
|
114
|
+
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
115
|
+
'sky_apiserver_request_memory_usage_bytes',
|
|
116
|
+
'Peak memory usage of requests', ['name'],
|
|
117
|
+
buckets=_MEM_BUCKETS)
|
|
118
|
+
|
|
119
|
+
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
120
|
+
'sky_apiserver_request_rss_incr_bytes',
|
|
121
|
+
'RSS increment after requests', ['name'],
|
|
122
|
+
buckets=_MEM_BUCKETS)
|
|
123
|
+
|
|
95
124
|
metrics_app = fastapi.FastAPI()
|
|
96
125
|
|
|
97
126
|
|
|
@@ -208,19 +237,23 @@ def time_me_async(func):
|
|
|
208
237
|
return async_wrapper
|
|
209
238
|
|
|
210
239
|
|
|
211
|
-
|
|
240
|
+
peak_rss_bytes = 0
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def process_monitor(process_type: str, stop: threading.Event):
|
|
212
244
|
pid = multiprocessing.current_process().pid
|
|
213
245
|
proc = psutil.Process(pid)
|
|
214
|
-
peak_rss = 0
|
|
215
246
|
last_bucket_end = time.time()
|
|
216
|
-
|
|
247
|
+
bucket_peak = 0
|
|
248
|
+
global peak_rss_bytes
|
|
249
|
+
while not stop.is_set():
|
|
217
250
|
if time.time() - last_bucket_end >= 30:
|
|
218
|
-
# Reset peak RSS
|
|
251
|
+
# Reset peak RSS for the next time bucket.
|
|
219
252
|
last_bucket_end = time.time()
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
223
|
-
|
|
253
|
+
bucket_peak = 0
|
|
254
|
+
peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
|
|
255
|
+
SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
256
|
+
pid=pid, type=process_type).set(peak_rss_bytes)
|
|
224
257
|
ctimes = proc.cpu_times()
|
|
225
258
|
SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
226
259
|
type=process_type,
|