skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/seeweb.py +103 -0
- sky/authentication.py +38 -0
- sky/backends/backend_utils.py +148 -30
- sky/backends/cloud_vm_ray_backend.py +606 -223
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -37
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/seeweb_catalog.py +184 -0
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/seeweb.py +463 -0
- sky/core.py +46 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
- sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +75 -26
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +67 -24
- sky/logs/agent.py +10 -2
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +14 -3
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +806 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +252 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +14 -8
- sky/skylet/job_lib.py +128 -10
- sky/skylet/log_lib.py +14 -3
- sky/skylet/log_lib.pyi +9 -0
- sky/skylet/services.py +203 -0
- sky/skylet/skylet.py +4 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +11 -5
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
sky/skylet/services.py
CHANGED
|
@@ -1,11 +1,19 @@
|
|
|
1
1
|
"""gRPC service implementations for skylet."""
|
|
2
2
|
|
|
3
|
+
import os
|
|
4
|
+
|
|
3
5
|
import grpc
|
|
4
6
|
|
|
5
7
|
from sky import sky_logging
|
|
8
|
+
from sky.jobs import state as managed_job_state
|
|
6
9
|
from sky.schemas.generated import autostopv1_pb2
|
|
7
10
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
11
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
12
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
13
|
+
from sky.serve import serve_state
|
|
8
14
|
from sky.skylet import autostop_lib
|
|
15
|
+
from sky.skylet import constants
|
|
16
|
+
from sky.skylet import job_lib
|
|
9
17
|
|
|
10
18
|
logger = sky_logging.init_logger(__name__)
|
|
11
19
|
|
|
@@ -42,3 +50,198 @@ class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
|
|
|
42
50
|
is_autostopping=is_autostopping)
|
|
43
51
|
except Exception as e: # pylint: disable=broad-except
|
|
44
52
|
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
56
|
+
"""Implementation of the JobsService gRPC service."""
|
|
57
|
+
|
|
58
|
+
def AddJob( # type: ignore[return]
|
|
59
|
+
self, request: jobsv1_pb2.AddJobRequest,
|
|
60
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.AddJobResponse:
|
|
61
|
+
try:
|
|
62
|
+
job_name = request.job_name if request.HasField('job_name') else '-'
|
|
63
|
+
job_id, log_dir = job_lib.add_job(job_name, request.username,
|
|
64
|
+
request.run_timestamp,
|
|
65
|
+
request.resources_str,
|
|
66
|
+
request.metadata)
|
|
67
|
+
return jobsv1_pb2.AddJobResponse(job_id=job_id, log_dir=log_dir)
|
|
68
|
+
except Exception as e: # pylint: disable=broad-except
|
|
69
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
70
|
+
|
|
71
|
+
def QueueJob( # type: ignore[return]
|
|
72
|
+
self, request: jobsv1_pb2.QueueJobRequest,
|
|
73
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.QueueJobResponse:
|
|
74
|
+
try:
|
|
75
|
+
job_id = request.job_id
|
|
76
|
+
# Create log directory and file
|
|
77
|
+
remote_log_dir = os.path.expanduser(request.remote_log_dir)
|
|
78
|
+
os.makedirs(remote_log_dir, exist_ok=True)
|
|
79
|
+
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
|
80
|
+
open(remote_log_path, 'a').close() # pylint: disable=unspecified-encoding
|
|
81
|
+
|
|
82
|
+
script_path = os.path.expanduser(request.script_path)
|
|
83
|
+
os.makedirs(os.path.dirname(script_path), exist_ok=True)
|
|
84
|
+
|
|
85
|
+
# If `codegen` is not provided, assume script is already
|
|
86
|
+
# uploaded to `script_path` via rsync.
|
|
87
|
+
if request.HasField('codegen'):
|
|
88
|
+
with open(script_path, 'w', encoding='utf-8') as f:
|
|
89
|
+
f.write(request.codegen)
|
|
90
|
+
os.chmod(script_path, 0o755)
|
|
91
|
+
|
|
92
|
+
cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
|
|
93
|
+
job_submit_cmd = (
|
|
94
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
95
|
+
# retrieved with pid is the same driver process.
|
|
96
|
+
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
97
|
+
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
98
|
+
# Do not use &>, which is not POSIX and may not work.
|
|
99
|
+
# Note that the order of ">filename 2>&1" matters.
|
|
100
|
+
f' > {remote_log_path} 2>&1')
|
|
101
|
+
job_lib.scheduler.queue(job_id, job_submit_cmd)
|
|
102
|
+
|
|
103
|
+
if request.HasField('managed_job'):
|
|
104
|
+
managed_job = request.managed_job
|
|
105
|
+
pool = managed_job.pool if managed_job.HasField(
|
|
106
|
+
'pool') else None
|
|
107
|
+
pool_hash = None
|
|
108
|
+
if pool is not None:
|
|
109
|
+
pool_hash = serve_state.get_service_hash(pool)
|
|
110
|
+
# Add the managed job to job queue database.
|
|
111
|
+
managed_job_state.set_job_info(job_id, managed_job.name,
|
|
112
|
+
managed_job.workspace,
|
|
113
|
+
managed_job.entrypoint, pool,
|
|
114
|
+
pool_hash)
|
|
115
|
+
# Set the managed job to PENDING state to make sure that
|
|
116
|
+
# this managed job appears in the `sky jobs queue`, even
|
|
117
|
+
# if it needs to wait to be submitted.
|
|
118
|
+
# We cannot set the managed job to PENDING state in the
|
|
119
|
+
# job template (jobs-controller.yaml.j2), as it may need
|
|
120
|
+
# to wait for the run commands to be scheduled on the job
|
|
121
|
+
# controller in high-load cases.
|
|
122
|
+
for task in managed_job.tasks:
|
|
123
|
+
managed_job_state.set_pending(job_id, task.task_id,
|
|
124
|
+
task.name, task.resources_str,
|
|
125
|
+
task.metadata_json)
|
|
126
|
+
return jobsv1_pb2.QueueJobResponse()
|
|
127
|
+
except Exception as e: # pylint: disable=broad-except
|
|
128
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
129
|
+
|
|
130
|
+
def UpdateStatus( # type: ignore[return]
|
|
131
|
+
self, request: jobsv1_pb2.UpdateStatusRequest,
|
|
132
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.UpdateStatusResponse:
|
|
133
|
+
try:
|
|
134
|
+
job_lib.update_status()
|
|
135
|
+
return jobsv1_pb2.UpdateStatusResponse()
|
|
136
|
+
except Exception as e: # pylint: disable=broad-except
|
|
137
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
138
|
+
|
|
139
|
+
def GetJobQueue( # type: ignore[return]
|
|
140
|
+
self, request: jobsv1_pb2.GetJobQueueRequest,
|
|
141
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.GetJobQueueResponse:
|
|
142
|
+
try:
|
|
143
|
+
user_hash = request.user_hash if request.HasField(
|
|
144
|
+
'user_hash') else None
|
|
145
|
+
all_jobs = request.all_jobs
|
|
146
|
+
jobs_info = job_lib.get_jobs_info(user_hash=user_hash,
|
|
147
|
+
all_jobs=all_jobs)
|
|
148
|
+
return jobsv1_pb2.GetJobQueueResponse(jobs=jobs_info)
|
|
149
|
+
except Exception as e: # pylint: disable=broad-except
|
|
150
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
151
|
+
|
|
152
|
+
def CancelJobs( # type: ignore[return]
|
|
153
|
+
self, request: jobsv1_pb2.CancelJobsRequest,
|
|
154
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.CancelJobsResponse:
|
|
155
|
+
try:
|
|
156
|
+
job_ids = list(request.job_ids) if request.job_ids else []
|
|
157
|
+
user_hash = request.user_hash if request.HasField(
|
|
158
|
+
'user_hash') else None
|
|
159
|
+
cancelled_job_ids = job_lib.cancel_jobs(job_ids, request.cancel_all,
|
|
160
|
+
user_hash)
|
|
161
|
+
return jobsv1_pb2.CancelJobsResponse(
|
|
162
|
+
cancelled_job_ids=cancelled_job_ids)
|
|
163
|
+
except Exception as e: # pylint: disable=broad-except
|
|
164
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
165
|
+
|
|
166
|
+
def FailAllInProgressJobs( # type: ignore[return]
|
|
167
|
+
self, _: jobsv1_pb2.FailAllInProgressJobsRequest,
|
|
168
|
+
context: grpc.ServicerContext
|
|
169
|
+
) -> jobsv1_pb2.FailAllInProgressJobsResponse:
|
|
170
|
+
try:
|
|
171
|
+
job_lib.fail_all_jobs_in_progress()
|
|
172
|
+
return jobsv1_pb2.FailAllInProgressJobsResponse()
|
|
173
|
+
except Exception as e: # pylint: disable=broad-except
|
|
174
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
175
|
+
|
|
176
|
+
def TailLogs(
|
|
177
|
+
self,
|
|
178
|
+
request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
|
|
179
|
+
context: grpc.ServicerContext):
|
|
180
|
+
# TODO(kevin): implement this
|
|
181
|
+
raise NotImplementedError('TailLogs is not implemented')
|
|
182
|
+
|
|
183
|
+
def GetJobStatus( # type: ignore[return]
|
|
184
|
+
self, request: jobsv1_pb2.GetJobStatusRequest,
|
|
185
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.GetJobStatusResponse:
|
|
186
|
+
try:
|
|
187
|
+
if request.job_ids:
|
|
188
|
+
job_ids = list(request.job_ids)
|
|
189
|
+
else:
|
|
190
|
+
latest_job_id = job_lib.get_latest_job_id()
|
|
191
|
+
job_ids = [latest_job_id] if latest_job_id is not None else []
|
|
192
|
+
job_statuses = job_lib.get_statuses(job_ids)
|
|
193
|
+
for job_id, status in job_statuses.items():
|
|
194
|
+
job_statuses[job_id] = job_lib.JobStatus(status).to_protobuf(
|
|
195
|
+
) if status is not None else jobsv1_pb2.JOB_STATUS_UNSPECIFIED
|
|
196
|
+
return jobsv1_pb2.GetJobStatusResponse(job_statuses=job_statuses)
|
|
197
|
+
except Exception as e: # pylint: disable=broad-except
|
|
198
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
199
|
+
|
|
200
|
+
def GetJobSubmittedTimestamp( # type: ignore[return]
|
|
201
|
+
self, request: jobsv1_pb2.GetJobSubmittedTimestampRequest,
|
|
202
|
+
context: grpc.ServicerContext
|
|
203
|
+
) -> jobsv1_pb2.GetJobSubmittedTimestampResponse:
|
|
204
|
+
try:
|
|
205
|
+
job_id = request.job_id if request.HasField(
|
|
206
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
207
|
+
timestamp = job_lib.get_job_submitted_or_ended_timestamp(
|
|
208
|
+
job_id, False)
|
|
209
|
+
if timestamp is None:
|
|
210
|
+
context.abort(grpc.StatusCode.NOT_FOUND,
|
|
211
|
+
f'Job {job_id} not found')
|
|
212
|
+
return jobsv1_pb2.GetJobSubmittedTimestampResponse(
|
|
213
|
+
timestamp=timestamp)
|
|
214
|
+
except Exception as e: # pylint: disable=broad-except
|
|
215
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
216
|
+
|
|
217
|
+
def GetJobEndedTimestamp( # type: ignore[return]
|
|
218
|
+
self, request: jobsv1_pb2.GetJobEndedTimestampRequest,
|
|
219
|
+
context: grpc.ServicerContext
|
|
220
|
+
) -> jobsv1_pb2.GetJobEndedTimestampResponse:
|
|
221
|
+
try:
|
|
222
|
+
job_id = request.job_id if request.HasField(
|
|
223
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
224
|
+
timestamp = job_lib.get_job_submitted_or_ended_timestamp(
|
|
225
|
+
job_id, True)
|
|
226
|
+
if timestamp is None:
|
|
227
|
+
context.abort(grpc.StatusCode.NOT_FOUND,
|
|
228
|
+
f'Job {job_id} not found or not ended')
|
|
229
|
+
return jobsv1_pb2.GetJobEndedTimestampResponse(timestamp=timestamp)
|
|
230
|
+
except Exception as e: # pylint: disable=broad-except
|
|
231
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
232
|
+
|
|
233
|
+
def GetLogDirsForJobs( # type: ignore[return]
|
|
234
|
+
self, request: jobsv1_pb2.GetLogDirsForJobsRequest,
|
|
235
|
+
context: grpc.ServicerContext
|
|
236
|
+
) -> jobsv1_pb2.GetLogDirsForJobsResponse:
|
|
237
|
+
try:
|
|
238
|
+
if request.job_ids:
|
|
239
|
+
job_ids = list(request.job_ids)
|
|
240
|
+
else:
|
|
241
|
+
latest_job_id = job_lib.get_latest_job_id()
|
|
242
|
+
job_ids = [latest_job_id] if latest_job_id is not None else []
|
|
243
|
+
job_log_dirs = job_lib.get_job_log_dirs(job_ids)
|
|
244
|
+
return jobsv1_pb2.GetLogDirsForJobsResponse(
|
|
245
|
+
job_log_dirs=job_log_dirs)
|
|
246
|
+
except Exception as e: # pylint: disable=broad-except
|
|
247
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
sky/skylet/skylet.py
CHANGED
|
@@ -9,6 +9,7 @@ import grpc
|
|
|
9
9
|
import sky
|
|
10
10
|
from sky import sky_logging
|
|
11
11
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
12
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
12
13
|
from sky.skylet import constants
|
|
13
14
|
from sky.skylet import events
|
|
14
15
|
from sky.skylet import services
|
|
@@ -50,6 +51,9 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
|
|
|
50
51
|
autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
|
|
51
52
|
services.AutostopServiceImpl(), server)
|
|
52
53
|
|
|
54
|
+
jobsv1_pb2_grpc.add_JobsServiceServicer_to_server(
|
|
55
|
+
services.JobsServiceImpl(), server)
|
|
56
|
+
|
|
53
57
|
listen_addr = f'127.0.0.1:{port}'
|
|
54
58
|
server.add_insecure_port(listen_addr)
|
|
55
59
|
|
sky/task.py
CHANGED
|
@@ -20,6 +20,7 @@ from sky.provision import docker_utils
|
|
|
20
20
|
from sky.serve import service_spec
|
|
21
21
|
from sky.skylet import constants
|
|
22
22
|
from sky.utils import common_utils
|
|
23
|
+
from sky.utils import git
|
|
23
24
|
from sky.utils import registry
|
|
24
25
|
from sky.utils import schemas
|
|
25
26
|
from sky.utils import ux_utils
|
|
@@ -1596,6 +1597,67 @@ class Task:
|
|
|
1596
1597
|
d[k] = v
|
|
1597
1598
|
return d
|
|
1598
1599
|
|
|
1600
|
+
def update_workdir(self, workdir: Optional[str], git_url: Optional[str],
|
|
1601
|
+
git_ref: Optional[str]) -> 'Task':
|
|
1602
|
+
"""Updates the task workdir.
|
|
1603
|
+
|
|
1604
|
+
Args:
|
|
1605
|
+
workdir: The workdir to update.
|
|
1606
|
+
git_url: The git url to update.
|
|
1607
|
+
git_ref: The git ref to update.
|
|
1608
|
+
"""
|
|
1609
|
+
if self.workdir is None or isinstance(self.workdir, str):
|
|
1610
|
+
if workdir is not None:
|
|
1611
|
+
self.workdir = workdir
|
|
1612
|
+
return self
|
|
1613
|
+
if git_url is not None:
|
|
1614
|
+
self.workdir = {}
|
|
1615
|
+
self.workdir['url'] = git_url
|
|
1616
|
+
if git_ref is not None:
|
|
1617
|
+
self.workdir['ref'] = git_ref
|
|
1618
|
+
return self
|
|
1619
|
+
return self
|
|
1620
|
+
if git_url is not None:
|
|
1621
|
+
self.workdir['url'] = git_url
|
|
1622
|
+
if git_ref is not None:
|
|
1623
|
+
self.workdir['ref'] = git_ref
|
|
1624
|
+
return self
|
|
1625
|
+
|
|
1626
|
+
def update_envs_and_secrets_from_workdir(self) -> 'Task':
|
|
1627
|
+
"""Updates the task envs and secrets from the workdir."""
|
|
1628
|
+
if self.workdir is None:
|
|
1629
|
+
return self
|
|
1630
|
+
if not isinstance(self.workdir, dict):
|
|
1631
|
+
return self
|
|
1632
|
+
url = self.workdir['url']
|
|
1633
|
+
ref = self.workdir.get('ref', '')
|
|
1634
|
+
token = os.environ.get(git.GIT_TOKEN_ENV_VAR)
|
|
1635
|
+
ssh_key_path = os.environ.get(git.GIT_SSH_KEY_PATH_ENV_VAR)
|
|
1636
|
+
try:
|
|
1637
|
+
git_repo = git.GitRepo(url, ref, token, ssh_key_path)
|
|
1638
|
+
clone_info = git_repo.get_repo_clone_info()
|
|
1639
|
+
if clone_info is None:
|
|
1640
|
+
return self
|
|
1641
|
+
self.envs[git.GIT_URL_ENV_VAR] = clone_info.url
|
|
1642
|
+
if ref:
|
|
1643
|
+
ref_type = git_repo.get_ref_type()
|
|
1644
|
+
if ref_type == git.GitRefType.COMMIT:
|
|
1645
|
+
self.envs[git.GIT_COMMIT_HASH_ENV_VAR] = ref
|
|
1646
|
+
elif ref_type == git.GitRefType.BRANCH:
|
|
1647
|
+
self.envs[git.GIT_BRANCH_ENV_VAR] = ref
|
|
1648
|
+
elif ref_type == git.GitRefType.TAG:
|
|
1649
|
+
self.envs[git.GIT_TAG_ENV_VAR] = ref
|
|
1650
|
+
if clone_info.token is None and clone_info.ssh_key is None:
|
|
1651
|
+
return self
|
|
1652
|
+
if clone_info.token is not None:
|
|
1653
|
+
self.secrets[git.GIT_TOKEN_ENV_VAR] = clone_info.token
|
|
1654
|
+
if clone_info.ssh_key is not None:
|
|
1655
|
+
self.secrets[git.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
|
|
1656
|
+
except exceptions.GitError as e:
|
|
1657
|
+
with ux_utils.print_exception_no_traceback():
|
|
1658
|
+
raise ValueError(f'{str(e)}') from None
|
|
1659
|
+
return self
|
|
1660
|
+
|
|
1599
1661
|
def to_yaml_config(self,
|
|
1600
1662
|
use_user_specified_yaml: bool = False) -> Dict[str, Any]:
|
|
1601
1663
|
"""Returns a yaml-style dict representation of the task.
|
|
@@ -654,8 +654,125 @@ available_node_types:
|
|
|
654
654
|
# after v0.11.0 release.
|
|
655
655
|
touch /tmp/apt_ssh_setup_started
|
|
656
656
|
|
|
657
|
-
|
|
658
|
-
|
|
657
|
+
# Helper: run apt-get update with retries
|
|
658
|
+
apt_update_with_retries() {
|
|
659
|
+
# do not fail the whole shell; we handle return codes
|
|
660
|
+
set +e
|
|
661
|
+
local log=/tmp/apt-update.log
|
|
662
|
+
local tries=3
|
|
663
|
+
local delay=1
|
|
664
|
+
local i
|
|
665
|
+
for i in $(seq 1 $tries); do
|
|
666
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update >> "$log" 2>&1 && { set -e; return 0; }
|
|
667
|
+
echo "apt-get update attempt $i/$tries failed; retrying in ${delay}s" >> "$log"
|
|
668
|
+
sleep $delay
|
|
669
|
+
delay=$((delay * 2))
|
|
670
|
+
done
|
|
671
|
+
set -e
|
|
672
|
+
return 1
|
|
673
|
+
}
|
|
674
|
+
apt_install_with_retries() {
|
|
675
|
+
local packages="$@"
|
|
676
|
+
[ -z "$packages" ] && return 0
|
|
677
|
+
set +e
|
|
678
|
+
local log=/tmp/apt-update.log
|
|
679
|
+
local tries=3
|
|
680
|
+
local delay=1
|
|
681
|
+
local i
|
|
682
|
+
for i in $(seq 1 $tries); do
|
|
683
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages && { set -e; return 0; }
|
|
684
|
+
echo "apt-get install failed for: $packages (attempt $i/$tries). Running -f install and retrying..." >> "$log"
|
|
685
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get -f install -y >> "$log" 2>&1 || true
|
|
686
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get clean >> "$log" 2>&1 || true
|
|
687
|
+
sleep $delay
|
|
688
|
+
delay=$((delay * 2))
|
|
689
|
+
done
|
|
690
|
+
set -e
|
|
691
|
+
return 1
|
|
692
|
+
}
|
|
693
|
+
apt_update_install_with_retries() {
|
|
694
|
+
apt_update_with_retries
|
|
695
|
+
apt_install_with_retries "$@"
|
|
696
|
+
}
|
|
697
|
+
backup_dir=/etc/apt/sources.list.backup_skypilot
|
|
698
|
+
backup_source() {
|
|
699
|
+
$(prefix_cmd) mkdir -p "$backup_dir"
|
|
700
|
+
if [ -f /etc/apt/sources.list ] && [ ! -f "$backup_dir/sources.list" ]; then
|
|
701
|
+
$(prefix_cmd) cp -a /etc/apt/sources.list "$backup_dir/sources.list" || true
|
|
702
|
+
fi
|
|
703
|
+
}
|
|
704
|
+
restore_source() {
|
|
705
|
+
if [ -f "$backup_dir/sources.list" ]; then
|
|
706
|
+
$(prefix_cmd) cp -a "$backup_dir/sources.list" /etc/apt/sources.list || true
|
|
707
|
+
fi
|
|
708
|
+
}
|
|
709
|
+
update_apt_sources() {
|
|
710
|
+
local host=$1
|
|
711
|
+
local apt_file=$2
|
|
712
|
+
$(prefix_cmd) sed -i -E "s|https?://[a-zA-Z0-9.-]+\.ubuntu\.com/ubuntu|http://$host/ubuntu|g" $apt_file
|
|
713
|
+
}
|
|
714
|
+
# Helper: install packages across mirrors with retries
|
|
715
|
+
apt_install_with_mirrors() {
|
|
716
|
+
local required=$1; shift
|
|
717
|
+
local packages="$@"
|
|
718
|
+
[ -z "$packages" ] && return 0
|
|
719
|
+
set +e
|
|
720
|
+
# Install packages with default sources first
|
|
721
|
+
local log=/tmp/apt-update.log
|
|
722
|
+
echo "$(date +%Y-%m-%d\ %H:%M:%S) Installing packages: $packages" >> "$log"
|
|
723
|
+
restore_source
|
|
724
|
+
apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
|
|
725
|
+
echo "Install failed with default sources: $packages" >> "$log"
|
|
726
|
+
# Detect distro (ubuntu/debian)
|
|
727
|
+
local APT_OS="unknown"
|
|
728
|
+
if [ -f /etc/os-release ]; then
|
|
729
|
+
. /etc/os-release
|
|
730
|
+
case "$ID" in
|
|
731
|
+
debian) APT_OS="debian" ;;
|
|
732
|
+
ubuntu) APT_OS="ubuntu" ;;
|
|
733
|
+
*)
|
|
734
|
+
if [ -n "$ID_LIKE" ]; then
|
|
735
|
+
case " $ID $ID_LIKE " in
|
|
736
|
+
*ubuntu*) APT_OS="ubuntu" ;;
|
|
737
|
+
*debian*) APT_OS="debian" ;;
|
|
738
|
+
esac
|
|
739
|
+
fi
|
|
740
|
+
;;
|
|
741
|
+
esac
|
|
742
|
+
fi
|
|
743
|
+
# Build mirror candidates
|
|
744
|
+
# deb.debian.org is a CDN endpoint, if one backend goes down,
|
|
745
|
+
# the CDN automatically fails over to another mirror,
|
|
746
|
+
# so we only retry for ubuntu here.
|
|
747
|
+
if [ "$APT_OS" = "ubuntu" ]; then
|
|
748
|
+
# Backup current sources once
|
|
749
|
+
backup_source
|
|
750
|
+
# Selected from https://launchpad.net/ubuntu/+archivemirrors
|
|
751
|
+
# and results from apt-select
|
|
752
|
+
local MIRROR_CANDIDATES="mirrors.wikimedia.org mirror.umd.edu"
|
|
753
|
+
for host in $MIRROR_CANDIDATES; do
|
|
754
|
+
echo "Trying APT mirror ($APT_OS): $host" >> "$log"
|
|
755
|
+
if [ -f /etc/apt/sources.list ]; then
|
|
756
|
+
update_apt_sources $host /etc/apt/sources.list
|
|
757
|
+
else
|
|
758
|
+
echo "Error: /etc/apt/sources.list not found" >> "$log"
|
|
759
|
+
break
|
|
760
|
+
fi
|
|
761
|
+
apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
|
|
762
|
+
echo "Install failed with mirror ($APT_OS): $host" >> "$log"
|
|
763
|
+
# Restore to default sources
|
|
764
|
+
restore_source
|
|
765
|
+
done
|
|
766
|
+
fi
|
|
767
|
+
set -e
|
|
768
|
+
if [ "$required" = "1" ]; then
|
|
769
|
+
echo "Error: required package install failed across all mirrors: $packages" >> "$log"
|
|
770
|
+
return 1
|
|
771
|
+
else
|
|
772
|
+
echo "Optional package install failed across all mirrors: $packages; skipping." >> "$log"
|
|
773
|
+
return 0
|
|
774
|
+
fi
|
|
775
|
+
}
|
|
659
776
|
# Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
|
|
660
777
|
# so that both fusemount and fusermount3 can be masked before enabling SSH access.
|
|
661
778
|
PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
|
|
@@ -682,7 +799,7 @@ available_node_types:
|
|
|
682
799
|
done;
|
|
683
800
|
if [ ! -z "$INSTALL_FIRST" ]; then
|
|
684
801
|
echo "Installing core packages: $INSTALL_FIRST";
|
|
685
|
-
|
|
802
|
+
apt_install_with_mirrors 1 $INSTALL_FIRST || { echo "Error: core package installation failed." >> /tmp/apt-update.log; exit 1; }
|
|
686
803
|
fi;
|
|
687
804
|
# SSH and other packages are not necessary, so we disable set -e
|
|
688
805
|
set +e
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
cluster_name: {{ cluster_name_on_cloud }}
|
|
2
|
+
|
|
3
|
+
max_workers: {{ num_nodes - 1 }}
|
|
4
|
+
upscaling_speed: {{ num_nodes - 1 }}
|
|
5
|
+
idle_timeout_minutes: 5
|
|
6
|
+
|
|
7
|
+
provider:
|
|
8
|
+
type: external
|
|
9
|
+
module: sky.provision.seeweb
|
|
10
|
+
region: "{{ region }}"
|
|
11
|
+
|
|
12
|
+
auth:
|
|
13
|
+
ssh_user: ecuser
|
|
14
|
+
ssh_private_key: {{ ssh_private_key }}
|
|
15
|
+
|
|
16
|
+
available_node_types:
|
|
17
|
+
ray_head_default:
|
|
18
|
+
resources: {}
|
|
19
|
+
node_config:
|
|
20
|
+
plan: {{ instance_type }}
|
|
21
|
+
image: {{ image_id }}
|
|
22
|
+
location: {{ region }}
|
|
23
|
+
{% if seeweb_gpu_config is not none %}
|
|
24
|
+
gpu: {{ seeweb_gpu_config.gpu }}
|
|
25
|
+
gpu_label: "{{ seeweb_gpu_config.gpu_label }}"
|
|
26
|
+
{% endif %}
|
|
27
|
+
disk: {{ disk_size }}
|
|
28
|
+
|
|
29
|
+
head_node_type: ray_head_default
|
|
30
|
+
|
|
31
|
+
file_mounts: {
|
|
32
|
+
"~/.seeweb_cloud/seeweb_keys": "~/.seeweb_cloud/seeweb_keys",
|
|
33
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
34
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
35
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
36
|
+
"{{remote_path}}": "{{local_path}}",
|
|
37
|
+
{%- endfor %}
|
|
38
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
rsync_exclude: []
|
|
42
|
+
|
|
43
|
+
setup_commands:
|
|
44
|
+
- |
|
|
45
|
+
touch ~/.bashrc;
|
|
46
|
+
echo "127.0.0.1 $(hostname)" | sudo tee -a /etc/hosts || true;
|
|
47
|
+
echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts || true;
|
|
48
|
+
sudo systemctl stop unattended-upgrades || true;
|
|
49
|
+
sudo systemctl disable unattended-upgrades || true;
|
|
50
|
+
sudo apt update && sudo apt install -y patch || sudo yum install -y patch || true;
|
|
51
|
+
{{ conda_installation_commands }}
|
|
52
|
+
{{ ray_skypilot_installation_commands }}
|
|
53
|
+
|
|
54
|
+
head_start_ray_commands:
|
|
55
|
+
- |
|
|
56
|
+
retry_ray() {
|
|
57
|
+
local n=0; local max=30
|
|
58
|
+
until [ $n -ge $max ]; do
|
|
59
|
+
export SKYPILOT_NUM_GPUS=0
|
|
60
|
+
command -v nvidia-smi >/dev/null 2>&1 && \
|
|
61
|
+
SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
|
|
62
|
+
|
|
63
|
+
ray stop || true
|
|
64
|
+
RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
|
|
65
|
+
ray start --disable-usage-stats --head \
|
|
66
|
+
--port={{ ray_port }} --dashboard-port={{ ray_dashboard_port }} \
|
|
67
|
+
--object-manager-port=8076 \
|
|
68
|
+
--autoscaling-config=~/ray_bootstrap_config.yaml \
|
|
69
|
+
--num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
|
|
70
|
+
|
|
71
|
+
echo "[head] Ray failed to start ($((++n))/$max), retrying in 5s..."
|
|
72
|
+
sleep 5
|
|
73
|
+
done
|
|
74
|
+
[ $n -eq $max ] && { echo "Ray head failed"; exit 1; }
|
|
75
|
+
}
|
|
76
|
+
retry_ray
|
|
77
|
+
|
|
78
|
+
worker_start_ray_commands:
|
|
79
|
+
- |
|
|
80
|
+
retry_ray() {
|
|
81
|
+
local n=0; local max=30
|
|
82
|
+
until [ $n -ge $max ]; do
|
|
83
|
+
SKYPILOT_NUM_GPUS=0
|
|
84
|
+
command -v nvidia-smi >/dev/null 2>&1 && \
|
|
85
|
+
SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
|
|
86
|
+
|
|
87
|
+
ray stop || true
|
|
88
|
+
RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
|
|
89
|
+
ray start --disable-usage-stats \
|
|
90
|
+
--address=$RAY_HEAD_IP:{{ ray_port }} \
|
|
91
|
+
--object-manager-port=8076 \
|
|
92
|
+
--num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
|
|
93
|
+
|
|
94
|
+
echo "[worker] Ray failed to start ($((++n))/$max), retrying in 5s..."
|
|
95
|
+
sleep 5
|
|
96
|
+
done
|
|
97
|
+
[ $n -eq $max ] && { echo "Ray worker failed"; exit 1; }
|
|
98
|
+
}
|
|
99
|
+
retry_ray
|
|
100
|
+
|
|
101
|
+
head_node: {}
|
|
102
|
+
worker_nodes: {}
|
|
103
|
+
|
|
104
|
+
head_setup_commands: []
|
|
105
|
+
worker_setup_commands: []
|
|
106
|
+
|
|
107
|
+
cluster_synced_files: []
|
|
108
|
+
file_mounts_sync_continuously: False
|
|
@@ -107,10 +107,12 @@ def canonicalize_accelerator_name(accelerator: str,
|
|
|
107
107
|
if not names and cloud_str in ['Kubernetes', None]:
|
|
108
108
|
with rich_utils.safe_status(
|
|
109
109
|
ux_utils.spinner_message('Listing accelerators on Kubernetes')):
|
|
110
|
+
# Only search for Kubernetes to reduce the lookup cost.
|
|
111
|
+
# For other clouds, the catalog has been searched in previous steps.
|
|
110
112
|
searched = catalog.list_accelerators(
|
|
111
113
|
name_filter=accelerator,
|
|
112
114
|
case_sensitive=False,
|
|
113
|
-
clouds=
|
|
115
|
+
clouds='Kubernetes',
|
|
114
116
|
)
|
|
115
117
|
names = list(searched.keys())
|
|
116
118
|
if accelerator in names:
|