skypilot-nightly 1.0.0.dev20250912__py3-none-any.whl → 1.0.0.dev20250914__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/seeweb.py +103 -0
- sky/authentication.py +38 -0
- sky/backends/backend_utils.py +24 -9
- sky/backends/cloud_vm_ray_backend.py +382 -151
- sky/catalog/data_fetchers/fetch_aws.py +0 -36
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/seeweb_catalog.py +184 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/seeweb.py +463 -0
- sky/core.py +46 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{DAiq7V2xJnO1LSfmunZl6 → 5iak5kYp9a9ezANCb74L8}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
- sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-e8a0c4c3c6f408fb.js → webpack-e2e3d2d3de7d43e5.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +41 -26
- sky/jobs/utils.py +61 -13
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/utils.py +14 -3
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +806 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +252 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +2 -1
- sky/skylet/job_lib.py +128 -10
- sky/skylet/log_lib.py +3 -3
- sky/skylet/services.py +203 -0
- sky/skylet/skylet.py +4 -0
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/utils/cluster_utils.py +6 -2
- sky/utils/controller_utils.py +11 -5
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/METADATA +39 -34
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/RECORD +65 -54
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- /sky/dashboard/out/_next/static/{DAiq7V2xJnO1LSfmunZl6 → 5iak5kYp9a9ezANCb74L8}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/top_level.txt +0 -0
sky/skylet/job_lib.py
CHANGED
|
@@ -31,8 +31,11 @@ from sky.utils.db import db_utils
|
|
|
31
31
|
|
|
32
32
|
if typing.TYPE_CHECKING:
|
|
33
33
|
import psutil
|
|
34
|
+
|
|
35
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
34
36
|
else:
|
|
35
37
|
psutil = adaptors_common.LazyImport('psutil')
|
|
38
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
36
39
|
|
|
37
40
|
logger = sky_logging.init_logger(__name__)
|
|
38
41
|
|
|
@@ -220,6 +223,45 @@ class JobStatus(enum.Enum):
|
|
|
220
223
|
color = _JOB_STATUS_TO_COLOR[self]
|
|
221
224
|
return f'{color}{self.value}{colorama.Style.RESET_ALL}'
|
|
222
225
|
|
|
226
|
+
@classmethod
|
|
227
|
+
def from_protobuf(
|
|
228
|
+
cls,
|
|
229
|
+
protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
|
|
230
|
+
"""Convert protobuf JobStatus enum to Python enum value."""
|
|
231
|
+
protobuf_to_enum = {
|
|
232
|
+
jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
|
|
233
|
+
jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
|
|
234
|
+
jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
|
|
235
|
+
jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
|
|
236
|
+
jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
|
|
237
|
+
jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
|
|
238
|
+
jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
|
|
239
|
+
jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
|
|
240
|
+
jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
|
|
241
|
+
jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
|
|
242
|
+
}
|
|
243
|
+
if protobuf_value not in protobuf_to_enum:
|
|
244
|
+
raise ValueError(
|
|
245
|
+
f'Unknown protobuf JobStatus value: {protobuf_value}')
|
|
246
|
+
return protobuf_to_enum[protobuf_value]
|
|
247
|
+
|
|
248
|
+
def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
|
|
249
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
250
|
+
enum_to_protobuf = {
|
|
251
|
+
JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
|
|
252
|
+
JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
|
|
253
|
+
JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
|
|
254
|
+
JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
|
|
255
|
+
JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
|
|
256
|
+
JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
|
|
257
|
+
JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
|
|
258
|
+
JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
|
|
259
|
+
JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
|
|
260
|
+
}
|
|
261
|
+
if self not in enum_to_protobuf:
|
|
262
|
+
raise ValueError(f'Unknown JobStatus value: {self}')
|
|
263
|
+
return enum_to_protobuf[self]
|
|
264
|
+
|
|
223
265
|
|
|
224
266
|
# We have two steps for job submissions:
|
|
225
267
|
# 1. Client reserve a job id from the job table by adding a INIT state job.
|
|
@@ -475,6 +517,11 @@ def get_status(job_id: int) -> Optional[JobStatus]:
|
|
|
475
517
|
|
|
476
518
|
@init_db
|
|
477
519
|
def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
520
|
+
return message_utils.encode_payload(get_statuses(job_ids))
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
@init_db
|
|
524
|
+
def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
|
|
478
525
|
assert _DB is not None
|
|
479
526
|
# Per-job lock is not required here, since the staled job status will not
|
|
480
527
|
# affect the caller.
|
|
@@ -482,10 +529,52 @@ def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
|
482
529
|
rows = _DB.cursor.execute(
|
|
483
530
|
f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
|
|
484
531
|
job_ids)
|
|
485
|
-
statuses = {job_id: None for job_id in job_ids}
|
|
532
|
+
statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
|
|
486
533
|
for (job_id, status) in rows:
|
|
487
534
|
statuses[job_id] = status
|
|
488
|
-
return
|
|
535
|
+
return statuses
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
@init_db
|
|
539
|
+
def get_jobs_info(user_hash: Optional[str] = None,
|
|
540
|
+
all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
|
|
541
|
+
"""Get detailed job information.
|
|
542
|
+
|
|
543
|
+
Similar to dump_job_queue but returns structured protobuf objects instead
|
|
544
|
+
of encoded strings.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
user_hash: The user hash to show jobs for. Show all the users if None.
|
|
548
|
+
all_jobs: Whether to show all jobs, not just the pending/running ones.
|
|
549
|
+
"""
|
|
550
|
+
assert _DB is not None
|
|
551
|
+
|
|
552
|
+
status_list: Optional[List[JobStatus]] = [
|
|
553
|
+
JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
|
|
554
|
+
]
|
|
555
|
+
if all_jobs:
|
|
556
|
+
status_list = None
|
|
557
|
+
|
|
558
|
+
jobs = _get_jobs(user_hash, status_list=status_list)
|
|
559
|
+
jobs_info = []
|
|
560
|
+
for job in jobs:
|
|
561
|
+
jobs_info.append(
|
|
562
|
+
jobsv1_pb2.JobInfo(
|
|
563
|
+
job_id=job['job_id'],
|
|
564
|
+
job_name=job['job_name'],
|
|
565
|
+
username=job['username'],
|
|
566
|
+
submitted_at=job['submitted_at'],
|
|
567
|
+
status=job['status'].to_protobuf(),
|
|
568
|
+
run_timestamp=job['run_timestamp'],
|
|
569
|
+
start_at=job['start_at']
|
|
570
|
+
if job['start_at'] is not None else -1.0,
|
|
571
|
+
end_at=job['end_at'] if job['end_at'] is not None else 0.0,
|
|
572
|
+
resources=job['resources'] or '',
|
|
573
|
+
pid=job['pid'],
|
|
574
|
+
log_path=os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
575
|
+
job['run_timestamp']),
|
|
576
|
+
metadata=json.dumps(job['metadata'])))
|
|
577
|
+
return jobs_info
|
|
489
578
|
|
|
490
579
|
|
|
491
580
|
def load_statuses_payload(
|
|
@@ -527,13 +616,24 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
|
|
|
527
616
|
`format_job_queue()`), because the job may stay in PENDING if the cluster is
|
|
528
617
|
busy.
|
|
529
618
|
"""
|
|
619
|
+
return message_utils.encode_payload(
|
|
620
|
+
get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
@init_db
|
|
624
|
+
def get_job_submitted_or_ended_timestamp(
|
|
625
|
+
job_id: int, get_ended_time: bool) -> Optional[float]:
|
|
626
|
+
"""Get the job submitted timestamp.
|
|
627
|
+
|
|
628
|
+
Returns the raw timestamp or None if job doesn't exist.
|
|
629
|
+
"""
|
|
530
630
|
assert _DB is not None
|
|
531
631
|
field = 'end_at' if get_ended_time else 'submitted_at'
|
|
532
632
|
rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
|
|
533
633
|
(job_id,))
|
|
534
634
|
for (timestamp,) in rows:
|
|
535
|
-
return
|
|
536
|
-
return
|
|
635
|
+
return timestamp
|
|
636
|
+
return None
|
|
537
637
|
|
|
538
638
|
|
|
539
639
|
def get_ray_port():
|
|
@@ -947,6 +1047,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
947
1047
|
Encoded job IDs that are actually cancelled. Caller should use
|
|
948
1048
|
message_utils.decode_payload() to parse.
|
|
949
1049
|
"""
|
|
1050
|
+
return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
|
|
1051
|
+
user_hash))
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
def cancel_jobs(jobs: Optional[List[int]],
|
|
1055
|
+
cancel_all: bool = False,
|
|
1056
|
+
user_hash: Optional[str] = None) -> List[int]:
|
|
950
1057
|
job_records = []
|
|
951
1058
|
all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
|
|
952
1059
|
if jobs is None and not cancel_all:
|
|
@@ -1010,7 +1117,7 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
1010
1117
|
cancelled_ids.append(job['job_id'])
|
|
1011
1118
|
|
|
1012
1119
|
scheduler.schedule_step()
|
|
1013
|
-
return
|
|
1120
|
+
return cancelled_ids
|
|
1014
1121
|
|
|
1015
1122
|
|
|
1016
1123
|
@init_db
|
|
@@ -1030,6 +1137,17 @@ def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
|
|
|
1030
1137
|
|
|
1031
1138
|
@init_db
|
|
1032
1139
|
def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
|
1140
|
+
"""Returns the relative paths to the log files for jobs with globbing,
|
|
1141
|
+
encoded."""
|
|
1142
|
+
job_to_dir = get_job_log_dirs(job_ids)
|
|
1143
|
+
job_to_dir_str: Dict[str, str] = {}
|
|
1144
|
+
for job_id, log_dir in job_to_dir.items():
|
|
1145
|
+
job_to_dir_str[str(job_id)] = log_dir
|
|
1146
|
+
return message_utils.encode_payload(job_to_dir_str)
|
|
1147
|
+
|
|
1148
|
+
|
|
1149
|
+
@init_db
|
|
1150
|
+
def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
|
|
1033
1151
|
"""Returns the relative paths to the log files for jobs with globbing."""
|
|
1034
1152
|
assert _DB is not None
|
|
1035
1153
|
query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
|
|
@@ -1038,16 +1156,16 @@ def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
|
|
1038
1156
|
SELECT * FROM jobs
|
|
1039
1157
|
WHERE {query_str}""", job_ids)
|
|
1040
1158
|
rows = _DB.cursor.fetchall()
|
|
1041
|
-
job_to_dir = {}
|
|
1159
|
+
job_to_dir: Dict[int, str] = {}
|
|
1042
1160
|
for row in rows:
|
|
1043
1161
|
job_id = row[JobInfoLoc.JOB_ID.value]
|
|
1044
1162
|
if row[JobInfoLoc.LOG_PATH.value]:
|
|
1045
|
-
job_to_dir[
|
|
1163
|
+
job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
|
|
1046
1164
|
else:
|
|
1047
1165
|
run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
|
|
1048
|
-
job_to_dir[
|
|
1049
|
-
|
|
1050
|
-
return
|
|
1166
|
+
job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
1167
|
+
run_timestamp)
|
|
1168
|
+
return job_to_dir
|
|
1051
1169
|
|
|
1052
1170
|
|
|
1053
1171
|
class JobLibCodeGen:
|
sky/skylet/log_lib.py
CHANGED
|
@@ -406,9 +406,9 @@ def _follow_job_logs(file,
|
|
|
406
406
|
wait_last_logs = False
|
|
407
407
|
continue
|
|
408
408
|
status_str = status.value if status is not None else 'None'
|
|
409
|
-
|
|
410
|
-
f'Job finished (status: {status_str}).')
|
|
411
|
-
|
|
409
|
+
finish = ux_utils.finishing_message(
|
|
410
|
+
f'Job finished (status: {status_str}).')
|
|
411
|
+
yield finish + '\n'
|
|
412
412
|
return
|
|
413
413
|
|
|
414
414
|
time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
|
sky/skylet/services.py
CHANGED
|
@@ -1,11 +1,19 @@
|
|
|
1
1
|
"""gRPC service implementations for skylet."""
|
|
2
2
|
|
|
3
|
+
import os
|
|
4
|
+
|
|
3
5
|
import grpc
|
|
4
6
|
|
|
5
7
|
from sky import sky_logging
|
|
8
|
+
from sky.jobs import state as managed_job_state
|
|
6
9
|
from sky.schemas.generated import autostopv1_pb2
|
|
7
10
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
11
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
12
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
13
|
+
from sky.serve import serve_state
|
|
8
14
|
from sky.skylet import autostop_lib
|
|
15
|
+
from sky.skylet import constants
|
|
16
|
+
from sky.skylet import job_lib
|
|
9
17
|
|
|
10
18
|
logger = sky_logging.init_logger(__name__)
|
|
11
19
|
|
|
@@ -42,3 +50,198 @@ class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
|
|
|
42
50
|
is_autostopping=is_autostopping)
|
|
43
51
|
except Exception as e: # pylint: disable=broad-except
|
|
44
52
|
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
56
|
+
"""Implementation of the JobsService gRPC service."""
|
|
57
|
+
|
|
58
|
+
def AddJob( # type: ignore[return]
|
|
59
|
+
self, request: jobsv1_pb2.AddJobRequest,
|
|
60
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.AddJobResponse:
|
|
61
|
+
try:
|
|
62
|
+
job_name = request.job_name if request.HasField('job_name') else '-'
|
|
63
|
+
job_id, log_dir = job_lib.add_job(job_name, request.username,
|
|
64
|
+
request.run_timestamp,
|
|
65
|
+
request.resources_str,
|
|
66
|
+
request.metadata)
|
|
67
|
+
return jobsv1_pb2.AddJobResponse(job_id=job_id, log_dir=log_dir)
|
|
68
|
+
except Exception as e: # pylint: disable=broad-except
|
|
69
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
70
|
+
|
|
71
|
+
def QueueJob( # type: ignore[return]
|
|
72
|
+
self, request: jobsv1_pb2.QueueJobRequest,
|
|
73
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.QueueJobResponse:
|
|
74
|
+
try:
|
|
75
|
+
job_id = request.job_id
|
|
76
|
+
# Create log directory and file
|
|
77
|
+
remote_log_dir = os.path.expanduser(request.remote_log_dir)
|
|
78
|
+
os.makedirs(remote_log_dir, exist_ok=True)
|
|
79
|
+
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
|
80
|
+
open(remote_log_path, 'a').close() # pylint: disable=unspecified-encoding
|
|
81
|
+
|
|
82
|
+
script_path = os.path.expanduser(request.script_path)
|
|
83
|
+
os.makedirs(os.path.dirname(script_path), exist_ok=True)
|
|
84
|
+
|
|
85
|
+
# If `codegen` is not provided, assume script is already
|
|
86
|
+
# uploaded to `script_path` via rsync.
|
|
87
|
+
if request.HasField('codegen'):
|
|
88
|
+
with open(script_path, 'w', encoding='utf-8') as f:
|
|
89
|
+
f.write(request.codegen)
|
|
90
|
+
os.chmod(script_path, 0o755)
|
|
91
|
+
|
|
92
|
+
cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
|
|
93
|
+
job_submit_cmd = (
|
|
94
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
95
|
+
# retrieved with pid is the same driver process.
|
|
96
|
+
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
97
|
+
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
98
|
+
# Do not use &>, which is not POSIX and may not work.
|
|
99
|
+
# Note that the order of ">filename 2>&1" matters.
|
|
100
|
+
f' > {remote_log_path} 2>&1')
|
|
101
|
+
job_lib.scheduler.queue(job_id, job_submit_cmd)
|
|
102
|
+
|
|
103
|
+
if request.HasField('managed_job'):
|
|
104
|
+
managed_job = request.managed_job
|
|
105
|
+
pool = managed_job.pool if managed_job.HasField(
|
|
106
|
+
'pool') else None
|
|
107
|
+
pool_hash = None
|
|
108
|
+
if pool is not None:
|
|
109
|
+
pool_hash = serve_state.get_service_hash(pool)
|
|
110
|
+
# Add the managed job to job queue database.
|
|
111
|
+
managed_job_state.set_job_info(job_id, managed_job.name,
|
|
112
|
+
managed_job.workspace,
|
|
113
|
+
managed_job.entrypoint, pool,
|
|
114
|
+
pool_hash)
|
|
115
|
+
# Set the managed job to PENDING state to make sure that
|
|
116
|
+
# this managed job appears in the `sky jobs queue`, even
|
|
117
|
+
# if it needs to wait to be submitted.
|
|
118
|
+
# We cannot set the managed job to PENDING state in the
|
|
119
|
+
# job template (jobs-controller.yaml.j2), as it may need
|
|
120
|
+
# to wait for the run commands to be scheduled on the job
|
|
121
|
+
# controller in high-load cases.
|
|
122
|
+
for task in managed_job.tasks:
|
|
123
|
+
managed_job_state.set_pending(job_id, task.task_id,
|
|
124
|
+
task.name, task.resources_str,
|
|
125
|
+
task.metadata_json)
|
|
126
|
+
return jobsv1_pb2.QueueJobResponse()
|
|
127
|
+
except Exception as e: # pylint: disable=broad-except
|
|
128
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
129
|
+
|
|
130
|
+
def UpdateStatus( # type: ignore[return]
|
|
131
|
+
self, request: jobsv1_pb2.UpdateStatusRequest,
|
|
132
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.UpdateStatusResponse:
|
|
133
|
+
try:
|
|
134
|
+
job_lib.update_status()
|
|
135
|
+
return jobsv1_pb2.UpdateStatusResponse()
|
|
136
|
+
except Exception as e: # pylint: disable=broad-except
|
|
137
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
138
|
+
|
|
139
|
+
def GetJobQueue( # type: ignore[return]
|
|
140
|
+
self, request: jobsv1_pb2.GetJobQueueRequest,
|
|
141
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.GetJobQueueResponse:
|
|
142
|
+
try:
|
|
143
|
+
user_hash = request.user_hash if request.HasField(
|
|
144
|
+
'user_hash') else None
|
|
145
|
+
all_jobs = request.all_jobs
|
|
146
|
+
jobs_info = job_lib.get_jobs_info(user_hash=user_hash,
|
|
147
|
+
all_jobs=all_jobs)
|
|
148
|
+
return jobsv1_pb2.GetJobQueueResponse(jobs=jobs_info)
|
|
149
|
+
except Exception as e: # pylint: disable=broad-except
|
|
150
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
151
|
+
|
|
152
|
+
def CancelJobs( # type: ignore[return]
|
|
153
|
+
self, request: jobsv1_pb2.CancelJobsRequest,
|
|
154
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.CancelJobsResponse:
|
|
155
|
+
try:
|
|
156
|
+
job_ids = list(request.job_ids) if request.job_ids else []
|
|
157
|
+
user_hash = request.user_hash if request.HasField(
|
|
158
|
+
'user_hash') else None
|
|
159
|
+
cancelled_job_ids = job_lib.cancel_jobs(job_ids, request.cancel_all,
|
|
160
|
+
user_hash)
|
|
161
|
+
return jobsv1_pb2.CancelJobsResponse(
|
|
162
|
+
cancelled_job_ids=cancelled_job_ids)
|
|
163
|
+
except Exception as e: # pylint: disable=broad-except
|
|
164
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
165
|
+
|
|
166
|
+
def FailAllInProgressJobs( # type: ignore[return]
|
|
167
|
+
self, _: jobsv1_pb2.FailAllInProgressJobsRequest,
|
|
168
|
+
context: grpc.ServicerContext
|
|
169
|
+
) -> jobsv1_pb2.FailAllInProgressJobsResponse:
|
|
170
|
+
try:
|
|
171
|
+
job_lib.fail_all_jobs_in_progress()
|
|
172
|
+
return jobsv1_pb2.FailAllInProgressJobsResponse()
|
|
173
|
+
except Exception as e: # pylint: disable=broad-except
|
|
174
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
175
|
+
|
|
176
|
+
def TailLogs(
|
|
177
|
+
self,
|
|
178
|
+
request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
|
|
179
|
+
context: grpc.ServicerContext):
|
|
180
|
+
# TODO(kevin): implement this
|
|
181
|
+
raise NotImplementedError('TailLogs is not implemented')
|
|
182
|
+
|
|
183
|
+
def GetJobStatus( # type: ignore[return]
|
|
184
|
+
self, request: jobsv1_pb2.GetJobStatusRequest,
|
|
185
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.GetJobStatusResponse:
|
|
186
|
+
try:
|
|
187
|
+
if request.job_ids:
|
|
188
|
+
job_ids = list(request.job_ids)
|
|
189
|
+
else:
|
|
190
|
+
latest_job_id = job_lib.get_latest_job_id()
|
|
191
|
+
job_ids = [latest_job_id] if latest_job_id is not None else []
|
|
192
|
+
job_statuses = job_lib.get_statuses(job_ids)
|
|
193
|
+
for job_id, status in job_statuses.items():
|
|
194
|
+
job_statuses[job_id] = job_lib.JobStatus(status).to_protobuf(
|
|
195
|
+
) if status is not None else jobsv1_pb2.JOB_STATUS_UNSPECIFIED
|
|
196
|
+
return jobsv1_pb2.GetJobStatusResponse(job_statuses=job_statuses)
|
|
197
|
+
except Exception as e: # pylint: disable=broad-except
|
|
198
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
199
|
+
|
|
200
|
+
def GetJobSubmittedTimestamp( # type: ignore[return]
|
|
201
|
+
self, request: jobsv1_pb2.GetJobSubmittedTimestampRequest,
|
|
202
|
+
context: grpc.ServicerContext
|
|
203
|
+
) -> jobsv1_pb2.GetJobSubmittedTimestampResponse:
|
|
204
|
+
try:
|
|
205
|
+
job_id = request.job_id if request.HasField(
|
|
206
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
207
|
+
timestamp = job_lib.get_job_submitted_or_ended_timestamp(
|
|
208
|
+
job_id, False)
|
|
209
|
+
if timestamp is None:
|
|
210
|
+
context.abort(grpc.StatusCode.NOT_FOUND,
|
|
211
|
+
f'Job {job_id} not found')
|
|
212
|
+
return jobsv1_pb2.GetJobSubmittedTimestampResponse(
|
|
213
|
+
timestamp=timestamp)
|
|
214
|
+
except Exception as e: # pylint: disable=broad-except
|
|
215
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
216
|
+
|
|
217
|
+
def GetJobEndedTimestamp( # type: ignore[return]
|
|
218
|
+
self, request: jobsv1_pb2.GetJobEndedTimestampRequest,
|
|
219
|
+
context: grpc.ServicerContext
|
|
220
|
+
) -> jobsv1_pb2.GetJobEndedTimestampResponse:
|
|
221
|
+
try:
|
|
222
|
+
job_id = request.job_id if request.HasField(
|
|
223
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
224
|
+
timestamp = job_lib.get_job_submitted_or_ended_timestamp(
|
|
225
|
+
job_id, True)
|
|
226
|
+
if timestamp is None:
|
|
227
|
+
context.abort(grpc.StatusCode.NOT_FOUND,
|
|
228
|
+
f'Job {job_id} not found or not ended')
|
|
229
|
+
return jobsv1_pb2.GetJobEndedTimestampResponse(timestamp=timestamp)
|
|
230
|
+
except Exception as e: # pylint: disable=broad-except
|
|
231
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
232
|
+
|
|
233
|
+
def GetLogDirsForJobs( # type: ignore[return]
|
|
234
|
+
self, request: jobsv1_pb2.GetLogDirsForJobsRequest,
|
|
235
|
+
context: grpc.ServicerContext
|
|
236
|
+
) -> jobsv1_pb2.GetLogDirsForJobsResponse:
|
|
237
|
+
try:
|
|
238
|
+
if request.job_ids:
|
|
239
|
+
job_ids = list(request.job_ids)
|
|
240
|
+
else:
|
|
241
|
+
latest_job_id = job_lib.get_latest_job_id()
|
|
242
|
+
job_ids = [latest_job_id] if latest_job_id is not None else []
|
|
243
|
+
job_log_dirs = job_lib.get_job_log_dirs(job_ids)
|
|
244
|
+
return jobsv1_pb2.GetLogDirsForJobsResponse(
|
|
245
|
+
job_log_dirs=job_log_dirs)
|
|
246
|
+
except Exception as e: # pylint: disable=broad-except
|
|
247
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
sky/skylet/skylet.py
CHANGED
|
@@ -9,6 +9,7 @@ import grpc
|
|
|
9
9
|
import sky
|
|
10
10
|
from sky import sky_logging
|
|
11
11
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
12
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
12
13
|
from sky.skylet import constants
|
|
13
14
|
from sky.skylet import events
|
|
14
15
|
from sky.skylet import services
|
|
@@ -50,6 +51,9 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
|
|
|
50
51
|
autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
|
|
51
52
|
services.AutostopServiceImpl(), server)
|
|
52
53
|
|
|
54
|
+
jobsv1_pb2_grpc.add_JobsServiceServicer_to_server(
|
|
55
|
+
services.JobsServiceImpl(), server)
|
|
56
|
+
|
|
53
57
|
listen_addr = f'127.0.0.1:{port}'
|
|
54
58
|
server.add_insecure_port(listen_addr)
|
|
55
59
|
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
cluster_name: {{ cluster_name_on_cloud }}
|
|
2
|
+
|
|
3
|
+
max_workers: {{ num_nodes - 1 }}
|
|
4
|
+
upscaling_speed: {{ num_nodes - 1 }}
|
|
5
|
+
idle_timeout_minutes: 5
|
|
6
|
+
|
|
7
|
+
provider:
|
|
8
|
+
type: external
|
|
9
|
+
module: sky.provision.seeweb
|
|
10
|
+
region: "{{ region }}"
|
|
11
|
+
|
|
12
|
+
auth:
|
|
13
|
+
ssh_user: ecuser
|
|
14
|
+
ssh_private_key: {{ ssh_private_key }}
|
|
15
|
+
|
|
16
|
+
available_node_types:
|
|
17
|
+
ray_head_default:
|
|
18
|
+
resources: {}
|
|
19
|
+
node_config:
|
|
20
|
+
plan: {{ instance_type }}
|
|
21
|
+
image: {{ image_id }}
|
|
22
|
+
location: {{ region }}
|
|
23
|
+
{% if seeweb_gpu_config is not none %}
|
|
24
|
+
gpu: {{ seeweb_gpu_config.gpu }}
|
|
25
|
+
gpu_label: "{{ seeweb_gpu_config.gpu_label }}"
|
|
26
|
+
{% endif %}
|
|
27
|
+
disk: {{ disk_size }}
|
|
28
|
+
|
|
29
|
+
head_node_type: ray_head_default
|
|
30
|
+
|
|
31
|
+
file_mounts: {
|
|
32
|
+
"~/.seeweb_cloud/seeweb_keys": "~/.seeweb_cloud/seeweb_keys",
|
|
33
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
34
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
35
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
36
|
+
"{{remote_path}}": "{{local_path}}",
|
|
37
|
+
{%- endfor %}
|
|
38
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
rsync_exclude: []
|
|
42
|
+
|
|
43
|
+
setup_commands:
|
|
44
|
+
- |
|
|
45
|
+
touch ~/.bashrc;
|
|
46
|
+
echo "127.0.0.1 $(hostname)" | sudo tee -a /etc/hosts || true;
|
|
47
|
+
echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts || true;
|
|
48
|
+
sudo systemctl stop unattended-upgrades || true;
|
|
49
|
+
sudo systemctl disable unattended-upgrades || true;
|
|
50
|
+
sudo apt update && sudo apt install -y patch || sudo yum install -y patch || true;
|
|
51
|
+
{{ conda_installation_commands }}
|
|
52
|
+
{{ ray_skypilot_installation_commands }}
|
|
53
|
+
|
|
54
|
+
head_start_ray_commands:
|
|
55
|
+
- |
|
|
56
|
+
retry_ray() {
|
|
57
|
+
local n=0; local max=30
|
|
58
|
+
until [ $n -ge $max ]; do
|
|
59
|
+
export SKYPILOT_NUM_GPUS=0
|
|
60
|
+
command -v nvidia-smi >/dev/null 2>&1 && \
|
|
61
|
+
SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
|
|
62
|
+
|
|
63
|
+
ray stop || true
|
|
64
|
+
RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
|
|
65
|
+
ray start --disable-usage-stats --head \
|
|
66
|
+
--port={{ ray_port }} --dashboard-port={{ ray_dashboard_port }} \
|
|
67
|
+
--object-manager-port=8076 \
|
|
68
|
+
--autoscaling-config=~/ray_bootstrap_config.yaml \
|
|
69
|
+
--num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
|
|
70
|
+
|
|
71
|
+
echo "[head] Ray failed to start ($((++n))/$max), retrying in 5s..."
|
|
72
|
+
sleep 5
|
|
73
|
+
done
|
|
74
|
+
[ $n -eq $max ] && { echo "Ray head failed"; exit 1; }
|
|
75
|
+
}
|
|
76
|
+
retry_ray
|
|
77
|
+
|
|
78
|
+
worker_start_ray_commands:
|
|
79
|
+
- |
|
|
80
|
+
retry_ray() {
|
|
81
|
+
local n=0; local max=30
|
|
82
|
+
until [ $n -ge $max ]; do
|
|
83
|
+
SKYPILOT_NUM_GPUS=0
|
|
84
|
+
command -v nvidia-smi >/dev/null 2>&1 && \
|
|
85
|
+
SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
|
|
86
|
+
|
|
87
|
+
ray stop || true
|
|
88
|
+
RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
|
|
89
|
+
ray start --disable-usage-stats \
|
|
90
|
+
--address=$RAY_HEAD_IP:{{ ray_port }} \
|
|
91
|
+
--object-manager-port=8076 \
|
|
92
|
+
--num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
|
|
93
|
+
|
|
94
|
+
echo "[worker] Ray failed to start ($((++n))/$max), retrying in 5s..."
|
|
95
|
+
sleep 5
|
|
96
|
+
done
|
|
97
|
+
[ $n -eq $max ] && { echo "Ray worker failed"; exit 1; }
|
|
98
|
+
}
|
|
99
|
+
retry_ray
|
|
100
|
+
|
|
101
|
+
head_node: {}
|
|
102
|
+
worker_nodes: {}
|
|
103
|
+
|
|
104
|
+
head_setup_commands: []
|
|
105
|
+
worker_setup_commands: []
|
|
106
|
+
|
|
107
|
+
cluster_synced_files: []
|
|
108
|
+
file_mounts_sync_continuously: False
|
sky/utils/cluster_utils.py
CHANGED
|
@@ -144,6 +144,9 @@ class SSHConfigHelper(object):
|
|
|
144
144
|
username = docker_user
|
|
145
145
|
|
|
146
146
|
key_path = cls.generate_local_key_file(cluster_name, auth_config)
|
|
147
|
+
# Keep the unexpanded path for SSH config (with ~)
|
|
148
|
+
key_path_for_config = key_path
|
|
149
|
+
# Expand the path for internal operations that need absolute path
|
|
147
150
|
key_path = os.path.expanduser(key_path)
|
|
148
151
|
sky_autogen_comment = ('# Added by sky (use `sky stop/down '
|
|
149
152
|
f'{cluster_name}` to remove)')
|
|
@@ -208,8 +211,9 @@ class SSHConfigHelper(object):
|
|
|
208
211
|
node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
|
|
209
212
|
# TODO(romilb): Update port number when k8s supports multinode
|
|
210
213
|
codegen += cls._get_generated_config(
|
|
211
|
-
sky_autogen_comment, node_name, ip, username,
|
|
212
|
-
proxy_command, port,
|
|
214
|
+
sky_autogen_comment, node_name, ip, username,
|
|
215
|
+
key_path_for_config, proxy_command, port,
|
|
216
|
+
docker_proxy_command) + '\n'
|
|
213
217
|
|
|
214
218
|
cluster_config_path = os.path.expanduser(
|
|
215
219
|
cls.ssh_cluster_path.format(cluster_name))
|
sky/utils/controller_utils.py
CHANGED
|
@@ -228,15 +228,21 @@ def get_controller_for_pool(pool: bool) -> Controllers:
|
|
|
228
228
|
def high_availability_specified(cluster_name: Optional[str]) -> bool:
|
|
229
229
|
"""Check if the controller high availability is specified in user config.
|
|
230
230
|
"""
|
|
231
|
-
# pylint: disable=import-outside-toplevel
|
|
232
|
-
from sky.jobs import utils as managed_job_utils
|
|
233
|
-
if managed_job_utils.is_consolidation_mode():
|
|
234
|
-
return True
|
|
235
|
-
|
|
236
231
|
controller = Controllers.from_name(cluster_name)
|
|
237
232
|
if controller is None:
|
|
238
233
|
return False
|
|
239
234
|
|
|
235
|
+
if controller.value.controller_type == 'jobs':
|
|
236
|
+
# pylint: disable-next=import-outside-toplevel
|
|
237
|
+
from sky.jobs import utils as managed_job_utils
|
|
238
|
+
if managed_job_utils.is_consolidation_mode():
|
|
239
|
+
return True
|
|
240
|
+
elif controller.value.controller_type == 'serve':
|
|
241
|
+
# pylint: disable-next=import-outside-toplevel
|
|
242
|
+
from sky.serve import serve_utils
|
|
243
|
+
if serve_utils.is_consolidation_mode():
|
|
244
|
+
return True
|
|
245
|
+
|
|
240
246
|
if skypilot_config.loaded():
|
|
241
247
|
return skypilot_config.get_nested((controller.value.controller_type,
|
|
242
248
|
'controller', 'high_availability'),
|