skypilot-nightly 1.0.0.dev20250912__py3-none-any.whl → 1.0.0.dev20250914__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (73) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/seeweb.py +103 -0
  3. sky/authentication.py +38 -0
  4. sky/backends/backend_utils.py +24 -9
  5. sky/backends/cloud_vm_ray_backend.py +382 -151
  6. sky/catalog/data_fetchers/fetch_aws.py +0 -36
  7. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  8. sky/catalog/seeweb_catalog.py +184 -0
  9. sky/clouds/__init__.py +2 -0
  10. sky/clouds/kubernetes.py +2 -0
  11. sky/clouds/seeweb.py +463 -0
  12. sky/core.py +46 -12
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/{DAiq7V2xJnO1LSfmunZl6 → 5iak5kYp9a9ezANCb74L8}/_buildManifest.js +1 -1
  15. sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
  20. sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
  23. sky/dashboard/out/_next/static/chunks/{webpack-e8a0c4c3c6f408fb.js → webpack-e2e3d2d3de7d43e5.js} +1 -1
  24. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  25. sky/dashboard/out/clusters/[cluster].html +1 -1
  26. sky/dashboard/out/clusters.html +1 -1
  27. sky/dashboard/out/config.html +1 -1
  28. sky/dashboard/out/index.html +1 -1
  29. sky/dashboard/out/infra/[context].html +1 -1
  30. sky/dashboard/out/infra.html +1 -1
  31. sky/dashboard/out/jobs/[job].html +1 -1
  32. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  33. sky/dashboard/out/jobs.html +1 -1
  34. sky/dashboard/out/users.html +1 -1
  35. sky/dashboard/out/volumes.html +1 -1
  36. sky/dashboard/out/workspace/new.html +1 -1
  37. sky/dashboard/out/workspaces/[name].html +1 -1
  38. sky/dashboard/out/workspaces.html +1 -1
  39. sky/exceptions.py +5 -0
  40. sky/global_user_state.py +41 -26
  41. sky/jobs/utils.py +61 -13
  42. sky/provision/__init__.py +1 -0
  43. sky/provision/kubernetes/utils.py +14 -3
  44. sky/provision/seeweb/__init__.py +11 -0
  45. sky/provision/seeweb/config.py +13 -0
  46. sky/provision/seeweb/instance.py +806 -0
  47. sky/schemas/generated/jobsv1_pb2.py +86 -0
  48. sky/schemas/generated/jobsv1_pb2.pyi +252 -0
  49. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  50. sky/setup_files/dependencies.py +8 -1
  51. sky/skylet/constants.py +2 -1
  52. sky/skylet/job_lib.py +128 -10
  53. sky/skylet/log_lib.py +3 -3
  54. sky/skylet/services.py +203 -0
  55. sky/skylet/skylet.py +4 -0
  56. sky/templates/seeweb-ray.yml.j2 +108 -0
  57. sky/utils/cluster_utils.py +6 -2
  58. sky/utils/controller_utils.py +11 -5
  59. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/METADATA +39 -34
  60. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/RECORD +65 -54
  61. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  62. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  63. sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +0 -6
  64. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  65. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  66. sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
  67. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  68. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  69. /sky/dashboard/out/_next/static/{DAiq7V2xJnO1LSfmunZl6 → 5iak5kYp9a9ezANCb74L8}/_ssgManifest.js +0 -0
  70. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/WHEEL +0 -0
  71. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/entry_points.txt +0 -0
  72. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/licenses/LICENSE +0 -0
  73. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/top_level.txt +0 -0
sky/skylet/job_lib.py CHANGED
@@ -31,8 +31,11 @@ from sky.utils.db import db_utils
31
31
 
32
32
  if typing.TYPE_CHECKING:
33
33
  import psutil
34
+
35
+ from sky.schemas.generated import jobsv1_pb2
34
36
  else:
35
37
  psutil = adaptors_common.LazyImport('psutil')
38
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
36
39
 
37
40
  logger = sky_logging.init_logger(__name__)
38
41
 
@@ -220,6 +223,45 @@ class JobStatus(enum.Enum):
220
223
  color = _JOB_STATUS_TO_COLOR[self]
221
224
  return f'{color}{self.value}{colorama.Style.RESET_ALL}'
222
225
 
226
+ @classmethod
227
+ def from_protobuf(
228
+ cls,
229
+ protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
230
+ """Convert protobuf JobStatus enum to Python enum value."""
231
+ protobuf_to_enum = {
232
+ jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
233
+ jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
234
+ jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
235
+ jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
236
+ jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
237
+ jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
238
+ jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
239
+ jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
240
+ jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
241
+ jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
242
+ }
243
+ if protobuf_value not in protobuf_to_enum:
244
+ raise ValueError(
245
+ f'Unknown protobuf JobStatus value: {protobuf_value}')
246
+ return protobuf_to_enum[protobuf_value]
247
+
248
+ def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
249
+ """Convert this Python enum value to protobuf enum value."""
250
+ enum_to_protobuf = {
251
+ JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
252
+ JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
253
+ JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
254
+ JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
255
+ JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
256
+ JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
257
+ JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
258
+ JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
259
+ JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
260
+ }
261
+ if self not in enum_to_protobuf:
262
+ raise ValueError(f'Unknown JobStatus value: {self}')
263
+ return enum_to_protobuf[self]
264
+
223
265
 
224
266
  # We have two steps for job submissions:
225
267
  # 1. Client reserve a job id from the job table by adding a INIT state job.
@@ -475,6 +517,11 @@ def get_status(job_id: int) -> Optional[JobStatus]:
475
517
 
476
518
  @init_db
477
519
  def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
520
+ return message_utils.encode_payload(get_statuses(job_ids))
521
+
522
+
523
+ @init_db
524
+ def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
478
525
  assert _DB is not None
479
526
  # Per-job lock is not required here, since the staled job status will not
480
527
  # affect the caller.
@@ -482,10 +529,52 @@ def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
482
529
  rows = _DB.cursor.execute(
483
530
  f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
484
531
  job_ids)
485
- statuses = {job_id: None for job_id in job_ids}
532
+ statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
486
533
  for (job_id, status) in rows:
487
534
  statuses[job_id] = status
488
- return message_utils.encode_payload(statuses)
535
+ return statuses
536
+
537
+
538
+ @init_db
539
+ def get_jobs_info(user_hash: Optional[str] = None,
540
+ all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
541
+ """Get detailed job information.
542
+
543
+ Similar to dump_job_queue but returns structured protobuf objects instead
544
+ of encoded strings.
545
+
546
+ Args:
547
+ user_hash: The user hash to show jobs for. Show all the users if None.
548
+ all_jobs: Whether to show all jobs, not just the pending/running ones.
549
+ """
550
+ assert _DB is not None
551
+
552
+ status_list: Optional[List[JobStatus]] = [
553
+ JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
554
+ ]
555
+ if all_jobs:
556
+ status_list = None
557
+
558
+ jobs = _get_jobs(user_hash, status_list=status_list)
559
+ jobs_info = []
560
+ for job in jobs:
561
+ jobs_info.append(
562
+ jobsv1_pb2.JobInfo(
563
+ job_id=job['job_id'],
564
+ job_name=job['job_name'],
565
+ username=job['username'],
566
+ submitted_at=job['submitted_at'],
567
+ status=job['status'].to_protobuf(),
568
+ run_timestamp=job['run_timestamp'],
569
+ start_at=job['start_at']
570
+ if job['start_at'] is not None else -1.0,
571
+ end_at=job['end_at'] if job['end_at'] is not None else 0.0,
572
+ resources=job['resources'] or '',
573
+ pid=job['pid'],
574
+ log_path=os.path.join(constants.SKY_LOGS_DIRECTORY,
575
+ job['run_timestamp']),
576
+ metadata=json.dumps(job['metadata'])))
577
+ return jobs_info
489
578
 
490
579
 
491
580
  def load_statuses_payload(
@@ -527,13 +616,24 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
527
616
  `format_job_queue()`), because the job may stay in PENDING if the cluster is
528
617
  busy.
529
618
  """
619
+ return message_utils.encode_payload(
620
+ get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
621
+
622
+
623
+ @init_db
624
+ def get_job_submitted_or_ended_timestamp(
625
+ job_id: int, get_ended_time: bool) -> Optional[float]:
626
+ """Get the job submitted timestamp.
627
+
628
+ Returns the raw timestamp or None if job doesn't exist.
629
+ """
530
630
  assert _DB is not None
531
631
  field = 'end_at' if get_ended_time else 'submitted_at'
532
632
  rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
533
633
  (job_id,))
534
634
  for (timestamp,) in rows:
535
- return message_utils.encode_payload(timestamp)
536
- return message_utils.encode_payload(None)
635
+ return timestamp
636
+ return None
537
637
 
538
638
 
539
639
  def get_ray_port():
@@ -947,6 +1047,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
947
1047
  Encoded job IDs that are actually cancelled. Caller should use
948
1048
  message_utils.decode_payload() to parse.
949
1049
  """
1050
+ return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
1051
+ user_hash))
1052
+
1053
+
1054
+ def cancel_jobs(jobs: Optional[List[int]],
1055
+ cancel_all: bool = False,
1056
+ user_hash: Optional[str] = None) -> List[int]:
950
1057
  job_records = []
951
1058
  all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
952
1059
  if jobs is None and not cancel_all:
@@ -1010,7 +1117,7 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
1010
1117
  cancelled_ids.append(job['job_id'])
1011
1118
 
1012
1119
  scheduler.schedule_step()
1013
- return message_utils.encode_payload(cancelled_ids)
1120
+ return cancelled_ids
1014
1121
 
1015
1122
 
1016
1123
  @init_db
@@ -1030,6 +1137,17 @@ def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
1030
1137
 
1031
1138
  @init_db
1032
1139
  def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
1140
+ """Returns the relative paths to the log files for jobs with globbing,
1141
+ encoded."""
1142
+ job_to_dir = get_job_log_dirs(job_ids)
1143
+ job_to_dir_str: Dict[str, str] = {}
1144
+ for job_id, log_dir in job_to_dir.items():
1145
+ job_to_dir_str[str(job_id)] = log_dir
1146
+ return message_utils.encode_payload(job_to_dir_str)
1147
+
1148
+
1149
+ @init_db
1150
+ def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
1033
1151
  """Returns the relative paths to the log files for jobs with globbing."""
1034
1152
  assert _DB is not None
1035
1153
  query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
@@ -1038,16 +1156,16 @@ def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
1038
1156
  SELECT * FROM jobs
1039
1157
  WHERE {query_str}""", job_ids)
1040
1158
  rows = _DB.cursor.fetchall()
1041
- job_to_dir = {}
1159
+ job_to_dir: Dict[int, str] = {}
1042
1160
  for row in rows:
1043
1161
  job_id = row[JobInfoLoc.JOB_ID.value]
1044
1162
  if row[JobInfoLoc.LOG_PATH.value]:
1045
- job_to_dir[str(job_id)] = row[JobInfoLoc.LOG_PATH.value]
1163
+ job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
1046
1164
  else:
1047
1165
  run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
1048
- job_to_dir[str(job_id)] = os.path.join(constants.SKY_LOGS_DIRECTORY,
1049
- run_timestamp)
1050
- return message_utils.encode_payload(job_to_dir)
1166
+ job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
1167
+ run_timestamp)
1168
+ return job_to_dir
1051
1169
 
1052
1170
 
1053
1171
  class JobLibCodeGen:
sky/skylet/log_lib.py CHANGED
@@ -406,9 +406,9 @@ def _follow_job_logs(file,
406
406
  wait_last_logs = False
407
407
  continue
408
408
  status_str = status.value if status is not None else 'None'
409
- print(ux_utils.finishing_message(
410
- f'Job finished (status: {status_str}).'),
411
- flush=True)
409
+ finish = ux_utils.finishing_message(
410
+ f'Job finished (status: {status_str}).')
411
+ yield finish + '\n'
412
412
  return
413
413
 
414
414
  time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
sky/skylet/services.py CHANGED
@@ -1,11 +1,19 @@
1
1
  """gRPC service implementations for skylet."""
2
2
 
3
+ import os
4
+
3
5
  import grpc
4
6
 
5
7
  from sky import sky_logging
8
+ from sky.jobs import state as managed_job_state
6
9
  from sky.schemas.generated import autostopv1_pb2
7
10
  from sky.schemas.generated import autostopv1_pb2_grpc
11
+ from sky.schemas.generated import jobsv1_pb2
12
+ from sky.schemas.generated import jobsv1_pb2_grpc
13
+ from sky.serve import serve_state
8
14
  from sky.skylet import autostop_lib
15
+ from sky.skylet import constants
16
+ from sky.skylet import job_lib
9
17
 
10
18
  logger = sky_logging.init_logger(__name__)
11
19
 
@@ -42,3 +50,198 @@ class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
42
50
  is_autostopping=is_autostopping)
43
51
  except Exception as e: # pylint: disable=broad-except
44
52
  context.abort(grpc.StatusCode.INTERNAL, str(e))
53
+
54
+
55
+ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
56
+ """Implementation of the JobsService gRPC service."""
57
+
58
+ def AddJob( # type: ignore[return]
59
+ self, request: jobsv1_pb2.AddJobRequest,
60
+ context: grpc.ServicerContext) -> jobsv1_pb2.AddJobResponse:
61
+ try:
62
+ job_name = request.job_name if request.HasField('job_name') else '-'
63
+ job_id, log_dir = job_lib.add_job(job_name, request.username,
64
+ request.run_timestamp,
65
+ request.resources_str,
66
+ request.metadata)
67
+ return jobsv1_pb2.AddJobResponse(job_id=job_id, log_dir=log_dir)
68
+ except Exception as e: # pylint: disable=broad-except
69
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
70
+
71
+ def QueueJob( # type: ignore[return]
72
+ self, request: jobsv1_pb2.QueueJobRequest,
73
+ context: grpc.ServicerContext) -> jobsv1_pb2.QueueJobResponse:
74
+ try:
75
+ job_id = request.job_id
76
+ # Create log directory and file
77
+ remote_log_dir = os.path.expanduser(request.remote_log_dir)
78
+ os.makedirs(remote_log_dir, exist_ok=True)
79
+ remote_log_path = os.path.join(remote_log_dir, 'run.log')
80
+ open(remote_log_path, 'a').close() # pylint: disable=unspecified-encoding
81
+
82
+ script_path = os.path.expanduser(request.script_path)
83
+ os.makedirs(os.path.dirname(script_path), exist_ok=True)
84
+
85
+ # If `codegen` is not provided, assume script is already
86
+ # uploaded to `script_path` via rsync.
87
+ if request.HasField('codegen'):
88
+ with open(script_path, 'w', encoding='utf-8') as f:
89
+ f.write(request.codegen)
90
+ os.chmod(script_path, 0o755)
91
+
92
+ cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
93
+ job_submit_cmd = (
94
+ # JOB_CMD_IDENTIFIER is used for identifying the process
95
+ # retrieved with pid is the same driver process.
96
+ f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
97
+ f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
98
+ # Do not use &>, which is not POSIX and may not work.
99
+ # Note that the order of ">filename 2>&1" matters.
100
+ f' > {remote_log_path} 2>&1')
101
+ job_lib.scheduler.queue(job_id, job_submit_cmd)
102
+
103
+ if request.HasField('managed_job'):
104
+ managed_job = request.managed_job
105
+ pool = managed_job.pool if managed_job.HasField(
106
+ 'pool') else None
107
+ pool_hash = None
108
+ if pool is not None:
109
+ pool_hash = serve_state.get_service_hash(pool)
110
+ # Add the managed job to job queue database.
111
+ managed_job_state.set_job_info(job_id, managed_job.name,
112
+ managed_job.workspace,
113
+ managed_job.entrypoint, pool,
114
+ pool_hash)
115
+ # Set the managed job to PENDING state to make sure that
116
+ # this managed job appears in the `sky jobs queue`, even
117
+ # if it needs to wait to be submitted.
118
+ # We cannot set the managed job to PENDING state in the
119
+ # job template (jobs-controller.yaml.j2), as it may need
120
+ # to wait for the run commands to be scheduled on the job
121
+ # controller in high-load cases.
122
+ for task in managed_job.tasks:
123
+ managed_job_state.set_pending(job_id, task.task_id,
124
+ task.name, task.resources_str,
125
+ task.metadata_json)
126
+ return jobsv1_pb2.QueueJobResponse()
127
+ except Exception as e: # pylint: disable=broad-except
128
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
129
+
130
+ def UpdateStatus( # type: ignore[return]
131
+ self, request: jobsv1_pb2.UpdateStatusRequest,
132
+ context: grpc.ServicerContext) -> jobsv1_pb2.UpdateStatusResponse:
133
+ try:
134
+ job_lib.update_status()
135
+ return jobsv1_pb2.UpdateStatusResponse()
136
+ except Exception as e: # pylint: disable=broad-except
137
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
138
+
139
+ def GetJobQueue( # type: ignore[return]
140
+ self, request: jobsv1_pb2.GetJobQueueRequest,
141
+ context: grpc.ServicerContext) -> jobsv1_pb2.GetJobQueueResponse:
142
+ try:
143
+ user_hash = request.user_hash if request.HasField(
144
+ 'user_hash') else None
145
+ all_jobs = request.all_jobs
146
+ jobs_info = job_lib.get_jobs_info(user_hash=user_hash,
147
+ all_jobs=all_jobs)
148
+ return jobsv1_pb2.GetJobQueueResponse(jobs=jobs_info)
149
+ except Exception as e: # pylint: disable=broad-except
150
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
151
+
152
+ def CancelJobs( # type: ignore[return]
153
+ self, request: jobsv1_pb2.CancelJobsRequest,
154
+ context: grpc.ServicerContext) -> jobsv1_pb2.CancelJobsResponse:
155
+ try:
156
+ job_ids = list(request.job_ids) if request.job_ids else []
157
+ user_hash = request.user_hash if request.HasField(
158
+ 'user_hash') else None
159
+ cancelled_job_ids = job_lib.cancel_jobs(job_ids, request.cancel_all,
160
+ user_hash)
161
+ return jobsv1_pb2.CancelJobsResponse(
162
+ cancelled_job_ids=cancelled_job_ids)
163
+ except Exception as e: # pylint: disable=broad-except
164
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
165
+
166
+ def FailAllInProgressJobs( # type: ignore[return]
167
+ self, _: jobsv1_pb2.FailAllInProgressJobsRequest,
168
+ context: grpc.ServicerContext
169
+ ) -> jobsv1_pb2.FailAllInProgressJobsResponse:
170
+ try:
171
+ job_lib.fail_all_jobs_in_progress()
172
+ return jobsv1_pb2.FailAllInProgressJobsResponse()
173
+ except Exception as e: # pylint: disable=broad-except
174
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
175
+
176
+ def TailLogs(
177
+ self,
178
+ request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
179
+ context: grpc.ServicerContext):
180
+ # TODO(kevin): implement this
181
+ raise NotImplementedError('TailLogs is not implemented')
182
+
183
+ def GetJobStatus( # type: ignore[return]
184
+ self, request: jobsv1_pb2.GetJobStatusRequest,
185
+ context: grpc.ServicerContext) -> jobsv1_pb2.GetJobStatusResponse:
186
+ try:
187
+ if request.job_ids:
188
+ job_ids = list(request.job_ids)
189
+ else:
190
+ latest_job_id = job_lib.get_latest_job_id()
191
+ job_ids = [latest_job_id] if latest_job_id is not None else []
192
+ job_statuses = job_lib.get_statuses(job_ids)
193
+ for job_id, status in job_statuses.items():
194
+ job_statuses[job_id] = job_lib.JobStatus(status).to_protobuf(
195
+ ) if status is not None else jobsv1_pb2.JOB_STATUS_UNSPECIFIED
196
+ return jobsv1_pb2.GetJobStatusResponse(job_statuses=job_statuses)
197
+ except Exception as e: # pylint: disable=broad-except
198
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
199
+
200
+ def GetJobSubmittedTimestamp( # type: ignore[return]
201
+ self, request: jobsv1_pb2.GetJobSubmittedTimestampRequest,
202
+ context: grpc.ServicerContext
203
+ ) -> jobsv1_pb2.GetJobSubmittedTimestampResponse:
204
+ try:
205
+ job_id = request.job_id if request.HasField(
206
+ 'job_id') else job_lib.get_latest_job_id()
207
+ timestamp = job_lib.get_job_submitted_or_ended_timestamp(
208
+ job_id, False)
209
+ if timestamp is None:
210
+ context.abort(grpc.StatusCode.NOT_FOUND,
211
+ f'Job {job_id} not found')
212
+ return jobsv1_pb2.GetJobSubmittedTimestampResponse(
213
+ timestamp=timestamp)
214
+ except Exception as e: # pylint: disable=broad-except
215
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
216
+
217
+ def GetJobEndedTimestamp( # type: ignore[return]
218
+ self, request: jobsv1_pb2.GetJobEndedTimestampRequest,
219
+ context: grpc.ServicerContext
220
+ ) -> jobsv1_pb2.GetJobEndedTimestampResponse:
221
+ try:
222
+ job_id = request.job_id if request.HasField(
223
+ 'job_id') else job_lib.get_latest_job_id()
224
+ timestamp = job_lib.get_job_submitted_or_ended_timestamp(
225
+ job_id, True)
226
+ if timestamp is None:
227
+ context.abort(grpc.StatusCode.NOT_FOUND,
228
+ f'Job {job_id} not found or not ended')
229
+ return jobsv1_pb2.GetJobEndedTimestampResponse(timestamp=timestamp)
230
+ except Exception as e: # pylint: disable=broad-except
231
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
232
+
233
+ def GetLogDirsForJobs( # type: ignore[return]
234
+ self, request: jobsv1_pb2.GetLogDirsForJobsRequest,
235
+ context: grpc.ServicerContext
236
+ ) -> jobsv1_pb2.GetLogDirsForJobsResponse:
237
+ try:
238
+ if request.job_ids:
239
+ job_ids = list(request.job_ids)
240
+ else:
241
+ latest_job_id = job_lib.get_latest_job_id()
242
+ job_ids = [latest_job_id] if latest_job_id is not None else []
243
+ job_log_dirs = job_lib.get_job_log_dirs(job_ids)
244
+ return jobsv1_pb2.GetLogDirsForJobsResponse(
245
+ job_log_dirs=job_log_dirs)
246
+ except Exception as e: # pylint: disable=broad-except
247
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
sky/skylet/skylet.py CHANGED
@@ -9,6 +9,7 @@ import grpc
9
9
  import sky
10
10
  from sky import sky_logging
11
11
  from sky.schemas.generated import autostopv1_pb2_grpc
12
+ from sky.schemas.generated import jobsv1_pb2_grpc
12
13
  from sky.skylet import constants
13
14
  from sky.skylet import events
14
15
  from sky.skylet import services
@@ -50,6 +51,9 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
50
51
  autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
51
52
  services.AutostopServiceImpl(), server)
52
53
 
54
+ jobsv1_pb2_grpc.add_JobsServiceServicer_to_server(
55
+ services.JobsServiceImpl(), server)
56
+
53
57
  listen_addr = f'127.0.0.1:{port}'
54
58
  server.add_insecure_port(listen_addr)
55
59
 
@@ -0,0 +1,108 @@
1
+ cluster_name: {{ cluster_name_on_cloud }}
2
+
3
+ max_workers: {{ num_nodes - 1 }}
4
+ upscaling_speed: {{ num_nodes - 1 }}
5
+ idle_timeout_minutes: 5
6
+
7
+ provider:
8
+ type: external
9
+ module: sky.provision.seeweb
10
+ region: "{{ region }}"
11
+
12
+ auth:
13
+ ssh_user: ecuser
14
+ ssh_private_key: {{ ssh_private_key }}
15
+
16
+ available_node_types:
17
+ ray_head_default:
18
+ resources: {}
19
+ node_config:
20
+ plan: {{ instance_type }}
21
+ image: {{ image_id }}
22
+ location: {{ region }}
23
+ {% if seeweb_gpu_config is not none %}
24
+ gpu: {{ seeweb_gpu_config.gpu }}
25
+ gpu_label: "{{ seeweb_gpu_config.gpu_label }}"
26
+ {% endif %}
27
+ disk: {{ disk_size }}
28
+
29
+ head_node_type: ray_head_default
30
+
31
+ file_mounts: {
32
+ "~/.seeweb_cloud/seeweb_keys": "~/.seeweb_cloud/seeweb_keys",
33
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
34
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
35
+ {%- for remote_path, local_path in credentials.items() %}
36
+ "{{remote_path}}": "{{local_path}}",
37
+ {%- endfor %}
38
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
39
+ }
40
+
41
+ rsync_exclude: []
42
+
43
+ setup_commands:
44
+ - |
45
+ touch ~/.bashrc;
46
+ echo "127.0.0.1 $(hostname)" | sudo tee -a /etc/hosts || true;
47
+ echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts || true;
48
+ sudo systemctl stop unattended-upgrades || true;
49
+ sudo systemctl disable unattended-upgrades || true;
50
+ sudo apt update && sudo apt install -y patch || sudo yum install -y patch || true;
51
+ {{ conda_installation_commands }}
52
+ {{ ray_skypilot_installation_commands }}
53
+
54
+ head_start_ray_commands:
55
+ - |
56
+ retry_ray() {
57
+ local n=0; local max=30
58
+ until [ $n -ge $max ]; do
59
+ export SKYPILOT_NUM_GPUS=0
60
+ command -v nvidia-smi >/dev/null 2>&1 && \
61
+ SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
62
+
63
+ ray stop || true
64
+ RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
65
+ ray start --disable-usage-stats --head \
66
+ --port={{ ray_port }} --dashboard-port={{ ray_dashboard_port }} \
67
+ --object-manager-port=8076 \
68
+ --autoscaling-config=~/ray_bootstrap_config.yaml \
69
+ --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
70
+
71
+ echo "[head] Ray failed to start ($((++n))/$max), retrying in 5s..."
72
+ sleep 5
73
+ done
74
+ [ $n -eq $max ] && { echo "Ray head failed"; exit 1; }
75
+ }
76
+ retry_ray
77
+
78
+ worker_start_ray_commands:
79
+ - |
80
+ retry_ray() {
81
+ local n=0; local max=30
82
+ until [ $n -ge $max ]; do
83
+ SKYPILOT_NUM_GPUS=0
84
+ command -v nvidia-smi >/dev/null 2>&1 && \
85
+ SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
86
+
87
+ ray stop || true
88
+ RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
89
+ ray start --disable-usage-stats \
90
+ --address=$RAY_HEAD_IP:{{ ray_port }} \
91
+ --object-manager-port=8076 \
92
+ --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
93
+
94
+ echo "[worker] Ray failed to start ($((++n))/$max), retrying in 5s..."
95
+ sleep 5
96
+ done
97
+ [ $n -eq $max ] && { echo "Ray worker failed"; exit 1; }
98
+ }
99
+ retry_ray
100
+
101
+ head_node: {}
102
+ worker_nodes: {}
103
+
104
+ head_setup_commands: []
105
+ worker_setup_commands: []
106
+
107
+ cluster_synced_files: []
108
+ file_mounts_sync_continuously: False
@@ -144,6 +144,9 @@ class SSHConfigHelper(object):
144
144
  username = docker_user
145
145
 
146
146
  key_path = cls.generate_local_key_file(cluster_name, auth_config)
147
+ # Keep the unexpanded path for SSH config (with ~)
148
+ key_path_for_config = key_path
149
+ # Expand the path for internal operations that need absolute path
147
150
  key_path = os.path.expanduser(key_path)
148
151
  sky_autogen_comment = ('# Added by sky (use `sky stop/down '
149
152
  f'{cluster_name}` to remove)')
@@ -208,8 +211,9 @@ class SSHConfigHelper(object):
208
211
  node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
209
212
  # TODO(romilb): Update port number when k8s supports multinode
210
213
  codegen += cls._get_generated_config(
211
- sky_autogen_comment, node_name, ip, username, key_path,
212
- proxy_command, port, docker_proxy_command) + '\n'
214
+ sky_autogen_comment, node_name, ip, username,
215
+ key_path_for_config, proxy_command, port,
216
+ docker_proxy_command) + '\n'
213
217
 
214
218
  cluster_config_path = os.path.expanduser(
215
219
  cls.ssh_cluster_path.format(cluster_name))
@@ -228,15 +228,21 @@ def get_controller_for_pool(pool: bool) -> Controllers:
228
228
  def high_availability_specified(cluster_name: Optional[str]) -> bool:
229
229
  """Check if the controller high availability is specified in user config.
230
230
  """
231
- # pylint: disable=import-outside-toplevel
232
- from sky.jobs import utils as managed_job_utils
233
- if managed_job_utils.is_consolidation_mode():
234
- return True
235
-
236
231
  controller = Controllers.from_name(cluster_name)
237
232
  if controller is None:
238
233
  return False
239
234
 
235
+ if controller.value.controller_type == 'jobs':
236
+ # pylint: disable-next=import-outside-toplevel
237
+ from sky.jobs import utils as managed_job_utils
238
+ if managed_job_utils.is_consolidation_mode():
239
+ return True
240
+ elif controller.value.controller_type == 'serve':
241
+ # pylint: disable-next=import-outside-toplevel
242
+ from sky.serve import serve_utils
243
+ if serve_utils.is_consolidation_mode():
244
+ return True
245
+
240
246
  if skypilot_config.loaded():
241
247
  return skypilot_config.get_nested((controller.value.controller_type,
242
248
  'controller', 'high_availability'),