skypilot-nightly 1.0.0.dev20250303__py3-none-any.whl → 1.0.0.dev20250305__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/jobs/client/sdk.py CHANGED
@@ -184,7 +184,7 @@ def tail_logs(name: Optional[str] = None,
184
184
  follow: bool = True,
185
185
  controller: bool = False,
186
186
  refresh: bool = False,
187
- output_stream: Optional['io.TextIOBase'] = None) -> None:
187
+ output_stream: Optional['io.TextIOBase'] = None) -> int:
188
188
  """Tails logs of managed jobs.
189
189
 
190
190
  You can provide either a job name or a job ID to tail logs. If both are not
@@ -199,6 +199,11 @@ def tail_logs(name: Optional[str] = None,
199
199
  output_stream: The stream to write the logs to. If None, print to the
200
200
  console.
201
201
 
202
+ Returns:
203
+ Exit code based on success or failure of the job. 0 if success,
204
+ 100 if the job failed. See exceptions.JobExitCode for possible exit
205
+ codes.
206
+
202
207
  Request Raises:
203
208
  ValueError: invalid arguments.
204
209
  sky.exceptions.ClusterNotUpError: the jobs controller is not up.
@@ -217,7 +222,7 @@ def tail_logs(name: Optional[str] = None,
217
222
  timeout=(5, None),
218
223
  )
219
224
  request_id = server_common.get_request_id(response)
220
- sdk.stream_response(request_id, response, output_stream)
225
+ return sdk.stream_response(request_id, response, output_stream)
221
226
 
222
227
 
223
228
  @usage_lib.entrypoint
sky/jobs/constants.py CHANGED
@@ -40,7 +40,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
40
40
  # The version of the lib files that jobs/utils use. Whenever there is an API
41
41
  # change for the jobs/utils, we need to bump this version and update
42
42
  # job.utils.ManagedJobCodeGen to handle the version update.
43
- MANAGED_JOBS_VERSION = 2
43
+ MANAGED_JOBS_VERSION = 3
44
44
 
45
45
  # The command for setting up the jobs dashboard on the controller. It firstly
46
46
  # checks if the systemd services are available, and if not (e.g., Kubernetes
@@ -62,6 +62,7 @@ class JobTableColumns(enum.IntEnum):
62
62
  - FAILOVER (13): Job failover history
63
63
  - DETAILS (14): Job details
64
64
  - ACTIONS (15): Available actions column
65
+ - LOG_CONTENT (16): Log content column
65
66
  """
66
67
  DROPDOWN = 0
67
68
  ID = 1
@@ -79,13 +80,14 @@ class JobTableColumns(enum.IntEnum):
79
80
  DETAILS = 13
80
81
  FAILOVER = 14
81
82
  ACTIONS = 15
83
+ LOG_CONTENT = 16
82
84
 
83
85
 
84
86
  # Column headers matching the indices above
85
87
  JOB_TABLE_COLUMNS = [
86
88
  '', 'ID', 'Task', 'Name', 'Resources', 'Submitted', 'Total Duration',
87
89
  'Job Duration', 'Status', 'Started', 'Cluster', 'Region', 'Failover',
88
- 'Recoveries', 'Details', 'Actions'
90
+ 'Recoveries', 'Details', 'Actions', 'Log Content'
89
91
  ]
90
92
 
91
93
  # This column is given by format_job_table but should be ignored.
@@ -153,14 +155,14 @@ def home():
153
155
  status_counts[task['status'].value] += 1
154
156
 
155
157
  # Add an empty column for the dropdown button and actions column
156
- # Exclude SCHED. STATE column
158
+ # Exclude SCHED. STATE and LOG_CONTENT columns
157
159
  rows = [
158
160
  [''] + row[:SCHED_STATE_COLUMN] + row[SCHED_STATE_COLUMN + 1:] +
159
161
  # Add empty cell for failover and actions column
160
- [''] + [''] for row in rows
162
+ [''] + [''] + [''] for row in rows
161
163
  ]
162
164
 
163
- # Add log content as failover history for each job
165
+ # Add log content as a regular column for each job
164
166
  for row in rows:
165
167
  job_id = str(row[JobTableColumns.ID]).strip().replace(' ⤳', '')
166
168
  if job_id and job_id != '-':
@@ -174,17 +176,21 @@ def home():
174
176
  log_content = f.read()
175
177
  row[JobTableColumns.FAILOVER] = _extract_launch_history(
176
178
  log_content)
179
+ row[JobTableColumns.LOG_CONTENT] = log_content
177
180
  else:
178
181
  row[JobTableColumns.FAILOVER] = 'Log file not found'
182
+ row[JobTableColumns.LOG_CONTENT] = 'Log file not found'
179
183
  except (IOError, OSError) as e:
180
- row[JobTableColumns.FAILOVER] = f'Error reading log: {str(e)}'
181
- app.logger.error('All managed jobs:')
184
+ error_msg = f'Error reading log: {str(e)}'
185
+ row[JobTableColumns.FAILOVER] = error_msg
186
+ row[JobTableColumns.LOG_CONTENT] = error_msg
187
+ else:
188
+ row[JobTableColumns.LOG_CONTENT] = ''
182
189
 
183
- # Validate column count
184
190
  if rows and len(rows[0]) != len(JOB_TABLE_COLUMNS):
185
191
  raise RuntimeError(
186
192
  f'Dashboard code and managed job queue code are out of sync. '
187
- f'Expected {(JOB_TABLE_COLUMNS)} columns, got {(rows[0])}')
193
+ f'Expected {JOB_TABLE_COLUMNS} columns, got {rows[0]}')
188
194
 
189
195
  # Fix STATUS color codes: '\x1b[33mCANCELLED\x1b[0m' -> 'CANCELLED'
190
196
  for row in rows:
@@ -208,26 +214,10 @@ def home():
208
214
  last_updated_timestamp=timestamp,
209
215
  status_values=status_values,
210
216
  status_counts=status_counts,
217
+ request=flask.request,
211
218
  )
212
219
  return rendered_html
213
220
 
214
221
 
215
- @app.route('/download_log/<job_id>')
216
- def download_log(job_id):
217
- try:
218
- log_path = os.path.join(
219
- os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
220
- f'{job_id}.log')
221
- if not os.path.exists(log_path):
222
- flask.abort(404)
223
- return flask.send_file(log_path,
224
- mimetype='text/plain',
225
- as_attachment=True,
226
- download_name=f'job_{job_id}.log')
227
- except (IOError, OSError) as e:
228
- app.logger.error(f'Error downloading log for job {job_id}: {str(e)}')
229
- flask.abort(500)
230
-
231
-
232
222
  if __name__ == '__main__':
233
223
  app.run()
@@ -498,10 +498,10 @@
498
498
  <td data-full-text="{{ row[13] }}">{{ row[13] }}</td> {# Details #}
499
499
  <td>
500
500
  {% if row[1]|string|replace(' \u21B3', '') and row[1]|string|replace(' \u21B3', '') != '-' %}
501
- <a href="{{ url_for('download_log', job_id=row[1]|string|replace(' \u21B3', '')) }}"
502
- class="btn btn-sm btn-outline-secondary">
501
+ <button class="btn btn-sm btn-outline-secondary log-btn"
502
+ data-job-id="{{ row[1]|string|replace(' \u21B3', '') }}">
503
503
  controller log
504
- </a>
504
+ </button>
505
505
  {% endif %}
506
506
  </td>
507
507
  </tr>
@@ -509,6 +509,41 @@
509
509
  </tbody>
510
510
  </table>
511
511
  </div>
512
+
513
+ <!-- Hidden container for log content -->
514
+ <div style="display: none;">
515
+ {% for row in rows %}
516
+ {% if row[1]|string|replace(' \u21B3', '') and row[1]|string|replace(' \u21B3', '') != '-' %}
517
+ <pre id="log-content-{{ row[1]|string|replace(' \u21B3', '') }}">{{ row[-1]|e }}</pre>
518
+ {% endif %}
519
+ {% endfor %}
520
+ </div>
521
+
522
+ <!-- Log Modal -->
523
+ <div class="modal fade" id="logModal" tabindex="-1" aria-labelledby="logModalLabel" aria-hidden="true">
524
+ <div class="modal-dialog modal-dialog-centered modal-lg">
525
+ <div class="modal-content">
526
+ <div class="modal-header">
527
+ <h5 class="modal-title" id="logModalLabel">Controller Log</h5>
528
+ <button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
529
+ </div>
530
+ <div class="modal-body">
531
+ <div id="logContent" style="white-space: pre-wrap; font-family: monospace; max-height: 70vh; overflow-y: auto; font-size: 0.85rem;"></div>
532
+ <div id="logError" class="alert alert-danger d-none">
533
+ Error loading log content. Please try again.
534
+ </div>
535
+ </div>
536
+ <div class="modal-footer">
537
+ <button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
538
+ <a id="downloadLogBtn" href="#" class="btn btn-primary" download>Download</a>
539
+ </div>
540
+ </div>
541
+ </div>
542
+ </div>
543
+
544
+ <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.2.3/dist/js/bootstrap.bundle.min.js"
545
+ integrity="sha384-kenU1KFdBIe4zVF0s0G1M5b4hcpxyD9F7jL+jjXkk+Q2h455rYXK/7HAuoJl+0I4"
546
+ crossorigin="anonymous"></script>
512
547
  <script>
513
548
  // Folder toggle for pipelines, this will fold/unfold the rows for
514
549
  // a pipeline and its tasks.
@@ -729,6 +764,68 @@
729
764
  });
730
765
  });
731
766
  </script>
767
+ <script>
768
+ // Function to show log modal and display pre-loaded log content
769
+ function showLogModal(jobId, logContent) {
770
+ try {
771
+ // Initialize modal
772
+ const logModal = new bootstrap.Modal(document.getElementById('logModal'));
773
+ const logContentEl = document.getElementById('logContent');
774
+ const logError = document.getElementById('logError');
775
+ const downloadBtn = document.getElementById('downloadLogBtn');
776
+
777
+ // Create a Blob for download functionality
778
+ const blob = new Blob([logContent], { type: 'text/plain' });
779
+ const url = URL.createObjectURL(blob);
780
+
781
+ // Set download button href
782
+ downloadBtn.href = url;
783
+ downloadBtn.setAttribute('download', `job_${jobId}.log`);
784
+
785
+ // Clear previous content and show new content directly
786
+ logContentEl.textContent = logContent || 'No log content available';
787
+ logError.classList.add('d-none');
788
+
789
+ // Set modal title
790
+ document.getElementById('logModalLabel').textContent = `Controller Log - Job ${jobId}`;
791
+
792
+ // Show modal
793
+ logModal.show();
794
+
795
+ // Cleanup the URL object when the modal is hidden
796
+ document.getElementById('logModal').addEventListener('hidden.bs.modal', function() {
797
+ URL.revokeObjectURL(url);
798
+ }, { once: true });
799
+ } catch (error) {
800
+ console.error('Error showing log modal:', error);
801
+ document.getElementById('logError').classList.remove('d-none');
802
+ document.getElementById('logError').textContent = `Error showing log: ${error.message}`;
803
+ }
804
+ }
805
+
806
+ // Add event listeners for log buttons
807
+ document.addEventListener('DOMContentLoaded', function() {
808
+ document.addEventListener('click', function(event) {
809
+ if (event.target.closest('.log-btn')) {
810
+ try {
811
+ const button = event.target.closest('.log-btn');
812
+ const jobId = button.dataset.jobId;
813
+ const logContentEl = document.getElementById(`log-content-${jobId}`);
814
+
815
+ if (!logContentEl) {
816
+ throw new Error(`Log content element not found for job ${jobId}`);
817
+ }
818
+
819
+ const logContent = logContentEl.textContent;
820
+ showLogModal(jobId, logContent);
821
+ } catch (error) {
822
+ console.error('Error getting log content:', error);
823
+ showLogModal(jobId, `Error loading log: ${error.message}`);
824
+ }
825
+ }
826
+ });
827
+ });
828
+ </script>
732
829
  </body>
733
830
 
734
831
  </html>
sky/jobs/server/core.py CHANGED
@@ -460,12 +460,17 @@ def cancel(name: Optional[str] = None,
460
460
 
461
461
  @usage_lib.entrypoint
462
462
  def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
463
- controller: bool, refresh: bool) -> None:
463
+ controller: bool, refresh: bool) -> int:
464
464
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
465
465
  """Tail logs of managed jobs.
466
466
 
467
467
  Please refer to sky.cli.job_logs for documentation.
468
468
 
469
+ Returns:
470
+ Exit code based on success or failure of the job. 0 if success,
471
+ 100 if the job failed. See exceptions.JobExitCode for possible exit
472
+ codes.
473
+
469
474
  Raises:
470
475
  ValueError: invalid arguments.
471
476
  sky.exceptions.ClusterNotUpError: the jobs controller is not up.
@@ -494,11 +499,11 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
494
499
  backend = backend_utils.get_backend_from_handle(handle)
495
500
  assert isinstance(backend, backends.CloudVmRayBackend), backend
496
501
 
497
- backend.tail_managed_job_logs(handle,
498
- job_id=job_id,
499
- job_name=name,
500
- follow=follow,
501
- controller=controller)
502
+ return backend.tail_managed_job_logs(handle,
503
+ job_id=job_id,
504
+ job_name=name,
505
+ follow=follow,
506
+ controller=controller)
502
507
 
503
508
 
504
509
  def start_dashboard_forwarding(refresh: bool = False) -> Tuple[int, int]:
sky/jobs/utils.py CHANGED
@@ -511,8 +511,14 @@ def cancel_job_by_name(job_name: str) -> str:
511
511
  return f'Job {job_name!r} is scheduled to be cancelled.'
512
512
 
513
513
 
514
- def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
515
- """Stream logs by job id."""
514
+ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
515
+ """Stream logs by job id.
516
+
517
+ Returns:
518
+ A tuple containing the log message and an exit code based on success or
519
+ failure of the job. 0 if success, 100 if the job failed.
520
+ See exceptions.JobExitCode for possible exit codes.
521
+ """
516
522
 
517
523
  def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
518
524
  # If we see CANCELLING, just exit - we could miss some job logs but the
@@ -547,13 +553,16 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
547
553
  start_streaming = True
548
554
  if start_streaming:
549
555
  print(line, end='', flush=True)
550
- return ''
556
+ return '', exceptions.JobExitCode.from_managed_job_status(
557
+ managed_job_status)
551
558
  return (f'{colorama.Fore.YELLOW}'
552
559
  f'Job {job_id} is already in terminal state '
553
560
  f'{managed_job_status.value}. For more details, run: '
554
561
  f'sky jobs logs --controller {job_id}'
555
562
  f'{colorama.Style.RESET_ALL}'
556
- f'{job_msg}')
563
+ f'{job_msg}',
564
+ exceptions.JobExitCode.from_managed_job_status(
565
+ managed_job_status))
557
566
  backend = backends.CloudVmRayBackend()
558
567
  task_id, managed_job_status = (
559
568
  managed_job_state.get_latest_task_id_status(job_id))
@@ -604,11 +613,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
604
613
  job_id=None,
605
614
  managed_job_id=job_id,
606
615
  follow=follow)
607
- if returncode == 0:
608
- # If the log tailing exit successfully (the real job can be
609
- # SUCCEEDED or FAILED), we can safely break the loop. We use the
610
- # status in job queue to show the information, as the
611
- # ManagedJobStatus is not updated yet.
616
+ if returncode in [rc.value for rc in exceptions.JobExitCode]:
617
+ # If the log tailing exits with a known exit code we can safely
618
+ # break the loop because it indicates the tailing process
619
+ # succeeded (even though the real job can be SUCCEEDED or
620
+ # FAILED). We use the status in job queue to show the
621
+ # information, as the ManagedJobStatus is not updated yet.
612
622
  job_statuses = backend.get_job_status(handle, stream_logs=False)
613
623
  job_status = list(job_statuses.values())[0]
614
624
  assert job_status is not None, 'No job found.'
@@ -728,18 +738,25 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
728
738
  logger.info(
729
739
  ux_utils.finishing_message(f'Managed job finished: {job_id} '
730
740
  f'(status: {managed_job_status.value}).'))
731
- return ''
741
+ return '', exceptions.JobExitCode.from_managed_job_status(
742
+ managed_job_status)
732
743
 
733
744
 
734
745
  def stream_logs(job_id: Optional[int],
735
746
  job_name: Optional[str],
736
747
  controller: bool = False,
737
- follow: bool = True) -> str:
738
- """Stream logs by job id or job name."""
748
+ follow: bool = True) -> Tuple[str, int]:
749
+ """Stream logs by job id or job name.
750
+
751
+ Returns:
752
+ A tuple containing the log message and the exit code based on success
753
+ or failure of the job. 0 if success, 100 if the job failed.
754
+ See exceptions.JobExitCode for possible exit codes.
755
+ """
739
756
  if job_id is None and job_name is None:
740
757
  job_id = managed_job_state.get_latest_job_id()
741
758
  if job_id is None:
742
- return 'No managed job found.'
759
+ return 'No managed job found.', exceptions.JobExitCode.NOT_FOUND
743
760
 
744
761
  if controller:
745
762
  if job_id is None:
@@ -754,7 +771,8 @@ def stream_logs(job_id: Optional[int],
754
771
  if job['job_name'] == job_name
755
772
  }
756
773
  if not managed_job_ids:
757
- return f'No managed job found with name {job_name!r}.'
774
+ return (f'No managed job found with name {job_name!r}.',
775
+ exceptions.JobExitCode.NOT_FOUND)
758
776
  if len(managed_job_ids) > 1:
759
777
  job_ids_str = ', '.join(
760
778
  str(job_id) for job_id in managed_job_ids)
@@ -776,7 +794,7 @@ def stream_logs(job_id: Optional[int],
776
794
  if not follow:
777
795
  # Assume that the log file hasn't been written yet. Since we
778
796
  # aren't following, just return.
779
- return ''
797
+ return '', exceptions.JobExitCode.SUCCEEDED
780
798
 
781
799
  job_status = managed_job_state.get_status(job_id)
782
800
  if job_status is None:
@@ -787,7 +805,8 @@ def stream_logs(job_id: Optional[int],
787
805
  # point, it never will be. This job may have been submitted
788
806
  # using an old version that did not create the log file, so this
789
807
  # is not considered an exceptional case.
790
- return ''
808
+ return '', exceptions.JobExitCode.from_managed_job_status(
809
+ job_status)
791
810
 
792
811
  time.sleep(log_lib.SKY_LOG_WAITING_GAP_SECONDS)
793
812
 
@@ -833,15 +852,17 @@ def stream_logs(job_id: Optional[int],
833
852
 
834
853
  if follow:
835
854
  return ux_utils.finishing_message(
836
- f'Job finished (status: {job_status}).')
855
+ f'Job finished (status: {job_status}).'
856
+ ), exceptions.JobExitCode.from_managed_job_status(job_status)
837
857
 
838
- return ''
858
+ return '', exceptions.JobExitCode.SUCCEEDED
839
859
 
840
860
  if job_id is None:
841
861
  assert job_name is not None
842
862
  job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
843
863
  if not job_ids:
844
- return f'No running managed job found with name {job_name!r}.'
864
+ return (f'No running managed job found with name {job_name!r}.',
865
+ exceptions.JobExitCode.NOT_FOUND)
845
866
  if len(job_ids) > 1:
846
867
  raise ValueError(
847
868
  f'Multiple running jobs found with name {job_name!r}.')
@@ -1167,6 +1188,7 @@ class ManagedJobCodeGen:
1167
1188
  >> codegen = ManagedJobCodeGen.show_jobs(...)
1168
1189
  """
1169
1190
  _PREFIX = textwrap.dedent("""\
1191
+ import sys
1170
1192
  from sky.jobs import utils
1171
1193
  from sky.jobs import state as managed_job_state
1172
1194
  from sky.jobs import constants as managed_job_constants
@@ -1222,9 +1244,17 @@ class ManagedJobCodeGen:
1222
1244
  follow: bool = True,
1223
1245
  controller: bool = False) -> str:
1224
1246
  code = textwrap.dedent(f"""\
1225
- msg = utils.stream_logs({job_id!r}, {job_name!r},
1247
+ result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
1226
1248
  follow={follow}, controller={controller})
1227
- print(msg, flush=True)
1249
+ if managed_job_version < 3:
1250
+ # Versions 2 and older did not return a retcode, so we just print
1251
+ # the result.
1252
+ # TODO: Remove compatibility before 0.12.0
1253
+ print(result, flush=True)
1254
+ else:
1255
+ msg, retcode = result
1256
+ print(msg, flush=True)
1257
+ sys.exit(retcode)
1228
1258
  """)
1229
1259
  return cls._build(code)
1230
1260
 
sky/server/constants.py CHANGED
@@ -3,7 +3,7 @@
3
3
  # API server version, whenever there is a change in API server that requires a
4
4
  # restart of the local API server or error out when the client does not match
5
5
  # the server version.
6
- API_VERSION = '2'
6
+ API_VERSION = '3'
7
7
 
8
8
  # Prefix for API request names.
9
9
  REQUEST_NAME_PREFIX = 'sky.'
@@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
12
12
 
13
13
  import pydantic
14
14
 
15
+ from sky import admin_policy
15
16
  from sky import serve
16
17
  from sky import sky_logging
17
18
  from sky import skypilot_config
@@ -113,15 +114,9 @@ class CheckBody(RequestBody):
113
114
  verbose: bool
114
115
 
115
116
 
116
- class ValidateBody(RequestBody):
117
- """The request body for the validate endpoint."""
118
- dag: str
119
-
120
-
121
- class OptimizeBody(RequestBody):
122
- """The request body for the optimize endpoint."""
117
+ class DagRequestBody(RequestBody):
118
+ """Request body base class for endpoints with a dag."""
123
119
  dag: str
124
- minimize: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
125
120
 
126
121
  def to_kwargs(self) -> Dict[str, Any]:
127
122
  # Import here to avoid requirement of the whole SkyPilot dependency on
@@ -139,6 +134,19 @@ class OptimizeBody(RequestBody):
139
134
  return kwargs
140
135
 
141
136
 
137
+ class ValidateBody(DagRequestBody):
138
+ """The request body for the validate endpoint."""
139
+ dag: str
140
+ request_options: Optional[admin_policy.RequestOptions]
141
+
142
+
143
+ class OptimizeBody(DagRequestBody):
144
+ """The request body for the optimize endpoint."""
145
+ dag: str
146
+ minimize: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
147
+ request_options: Optional[admin_policy.RequestOptions]
148
+
149
+
142
150
  class LaunchBody(RequestBody):
143
151
  """The request body for the launch endpoint."""
144
152
  task: str
sky/server/server.py CHANGED
@@ -27,7 +27,6 @@ from sky import core
27
27
  from sky import exceptions
28
28
  from sky import execution
29
29
  from sky import global_user_state
30
- from sky import optimizer
31
30
  from sky import sky_logging
32
31
  from sky.clouds import service_catalog
33
32
  from sky.data import storage_utils
@@ -42,6 +41,7 @@ from sky.server.requests import payloads
42
41
  from sky.server.requests import requests as requests_lib
43
42
  from sky.skylet import constants
44
43
  from sky.usage import usage_lib
44
+ from sky.utils import admin_policy_utils
45
45
  from sky.utils import common as common_lib
46
46
  from sky.utils import common_utils
47
47
  from sky.utils import dag_utils
@@ -258,9 +258,22 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
258
258
  """Validates the user's DAG."""
259
259
  # TODO(SKY-1035): validate if existing cluster satisfies the requested
260
260
  # resources, e.g. sky exec --gpus V100:8 existing-cluster-with-no-gpus
261
+
262
+ # TODO: Our current launch process is split into three calls:
263
+ # validate, optimize, and launch. This requires us to apply the admin policy
264
+ # in each step, which may be an expensive operation. We should consolidate
265
+ # these into a single call or have a TTL cache for (task, admin_policy)
266
+ # pairs.
261
267
  logger.debug(f'Validating tasks: {validate_body.dag}')
262
268
  try:
263
269
  dag = dag_utils.load_chain_dag_from_yaml_str(validate_body.dag)
270
+ # TODO: Admin policy may contain arbitrary code, which may be expensive
271
+ # to run and may block the server thread. However, moving it into the
272
+ # executor adds a ~150ms penalty on the local API server because of
273
+ # added RTTs. For now, we stick to doing the validation inline in the
274
+ # server thread.
275
+ dag, _ = admin_policy_utils.apply(
276
+ dag, request_options=validate_body.request_options)
264
277
  for task in dag.tasks:
265
278
  # Will validate workdir and file_mounts in the backend, as those
266
279
  # need to be validated after the files are uploaded to the SkyPilot
@@ -283,7 +296,7 @@ async def optimize(optimize_body: payloads.OptimizeBody,
283
296
  request_name='optimize',
284
297
  request_body=optimize_body,
285
298
  ignore_return_value=True,
286
- func=optimizer.Optimizer.optimize,
299
+ func=core.optimize,
287
300
  schedule_type=requests_lib.ScheduleType.SHORT,
288
301
  )
289
302
 
sky/skylet/constants.py CHANGED
@@ -93,7 +93,7 @@ SKYLET_VERSION = '12'
93
93
  # The version of the lib files that skylet/jobs use. Whenever there is an API
94
94
  # change for the job_lib or log_lib, we need to bump this version, so that the
95
95
  # user can be notified to update their SkyPilot version on the remote cluster.
96
- SKYLET_LIB_VERSION = 2
96
+ SKYLET_LIB_VERSION = 3
97
97
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
98
98
 
99
99
  # `sky jobs dashboard`-related
sky/skylet/job_lib.py CHANGED
@@ -938,7 +938,9 @@ class JobLibCodeGen:
938
938
  _PREFIX = [
939
939
  'import os',
940
940
  'import getpass',
941
- 'from sky.skylet import job_lib, log_lib, constants',
941
+ 'import sys',
942
+ 'from sky import exceptions',
943
+ 'from sky.skylet import log_lib, job_lib, constants',
942
944
  ]
943
945
 
944
946
  @classmethod
@@ -1033,6 +1035,13 @@ class JobLibCodeGen:
1033
1035
  f'tail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
1034
1036
  f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
1035
1037
  f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
1038
+ # After tailing, check the job status and exit with appropriate code
1039
+ 'job_status = job_lib.get_status(job_id)',
1040
+ # Backward compatibility for returning exit code: Skylet versions 2
1041
+ # and older did not have JobExitCode, so we use 0 for those versions
1042
+ # TODO: Remove this special handling after 0.10.0.
1043
+ 'exit_code = exceptions.JobExitCode.from_job_status(job_status) if getattr(constants, "SKYLET_LIB_VERSION", 1) > 2 else 0',
1044
+ 'sys.exit(exit_code)',
1036
1045
  ]
1037
1046
  return cls._build(code)
1038
1047