skypilot-nightly 1.0.0.dev20250303__py3-none-any.whl → 1.0.0.dev20250305__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +16 -8
- sky/cli.py +36 -18
- sky/client/cli.py +36 -18
- sky/client/sdk.py +32 -9
- sky/core.py +55 -6
- sky/exceptions.py +80 -1
- sky/jobs/client/sdk.py +7 -2
- sky/jobs/constants.py +1 -1
- sky/jobs/dashboard/dashboard.py +15 -25
- sky/jobs/dashboard/templates/index.html +100 -3
- sky/jobs/server/core.py +11 -6
- sky/jobs/utils.py +51 -21
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +16 -8
- sky/server/server.py +15 -2
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +10 -1
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/METADATA +28 -41
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/RECORD +24 -24
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/top_level.txt +0 -0
sky/jobs/client/sdk.py
CHANGED
@@ -184,7 +184,7 @@ def tail_logs(name: Optional[str] = None,
|
|
184
184
|
follow: bool = True,
|
185
185
|
controller: bool = False,
|
186
186
|
refresh: bool = False,
|
187
|
-
output_stream: Optional['io.TextIOBase'] = None) ->
|
187
|
+
output_stream: Optional['io.TextIOBase'] = None) -> int:
|
188
188
|
"""Tails logs of managed jobs.
|
189
189
|
|
190
190
|
You can provide either a job name or a job ID to tail logs. If both are not
|
@@ -199,6 +199,11 @@ def tail_logs(name: Optional[str] = None,
|
|
199
199
|
output_stream: The stream to write the logs to. If None, print to the
|
200
200
|
console.
|
201
201
|
|
202
|
+
Returns:
|
203
|
+
Exit code based on success or failure of the job. 0 if success,
|
204
|
+
100 if the job failed. See exceptions.JobExitCode for possible exit
|
205
|
+
codes.
|
206
|
+
|
202
207
|
Request Raises:
|
203
208
|
ValueError: invalid arguments.
|
204
209
|
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
@@ -217,7 +222,7 @@ def tail_logs(name: Optional[str] = None,
|
|
217
222
|
timeout=(5, None),
|
218
223
|
)
|
219
224
|
request_id = server_common.get_request_id(response)
|
220
|
-
sdk.stream_response(request_id, response, output_stream)
|
225
|
+
return sdk.stream_response(request_id, response, output_stream)
|
221
226
|
|
222
227
|
|
223
228
|
@usage_lib.entrypoint
|
sky/jobs/constants.py
CHANGED
@@ -40,7 +40,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
40
40
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
41
41
|
# change for the jobs/utils, we need to bump this version and update
|
42
42
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
43
|
-
MANAGED_JOBS_VERSION =
|
43
|
+
MANAGED_JOBS_VERSION = 3
|
44
44
|
|
45
45
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
46
46
|
# checks if the systemd services are available, and if not (e.g., Kubernetes
|
sky/jobs/dashboard/dashboard.py
CHANGED
@@ -62,6 +62,7 @@ class JobTableColumns(enum.IntEnum):
|
|
62
62
|
- FAILOVER (13): Job failover history
|
63
63
|
- DETAILS (14): Job details
|
64
64
|
- ACTIONS (15): Available actions column
|
65
|
+
- LOG_CONTENT (16): Log content column
|
65
66
|
"""
|
66
67
|
DROPDOWN = 0
|
67
68
|
ID = 1
|
@@ -79,13 +80,14 @@ class JobTableColumns(enum.IntEnum):
|
|
79
80
|
DETAILS = 13
|
80
81
|
FAILOVER = 14
|
81
82
|
ACTIONS = 15
|
83
|
+
LOG_CONTENT = 16
|
82
84
|
|
83
85
|
|
84
86
|
# Column headers matching the indices above
|
85
87
|
JOB_TABLE_COLUMNS = [
|
86
88
|
'', 'ID', 'Task', 'Name', 'Resources', 'Submitted', 'Total Duration',
|
87
89
|
'Job Duration', 'Status', 'Started', 'Cluster', 'Region', 'Failover',
|
88
|
-
'Recoveries', 'Details', 'Actions'
|
90
|
+
'Recoveries', 'Details', 'Actions', 'Log Content'
|
89
91
|
]
|
90
92
|
|
91
93
|
# This column is given by format_job_table but should be ignored.
|
@@ -153,14 +155,14 @@ def home():
|
|
153
155
|
status_counts[task['status'].value] += 1
|
154
156
|
|
155
157
|
# Add an empty column for the dropdown button and actions column
|
156
|
-
# Exclude SCHED. STATE
|
158
|
+
# Exclude SCHED. STATE and LOG_CONTENT columns
|
157
159
|
rows = [
|
158
160
|
[''] + row[:SCHED_STATE_COLUMN] + row[SCHED_STATE_COLUMN + 1:] +
|
159
161
|
# Add empty cell for failover and actions column
|
160
|
-
[''] + [''] for row in rows
|
162
|
+
[''] + [''] + [''] for row in rows
|
161
163
|
]
|
162
164
|
|
163
|
-
# Add log content as
|
165
|
+
# Add log content as a regular column for each job
|
164
166
|
for row in rows:
|
165
167
|
job_id = str(row[JobTableColumns.ID]).strip().replace(' ⤳', '')
|
166
168
|
if job_id and job_id != '-':
|
@@ -174,17 +176,21 @@ def home():
|
|
174
176
|
log_content = f.read()
|
175
177
|
row[JobTableColumns.FAILOVER] = _extract_launch_history(
|
176
178
|
log_content)
|
179
|
+
row[JobTableColumns.LOG_CONTENT] = log_content
|
177
180
|
else:
|
178
181
|
row[JobTableColumns.FAILOVER] = 'Log file not found'
|
182
|
+
row[JobTableColumns.LOG_CONTENT] = 'Log file not found'
|
179
183
|
except (IOError, OSError) as e:
|
180
|
-
|
181
|
-
|
184
|
+
error_msg = f'Error reading log: {str(e)}'
|
185
|
+
row[JobTableColumns.FAILOVER] = error_msg
|
186
|
+
row[JobTableColumns.LOG_CONTENT] = error_msg
|
187
|
+
else:
|
188
|
+
row[JobTableColumns.LOG_CONTENT] = ''
|
182
189
|
|
183
|
-
# Validate column count
|
184
190
|
if rows and len(rows[0]) != len(JOB_TABLE_COLUMNS):
|
185
191
|
raise RuntimeError(
|
186
192
|
f'Dashboard code and managed job queue code are out of sync. '
|
187
|
-
f'Expected {
|
193
|
+
f'Expected {JOB_TABLE_COLUMNS} columns, got {rows[0]}')
|
188
194
|
|
189
195
|
# Fix STATUS color codes: '\x1b[33mCANCELLED\x1b[0m' -> 'CANCELLED'
|
190
196
|
for row in rows:
|
@@ -208,26 +214,10 @@ def home():
|
|
208
214
|
last_updated_timestamp=timestamp,
|
209
215
|
status_values=status_values,
|
210
216
|
status_counts=status_counts,
|
217
|
+
request=flask.request,
|
211
218
|
)
|
212
219
|
return rendered_html
|
213
220
|
|
214
221
|
|
215
|
-
@app.route('/download_log/<job_id>')
|
216
|
-
def download_log(job_id):
|
217
|
-
try:
|
218
|
-
log_path = os.path.join(
|
219
|
-
os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
|
220
|
-
f'{job_id}.log')
|
221
|
-
if not os.path.exists(log_path):
|
222
|
-
flask.abort(404)
|
223
|
-
return flask.send_file(log_path,
|
224
|
-
mimetype='text/plain',
|
225
|
-
as_attachment=True,
|
226
|
-
download_name=f'job_{job_id}.log')
|
227
|
-
except (IOError, OSError) as e:
|
228
|
-
app.logger.error(f'Error downloading log for job {job_id}: {str(e)}')
|
229
|
-
flask.abort(500)
|
230
|
-
|
231
|
-
|
232
222
|
if __name__ == '__main__':
|
233
223
|
app.run()
|
@@ -498,10 +498,10 @@
|
|
498
498
|
<td data-full-text="{{ row[13] }}">{{ row[13] }}</td> {# Details #}
|
499
499
|
<td>
|
500
500
|
{% if row[1]|string|replace(' \u21B3', '') and row[1]|string|replace(' \u21B3', '') != '-' %}
|
501
|
-
<
|
502
|
-
|
501
|
+
<button class="btn btn-sm btn-outline-secondary log-btn"
|
502
|
+
data-job-id="{{ row[1]|string|replace(' \u21B3', '') }}">
|
503
503
|
controller log
|
504
|
-
</
|
504
|
+
</button>
|
505
505
|
{% endif %}
|
506
506
|
</td>
|
507
507
|
</tr>
|
@@ -509,6 +509,41 @@
|
|
509
509
|
</tbody>
|
510
510
|
</table>
|
511
511
|
</div>
|
512
|
+
|
513
|
+
<!-- Hidden container for log content -->
|
514
|
+
<div style="display: none;">
|
515
|
+
{% for row in rows %}
|
516
|
+
{% if row[1]|string|replace(' \u21B3', '') and row[1]|string|replace(' \u21B3', '') != '-' %}
|
517
|
+
<pre id="log-content-{{ row[1]|string|replace(' \u21B3', '') }}">{{ row[-1]|e }}</pre>
|
518
|
+
{% endif %}
|
519
|
+
{% endfor %}
|
520
|
+
</div>
|
521
|
+
|
522
|
+
<!-- Log Modal -->
|
523
|
+
<div class="modal fade" id="logModal" tabindex="-1" aria-labelledby="logModalLabel" aria-hidden="true">
|
524
|
+
<div class="modal-dialog modal-dialog-centered modal-lg">
|
525
|
+
<div class="modal-content">
|
526
|
+
<div class="modal-header">
|
527
|
+
<h5 class="modal-title" id="logModalLabel">Controller Log</h5>
|
528
|
+
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
529
|
+
</div>
|
530
|
+
<div class="modal-body">
|
531
|
+
<div id="logContent" style="white-space: pre-wrap; font-family: monospace; max-height: 70vh; overflow-y: auto; font-size: 0.85rem;"></div>
|
532
|
+
<div id="logError" class="alert alert-danger d-none">
|
533
|
+
Error loading log content. Please try again.
|
534
|
+
</div>
|
535
|
+
</div>
|
536
|
+
<div class="modal-footer">
|
537
|
+
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
|
538
|
+
<a id="downloadLogBtn" href="#" class="btn btn-primary" download>Download</a>
|
539
|
+
</div>
|
540
|
+
</div>
|
541
|
+
</div>
|
542
|
+
</div>
|
543
|
+
|
544
|
+
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.2.3/dist/js/bootstrap.bundle.min.js"
|
545
|
+
integrity="sha384-kenU1KFdBIe4zVF0s0G1M5b4hcpxyD9F7jL+jjXkk+Q2h455rYXK/7HAuoJl+0I4"
|
546
|
+
crossorigin="anonymous"></script>
|
512
547
|
<script>
|
513
548
|
// Folder toggle for pipelines, this will fold/unfold the rows for
|
514
549
|
// a pipeline and its tasks.
|
@@ -729,6 +764,68 @@
|
|
729
764
|
});
|
730
765
|
});
|
731
766
|
</script>
|
767
|
+
<script>
|
768
|
+
// Function to show log modal and display pre-loaded log content
|
769
|
+
function showLogModal(jobId, logContent) {
|
770
|
+
try {
|
771
|
+
// Initialize modal
|
772
|
+
const logModal = new bootstrap.Modal(document.getElementById('logModal'));
|
773
|
+
const logContentEl = document.getElementById('logContent');
|
774
|
+
const logError = document.getElementById('logError');
|
775
|
+
const downloadBtn = document.getElementById('downloadLogBtn');
|
776
|
+
|
777
|
+
// Create a Blob for download functionality
|
778
|
+
const blob = new Blob([logContent], { type: 'text/plain' });
|
779
|
+
const url = URL.createObjectURL(blob);
|
780
|
+
|
781
|
+
// Set download button href
|
782
|
+
downloadBtn.href = url;
|
783
|
+
downloadBtn.setAttribute('download', `job_${jobId}.log`);
|
784
|
+
|
785
|
+
// Clear previous content and show new content directly
|
786
|
+
logContentEl.textContent = logContent || 'No log content available';
|
787
|
+
logError.classList.add('d-none');
|
788
|
+
|
789
|
+
// Set modal title
|
790
|
+
document.getElementById('logModalLabel').textContent = `Controller Log - Job ${jobId}`;
|
791
|
+
|
792
|
+
// Show modal
|
793
|
+
logModal.show();
|
794
|
+
|
795
|
+
// Cleanup the URL object when the modal is hidden
|
796
|
+
document.getElementById('logModal').addEventListener('hidden.bs.modal', function() {
|
797
|
+
URL.revokeObjectURL(url);
|
798
|
+
}, { once: true });
|
799
|
+
} catch (error) {
|
800
|
+
console.error('Error showing log modal:', error);
|
801
|
+
document.getElementById('logError').classList.remove('d-none');
|
802
|
+
document.getElementById('logError').textContent = `Error showing log: ${error.message}`;
|
803
|
+
}
|
804
|
+
}
|
805
|
+
|
806
|
+
// Add event listeners for log buttons
|
807
|
+
document.addEventListener('DOMContentLoaded', function() {
|
808
|
+
document.addEventListener('click', function(event) {
|
809
|
+
if (event.target.closest('.log-btn')) {
|
810
|
+
try {
|
811
|
+
const button = event.target.closest('.log-btn');
|
812
|
+
const jobId = button.dataset.jobId;
|
813
|
+
const logContentEl = document.getElementById(`log-content-${jobId}`);
|
814
|
+
|
815
|
+
if (!logContentEl) {
|
816
|
+
throw new Error(`Log content element not found for job ${jobId}`);
|
817
|
+
}
|
818
|
+
|
819
|
+
const logContent = logContentEl.textContent;
|
820
|
+
showLogModal(jobId, logContent);
|
821
|
+
} catch (error) {
|
822
|
+
console.error('Error getting log content:', error);
|
823
|
+
showLogModal(jobId, `Error loading log: ${error.message}`);
|
824
|
+
}
|
825
|
+
}
|
826
|
+
});
|
827
|
+
});
|
828
|
+
</script>
|
732
829
|
</body>
|
733
830
|
|
734
831
|
</html>
|
sky/jobs/server/core.py
CHANGED
@@ -460,12 +460,17 @@ def cancel(name: Optional[str] = None,
|
|
460
460
|
|
461
461
|
@usage_lib.entrypoint
|
462
462
|
def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
463
|
-
controller: bool, refresh: bool) ->
|
463
|
+
controller: bool, refresh: bool) -> int:
|
464
464
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
465
465
|
"""Tail logs of managed jobs.
|
466
466
|
|
467
467
|
Please refer to sky.cli.job_logs for documentation.
|
468
468
|
|
469
|
+
Returns:
|
470
|
+
Exit code based on success or failure of the job. 0 if success,
|
471
|
+
100 if the job failed. See exceptions.JobExitCode for possible exit
|
472
|
+
codes.
|
473
|
+
|
469
474
|
Raises:
|
470
475
|
ValueError: invalid arguments.
|
471
476
|
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
@@ -494,11 +499,11 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
|
494
499
|
backend = backend_utils.get_backend_from_handle(handle)
|
495
500
|
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
496
501
|
|
497
|
-
backend.tail_managed_job_logs(handle,
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
+
return backend.tail_managed_job_logs(handle,
|
503
|
+
job_id=job_id,
|
504
|
+
job_name=name,
|
505
|
+
follow=follow,
|
506
|
+
controller=controller)
|
502
507
|
|
503
508
|
|
504
509
|
def start_dashboard_forwarding(refresh: bool = False) -> Tuple[int, int]:
|
sky/jobs/utils.py
CHANGED
@@ -511,8 +511,14 @@ def cancel_job_by_name(job_name: str) -> str:
|
|
511
511
|
return f'Job {job_name!r} is scheduled to be cancelled.'
|
512
512
|
|
513
513
|
|
514
|
-
def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
515
|
-
"""Stream logs by job id.
|
514
|
+
def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
|
515
|
+
"""Stream logs by job id.
|
516
|
+
|
517
|
+
Returns:
|
518
|
+
A tuple containing the log message and an exit code based on success or
|
519
|
+
failure of the job. 0 if success, 100 if the job failed.
|
520
|
+
See exceptions.JobExitCode for possible exit codes.
|
521
|
+
"""
|
516
522
|
|
517
523
|
def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
|
518
524
|
# If we see CANCELLING, just exit - we could miss some job logs but the
|
@@ -547,13 +553,16 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
547
553
|
start_streaming = True
|
548
554
|
if start_streaming:
|
549
555
|
print(line, end='', flush=True)
|
550
|
-
return ''
|
556
|
+
return '', exceptions.JobExitCode.from_managed_job_status(
|
557
|
+
managed_job_status)
|
551
558
|
return (f'{colorama.Fore.YELLOW}'
|
552
559
|
f'Job {job_id} is already in terminal state '
|
553
560
|
f'{managed_job_status.value}. For more details, run: '
|
554
561
|
f'sky jobs logs --controller {job_id}'
|
555
562
|
f'{colorama.Style.RESET_ALL}'
|
556
|
-
f'{job_msg}'
|
563
|
+
f'{job_msg}',
|
564
|
+
exceptions.JobExitCode.from_managed_job_status(
|
565
|
+
managed_job_status))
|
557
566
|
backend = backends.CloudVmRayBackend()
|
558
567
|
task_id, managed_job_status = (
|
559
568
|
managed_job_state.get_latest_task_id_status(job_id))
|
@@ -604,11 +613,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
604
613
|
job_id=None,
|
605
614
|
managed_job_id=job_id,
|
606
615
|
follow=follow)
|
607
|
-
if returncode
|
608
|
-
# If the log tailing
|
609
|
-
#
|
610
|
-
#
|
611
|
-
#
|
616
|
+
if returncode in [rc.value for rc in exceptions.JobExitCode]:
|
617
|
+
# If the log tailing exits with a known exit code we can safely
|
618
|
+
# break the loop because it indicates the tailing process
|
619
|
+
# succeeded (even though the real job can be SUCCEEDED or
|
620
|
+
# FAILED). We use the status in job queue to show the
|
621
|
+
# information, as the ManagedJobStatus is not updated yet.
|
612
622
|
job_statuses = backend.get_job_status(handle, stream_logs=False)
|
613
623
|
job_status = list(job_statuses.values())[0]
|
614
624
|
assert job_status is not None, 'No job found.'
|
@@ -728,18 +738,25 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
728
738
|
logger.info(
|
729
739
|
ux_utils.finishing_message(f'Managed job finished: {job_id} '
|
730
740
|
f'(status: {managed_job_status.value}).'))
|
731
|
-
return ''
|
741
|
+
return '', exceptions.JobExitCode.from_managed_job_status(
|
742
|
+
managed_job_status)
|
732
743
|
|
733
744
|
|
734
745
|
def stream_logs(job_id: Optional[int],
|
735
746
|
job_name: Optional[str],
|
736
747
|
controller: bool = False,
|
737
|
-
follow: bool = True) -> str:
|
738
|
-
"""Stream logs by job id or job name.
|
748
|
+
follow: bool = True) -> Tuple[str, int]:
|
749
|
+
"""Stream logs by job id or job name.
|
750
|
+
|
751
|
+
Returns:
|
752
|
+
A tuple containing the log message and the exit code based on success
|
753
|
+
or failure of the job. 0 if success, 100 if the job failed.
|
754
|
+
See exceptions.JobExitCode for possible exit codes.
|
755
|
+
"""
|
739
756
|
if job_id is None and job_name is None:
|
740
757
|
job_id = managed_job_state.get_latest_job_id()
|
741
758
|
if job_id is None:
|
742
|
-
return 'No managed job found.'
|
759
|
+
return 'No managed job found.', exceptions.JobExitCode.NOT_FOUND
|
743
760
|
|
744
761
|
if controller:
|
745
762
|
if job_id is None:
|
@@ -754,7 +771,8 @@ def stream_logs(job_id: Optional[int],
|
|
754
771
|
if job['job_name'] == job_name
|
755
772
|
}
|
756
773
|
if not managed_job_ids:
|
757
|
-
return f'No managed job found with name {job_name!r}.'
|
774
|
+
return (f'No managed job found with name {job_name!r}.',
|
775
|
+
exceptions.JobExitCode.NOT_FOUND)
|
758
776
|
if len(managed_job_ids) > 1:
|
759
777
|
job_ids_str = ', '.join(
|
760
778
|
str(job_id) for job_id in managed_job_ids)
|
@@ -776,7 +794,7 @@ def stream_logs(job_id: Optional[int],
|
|
776
794
|
if not follow:
|
777
795
|
# Assume that the log file hasn't been written yet. Since we
|
778
796
|
# aren't following, just return.
|
779
|
-
return ''
|
797
|
+
return '', exceptions.JobExitCode.SUCCEEDED
|
780
798
|
|
781
799
|
job_status = managed_job_state.get_status(job_id)
|
782
800
|
if job_status is None:
|
@@ -787,7 +805,8 @@ def stream_logs(job_id: Optional[int],
|
|
787
805
|
# point, it never will be. This job may have been submitted
|
788
806
|
# using an old version that did not create the log file, so this
|
789
807
|
# is not considered an exceptional case.
|
790
|
-
return ''
|
808
|
+
return '', exceptions.JobExitCode.from_managed_job_status(
|
809
|
+
job_status)
|
791
810
|
|
792
811
|
time.sleep(log_lib.SKY_LOG_WAITING_GAP_SECONDS)
|
793
812
|
|
@@ -833,15 +852,17 @@ def stream_logs(job_id: Optional[int],
|
|
833
852
|
|
834
853
|
if follow:
|
835
854
|
return ux_utils.finishing_message(
|
836
|
-
f'Job finished (status: {job_status}).'
|
855
|
+
f'Job finished (status: {job_status}).'
|
856
|
+
), exceptions.JobExitCode.from_managed_job_status(job_status)
|
837
857
|
|
838
|
-
return ''
|
858
|
+
return '', exceptions.JobExitCode.SUCCEEDED
|
839
859
|
|
840
860
|
if job_id is None:
|
841
861
|
assert job_name is not None
|
842
862
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
|
843
863
|
if not job_ids:
|
844
|
-
return f'No running managed job found with name {job_name!r}.'
|
864
|
+
return (f'No running managed job found with name {job_name!r}.',
|
865
|
+
exceptions.JobExitCode.NOT_FOUND)
|
845
866
|
if len(job_ids) > 1:
|
846
867
|
raise ValueError(
|
847
868
|
f'Multiple running jobs found with name {job_name!r}.')
|
@@ -1167,6 +1188,7 @@ class ManagedJobCodeGen:
|
|
1167
1188
|
>> codegen = ManagedJobCodeGen.show_jobs(...)
|
1168
1189
|
"""
|
1169
1190
|
_PREFIX = textwrap.dedent("""\
|
1191
|
+
import sys
|
1170
1192
|
from sky.jobs import utils
|
1171
1193
|
from sky.jobs import state as managed_job_state
|
1172
1194
|
from sky.jobs import constants as managed_job_constants
|
@@ -1222,9 +1244,17 @@ class ManagedJobCodeGen:
|
|
1222
1244
|
follow: bool = True,
|
1223
1245
|
controller: bool = False) -> str:
|
1224
1246
|
code = textwrap.dedent(f"""\
|
1225
|
-
|
1247
|
+
result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
|
1226
1248
|
follow={follow}, controller={controller})
|
1227
|
-
|
1249
|
+
if managed_job_version < 3:
|
1250
|
+
# Versions 2 and older did not return a retcode, so we just print
|
1251
|
+
# the result.
|
1252
|
+
# TODO: Remove compatibility before 0.12.0
|
1253
|
+
print(result, flush=True)
|
1254
|
+
else:
|
1255
|
+
msg, retcode = result
|
1256
|
+
print(msg, flush=True)
|
1257
|
+
sys.exit(retcode)
|
1228
1258
|
""")
|
1229
1259
|
return cls._build(code)
|
1230
1260
|
|
sky/server/constants.py
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
# API server version, whenever there is a change in API server that requires a
|
4
4
|
# restart of the local API server or error out when the client does not match
|
5
5
|
# the server version.
|
6
|
-
API_VERSION = '
|
6
|
+
API_VERSION = '3'
|
7
7
|
|
8
8
|
# Prefix for API request names.
|
9
9
|
REQUEST_NAME_PREFIX = 'sky.'
|
sky/server/requests/payloads.py
CHANGED
@@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
12
12
|
|
13
13
|
import pydantic
|
14
14
|
|
15
|
+
from sky import admin_policy
|
15
16
|
from sky import serve
|
16
17
|
from sky import sky_logging
|
17
18
|
from sky import skypilot_config
|
@@ -113,15 +114,9 @@ class CheckBody(RequestBody):
|
|
113
114
|
verbose: bool
|
114
115
|
|
115
116
|
|
116
|
-
class
|
117
|
-
"""
|
118
|
-
dag: str
|
119
|
-
|
120
|
-
|
121
|
-
class OptimizeBody(RequestBody):
|
122
|
-
"""The request body for the optimize endpoint."""
|
117
|
+
class DagRequestBody(RequestBody):
|
118
|
+
"""Request body base class for endpoints with a dag."""
|
123
119
|
dag: str
|
124
|
-
minimize: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
|
125
120
|
|
126
121
|
def to_kwargs(self) -> Dict[str, Any]:
|
127
122
|
# Import here to avoid requirement of the whole SkyPilot dependency on
|
@@ -139,6 +134,19 @@ class OptimizeBody(RequestBody):
|
|
139
134
|
return kwargs
|
140
135
|
|
141
136
|
|
137
|
+
class ValidateBody(DagRequestBody):
|
138
|
+
"""The request body for the validate endpoint."""
|
139
|
+
dag: str
|
140
|
+
request_options: Optional[admin_policy.RequestOptions]
|
141
|
+
|
142
|
+
|
143
|
+
class OptimizeBody(DagRequestBody):
|
144
|
+
"""The request body for the optimize endpoint."""
|
145
|
+
dag: str
|
146
|
+
minimize: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
|
147
|
+
request_options: Optional[admin_policy.RequestOptions]
|
148
|
+
|
149
|
+
|
142
150
|
class LaunchBody(RequestBody):
|
143
151
|
"""The request body for the launch endpoint."""
|
144
152
|
task: str
|
sky/server/server.py
CHANGED
@@ -27,7 +27,6 @@ from sky import core
|
|
27
27
|
from sky import exceptions
|
28
28
|
from sky import execution
|
29
29
|
from sky import global_user_state
|
30
|
-
from sky import optimizer
|
31
30
|
from sky import sky_logging
|
32
31
|
from sky.clouds import service_catalog
|
33
32
|
from sky.data import storage_utils
|
@@ -42,6 +41,7 @@ from sky.server.requests import payloads
|
|
42
41
|
from sky.server.requests import requests as requests_lib
|
43
42
|
from sky.skylet import constants
|
44
43
|
from sky.usage import usage_lib
|
44
|
+
from sky.utils import admin_policy_utils
|
45
45
|
from sky.utils import common as common_lib
|
46
46
|
from sky.utils import common_utils
|
47
47
|
from sky.utils import dag_utils
|
@@ -258,9 +258,22 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
258
258
|
"""Validates the user's DAG."""
|
259
259
|
# TODO(SKY-1035): validate if existing cluster satisfies the requested
|
260
260
|
# resources, e.g. sky exec --gpus V100:8 existing-cluster-with-no-gpus
|
261
|
+
|
262
|
+
# TODO: Our current launch process is split into three calls:
|
263
|
+
# validate, optimize, and launch. This requires us to apply the admin policy
|
264
|
+
# in each step, which may be an expensive operation. We should consolidate
|
265
|
+
# these into a single call or have a TTL cache for (task, admin_policy)
|
266
|
+
# pairs.
|
261
267
|
logger.debug(f'Validating tasks: {validate_body.dag}')
|
262
268
|
try:
|
263
269
|
dag = dag_utils.load_chain_dag_from_yaml_str(validate_body.dag)
|
270
|
+
# TODO: Admin policy may contain arbitrary code, which may be expensive
|
271
|
+
# to run and may block the server thread. However, moving it into the
|
272
|
+
# executor adds a ~150ms penalty on the local API server because of
|
273
|
+
# added RTTs. For now, we stick to doing the validation inline in the
|
274
|
+
# server thread.
|
275
|
+
dag, _ = admin_policy_utils.apply(
|
276
|
+
dag, request_options=validate_body.request_options)
|
264
277
|
for task in dag.tasks:
|
265
278
|
# Will validate workdir and file_mounts in the backend, as those
|
266
279
|
# need to be validated after the files are uploaded to the SkyPilot
|
@@ -283,7 +296,7 @@ async def optimize(optimize_body: payloads.OptimizeBody,
|
|
283
296
|
request_name='optimize',
|
284
297
|
request_body=optimize_body,
|
285
298
|
ignore_return_value=True,
|
286
|
-
func=
|
299
|
+
func=core.optimize,
|
287
300
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
288
301
|
)
|
289
302
|
|
sky/skylet/constants.py
CHANGED
@@ -93,7 +93,7 @@ SKYLET_VERSION = '12'
|
|
93
93
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
94
94
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
95
95
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
96
|
-
SKYLET_LIB_VERSION =
|
96
|
+
SKYLET_LIB_VERSION = 3
|
97
97
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
98
98
|
|
99
99
|
# `sky jobs dashboard`-related
|
sky/skylet/job_lib.py
CHANGED
@@ -938,7 +938,9 @@ class JobLibCodeGen:
|
|
938
938
|
_PREFIX = [
|
939
939
|
'import os',
|
940
940
|
'import getpass',
|
941
|
-
'
|
941
|
+
'import sys',
|
942
|
+
'from sky import exceptions',
|
943
|
+
'from sky.skylet import log_lib, job_lib, constants',
|
942
944
|
]
|
943
945
|
|
944
946
|
@classmethod
|
@@ -1033,6 +1035,13 @@ class JobLibCodeGen:
|
|
1033
1035
|
f'tail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
|
1034
1036
|
f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
|
1035
1037
|
f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
|
1038
|
+
# After tailing, check the job status and exit with appropriate code
|
1039
|
+
'job_status = job_lib.get_status(job_id)',
|
1040
|
+
# Backward compatibility for returning exit code: Skylet versions 2
|
1041
|
+
# and older did not have JobExitCode, so we use 0 for those versions
|
1042
|
+
# TODO: Remove this special handling after 0.10.0.
|
1043
|
+
'exit_code = exceptions.JobExitCode.from_job_status(job_status) if getattr(constants, "SKYLET_LIB_VERSION", 1) > 2 else 0',
|
1044
|
+
'sys.exit(exit_code)',
|
1036
1045
|
]
|
1037
1046
|
return cls._build(code)
|
1038
1047
|
|