skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20251001__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +43 -14
- sky/backends/cloud_vm_ray_backend.py +153 -38
- sky/check.py +0 -29
- sky/client/cli/command.py +48 -26
- sky/client/cli/table_utils.py +91 -0
- sky/client/sdk.py +14 -23
- sky/client/sdk_async.py +5 -5
- sky/core.py +18 -20
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-07349868f7905d37.js → [pool]-509b2977a6373bf6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-4f0c389a4ce5fd9c.js} +1 -1
- sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → m3YT2i5s6v4SsIdYc8WZa}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -0
- sky/data/storage_utils.py +1 -45
- sky/execution.py +0 -1
- sky/global_user_state.py +3 -3
- sky/jobs/client/sdk.py +3 -2
- sky/jobs/controller.py +15 -0
- sky/jobs/server/core.py +120 -28
- sky/jobs/server/server.py +1 -1
- sky/jobs/server/utils.py +65 -32
- sky/jobs/state.py +145 -3
- sky/jobs/utils.py +87 -8
- sky/provision/kubernetes/instance.py +1 -1
- sky/schemas/api/responses.py +73 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/serve/serve_utils.py +16 -0
- sky/serve/server/core.py +1 -1
- sky/serve/server/impl.py +6 -6
- sky/server/common.py +2 -1
- sky/server/requests/serializers/decoders.py +10 -6
- sky/server/requests/serializers/encoders.py +13 -8
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +2 -32
- sky/skylet/log_lib.py +211 -0
- sky/skylet/log_lib.pyi +30 -1
- sky/skylet/services.py +208 -2
- sky/skylet/skylet.py +3 -0
- sky/task.py +4 -0
- sky/utils/cluster_utils.py +23 -5
- sky/utils/command_runner.py +21 -5
- sky/utils/command_runner.pyi +11 -0
- sky/utils/volume.py +5 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/METADATA +35 -35
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/RECORD +70 -66
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
- /sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → m3YT2i5s6v4SsIdYc8WZa}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/top_level.txt +0 -0
|
@@ -72,7 +72,7 @@ def decode_status_kubernetes(
|
|
|
72
72
|
List[Dict[str, Any]], Optional[str]]
|
|
73
73
|
) -> Tuple[List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
|
|
74
74
|
List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
|
|
75
|
-
List[
|
|
75
|
+
List[responses.ManagedJobRecord], Optional[str]]:
|
|
76
76
|
(encoded_all_clusters, encoded_unmanaged_clusters, all_jobs,
|
|
77
77
|
context) = return_value
|
|
78
78
|
all_clusters = []
|
|
@@ -85,6 +85,7 @@ def decode_status_kubernetes(
|
|
|
85
85
|
cluster['status'] = status_lib.ClusterStatus(cluster['status'])
|
|
86
86
|
unmanaged_clusters.append(
|
|
87
87
|
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload(**cluster))
|
|
88
|
+
all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
|
|
88
89
|
return all_clusters, unmanaged_clusters, all_jobs, context
|
|
89
90
|
|
|
90
91
|
|
|
@@ -101,11 +102,11 @@ def decode_start(return_value: str) -> 'backends.CloudVmRayResourceHandle':
|
|
|
101
102
|
|
|
102
103
|
|
|
103
104
|
@register_decoders('queue')
|
|
104
|
-
def decode_queue(return_value: List[dict],) -> List[
|
|
105
|
+
def decode_queue(return_value: List[dict],) -> List[responses.ClusterJobRecord]:
|
|
105
106
|
jobs = return_value
|
|
106
107
|
for job in jobs:
|
|
107
108
|
job['status'] = job_lib.JobStatus(job['status'])
|
|
108
|
-
return jobs
|
|
109
|
+
return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
|
|
109
110
|
|
|
110
111
|
|
|
111
112
|
@register_decoders('jobs.queue')
|
|
@@ -115,7 +116,7 @@ def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
|
|
|
115
116
|
|
|
116
117
|
|
|
117
118
|
@register_decoders('jobs.queue_v2')
|
|
118
|
-
def decode_jobs_queue_v2(return_value) -> List[
|
|
119
|
+
def decode_jobs_queue_v2(return_value) -> List[responses.ManagedJobRecord]:
|
|
119
120
|
"""Decode jobs queue response.
|
|
120
121
|
|
|
121
122
|
Supports legacy list, or a dict {jobs, total}.
|
|
@@ -129,6 +130,7 @@ def decode_jobs_queue_v2(return_value) -> List[Dict[str, Any]]:
|
|
|
129
130
|
jobs = return_value
|
|
130
131
|
for job in jobs:
|
|
131
132
|
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
133
|
+
jobs = [responses.ManagedJobRecord(**job) for job in jobs]
|
|
132
134
|
return jobs
|
|
133
135
|
|
|
134
136
|
|
|
@@ -181,14 +183,16 @@ def decode_list_accelerators(
|
|
|
181
183
|
|
|
182
184
|
@register_decoders('storage_ls')
|
|
183
185
|
def decode_storage_ls(
|
|
184
|
-
return_value: List[Dict[str, Any]]) -> List[
|
|
186
|
+
return_value: List[Dict[str, Any]]) -> List[responses.StorageRecord]:
|
|
185
187
|
for storage_info in return_value:
|
|
186
188
|
storage_info['status'] = status_lib.StorageStatus(
|
|
187
189
|
storage_info['status'])
|
|
188
190
|
storage_info['store'] = [
|
|
189
191
|
storage.StoreType(store) for store in storage_info['store']
|
|
190
192
|
]
|
|
191
|
-
return
|
|
193
|
+
return [
|
|
194
|
+
responses.StorageRecord(**storage_info) for storage_info in return_value
|
|
195
|
+
]
|
|
192
196
|
|
|
193
197
|
|
|
194
198
|
@register_decoders('job_status')
|
|
@@ -92,10 +92,14 @@ def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
|
|
|
92
92
|
|
|
93
93
|
|
|
94
94
|
@register_encoder('queue')
|
|
95
|
-
def encode_queue(
|
|
95
|
+
def encode_queue(
|
|
96
|
+
jobs: List[responses.ClusterJobRecord],) -> List[Dict[str, Any]]:
|
|
97
|
+
response = []
|
|
96
98
|
for job in jobs:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
+
response_job = job.model_dump()
|
|
100
|
+
response_job['status'] = job['status'].value
|
|
101
|
+
response.append(response_job)
|
|
102
|
+
return response
|
|
99
103
|
|
|
100
104
|
|
|
101
105
|
@register_encoder('status_kubernetes')
|
|
@@ -103,7 +107,7 @@ def encode_status_kubernetes(
|
|
|
103
107
|
return_value: Tuple[
|
|
104
108
|
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
105
109
|
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
106
|
-
List[
|
|
110
|
+
List[responses.ManagedJobRecord], Optional[str]]
|
|
107
111
|
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]],
|
|
108
112
|
Optional[str]]:
|
|
109
113
|
all_clusters, unmanaged_clusters, all_jobs, context = return_value
|
|
@@ -117,6 +121,7 @@ def encode_status_kubernetes(
|
|
|
117
121
|
encoded_cluster = dataclasses.asdict(cluster)
|
|
118
122
|
encoded_cluster['status'] = encoded_cluster['status'].value
|
|
119
123
|
encoded_unmanaged_clusters.append(encoded_cluster)
|
|
124
|
+
all_jobs = [job.model_dump() for job in all_jobs]
|
|
120
125
|
return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
|
|
121
126
|
|
|
122
127
|
|
|
@@ -146,9 +151,9 @@ def encode_jobs_queue_v2(
|
|
|
146
151
|
for job in jobs:
|
|
147
152
|
job['status'] = job['status'].value
|
|
148
153
|
if total is None:
|
|
149
|
-
return jobs
|
|
154
|
+
return [job.model_dump() for job in jobs]
|
|
150
155
|
return {
|
|
151
|
-
'jobs': jobs,
|
|
156
|
+
'jobs': [job.model_dump() for job in jobs],
|
|
152
157
|
'total': total,
|
|
153
158
|
'total_no_filter': total_no_filter,
|
|
154
159
|
'status_counts': status_counts
|
|
@@ -199,11 +204,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
|
|
|
199
204
|
|
|
200
205
|
@register_encoder('storage_ls')
|
|
201
206
|
def encode_storage_ls(
|
|
202
|
-
return_value: List[
|
|
207
|
+
return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
|
|
203
208
|
for storage_info in return_value:
|
|
204
209
|
storage_info['status'] = storage_info['status'].value
|
|
205
210
|
storage_info['store'] = [store.value for store in storage_info['store']]
|
|
206
|
-
return return_value
|
|
211
|
+
return [storage_info.model_dump() for storage_info in return_value]
|
|
207
212
|
|
|
208
213
|
|
|
209
214
|
@register_encoder('job_status')
|
sky/skylet/constants.py
CHANGED
|
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
100
100
|
# cluster yaml is updated.
|
|
101
101
|
#
|
|
102
102
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
103
|
-
SKYLET_VERSION = '
|
|
103
|
+
SKYLET_VERSION = '21'
|
|
104
104
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
105
105
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
106
106
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
sky/skylet/job_lib.py
CHANGED
|
@@ -24,7 +24,6 @@ from sky import sky_logging
|
|
|
24
24
|
from sky.adaptors import common as adaptors_common
|
|
25
25
|
from sky.skylet import constants
|
|
26
26
|
from sky.utils import common_utils
|
|
27
|
-
from sky.utils import log_utils
|
|
28
27
|
from sky.utils import message_utils
|
|
29
28
|
from sky.utils import subprocess_utils
|
|
30
29
|
from sky.utils.db import db_utils
|
|
@@ -612,8 +611,8 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
|
|
|
612
611
|
PENDING state.
|
|
613
612
|
|
|
614
613
|
The normal job duration will use `start_at` instead of `submitted_at` (in
|
|
615
|
-
`format_job_queue()`), because the job may stay in PENDING if
|
|
616
|
-
busy.
|
|
614
|
+
`table_utils.format_job_queue()`), because the job may stay in PENDING if
|
|
615
|
+
the cluster is busy.
|
|
617
616
|
"""
|
|
618
617
|
return message_utils.encode_payload(
|
|
619
618
|
get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
|
|
@@ -941,35 +940,6 @@ def is_cluster_idle() -> bool:
|
|
|
941
940
|
assert False, 'Should not reach here'
|
|
942
941
|
|
|
943
942
|
|
|
944
|
-
def format_job_queue(jobs: List[Dict[str, Any]]):
|
|
945
|
-
"""Format the job queue for display.
|
|
946
|
-
|
|
947
|
-
Usage:
|
|
948
|
-
jobs = get_job_queue()
|
|
949
|
-
print(format_job_queue(jobs))
|
|
950
|
-
"""
|
|
951
|
-
job_table = log_utils.create_table([
|
|
952
|
-
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
|
953
|
-
'STATUS', 'LOG', 'GIT COMMIT'
|
|
954
|
-
])
|
|
955
|
-
for job in jobs:
|
|
956
|
-
job_table.add_row([
|
|
957
|
-
job['job_id'],
|
|
958
|
-
job['job_name'],
|
|
959
|
-
job['username'],
|
|
960
|
-
log_utils.readable_time_duration(job['submitted_at']),
|
|
961
|
-
log_utils.readable_time_duration(job['start_at']),
|
|
962
|
-
log_utils.readable_time_duration(job['start_at'],
|
|
963
|
-
job['end_at'],
|
|
964
|
-
absolute=True),
|
|
965
|
-
job['resources'],
|
|
966
|
-
job['status'].colored_str(),
|
|
967
|
-
job['log_path'],
|
|
968
|
-
job.get('metadata', {}).get('git_commit', '-'),
|
|
969
|
-
])
|
|
970
|
-
return job_table
|
|
971
|
-
|
|
972
|
-
|
|
973
943
|
def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
|
|
974
944
|
"""Get the job queue in encoded json format.
|
|
975
945
|
|
sky/skylet/log_lib.py
CHANGED
|
@@ -8,11 +8,13 @@ import functools
|
|
|
8
8
|
import io
|
|
9
9
|
import multiprocessing.pool
|
|
10
10
|
import os
|
|
11
|
+
import queue as queue_lib
|
|
11
12
|
import shlex
|
|
12
13
|
import subprocess
|
|
13
14
|
import sys
|
|
14
15
|
import tempfile
|
|
15
16
|
import textwrap
|
|
17
|
+
import threading
|
|
16
18
|
import time
|
|
17
19
|
from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
|
|
18
20
|
Tuple, Union)
|
|
@@ -39,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
|
|
|
39
41
|
|
|
40
42
|
LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
|
|
41
43
|
|
|
44
|
+
# 16-64KiB seems to be the sweet spot:
|
|
45
|
+
# https://github.com/grpc/grpc.github.io/issues/371
|
|
46
|
+
# TODO(kevin): Benchmark this ourselves and verify.
|
|
47
|
+
DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
|
|
48
|
+
|
|
42
49
|
|
|
43
50
|
class _ProcessingArgs:
|
|
44
51
|
"""Arguments for processing logs."""
|
|
@@ -563,3 +570,207 @@ def tail_logs(job_id: Optional[int],
|
|
|
563
570
|
except FileNotFoundError:
|
|
564
571
|
print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
565
572
|
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
576
|
+
log_dir: Optional[str],
|
|
577
|
+
managed_job_id: Optional[int] = None,
|
|
578
|
+
follow: bool = True,
|
|
579
|
+
tail: int = 0) -> Iterator[str]:
|
|
580
|
+
"""Tail the logs of a job. This is mostly the same as tail_logs, but
|
|
581
|
+
returns an iterator instead of printing to stdout/stderr."""
|
|
582
|
+
if job_id is None:
|
|
583
|
+
# This only happens when job_lib.get_latest_job_id() returns None,
|
|
584
|
+
# which means no job has been submitted to this cluster. See
|
|
585
|
+
# sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
|
|
586
|
+
logger.info('Skip streaming logs as no job has been submitted.')
|
|
587
|
+
return
|
|
588
|
+
job_str = f'job {job_id}'
|
|
589
|
+
if managed_job_id is not None:
|
|
590
|
+
job_str = f'managed job {managed_job_id}'
|
|
591
|
+
if log_dir is None:
|
|
592
|
+
msg = f'{job_str.capitalize()} not found (see `sky queue`).'
|
|
593
|
+
yield msg + '\n'
|
|
594
|
+
return
|
|
595
|
+
logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
|
|
596
|
+
f'{managed_job_id}.')
|
|
597
|
+
log_path = os.path.join(log_dir, 'run.log')
|
|
598
|
+
log_path = os.path.expanduser(log_path)
|
|
599
|
+
|
|
600
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
601
|
+
|
|
602
|
+
# Wait for the log to be written. This is needed due to the `ray submit`
|
|
603
|
+
# will take some time to start the job and write the log.
|
|
604
|
+
retry_cnt = 0
|
|
605
|
+
while status is not None and not status.is_terminal():
|
|
606
|
+
retry_cnt += 1
|
|
607
|
+
if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
|
|
608
|
+
break
|
|
609
|
+
if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
|
|
610
|
+
err = (f'{colorama.Fore.RED}ERROR: Logs for '
|
|
611
|
+
f'{job_str} (status: {status.value}) does not exist '
|
|
612
|
+
f'after retrying {retry_cnt} times.'
|
|
613
|
+
f'{colorama.Style.RESET_ALL}')
|
|
614
|
+
yield err + '\n'
|
|
615
|
+
return
|
|
616
|
+
waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
|
|
617
|
+
'to be written...')
|
|
618
|
+
yield waiting + '\n'
|
|
619
|
+
time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
|
|
620
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
621
|
+
|
|
622
|
+
start_stream_at = LOG_FILE_START_STREAMING_AT
|
|
623
|
+
# Explicitly declare the type to avoid mypy warning.
|
|
624
|
+
lines: Iterable[str] = []
|
|
625
|
+
if follow and status in [
|
|
626
|
+
job_lib.JobStatus.SETTING_UP,
|
|
627
|
+
job_lib.JobStatus.PENDING,
|
|
628
|
+
job_lib.JobStatus.RUNNING,
|
|
629
|
+
]:
|
|
630
|
+
# Not using `ray job logs` because it will put progress bar in
|
|
631
|
+
# multiple lines.
|
|
632
|
+
with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
|
|
633
|
+
# Using `_follow` instead of `tail -f` to streaming the whole
|
|
634
|
+
# log and creating a new process for tail.
|
|
635
|
+
start_streaming = False
|
|
636
|
+
if tail > 0:
|
|
637
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
638
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
639
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
640
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
641
|
+
for line in lines:
|
|
642
|
+
if start_stream_at in line:
|
|
643
|
+
start_streaming = True
|
|
644
|
+
if start_streaming:
|
|
645
|
+
yield line
|
|
646
|
+
# Now, the cursor is at the end of the last lines
|
|
647
|
+
# if tail > 0
|
|
648
|
+
for line in _follow_job_logs(log_file,
|
|
649
|
+
job_id=job_id,
|
|
650
|
+
start_streaming=start_streaming,
|
|
651
|
+
start_streaming_at=start_stream_at):
|
|
652
|
+
yield line
|
|
653
|
+
else:
|
|
654
|
+
try:
|
|
655
|
+
start_streaming = False
|
|
656
|
+
with open(log_path, 'r', encoding='utf-8') as log_file:
|
|
657
|
+
if tail > 0:
|
|
658
|
+
# If tail > 0, we need to read the last n lines.
|
|
659
|
+
# We use double ended queue to rotate the last n lines.
|
|
660
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
661
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
662
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
663
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
664
|
+
else:
|
|
665
|
+
lines = log_file
|
|
666
|
+
for line in lines:
|
|
667
|
+
if start_stream_at in line:
|
|
668
|
+
start_streaming = True
|
|
669
|
+
if start_streaming:
|
|
670
|
+
yield line
|
|
671
|
+
status_str = status.value if status is not None else 'None'
|
|
672
|
+
# Only show "Job finished" for actually terminal states
|
|
673
|
+
if status is not None and status.is_terminal():
|
|
674
|
+
finish = ux_utils.finishing_message(
|
|
675
|
+
f'Job finished (status: {status_str}).')
|
|
676
|
+
yield finish + '\n'
|
|
677
|
+
return
|
|
678
|
+
except FileNotFoundError:
|
|
679
|
+
err = (
|
|
680
|
+
f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
681
|
+
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
682
|
+
yield err + '\n'
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
class LogBuffer:
|
|
686
|
+
"""In-memory buffer for chunking log lines for streaming."""
|
|
687
|
+
|
|
688
|
+
def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
|
|
689
|
+
"""Initialize the log buffer.
|
|
690
|
+
|
|
691
|
+
Args:
|
|
692
|
+
max_chars: Maximum buffer size (in characters, not bytes) before
|
|
693
|
+
flushing. The actual amount of bytes (UTF-8 encoding)
|
|
694
|
+
could be more than this, depending on the characters,
|
|
695
|
+
i.e. ASCII characters take 1 byte, while others
|
|
696
|
+
may take 2-4 bytes. But this is fine as our default
|
|
697
|
+
chunk size is well below the default value of
|
|
698
|
+
grpc.max_receive_message_length which is 4MB.
|
|
699
|
+
"""
|
|
700
|
+
self.max_chars = max_chars
|
|
701
|
+
self._buffer = io.StringIO()
|
|
702
|
+
|
|
703
|
+
def _should_flush(self) -> bool:
|
|
704
|
+
return self._buffer.tell() >= self.max_chars
|
|
705
|
+
|
|
706
|
+
def flush(self) -> str:
|
|
707
|
+
"""Get the current buffered content and clear the buffer.
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
The buffered log lines as a single string
|
|
711
|
+
"""
|
|
712
|
+
if not self._buffer.tell():
|
|
713
|
+
return ''
|
|
714
|
+
chunk = self._buffer.getvalue()
|
|
715
|
+
self._buffer.truncate(0)
|
|
716
|
+
self._buffer.seek(0)
|
|
717
|
+
return chunk
|
|
718
|
+
|
|
719
|
+
def write(self, line: str) -> bool:
|
|
720
|
+
"""Add a line to the buffer.
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
line: The log line to add
|
|
724
|
+
|
|
725
|
+
Returns:
|
|
726
|
+
True if buffer should be flushed after adding the line
|
|
727
|
+
"""
|
|
728
|
+
self._buffer.write(line)
|
|
729
|
+
return self._should_flush()
|
|
730
|
+
|
|
731
|
+
def close(self):
|
|
732
|
+
self._buffer.close()
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
736
|
+
timeout: float) -> Iterable[str]:
|
|
737
|
+
"""Iterates over an iterable, writing each item to a buffer,
|
|
738
|
+
and flushing the buffer when it is full or no item is
|
|
739
|
+
yielded within the timeout duration."""
|
|
740
|
+
# TODO(kevin): Simplify this using asyncio.timeout, once we move
|
|
741
|
+
# the skylet event loop and gRPC server to asyncio.
|
|
742
|
+
# https://docs.python.org/3/library/asyncio-task.html#timeouts
|
|
743
|
+
|
|
744
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
745
|
+
sentinel = object()
|
|
746
|
+
|
|
747
|
+
def producer():
|
|
748
|
+
try:
|
|
749
|
+
for item in iterable:
|
|
750
|
+
queue.put(item)
|
|
751
|
+
finally:
|
|
752
|
+
queue.put(sentinel)
|
|
753
|
+
|
|
754
|
+
thread = threading.Thread(target=producer, daemon=True)
|
|
755
|
+
thread.start()
|
|
756
|
+
|
|
757
|
+
while True:
|
|
758
|
+
try:
|
|
759
|
+
item = queue.get(timeout=timeout)
|
|
760
|
+
except queue_lib.Empty:
|
|
761
|
+
out = buffer.flush()
|
|
762
|
+
if out:
|
|
763
|
+
yield out
|
|
764
|
+
continue
|
|
765
|
+
|
|
766
|
+
if item is sentinel:
|
|
767
|
+
thread.join()
|
|
768
|
+
out = buffer.flush()
|
|
769
|
+
if out:
|
|
770
|
+
yield out
|
|
771
|
+
return
|
|
772
|
+
|
|
773
|
+
if buffer.write(item):
|
|
774
|
+
out = buffer.flush()
|
|
775
|
+
if out:
|
|
776
|
+
yield out
|
sky/skylet/log_lib.pyi
CHANGED
|
@@ -4,7 +4,7 @@ overloaded type hints for run_with_log(), as we need to determine
|
|
|
4
4
|
the return type based on the value of require_outputs.
|
|
5
5
|
"""
|
|
6
6
|
import typing
|
|
7
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
7
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
from typing_extensions import Literal
|
|
10
10
|
|
|
@@ -143,3 +143,32 @@ def tail_logs(job_id: int,
|
|
|
143
143
|
managed_job_id: Optional[int] = ...,
|
|
144
144
|
follow: bool = ...) -> None:
|
|
145
145
|
...
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
149
|
+
log_dir: Optional[str],
|
|
150
|
+
managed_job_id: Optional[int] = ...,
|
|
151
|
+
follow: bool = ...,
|
|
152
|
+
tail: int = ...) -> Iterator[str]:
|
|
153
|
+
...
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class LogBuffer:
|
|
157
|
+
max_chars: int
|
|
158
|
+
|
|
159
|
+
def __init__(self, max_chars: int = ...):
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
def flush(self) -> str:
|
|
163
|
+
...
|
|
164
|
+
|
|
165
|
+
def write(self, line: str) -> bool:
|
|
166
|
+
...
|
|
167
|
+
|
|
168
|
+
def close(self):
|
|
169
|
+
...
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
173
|
+
timeout: float) -> Iterable[str]:
|
|
174
|
+
...
|