skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251012__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +3 -2
- sky/client/cli/command.py +53 -4
- sky/client/sdk.py +11 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-4f7079dcab6ed653.js → [job]-e5c9ce6a24fc0de4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-6a5ddd0184bfa22c.js → webpack-66f23594d38c7f16.js} +1 -1
- sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → yOfMelBaFp8uL5F9atyAK}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +54 -15
- sky/jobs/server/server.py +2 -2
- sky/provision/kubernetes/instance.py +2 -27
- sky/provision/kubernetes/utils.py +47 -6
- sky/serve/server/server.py +1 -1
- sky/server/constants.py +4 -0
- sky/server/requests/executor.py +36 -36
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/requests.py +119 -2
- sky/server/server.py +19 -5
- sky/server/stream_utils.py +61 -26
- sky/utils/common_utils.py +6 -3
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/METADATA +36 -35
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/RECORD +43 -43
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → yOfMelBaFp8uL5F9atyAK}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/top_level.txt +0 -0
sky/server/requests/requests.py
CHANGED
|
@@ -292,6 +292,100 @@ class Request:
|
|
|
292
292
|
raise
|
|
293
293
|
|
|
294
294
|
|
|
295
|
+
def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
|
|
296
|
+
"""Serialize the SkyPilot API request for display purposes.
|
|
297
|
+
|
|
298
|
+
This function should be called on the server side to serialize the
|
|
299
|
+
request body into human readable format, e.g., the entrypoint should
|
|
300
|
+
be a string, and the pid, error, or return value are not needed.
|
|
301
|
+
|
|
302
|
+
The returned value will then be displayed on the client side in request
|
|
303
|
+
table.
|
|
304
|
+
|
|
305
|
+
We do not use `encode` for display to avoid a large amount of data being
|
|
306
|
+
sent to the client side, especially for the request table could include
|
|
307
|
+
all the requests.
|
|
308
|
+
"""
|
|
309
|
+
encoded_requests = []
|
|
310
|
+
all_users = global_user_state.get_all_users()
|
|
311
|
+
all_users_map = {user.id: user.name for user in all_users}
|
|
312
|
+
for request in requests:
|
|
313
|
+
if request.request_body is not None:
|
|
314
|
+
assert isinstance(request.request_body,
|
|
315
|
+
payloads.RequestBody), (request.name,
|
|
316
|
+
request.request_body)
|
|
317
|
+
user_name = all_users_map.get(request.user_id)
|
|
318
|
+
payload = payloads.RequestPayload(
|
|
319
|
+
request_id=request.request_id,
|
|
320
|
+
name=request.name,
|
|
321
|
+
entrypoint=request.entrypoint.__name__
|
|
322
|
+
if request.entrypoint is not None else '',
|
|
323
|
+
request_body=request.request_body.model_dump_json()
|
|
324
|
+
if request.request_body is not None else json.dumps(None),
|
|
325
|
+
status=request.status.value,
|
|
326
|
+
return_value=json.dumps(None),
|
|
327
|
+
error=json.dumps(None),
|
|
328
|
+
pid=None,
|
|
329
|
+
created_at=request.created_at,
|
|
330
|
+
schedule_type=request.schedule_type.value,
|
|
331
|
+
user_id=request.user_id,
|
|
332
|
+
user_name=user_name,
|
|
333
|
+
cluster_name=request.cluster_name,
|
|
334
|
+
status_msg=request.status_msg,
|
|
335
|
+
should_retry=request.should_retry,
|
|
336
|
+
finished_at=request.finished_at,
|
|
337
|
+
)
|
|
338
|
+
encoded_requests.append(payload)
|
|
339
|
+
return encoded_requests
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _update_request_row_fields(
|
|
343
|
+
row: Tuple[Any, ...],
|
|
344
|
+
fields: Optional[List[str]] = None) -> Tuple[Any, ...]:
|
|
345
|
+
"""Update the request row fields."""
|
|
346
|
+
if not fields:
|
|
347
|
+
return row
|
|
348
|
+
|
|
349
|
+
# Convert tuple to dictionary for easier manipulation
|
|
350
|
+
content = dict(zip(fields, row))
|
|
351
|
+
|
|
352
|
+
# Required fields in RequestPayload
|
|
353
|
+
if 'request_id' not in fields:
|
|
354
|
+
content['request_id'] = ''
|
|
355
|
+
if 'name' not in fields:
|
|
356
|
+
content['name'] = ''
|
|
357
|
+
if 'entrypoint' not in fields:
|
|
358
|
+
content['entrypoint'] = server_constants.EMPTY_PICKLED_VALUE
|
|
359
|
+
if 'request_body' not in fields:
|
|
360
|
+
content['request_body'] = server_constants.EMPTY_PICKLED_VALUE
|
|
361
|
+
if 'status' not in fields:
|
|
362
|
+
content['status'] = RequestStatus.PENDING.value
|
|
363
|
+
if 'created_at' not in fields:
|
|
364
|
+
content['created_at'] = 0
|
|
365
|
+
if 'user_id' not in fields:
|
|
366
|
+
content['user_id'] = ''
|
|
367
|
+
if 'return_value' not in fields:
|
|
368
|
+
content['return_value'] = json.dumps(None)
|
|
369
|
+
if 'error' not in fields:
|
|
370
|
+
content['error'] = json.dumps(None)
|
|
371
|
+
if 'schedule_type' not in fields:
|
|
372
|
+
content['schedule_type'] = ScheduleType.SHORT.value
|
|
373
|
+
# Optional fields in RequestPayload
|
|
374
|
+
if 'pid' not in fields:
|
|
375
|
+
content['pid'] = None
|
|
376
|
+
if 'cluster_name' not in fields:
|
|
377
|
+
content['cluster_name'] = None
|
|
378
|
+
if 'status_msg' not in fields:
|
|
379
|
+
content['status_msg'] = None
|
|
380
|
+
if 'should_retry' not in fields:
|
|
381
|
+
content['should_retry'] = False
|
|
382
|
+
if 'finished_at' not in fields:
|
|
383
|
+
content['finished_at'] = None
|
|
384
|
+
|
|
385
|
+
# Convert back to tuple in the same order as REQUEST_COLUMNS
|
|
386
|
+
return tuple(content[col] for col in REQUEST_COLUMNS)
|
|
387
|
+
|
|
388
|
+
|
|
295
389
|
def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
296
390
|
"""Kill all pending and running requests for a cluster.
|
|
297
391
|
|
|
@@ -634,6 +728,7 @@ class RequestTaskFilter:
|
|
|
634
728
|
Mutually exclusive with exclude_request_names.
|
|
635
729
|
finished_before: if provided, only include requests finished before this
|
|
636
730
|
timestamp.
|
|
731
|
+
limit: the number of requests to show. If None, show all requests.
|
|
637
732
|
|
|
638
733
|
Raises:
|
|
639
734
|
ValueError: If both exclude_request_names and include_request_names are
|
|
@@ -645,6 +740,8 @@ class RequestTaskFilter:
|
|
|
645
740
|
exclude_request_names: Optional[List[str]] = None
|
|
646
741
|
include_request_names: Optional[List[str]] = None
|
|
647
742
|
finished_before: Optional[float] = None
|
|
743
|
+
limit: Optional[int] = None
|
|
744
|
+
fields: Optional[List[str]] = None
|
|
648
745
|
|
|
649
746
|
def __post_init__(self):
|
|
650
747
|
if (self.exclude_request_names is not None and
|
|
@@ -687,8 +784,13 @@ class RequestTaskFilter:
|
|
|
687
784
|
if filter_str:
|
|
688
785
|
filter_str = f' WHERE {filter_str}'
|
|
689
786
|
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
690
|
-
|
|
691
|
-
|
|
787
|
+
if self.fields:
|
|
788
|
+
columns_str = ', '.join(self.fields)
|
|
789
|
+
query_str = (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
|
|
790
|
+
'ORDER BY created_at DESC')
|
|
791
|
+
if self.limit is not None:
|
|
792
|
+
query_str += f' LIMIT {self.limit}'
|
|
793
|
+
return query_str, filter_params
|
|
692
794
|
|
|
693
795
|
|
|
694
796
|
@init_db
|
|
@@ -722,6 +824,21 @@ async def get_request_tasks_async(
|
|
|
722
824
|
return [Request.from_row(row) for row in rows]
|
|
723
825
|
|
|
724
826
|
|
|
827
|
+
@init_db_async
|
|
828
|
+
@metrics_lib.time_me_async
|
|
829
|
+
async def get_request_tasks_with_fields_async(
|
|
830
|
+
req_filter: RequestTaskFilter,
|
|
831
|
+
fields: Optional[List[str]] = None,
|
|
832
|
+
) -> List[Request]:
|
|
833
|
+
"""Async version of get_request_tasks."""
|
|
834
|
+
assert _DB is not None
|
|
835
|
+
async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
|
|
836
|
+
if not rows:
|
|
837
|
+
return []
|
|
838
|
+
rows = [_update_request_row_fields(row, fields) for row in rows]
|
|
839
|
+
return [Request.from_row(row) for row in rows]
|
|
840
|
+
|
|
841
|
+
|
|
725
842
|
@init_db_async
|
|
726
843
|
@metrics_lib.time_me_async
|
|
727
844
|
async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
sky/server/server.py
CHANGED
|
@@ -1243,7 +1243,7 @@ async def logs(
|
|
|
1243
1243
|
background_tasks.add_task(task.cancel)
|
|
1244
1244
|
# TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
|
|
1245
1245
|
# the same approach as /stream.
|
|
1246
|
-
return stream_utils.
|
|
1246
|
+
return stream_utils.stream_response_for_long_request(
|
|
1247
1247
|
request_id=request.state.request_id,
|
|
1248
1248
|
logs_path=request_task.log_path,
|
|
1249
1249
|
background_tasks=background_tasks,
|
|
@@ -1539,6 +1539,7 @@ async def stream(
|
|
|
1539
1539
|
'X-Accel-Buffering': 'no'
|
|
1540
1540
|
})
|
|
1541
1541
|
|
|
1542
|
+
polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
|
|
1542
1543
|
# Original plain text streaming logic
|
|
1543
1544
|
if request_id is not None:
|
|
1544
1545
|
request_task = await requests_lib.get_request_async(request_id)
|
|
@@ -1553,6 +1554,8 @@ async def stream(
|
|
|
1553
1554
|
raise fastapi.HTTPException(
|
|
1554
1555
|
status_code=404,
|
|
1555
1556
|
detail=f'Log of request {request_id!r} has been deleted')
|
|
1557
|
+
if request_task.schedule_type == requests_lib.ScheduleType.LONG:
|
|
1558
|
+
polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
|
|
1556
1559
|
else:
|
|
1557
1560
|
assert log_path is not None, (request_id, log_path)
|
|
1558
1561
|
if log_path == constants.API_SERVER_LOGS:
|
|
@@ -1600,7 +1603,8 @@ async def stream(
|
|
|
1600
1603
|
log_path_to_stream,
|
|
1601
1604
|
plain_logs=format == 'plain',
|
|
1602
1605
|
tail=tail,
|
|
1603
|
-
follow=follow
|
|
1606
|
+
follow=follow,
|
|
1607
|
+
polling_interval=polling_interval),
|
|
1604
1608
|
media_type='text/plain',
|
|
1605
1609
|
headers=headers,
|
|
1606
1610
|
)
|
|
@@ -1625,6 +1629,10 @@ async def api_status(
|
|
|
1625
1629
|
None, description='Request IDs to get status for.'),
|
|
1626
1630
|
all_status: bool = fastapi.Query(
|
|
1627
1631
|
False, description='Get finished requests as well.'),
|
|
1632
|
+
limit: Optional[int] = fastapi.Query(
|
|
1633
|
+
None, description='Number of requests to show.'),
|
|
1634
|
+
fields: Optional[List[str]] = fastapi.Query(
|
|
1635
|
+
None, description='Fields to get. If None, get all fields.'),
|
|
1628
1636
|
) -> List[payloads.RequestPayload]:
|
|
1629
1637
|
"""Gets the list of requests."""
|
|
1630
1638
|
if request_ids is None:
|
|
@@ -1634,9 +1642,15 @@ async def api_status(
|
|
|
1634
1642
|
requests_lib.RequestStatus.PENDING,
|
|
1635
1643
|
requests_lib.RequestStatus.RUNNING,
|
|
1636
1644
|
]
|
|
1637
|
-
request_tasks = await requests_lib.
|
|
1638
|
-
req_filter=requests_lib.RequestTaskFilter(
|
|
1639
|
-
|
|
1645
|
+
request_tasks = await requests_lib.get_request_tasks_with_fields_async(
|
|
1646
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
1647
|
+
status=statuses,
|
|
1648
|
+
limit=limit,
|
|
1649
|
+
fields=fields,
|
|
1650
|
+
),
|
|
1651
|
+
fields=fields,
|
|
1652
|
+
)
|
|
1653
|
+
return requests_lib.encode_requests(request_tasks)
|
|
1640
1654
|
else:
|
|
1641
1655
|
encoded_request_tasks = []
|
|
1642
1656
|
for request_id in request_ids:
|
sky/server/stream_utils.py
CHANGED
|
@@ -11,6 +11,7 @@ import fastapi
|
|
|
11
11
|
from sky import global_user_state
|
|
12
12
|
from sky import sky_logging
|
|
13
13
|
from sky.server.requests import requests as requests_lib
|
|
14
|
+
from sky.utils import common_utils
|
|
14
15
|
from sky.utils import message_utils
|
|
15
16
|
from sky.utils import rich_utils
|
|
16
17
|
from sky.utils import status_lib
|
|
@@ -24,7 +25,9 @@ logger = sky_logging.init_logger(__name__)
|
|
|
24
25
|
_BUFFER_SIZE = 8 * 1024 # 8KB
|
|
25
26
|
_BUFFER_TIMEOUT = 0.02 # 20ms
|
|
26
27
|
_HEARTBEAT_INTERVAL = 30
|
|
27
|
-
|
|
28
|
+
|
|
29
|
+
LONG_REQUEST_POLL_INTERVAL = 1
|
|
30
|
+
DEFAULT_POLL_INTERVAL = 0.1
|
|
28
31
|
|
|
29
32
|
|
|
30
33
|
async def _yield_log_file_with_payloads_skipped(
|
|
@@ -41,12 +44,14 @@ async def _yield_log_file_with_payloads_skipped(
|
|
|
41
44
|
|
|
42
45
|
|
|
43
46
|
async def log_streamer(
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
47
|
+
request_id: Optional[str],
|
|
48
|
+
log_path: pathlib.Path,
|
|
49
|
+
plain_logs: bool = False,
|
|
50
|
+
tail: Optional[int] = None,
|
|
51
|
+
follow: bool = True,
|
|
52
|
+
cluster_name: Optional[str] = None,
|
|
53
|
+
polling_interval: float = DEFAULT_POLL_INTERVAL
|
|
54
|
+
) -> AsyncGenerator[str, None]:
|
|
50
55
|
"""Streams the logs of a request.
|
|
51
56
|
|
|
52
57
|
Args:
|
|
@@ -84,6 +89,11 @@ async def log_streamer(
|
|
|
84
89
|
f'scheduled: {request_id}')
|
|
85
90
|
req_status = request_task.status
|
|
86
91
|
req_msg = request_task.status_msg
|
|
92
|
+
# Slowly back off the database polling up to every 1 second, to avoid
|
|
93
|
+
# overloading the CPU and DB.
|
|
94
|
+
backoff = common_utils.Backoff(initial_backoff=polling_interval,
|
|
95
|
+
max_backoff_factor=10,
|
|
96
|
+
multiplier=1.2)
|
|
87
97
|
while req_status < requests_lib.RequestStatus.RUNNING:
|
|
88
98
|
if req_msg is not None:
|
|
89
99
|
waiting_msg = request_task.status_msg
|
|
@@ -99,7 +109,7 @@ async def log_streamer(
|
|
|
99
109
|
# TODO(aylei): we should use a better mechanism to avoid busy
|
|
100
110
|
# polling the DB, which can be a bottleneck for high-concurrency
|
|
101
111
|
# requests.
|
|
102
|
-
await asyncio.sleep(
|
|
112
|
+
await asyncio.sleep(backoff.current_backoff())
|
|
103
113
|
status_with_msg = await requests_lib.get_request_status_async(
|
|
104
114
|
request_id, include_msg=True)
|
|
105
115
|
req_status = status_with_msg.status
|
|
@@ -111,17 +121,20 @@ async def log_streamer(
|
|
|
111
121
|
|
|
112
122
|
async with aiofiles.open(log_path, 'rb') as f:
|
|
113
123
|
async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
|
|
114
|
-
follow, cluster_name
|
|
124
|
+
follow, cluster_name,
|
|
125
|
+
polling_interval):
|
|
115
126
|
yield chunk
|
|
116
127
|
|
|
117
128
|
|
|
118
129
|
async def _tail_log_file(
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
130
|
+
f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
131
|
+
request_id: Optional[str] = None,
|
|
132
|
+
plain_logs: bool = False,
|
|
133
|
+
tail: Optional[int] = None,
|
|
134
|
+
follow: bool = True,
|
|
135
|
+
cluster_name: Optional[str] = None,
|
|
136
|
+
polling_interval: float = DEFAULT_POLL_INTERVAL
|
|
137
|
+
) -> AsyncGenerator[str, None]:
|
|
125
138
|
"""Tail the opened log file, buffer the lines and flush in chunks."""
|
|
126
139
|
|
|
127
140
|
if tail is not None:
|
|
@@ -137,7 +150,7 @@ async def _tail_log_file(
|
|
|
137
150
|
yield line_str
|
|
138
151
|
|
|
139
152
|
last_heartbeat_time = asyncio.get_event_loop().time()
|
|
140
|
-
|
|
153
|
+
last_status_check_time = asyncio.get_event_loop().time()
|
|
141
154
|
|
|
142
155
|
# Buffer the lines in memory and flush them in chunks to improve log
|
|
143
156
|
# tailing throughput.
|
|
@@ -167,7 +180,17 @@ async def _tail_log_file(
|
|
|
167
180
|
|
|
168
181
|
line: Optional[bytes] = await f.readline()
|
|
169
182
|
if not line:
|
|
170
|
-
|
|
183
|
+
# Avoid checking the status too frequently to avoid overloading the
|
|
184
|
+
# DB.
|
|
185
|
+
should_check_status = (current_time -
|
|
186
|
+
last_status_check_time) >= polling_interval
|
|
187
|
+
if not follow:
|
|
188
|
+
# We will only hit this path once, but we should make sure to
|
|
189
|
+
# check the status so that we display the final request status
|
|
190
|
+
# if the request is complete.
|
|
191
|
+
should_check_status = True
|
|
192
|
+
if request_id is not None and should_check_status:
|
|
193
|
+
last_status_check_time = current_time
|
|
171
194
|
req_status = await requests_lib.get_request_status_async(
|
|
172
195
|
request_id)
|
|
173
196
|
if req_status.status > requests_lib.RequestStatus.RUNNING:
|
|
@@ -185,20 +208,19 @@ async def _tail_log_file(
|
|
|
185
208
|
' cancelled\n')
|
|
186
209
|
break
|
|
187
210
|
if not follow:
|
|
211
|
+
# The below checks (cluster status, heartbeat) are not needed
|
|
212
|
+
# for non-follow logs.
|
|
188
213
|
break
|
|
189
214
|
# Provision logs pass in cluster_name, check cluster status
|
|
190
|
-
# periodically to see if provisioning is done.
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
) >= _CLUSTER_STATUS_INTERVAL
|
|
194
|
-
if cluster_name is not None and check_status:
|
|
215
|
+
# periodically to see if provisioning is done.
|
|
216
|
+
if cluster_name is not None and should_check_status:
|
|
217
|
+
last_status_check_time = current_time
|
|
195
218
|
cluster_record = await (
|
|
196
219
|
global_user_state.get_status_from_cluster_name_async(
|
|
197
220
|
cluster_name))
|
|
198
221
|
if (cluster_record is None or
|
|
199
222
|
cluster_record != status_lib.ClusterStatus.INIT):
|
|
200
223
|
break
|
|
201
|
-
last_cluster_status_check_time = current_time
|
|
202
224
|
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
|
203
225
|
# Currently just used to keep the connection busy, refer to
|
|
204
226
|
# https://github.com/skypilot-org/skypilot/issues/5750 for
|
|
@@ -234,9 +256,22 @@ async def _tail_log_file(
|
|
|
234
256
|
yield chunk
|
|
235
257
|
|
|
236
258
|
|
|
259
|
+
def stream_response_for_long_request(
|
|
260
|
+
request_id: str,
|
|
261
|
+
logs_path: pathlib.Path,
|
|
262
|
+
background_tasks: fastapi.BackgroundTasks,
|
|
263
|
+
) -> fastapi.responses.StreamingResponse:
|
|
264
|
+
return stream_response(request_id,
|
|
265
|
+
logs_path,
|
|
266
|
+
background_tasks,
|
|
267
|
+
polling_interval=LONG_REQUEST_POLL_INTERVAL)
|
|
268
|
+
|
|
269
|
+
|
|
237
270
|
def stream_response(
|
|
238
|
-
request_id: str,
|
|
239
|
-
|
|
271
|
+
request_id: str,
|
|
272
|
+
logs_path: pathlib.Path,
|
|
273
|
+
background_tasks: fastapi.BackgroundTasks,
|
|
274
|
+
polling_interval: float = DEFAULT_POLL_INTERVAL
|
|
240
275
|
) -> fastapi.responses.StreamingResponse:
|
|
241
276
|
|
|
242
277
|
async def on_disconnect():
|
|
@@ -249,7 +284,7 @@ def stream_response(
|
|
|
249
284
|
background_tasks.add_task(on_disconnect)
|
|
250
285
|
|
|
251
286
|
return fastapi.responses.StreamingResponse(
|
|
252
|
-
log_streamer(request_id, logs_path),
|
|
287
|
+
log_streamer(request_id, logs_path, polling_interval=polling_interval),
|
|
253
288
|
media_type='text/plain',
|
|
254
289
|
headers={
|
|
255
290
|
'Cache-Control': 'no-cache, no-transform',
|
sky/utils/common_utils.py
CHANGED
|
@@ -265,13 +265,16 @@ def get_global_job_id(job_timestamp: str,
|
|
|
265
265
|
|
|
266
266
|
class Backoff:
|
|
267
267
|
"""Exponential backoff with jittering."""
|
|
268
|
-
MULTIPLIER = 1.6
|
|
269
268
|
JITTER = 0.4
|
|
270
269
|
|
|
271
|
-
def __init__(self,
|
|
270
|
+
def __init__(self,
|
|
271
|
+
initial_backoff: float = 5,
|
|
272
|
+
max_backoff_factor: int = 5,
|
|
273
|
+
multiplier: float = 1.6):
|
|
272
274
|
self._initial = True
|
|
273
275
|
self._backoff = 0.0
|
|
274
276
|
self._initial_backoff = initial_backoff
|
|
277
|
+
self._multiplier = multiplier
|
|
275
278
|
self._max_backoff = max_backoff_factor * self._initial_backoff
|
|
276
279
|
|
|
277
280
|
# https://github.com/grpc/grpc/blob/2d4f3c56001cd1e1f85734b2f7c5ce5f2797c38a/doc/connection-backoff.md
|
|
@@ -283,7 +286,7 @@ class Backoff:
|
|
|
283
286
|
self._initial = False
|
|
284
287
|
self._backoff = min(self._initial_backoff, self._max_backoff)
|
|
285
288
|
else:
|
|
286
|
-
self._backoff = min(self._backoff * self.
|
|
289
|
+
self._backoff = min(self._backoff * self._multiplier,
|
|
287
290
|
self._max_backoff)
|
|
288
291
|
self._backoff += random.uniform(-self.JITTER * self._backoff,
|
|
289
292
|
self.JITTER * self._backoff)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20251012
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -155,51 +155,51 @@ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
|
|
|
155
155
|
Requires-Dist: aiosqlite; extra == "server"
|
|
156
156
|
Requires-Dist: greenlet; extra == "server"
|
|
157
157
|
Provides-Extra: all
|
|
158
|
+
Requires-Dist: anyio; extra == "all"
|
|
158
159
|
Requires-Dist: nebius>=0.2.47; extra == "all"
|
|
159
|
-
Requires-Dist:
|
|
160
|
+
Requires-Dist: ecsapi>=0.2.0; extra == "all"
|
|
161
|
+
Requires-Dist: ibm-cos-sdk; extra == "all"
|
|
162
|
+
Requires-Dist: python-dateutil; extra == "all"
|
|
160
163
|
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
|
161
|
-
Requires-Dist:
|
|
162
|
-
Requires-Dist:
|
|
163
|
-
Requires-Dist:
|
|
164
|
+
Requires-Dist: aiosqlite; extra == "all"
|
|
165
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
166
|
+
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
167
|
+
Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
164
168
|
Requires-Dist: casbin; extra == "all"
|
|
165
|
-
Requires-Dist:
|
|
166
|
-
Requires-Dist:
|
|
167
|
-
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
169
|
+
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
|
170
|
+
Requires-Dist: boto3>=1.26.1; extra == "all"
|
|
168
171
|
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
169
|
-
Requires-Dist:
|
|
170
|
-
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
171
|
-
Requires-Dist: ibm-vpc; extra == "all"
|
|
172
|
-
Requires-Dist: ray[default]>=2.6.1; extra == "all"
|
|
173
|
-
Requires-Dist: anyio; extra == "all"
|
|
174
|
-
Requires-Dist: runpod>=1.6.1; extra == "all"
|
|
175
|
-
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
|
172
|
+
Requires-Dist: passlib; extra == "all"
|
|
176
173
|
Requires-Dist: greenlet; extra == "all"
|
|
177
|
-
Requires-Dist:
|
|
178
|
-
Requires-Dist:
|
|
174
|
+
Requires-Dist: msrestazure; extra == "all"
|
|
175
|
+
Requires-Dist: colorama<0.4.5; extra == "all"
|
|
179
176
|
Requires-Dist: azure-common; extra == "all"
|
|
180
|
-
Requires-Dist:
|
|
177
|
+
Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
|
|
178
|
+
Requires-Dist: websockets; extra == "all"
|
|
181
179
|
Requires-Dist: tomli; python_version < "3.11" and extra == "all"
|
|
182
|
-
Requires-Dist:
|
|
183
|
-
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
184
|
-
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
185
|
-
Requires-Dist: colorama<0.4.5; extra == "all"
|
|
186
|
-
Requires-Dist: passlib; extra == "all"
|
|
180
|
+
Requires-Dist: ray[default]>=2.6.1; extra == "all"
|
|
187
181
|
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
|
188
|
-
Requires-Dist:
|
|
182
|
+
Requires-Dist: google-cloud-storage; extra == "all"
|
|
183
|
+
Requires-Dist: docker; extra == "all"
|
|
184
|
+
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
185
|
+
Requires-Dist: msgraph-sdk; extra == "all"
|
|
186
|
+
Requires-Dist: ibm-vpc; extra == "all"
|
|
189
187
|
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
|
190
|
-
Requires-Dist:
|
|
191
|
-
Requires-Dist: aiohttp; extra == "all"
|
|
192
|
-
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
188
|
+
Requires-Dist: pyjwt; extra == "all"
|
|
193
189
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
|
194
|
-
Requires-Dist:
|
|
195
|
-
Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
190
|
+
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
196
191
|
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
|
192
|
+
Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
|
|
193
|
+
Requires-Dist: oci; extra == "all"
|
|
194
|
+
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
195
|
+
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
|
196
|
+
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
197
197
|
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
|
198
|
+
Requires-Dist: runpod>=1.6.1; extra == "all"
|
|
199
|
+
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
|
200
|
+
Requires-Dist: ibm-cloud-sdk-core; extra == "all"
|
|
198
201
|
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
199
|
-
Requires-Dist:
|
|
200
|
-
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
|
201
|
-
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
202
|
-
Requires-Dist: oci; extra == "all"
|
|
202
|
+
Requires-Dist: aiohttp; extra == "all"
|
|
203
203
|
Dynamic: author
|
|
204
204
|
Dynamic: classifier
|
|
205
205
|
Dynamic: description
|
|
@@ -249,10 +249,11 @@ Dynamic: summary
|
|
|
249
249
|
----
|
|
250
250
|
|
|
251
251
|
:fire: *News* :fire:
|
|
252
|
+
- [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./llm/torchtitan/)
|
|
253
|
+
- [Sep 2025] Scaling AI infrastructure at Abridge - **10x faster development** with SkyPilot: [**blog**](https://blog.skypilot.co/abridge/)
|
|
254
|
+
- [Sep 2025] Network and Storage Benchmarks for LLM training on the cloud: [**blog**](https://maknee.github.io/blog/2025/Network-And-Storage-Training-Skypilot/)
|
|
252
255
|
- [Aug 2025] Serve and finetune **OpenAI GPT-OSS models** (gpt-oss-120b, gpt-oss-20b) with one command on any infra: [**serve**](./llm/gpt-oss/) + [**LoRA and full finetuning**](./llm/gpt-oss-finetuning/)
|
|
253
|
-
- [Jul 2025] Run large-scale **LLM training with TorchTitan** on any cloud: [**example**](./llm/torchtitan/)
|
|
254
256
|
- [Jul 2025] Run distributed **RL training for LLMs** with Verl (PPO, GRPO) on any cloud: [**example**](./llm/verl/)
|
|
255
|
-
- [Jul 2025] 🎉 SkyPilot v0.10.0 released! [**blog post**](https://blog.skypilot.co/announcing-skypilot-0.10.0/), [**release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.10.0)
|
|
256
257
|
- [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)
|
|
257
258
|
- [Jul 2025] Two-part blog series, `The Evolution of AI Job Orchestration`: (1) [Running AI jobs on GPU Neoclouds](https://blog.skypilot.co/ai-job-orchestration-pt1-gpu-neoclouds/), (2) [The AI-Native Control Plane & Orchestration that Finally Works for ML](https://blog.skypilot.co/ai-job-orchestration-pt2-ai-control-plane/)
|
|
258
259
|
- [Apr 2025] Spin up **Qwen3** on your cluster/cloud: [**example**](./llm/qwen/)
|