skypilot-nightly 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250828__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/admin_policy.py +11 -10
- sky/authentication.py +1 -1
- sky/backends/backend.py +3 -5
- sky/backends/backend_utils.py +11 -13
- sky/backends/cloud_vm_ray_backend.py +11 -22
- sky/backends/local_docker_backend.py +3 -8
- sky/client/cli/command.py +41 -9
- sky/client/sdk.py +23 -8
- sky/client/sdk_async.py +6 -2
- sky/core.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-6e76f636a048e145.js → webpack-6dae1cd599a34def.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +24 -12
- sky/jobs/client/sdk.py +5 -2
- sky/jobs/recovery_strategy.py +9 -4
- sky/logs/agent.py +2 -2
- sky/logs/aws.py +6 -3
- sky/provision/do/utils.py +2 -1
- sky/provision/kubernetes/instance.py +55 -11
- sky/provision/kubernetes/utils.py +2 -2
- sky/provision/nebius/utils.py +36 -2
- sky/serve/client/impl.py +5 -4
- sky/serve/replica_managers.py +4 -3
- sky/serve/serve_utils.py +2 -2
- sky/serve/server/impl.py +3 -2
- sky/server/auth/oauth2_proxy.py +10 -4
- sky/server/common.py +3 -2
- sky/server/daemons.py +10 -5
- sky/server/requests/executor.py +2 -1
- sky/server/requests/requests.py +21 -0
- sky/server/server.py +16 -0
- sky/skylet/events.py +2 -3
- sky/skypilot_config.py +10 -10
- sky/task.py +1 -1
- sky/templates/nebius-ray.yml.j2 +4 -8
- sky/usage/usage_lib.py +3 -2
- sky/utils/common_utils.py +0 -72
- sky/utils/controller_utils.py +4 -3
- sky/utils/dag_utils.py +4 -4
- sky/utils/kubernetes/config_map_utils.py +3 -3
- sky/utils/schemas.py +3 -0
- sky/utils/yaml_utils.py +77 -10
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/RECORD +66 -66
- /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → 9DW6d9jaP2kZt0NcgIfFa}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → 9DW6d9jaP2kZt0NcgIfFa}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Kubernetes instance provisioning."""
|
|
2
2
|
import copy
|
|
3
|
+
import datetime
|
|
3
4
|
import json
|
|
4
5
|
import re
|
|
5
6
|
import time
|
|
@@ -1254,9 +1255,11 @@ def get_cluster_info(
|
|
|
1254
1255
|
provider_config=provider_config)
|
|
1255
1256
|
|
|
1256
1257
|
|
|
1257
|
-
def _get_pod_termination_reason(pod: Any) -> str:
|
|
1258
|
+
def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
1259
|
+
"""Get pod termination reason and write to cluster events."""
|
|
1258
1260
|
reasons = []
|
|
1259
|
-
|
|
1261
|
+
latest_timestamp = pod.status.start_time or datetime.datetime.min
|
|
1262
|
+
if pod.status and pod.status.container_statuses:
|
|
1260
1263
|
for container_status in pod.status.container_statuses:
|
|
1261
1264
|
terminated = container_status.state.terminated
|
|
1262
1265
|
if terminated:
|
|
@@ -1264,20 +1267,38 @@ def _get_pod_termination_reason(pod: Any) -> str:
|
|
|
1264
1267
|
reason = terminated.reason
|
|
1265
1268
|
if exit_code == 0:
|
|
1266
1269
|
# skip exit 0 (non-failed) just for sanity
|
|
1270
|
+
logger.debug(f'{pod.metadata.name}/{container_status.name} '
|
|
1271
|
+
'had exit code 0. Skipping.')
|
|
1267
1272
|
continue
|
|
1268
1273
|
if reason is None:
|
|
1269
1274
|
# just in-case reason is None, have default for debugging
|
|
1270
1275
|
reason = f'exit({exit_code})'
|
|
1271
1276
|
reasons.append(reason)
|
|
1277
|
+
if terminated.finished_at > latest_timestamp:
|
|
1278
|
+
latest_timestamp = terminated.finished_at
|
|
1279
|
+
|
|
1272
1280
|
# TODO (kyuds): later, if needed, query `last_state` too.
|
|
1273
1281
|
|
|
1282
|
+
if not reasons:
|
|
1283
|
+
return ''
|
|
1284
|
+
|
|
1274
1285
|
# Normally we will have a single container per pod for skypilot
|
|
1275
1286
|
# but doing this just in-case there are multiple containers.
|
|
1276
|
-
|
|
1287
|
+
pod_reason = ' | '.join(reasons)
|
|
1288
|
+
|
|
1289
|
+
global_user_state.add_cluster_event(
|
|
1290
|
+
cluster_name,
|
|
1291
|
+
None,
|
|
1292
|
+
f'[kubernetes pod {pod.metadata.name} terminated] {pod_reason}',
|
|
1293
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1294
|
+
transitioned_at=int(latest_timestamp.timestamp()),
|
|
1295
|
+
)
|
|
1296
|
+
return pod_reason
|
|
1277
1297
|
|
|
1278
1298
|
|
|
1279
1299
|
def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
1280
1300
|
cluster_name: str, pod_name: str) -> Optional[str]:
|
|
1301
|
+
"""Get events for missing pod and write to cluster events."""
|
|
1281
1302
|
logger.debug(f'Analyzing events for pod {pod_name}')
|
|
1282
1303
|
pod_field_selector = (
|
|
1283
1304
|
f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
|
|
@@ -1293,6 +1314,8 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
|
1293
1314
|
last_scheduled_node = None
|
|
1294
1315
|
insert_new_pod_event = True
|
|
1295
1316
|
new_event_inserted = False
|
|
1317
|
+
inserted_pod_events = 0
|
|
1318
|
+
|
|
1296
1319
|
for event in pod_events:
|
|
1297
1320
|
if event.reason == 'Scheduled':
|
|
1298
1321
|
pattern = r'Successfully assigned (\S+) to (\S+)'
|
|
@@ -1313,10 +1336,18 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
|
1313
1336
|
transitioned_at=int(
|
|
1314
1337
|
event.metadata.creation_timestamp.timestamp()),
|
|
1315
1338
|
expose_duplicate_error=True)
|
|
1339
|
+
logger.debug(f'[pod {pod_name}] encountered new pod event: '
|
|
1340
|
+
f'{event.metadata.creation_timestamp} '
|
|
1341
|
+
f'{event.reason} {event.message}')
|
|
1316
1342
|
except db_utils.UniqueConstraintViolationError:
|
|
1317
1343
|
insert_new_pod_event = False
|
|
1318
1344
|
else:
|
|
1319
1345
|
new_event_inserted = True
|
|
1346
|
+
inserted_pod_events += 1
|
|
1347
|
+
|
|
1348
|
+
logger.debug(f'[pod {pod_name}] processed {len(pod_events)} pod events and '
|
|
1349
|
+
f'inserted {inserted_pod_events} new pod events '
|
|
1350
|
+
'previously unseen')
|
|
1320
1351
|
|
|
1321
1352
|
if last_scheduled_node is not None:
|
|
1322
1353
|
node_field_selector = ('involvedObject.kind=Node,'
|
|
@@ -1331,6 +1362,7 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
|
1331
1362
|
# latest event appears first
|
|
1332
1363
|
reverse=True)
|
|
1333
1364
|
insert_new_node_event = True
|
|
1365
|
+
inserted_node_events = 0
|
|
1334
1366
|
for event in node_events:
|
|
1335
1367
|
if insert_new_node_event:
|
|
1336
1368
|
# Try inserting the latest events first. If the event is a
|
|
@@ -1345,10 +1377,23 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
|
1345
1377
|
transitioned_at=int(
|
|
1346
1378
|
event.metadata.creation_timestamp.timestamp()),
|
|
1347
1379
|
expose_duplicate_error=True)
|
|
1380
|
+
logger.debug(
|
|
1381
|
+
f'[pod {pod_name}] encountered new node event: '
|
|
1382
|
+
f'{event.metadata.creation_timestamp} '
|
|
1383
|
+
f'{event.reason} {event.message}')
|
|
1348
1384
|
except db_utils.UniqueConstraintViolationError:
|
|
1349
1385
|
insert_new_node_event = False
|
|
1350
1386
|
else:
|
|
1351
1387
|
new_event_inserted = True
|
|
1388
|
+
inserted_node_events += 1
|
|
1389
|
+
|
|
1390
|
+
logger.debug(f'[pod {pod_name}: node {last_scheduled_node}] '
|
|
1391
|
+
f'processed {len(node_events)} node events and '
|
|
1392
|
+
f'inserted {inserted_node_events} new node events '
|
|
1393
|
+
'previously unseen')
|
|
1394
|
+
else:
|
|
1395
|
+
logger.debug(f'[pod {pod_name}] could not determine the node '
|
|
1396
|
+
'the pod was scheduled to')
|
|
1352
1397
|
|
|
1353
1398
|
if not new_event_inserted:
|
|
1354
1399
|
# If new event is not inserted, there is no useful information to
|
|
@@ -1390,13 +1435,15 @@ def query_instances(
|
|
|
1390
1435
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
1391
1436
|
non_terminated_only: bool = True
|
|
1392
1437
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1438
|
+
# Mapping from pod phase to skypilot status. These are the only valid pod
|
|
1439
|
+
# phases.
|
|
1440
|
+
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
|
|
1393
1441
|
status_map = {
|
|
1394
1442
|
'Pending': status_lib.ClusterStatus.INIT,
|
|
1395
1443
|
'Running': status_lib.ClusterStatus.UP,
|
|
1396
1444
|
'Failed': status_lib.ClusterStatus.INIT,
|
|
1397
1445
|
'Unknown': None,
|
|
1398
1446
|
'Succeeded': None,
|
|
1399
|
-
'Terminating': None,
|
|
1400
1447
|
}
|
|
1401
1448
|
|
|
1402
1449
|
assert provider_config is not None
|
|
@@ -1440,18 +1487,15 @@ def query_instances(
|
|
|
1440
1487
|
for pod in pods:
|
|
1441
1488
|
phase = pod.status.phase
|
|
1442
1489
|
pod_status = status_map[phase]
|
|
1490
|
+
reason = None
|
|
1491
|
+
if phase in ('Failed', 'Unknown'):
|
|
1492
|
+
reason = _get_pod_termination_reason(pod, cluster_name)
|
|
1493
|
+
logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
|
|
1443
1494
|
if non_terminated_only and pod_status is None:
|
|
1444
1495
|
logger.debug(f'Pod {pod.metadata.name} is terminated, but '
|
|
1445
1496
|
'query_instances is called with '
|
|
1446
1497
|
f'non_terminated_only=True. Phase: {phase}')
|
|
1447
|
-
if phase == 'Failed':
|
|
1448
|
-
reason_for_debug = _get_pod_termination_reason(pod)
|
|
1449
|
-
logger.debug(f'Termination reason: {reason_for_debug}')
|
|
1450
1498
|
continue
|
|
1451
|
-
reason = None
|
|
1452
|
-
if phase == 'Failed':
|
|
1453
|
-
reason = _get_pod_termination_reason(pod)
|
|
1454
|
-
logger.debug(f'Pod Status Reason(s): {reason}')
|
|
1455
1499
|
pod_name = pod.metadata.name
|
|
1456
1500
|
reason = f'{pod_name}: {reason}' if reason is not None else None
|
|
1457
1501
|
cluster_status[pod_name] = (pod_status, reason)
|
|
@@ -2782,7 +2782,7 @@ def combine_pod_config_fields(
|
|
|
2782
2782
|
kubernetes_config)
|
|
2783
2783
|
|
|
2784
2784
|
# Write the updated YAML back to the file
|
|
2785
|
-
|
|
2785
|
+
yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
|
|
2786
2786
|
|
|
2787
2787
|
|
|
2788
2788
|
def combine_metadata_fields(cluster_yaml_path: str,
|
|
@@ -2834,7 +2834,7 @@ def combine_metadata_fields(cluster_yaml_path: str,
|
|
|
2834
2834
|
config_utils.merge_k8s_configs(destination, custom_metadata)
|
|
2835
2835
|
|
|
2836
2836
|
# Write the updated YAML back to the file
|
|
2837
|
-
|
|
2837
|
+
yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
|
|
2838
2838
|
|
|
2839
2839
|
|
|
2840
2840
|
def merge_custom_metadata(
|
sky/provision/nebius/utils.py
CHANGED
|
@@ -14,6 +14,8 @@ logger = sky_logging.init_logger(__name__)
|
|
|
14
14
|
|
|
15
15
|
POLL_INTERVAL = 5
|
|
16
16
|
|
|
17
|
+
_MAX_OPERATIONS_TO_FETCH = 1000
|
|
18
|
+
|
|
17
19
|
|
|
18
20
|
def retry(func):
|
|
19
21
|
"""Decorator to retry a function."""
|
|
@@ -321,11 +323,43 @@ def launch(cluster_name_on_cloud: str,
|
|
|
321
323
|
parent_id=project_id,
|
|
322
324
|
name=instance_name,
|
|
323
325
|
)))
|
|
326
|
+
instance_id = instance.metadata.id
|
|
324
327
|
if instance.status.state.name == 'STARTING':
|
|
325
|
-
instance_id = instance.metadata.id
|
|
326
328
|
break
|
|
329
|
+
|
|
330
|
+
# All Instances initially have state=STOPPED and reconciling=True,
|
|
331
|
+
# so we need to wait until reconciling is False.
|
|
332
|
+
if instance.status.state.name == 'STOPPED' and \
|
|
333
|
+
not instance.status.reconciling:
|
|
334
|
+
next_token = ''
|
|
335
|
+
total_operations = 0
|
|
336
|
+
while True:
|
|
337
|
+
operations_response = nebius.sync_call(
|
|
338
|
+
service.list_operations_by_parent(
|
|
339
|
+
nebius.compute().ListOperationsByParentRequest(
|
|
340
|
+
parent_id=project_id,
|
|
341
|
+
page_size=100,
|
|
342
|
+
page_token=next_token,
|
|
343
|
+
)))
|
|
344
|
+
total_operations += len(operations_response.operations)
|
|
345
|
+
for operation in operations_response.operations:
|
|
346
|
+
# Find the most recent operation for the instance.
|
|
347
|
+
if operation.resource_id == instance_id:
|
|
348
|
+
error_msg = operation.description
|
|
349
|
+
if operation.status:
|
|
350
|
+
error_msg += f' {operation.status.message}'
|
|
351
|
+
raise RuntimeError(error_msg)
|
|
352
|
+
# If we've fetched too many operations, or there are no more
|
|
353
|
+
# operations to fetch, just raise a generic error.
|
|
354
|
+
if total_operations > _MAX_OPERATIONS_TO_FETCH or \
|
|
355
|
+
not operations_response.next_page_token:
|
|
356
|
+
raise RuntimeError(
|
|
357
|
+
f'Instance {instance_name} failed to start.')
|
|
358
|
+
next_token = operations_response.next_page_token
|
|
327
359
|
time.sleep(POLL_INTERVAL)
|
|
328
|
-
logger.debug(f'Waiting for instance {instance_name} start running.'
|
|
360
|
+
logger.debug(f'Waiting for instance {instance_name} to start running. '
|
|
361
|
+
f'State: {instance.status.state.name}, '
|
|
362
|
+
f'Reconciling: {instance.status.reconciling}')
|
|
329
363
|
retry_count += 1
|
|
330
364
|
|
|
331
365
|
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
sky/serve/client/impl.py
CHANGED
|
@@ -224,10 +224,11 @@ def tail_logs(service_name: str,
|
|
|
224
224
|
stream=True)
|
|
225
225
|
request_id: server_common.RequestId[None] = server_common.get_request_id(
|
|
226
226
|
response)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
227
|
+
sdk.stream_response(request_id=request_id,
|
|
228
|
+
response=response,
|
|
229
|
+
output_stream=output_stream,
|
|
230
|
+
resumable=True,
|
|
231
|
+
get_result=follow)
|
|
231
232
|
|
|
232
233
|
|
|
233
234
|
def sync_down_logs(service_name: str,
|
sky/serve/replica_managers.py
CHANGED
|
@@ -37,6 +37,7 @@ from sky.utils import env_options
|
|
|
37
37
|
from sky.utils import resources_utils
|
|
38
38
|
from sky.utils import status_lib
|
|
39
39
|
from sky.utils import ux_utils
|
|
40
|
+
from sky.utils import yaml_utils
|
|
40
41
|
|
|
41
42
|
if typing.TYPE_CHECKING:
|
|
42
43
|
from sky.serve import service_spec
|
|
@@ -79,7 +80,7 @@ def launch_cluster(replica_id: int,
|
|
|
79
80
|
f'{cluster_name} with resources override: '
|
|
80
81
|
f'{resources_override}')
|
|
81
82
|
try:
|
|
82
|
-
config =
|
|
83
|
+
config = yaml_utils.read_yaml(
|
|
83
84
|
os.path.expanduser(service_task_yaml_path))
|
|
84
85
|
task = task_lib.Task.from_yaml_config(config)
|
|
85
86
|
if resources_override is not None:
|
|
@@ -1397,7 +1398,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1397
1398
|
# the latest version. This can significantly improve the speed
|
|
1398
1399
|
# for updating an existing service with only config changes to the
|
|
1399
1400
|
# service specs, e.g. scale down the service.
|
|
1400
|
-
new_config =
|
|
1401
|
+
new_config = yaml_utils.read_yaml(
|
|
1401
1402
|
os.path.expanduser(service_task_yaml_path))
|
|
1402
1403
|
# Always create new replicas and scale down old ones when file_mounts
|
|
1403
1404
|
# are not empty.
|
|
@@ -1414,7 +1415,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1414
1415
|
old_service_task_yaml_path = (
|
|
1415
1416
|
serve_utils.generate_task_yaml_file_name(
|
|
1416
1417
|
self._service_name, info.version))
|
|
1417
|
-
old_config =
|
|
1418
|
+
old_config = yaml_utils.read_yaml(
|
|
1418
1419
|
os.path.expanduser(old_service_task_yaml_path))
|
|
1419
1420
|
for key in ['service', 'pool', '_user_specified_yaml']:
|
|
1420
1421
|
old_config.pop(key, None)
|
sky/serve/serve_utils.py
CHANGED
|
@@ -699,7 +699,7 @@ def _get_service_status(
|
|
|
699
699
|
if record['pool']:
|
|
700
700
|
latest_yaml_path = generate_task_yaml_file_name(service_name,
|
|
701
701
|
record['version'])
|
|
702
|
-
raw_yaml_config =
|
|
702
|
+
raw_yaml_config = yaml_utils.read_yaml(latest_yaml_path)
|
|
703
703
|
original_config = raw_yaml_config.get('_user_specified_yaml')
|
|
704
704
|
if original_config is None:
|
|
705
705
|
# Fall back to old display format.
|
|
@@ -711,7 +711,7 @@ def _get_service_status(
|
|
|
711
711
|
original_config['pool'] = svc # Add pool to root config
|
|
712
712
|
else:
|
|
713
713
|
original_config = yaml_utils.safe_load(original_config)
|
|
714
|
-
record['pool_yaml'] =
|
|
714
|
+
record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
|
|
715
715
|
|
|
716
716
|
record['target_num_replicas'] = 0
|
|
717
717
|
try:
|
sky/serve/server/impl.py
CHANGED
|
@@ -34,6 +34,7 @@ from sky.utils import dag_utils
|
|
|
34
34
|
from sky.utils import rich_utils
|
|
35
35
|
from sky.utils import subprocess_utils
|
|
36
36
|
from sky.utils import ux_utils
|
|
37
|
+
from sky.utils import yaml_utils
|
|
37
38
|
|
|
38
39
|
logger = sky_logging.init_logger(__name__)
|
|
39
40
|
|
|
@@ -179,7 +180,7 @@ def up(
|
|
|
179
180
|
controller = controller_utils.get_controller_for_pool(pool)
|
|
180
181
|
controller_name = controller.value.cluster_name
|
|
181
182
|
task_config = task.to_yaml_config()
|
|
182
|
-
|
|
183
|
+
yaml_utils.dump_yaml(service_file.name, task_config)
|
|
183
184
|
remote_tmp_task_yaml_path = (
|
|
184
185
|
serve_utils.generate_remote_tmp_task_yaml_file_name(service_name))
|
|
185
186
|
remote_config_yaml_path = (
|
|
@@ -531,7 +532,7 @@ def update(
|
|
|
531
532
|
prefix=f'{service_name}-v{current_version}',
|
|
532
533
|
mode='w') as service_file:
|
|
533
534
|
task_config = task.to_yaml_config()
|
|
534
|
-
|
|
535
|
+
yaml_utils.dump_yaml(service_file.name, task_config)
|
|
535
536
|
remote_task_yaml_path = serve_utils.generate_task_yaml_file_name(
|
|
536
537
|
service_name, current_version, expand_user=False)
|
|
537
538
|
|
sky/server/auth/oauth2_proxy.py
CHANGED
|
@@ -4,6 +4,7 @@ import asyncio
|
|
|
4
4
|
import hashlib
|
|
5
5
|
import http
|
|
6
6
|
import os
|
|
7
|
+
import traceback
|
|
7
8
|
from typing import Optional
|
|
8
9
|
import urllib
|
|
9
10
|
|
|
@@ -109,8 +110,8 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
109
110
|
try:
|
|
110
111
|
return await self._authenticate(request, call_next, session)
|
|
111
112
|
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
112
|
-
logger.error(f'Error communicating with OAuth2 proxy: {e}'
|
|
113
|
-
|
|
113
|
+
logger.error(f'Error communicating with OAuth2 proxy: {e}'
|
|
114
|
+
f'{traceback.format_exc()}')
|
|
114
115
|
return fastapi.responses.JSONResponse(
|
|
115
116
|
status_code=http.HTTPStatus.BAD_GATEWAY,
|
|
116
117
|
content={'detail': 'oauth2-proxy service unavailable'})
|
|
@@ -120,10 +121,15 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
120
121
|
forwarded_headers = dict(request.headers)
|
|
121
122
|
auth_url = f'{self.proxy_base}/oauth2/auth'
|
|
122
123
|
forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
|
|
123
|
-
|
|
124
|
+
# Remove content-length and content-type headers and drop request body
|
|
125
|
+
# to reduce the auth overhead.
|
|
126
|
+
forwarded_headers.pop('content-length', None)
|
|
127
|
+
forwarded_headers.pop('content-type', None)
|
|
128
|
+
logger.debug(f'authenticate request: {auth_url}, '
|
|
129
|
+
f'headers: {forwarded_headers}')
|
|
124
130
|
|
|
125
131
|
async with session.request(
|
|
126
|
-
method=
|
|
132
|
+
method='GET',
|
|
127
133
|
url=auth_url,
|
|
128
134
|
headers=forwarded_headers,
|
|
129
135
|
cookies=request.cookies,
|
sky/server/common.py
CHANGED
|
@@ -41,6 +41,7 @@ from sky.utils import annotations
|
|
|
41
41
|
from sky.utils import common_utils
|
|
42
42
|
from sky.utils import rich_utils
|
|
43
43
|
from sky.utils import ux_utils
|
|
44
|
+
from sky.utils import yaml_utils
|
|
44
45
|
|
|
45
46
|
if typing.TYPE_CHECKING:
|
|
46
47
|
import aiohttp
|
|
@@ -816,7 +817,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
|
|
|
816
817
|
return str(client_file_mounts_dir /
|
|
817
818
|
file_mounts_mapping[original_path].lstrip('/'))
|
|
818
819
|
|
|
819
|
-
task_configs =
|
|
820
|
+
task_configs = yaml_utils.read_yaml_all(str(client_task_path))
|
|
820
821
|
for task_config in task_configs:
|
|
821
822
|
if task_config is None:
|
|
822
823
|
continue
|
|
@@ -869,7 +870,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
|
|
|
869
870
|
# We can switch to using string, but this is to make it easier to debug, by
|
|
870
871
|
# persisting the translated task yaml file.
|
|
871
872
|
translated_client_task_path = client_dir / f'{task_id}_translated.yaml'
|
|
872
|
-
|
|
873
|
+
yaml_utils.dump_yaml(str(translated_client_task_path), task_configs)
|
|
873
874
|
|
|
874
875
|
dag = dag_utils.load_chain_dag_from_yaml(str(translated_client_task_path))
|
|
875
876
|
return dag
|
sky/server/daemons.py
CHANGED
|
@@ -191,23 +191,28 @@ INTERNAL_REQUEST_DAEMONS = [
|
|
|
191
191
|
# set to updated status automatically, without showing users the hint of
|
|
192
192
|
# cluster being stopped or down when `sky status -r` is called.
|
|
193
193
|
InternalRequestDaemon(id='skypilot-status-refresh-daemon',
|
|
194
|
-
name='status',
|
|
194
|
+
name='status-refresh',
|
|
195
195
|
event_fn=refresh_cluster_status_event,
|
|
196
196
|
default_log_level='DEBUG'),
|
|
197
197
|
# Volume status refresh daemon to update the volume status periodically.
|
|
198
198
|
InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
|
|
199
|
-
name='volume',
|
|
199
|
+
name='volume-refresh',
|
|
200
200
|
event_fn=refresh_volume_status_event),
|
|
201
201
|
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
202
|
-
name='managed-job-status',
|
|
202
|
+
name='managed-job-status-refresh',
|
|
203
203
|
event_fn=managed_job_status_refresh_event,
|
|
204
204
|
should_skip=should_skip_managed_job_status_refresh),
|
|
205
205
|
InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
|
|
206
|
-
name='sky-serve-status',
|
|
206
|
+
name='sky-serve-status-refresh',
|
|
207
207
|
event_fn=sky_serve_status_refresh_event,
|
|
208
208
|
should_skip=should_skip_sky_serve_status_refresh),
|
|
209
209
|
InternalRequestDaemon(id='pool-status-refresh-daemon',
|
|
210
|
-
name='pool-status',
|
|
210
|
+
name='pool-status-refresh',
|
|
211
211
|
event_fn=pool_status_refresh_event,
|
|
212
212
|
should_skip=should_skip_pool_status_refresh),
|
|
213
213
|
]
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def is_daemon_request_id(request_id: str) -> bool:
|
|
217
|
+
"""Returns whether a specific request_id is an internal daemon."""
|
|
218
|
+
return any([d.id == request_id for d in INTERNAL_REQUEST_DAEMONS])
|
sky/server/requests/executor.py
CHANGED
|
@@ -55,6 +55,7 @@ from sky.utils import context_utils
|
|
|
55
55
|
from sky.utils import subprocess_utils
|
|
56
56
|
from sky.utils import tempstore
|
|
57
57
|
from sky.utils import timeline
|
|
58
|
+
from sky.utils import yaml_utils
|
|
58
59
|
from sky.workspaces import core as workspaces_core
|
|
59
60
|
|
|
60
61
|
if typing.TYPE_CHECKING:
|
|
@@ -387,7 +388,7 @@ def _request_execution_wrapper(request_id: str,
|
|
|
387
388
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
388
389
|
config = skypilot_config.to_dict()
|
|
389
390
|
logger.debug(f'request config: \n'
|
|
390
|
-
f'{
|
|
391
|
+
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
391
392
|
return_value = func(**request_body.to_kwargs())
|
|
392
393
|
f.flush()
|
|
393
394
|
except KeyboardInterrupt:
|
sky/server/requests/requests.py
CHANGED
|
@@ -565,6 +565,27 @@ def get_request_tasks(
|
|
|
565
565
|
return requests
|
|
566
566
|
|
|
567
567
|
|
|
568
|
+
@init_db
|
|
569
|
+
def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
570
|
+
"""Get a list of API request ids for shell completion."""
|
|
571
|
+
assert _DB is not None
|
|
572
|
+
with _DB.conn:
|
|
573
|
+
cursor = _DB.conn.cursor()
|
|
574
|
+
# Prioritize alive requests (PENDING, RUNNING) over finished ones,
|
|
575
|
+
# then order by creation time (newest first) within each category.
|
|
576
|
+
cursor.execute(
|
|
577
|
+
f"""SELECT request_id FROM {REQUEST_TABLE}
|
|
578
|
+
WHERE request_id LIKE ?
|
|
579
|
+
ORDER BY
|
|
580
|
+
CASE
|
|
581
|
+
WHEN status IN ('PENDING', 'RUNNING') THEN 0
|
|
582
|
+
ELSE 1
|
|
583
|
+
END,
|
|
584
|
+
created_at DESC
|
|
585
|
+
LIMIT 1000""", (f'{incomplete}%',))
|
|
586
|
+
return [row[0] for row in cursor.fetchall()]
|
|
587
|
+
|
|
588
|
+
|
|
568
589
|
def _add_or_update_request_no_lock(request: Request):
|
|
569
590
|
"""Add or update a REST request into the database."""
|
|
570
591
|
row = request.to_row()
|
sky/server/server.py
CHANGED
|
@@ -1403,6 +1403,9 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
|
1403
1403
|
raise fastapi.HTTPException(
|
|
1404
1404
|
status_code=500, detail=request_task.encode().model_dump())
|
|
1405
1405
|
return request_task.encode()
|
|
1406
|
+
elif (request_task.status == requests_lib.RequestStatus.RUNNING and
|
|
1407
|
+
daemons.is_daemon_request_id(request_id)):
|
|
1408
|
+
return request_task.encode()
|
|
1406
1409
|
# yield control to allow other coroutines to run, sleep shortly
|
|
1407
1410
|
# to avoid storming the DB and CPU in the meantime
|
|
1408
1411
|
await asyncio.sleep(0.1)
|
|
@@ -1491,6 +1494,14 @@ async def stream(
|
|
|
1491
1494
|
if log_path == constants.API_SERVER_LOGS:
|
|
1492
1495
|
resolved_log_path = pathlib.Path(
|
|
1493
1496
|
constants.API_SERVER_LOGS).expanduser()
|
|
1497
|
+
if not resolved_log_path.exists():
|
|
1498
|
+
raise fastapi.HTTPException(
|
|
1499
|
+
status_code=404,
|
|
1500
|
+
detail='Server log file does not exist. The API server may '
|
|
1501
|
+
'have been started with `--foreground` - check the '
|
|
1502
|
+
'stdout of API server process, such as: '
|
|
1503
|
+
'`kubectl logs -n api-server-namespace '
|
|
1504
|
+
'api-server-pod-name`')
|
|
1494
1505
|
else:
|
|
1495
1506
|
# This should be a log path under ~/sky_logs.
|
|
1496
1507
|
resolved_logs_directory = pathlib.Path(
|
|
@@ -1769,6 +1780,11 @@ async def complete_volume_name(incomplete: str,) -> List[str]:
|
|
|
1769
1780
|
return global_user_state.get_volume_names_start_with(incomplete)
|
|
1770
1781
|
|
|
1771
1782
|
|
|
1783
|
+
@app.get('/api/completion/api_request')
|
|
1784
|
+
async def complete_api_request(incomplete: str,) -> List[str]:
|
|
1785
|
+
return requests_lib.get_api_request_ids_start_with(incomplete)
|
|
1786
|
+
|
|
1787
|
+
|
|
1772
1788
|
@app.get('/dashboard/{full_path:path}')
|
|
1773
1789
|
async def serve_dashboard(full_path: str):
|
|
1774
1790
|
"""Serves the Next.js dashboard application.
|
sky/skylet/events.py
CHANGED
|
@@ -20,7 +20,6 @@ from sky.skylet import constants
|
|
|
20
20
|
from sky.skylet import job_lib
|
|
21
21
|
from sky.usage import usage_lib
|
|
22
22
|
from sky.utils import cluster_utils
|
|
23
|
-
from sky.utils import common_utils
|
|
24
23
|
from sky.utils import registry
|
|
25
24
|
from sky.utils import ux_utils
|
|
26
25
|
from sky.utils import yaml_utils
|
|
@@ -181,7 +180,7 @@ class AutostopEvent(SkyletEvent):
|
|
|
181
180
|
|
|
182
181
|
config_path = os.path.abspath(
|
|
183
182
|
os.path.expanduser(cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH))
|
|
184
|
-
config =
|
|
183
|
+
config = yaml_utils.read_yaml(config_path)
|
|
185
184
|
provider_name = cluster_utils.get_provider_name(config)
|
|
186
185
|
cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
|
|
187
186
|
assert cloud is not None, f'Unknown cloud: {provider_name}'
|
|
@@ -326,5 +325,5 @@ class AutostopEvent(SkyletEvent):
|
|
|
326
325
|
config['auth'].pop('ssh_proxy_command', None)
|
|
327
326
|
# Empty the file_mounts.
|
|
328
327
|
config['file_mounts'] = {}
|
|
329
|
-
|
|
328
|
+
yaml_utils.dump_yaml(yaml_path, config)
|
|
330
329
|
logger.debug('Replaced upscaling speed to 0.')
|
sky/skypilot_config.py
CHANGED
|
@@ -494,7 +494,7 @@ def reload_config() -> None:
|
|
|
494
494
|
def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
|
|
495
495
|
config = config_utils.Config()
|
|
496
496
|
try:
|
|
497
|
-
config_dict =
|
|
497
|
+
config_dict = yaml_utils.read_yaml(config_path)
|
|
498
498
|
config = config_utils.Config.from_dict(config_dict)
|
|
499
499
|
# pop the db url from the config, and set it to the env var.
|
|
500
500
|
# this is to avoid db url (considered a sensitive value)
|
|
@@ -504,7 +504,7 @@ def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
|
|
|
504
504
|
os.environ[constants.ENV_VAR_DB_CONNECTION_URI] = db_url
|
|
505
505
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
506
506
|
logger.debug(f'Config loaded from {config_path}:\n'
|
|
507
|
-
f'{
|
|
507
|
+
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
508
508
|
except yaml.YAMLError as e:
|
|
509
509
|
logger.error(f'Error in loading config file ({config_path}):', e)
|
|
510
510
|
if config:
|
|
@@ -600,7 +600,7 @@ def _reload_config_as_server() -> None:
|
|
|
600
600
|
sqlalchemy_engine.dispose()
|
|
601
601
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
602
602
|
logger.debug(f'server config: \n'
|
|
603
|
-
f'{
|
|
603
|
+
f'{yaml_utils.dump_yaml_str(dict(server_config))}')
|
|
604
604
|
_set_loaded_config(server_config)
|
|
605
605
|
_set_loaded_config_path(server_config_path)
|
|
606
606
|
|
|
@@ -628,7 +628,7 @@ def _reload_config_as_client() -> None:
|
|
|
628
628
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
629
629
|
logger.debug(
|
|
630
630
|
f'client config (before task and CLI overrides): \n'
|
|
631
|
-
f'{
|
|
631
|
+
f'{yaml_utils.dump_yaml_str(dict(overlaid_client_config))}')
|
|
632
632
|
_set_loaded_config(overlaid_client_config)
|
|
633
633
|
_set_loaded_config_path([user_config_path, project_config_path])
|
|
634
634
|
|
|
@@ -738,9 +738,9 @@ def override_skypilot_config(
|
|
|
738
738
|
'Failed to override the SkyPilot config on API '
|
|
739
739
|
'server with your local SkyPilot config:\n'
|
|
740
740
|
'=== SkyPilot config on API server ===\n'
|
|
741
|
-
f'{
|
|
741
|
+
f'{yaml_utils.dump_yaml_str(dict(original_config))}\n'
|
|
742
742
|
'=== Your local SkyPilot config ===\n'
|
|
743
|
-
f'{
|
|
743
|
+
f'{yaml_utils.dump_yaml_str(dict(override_configs))}\n'
|
|
744
744
|
f'Details: {e}') from e
|
|
745
745
|
finally:
|
|
746
746
|
_set_loaded_config(original_config)
|
|
@@ -767,7 +767,7 @@ def replace_skypilot_config(new_configs: config_utils.Config) -> Iterator[None]:
|
|
|
767
767
|
mode='w',
|
|
768
768
|
prefix='mutated-skypilot-config-',
|
|
769
769
|
suffix='.yaml') as temp_file:
|
|
770
|
-
|
|
770
|
+
yaml_utils.dump_yaml(temp_file.name, dict(**new_configs))
|
|
771
771
|
# Modify the env var of current process or context so that the
|
|
772
772
|
# new config will be used by spawned sub-processes.
|
|
773
773
|
# Note that this code modifies os.environ directly because it
|
|
@@ -831,7 +831,7 @@ def apply_cli_config(cli_config: Optional[List[str]]) -> Dict[str, Any]:
|
|
|
831
831
|
parsed_config = _compose_cli_config(cli_config)
|
|
832
832
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
833
833
|
logger.debug(f'applying following CLI overrides: \n'
|
|
834
|
-
f'{
|
|
834
|
+
f'{yaml_utils.dump_yaml_str(dict(parsed_config))}')
|
|
835
835
|
_set_loaded_config(
|
|
836
836
|
overlay_skypilot_config(original_config=_get_loaded_config(),
|
|
837
837
|
override_configs=parsed_config))
|
|
@@ -875,7 +875,7 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
875
875
|
def _set_config_yaml_to_db(key: str,
|
|
876
876
|
config: config_utils.Config):
|
|
877
877
|
assert sqlalchemy_engine is not None
|
|
878
|
-
config_str =
|
|
878
|
+
config_str = yaml_utils.dump_yaml_str(dict(config))
|
|
879
879
|
with orm.Session(sqlalchemy_engine) as session:
|
|
880
880
|
if (sqlalchemy_engine.dialect.name ==
|
|
881
881
|
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
@@ -901,7 +901,7 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
901
901
|
|
|
902
902
|
if not db_updated:
|
|
903
903
|
# save to the local file (PVC in Kubernetes, local file otherwise)
|
|
904
|
-
|
|
904
|
+
yaml_utils.dump_yaml(global_config_path, dict(config))
|
|
905
905
|
|
|
906
906
|
if config_map_utils.is_running_in_kubernetes():
|
|
907
907
|
# In Kubernetes, sync the PVC config to ConfigMap for user
|
sky/task.py
CHANGED
|
@@ -564,7 +564,7 @@ class Task:
|
|
|
564
564
|
secrets_overrides: Optional[List[Tuple[str, str]]] = None,
|
|
565
565
|
) -> 'Task':
|
|
566
566
|
user_specified_yaml = config.pop('_user_specified_yaml',
|
|
567
|
-
|
|
567
|
+
yaml_utils.dump_yaml_str(config))
|
|
568
568
|
# More robust handling for 'envs': explicitly convert keys and values to
|
|
569
569
|
# str, since users may pass '123' as keys/values which will get parsed
|
|
570
570
|
# as int causing validate_schema() to fail.
|
sky/templates/nebius-ray.yml.j2
CHANGED
|
@@ -56,15 +56,11 @@ available_node_types:
|
|
|
56
56
|
filesystem_mount_path: {{ fs.filesystem_mount_path }}
|
|
57
57
|
{%- endfor %}
|
|
58
58
|
UserData: |
|
|
59
|
-
runcmd:
|
|
60
|
-
- sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
|
|
61
|
-
- systemctl restart sshd
|
|
62
|
-
|
|
63
59
|
{# Two available OS images:
|
|
64
|
-
1.
|
|
65
|
-
2.
|
|
66
|
-
To optimize deployment speed, Docker is only installed when using
|
|
67
|
-
{%- if docker_image is not none and image_id
|
|
60
|
+
1. ubuntu24.04-driverless - requires Docker installation
|
|
61
|
+
2. ubuntu24.04-cuda12 - comes with Docker pre-installed
|
|
62
|
+
To optimize deployment speed, Docker is only installed when using ubuntu24.04-driverless #}
|
|
63
|
+
{%- if docker_image is not none and image_id.endswith('-driverless') %}
|
|
68
64
|
apt:
|
|
69
65
|
sources:
|
|
70
66
|
docker.list:
|