skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +207 -79
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +112 -53
- sky/client/common.py +4 -2
- sky/client/sdk.py +17 -7
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +9 -54
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +271 -67
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +11 -7
- sky/jobs/server/core.py +5 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -7
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +32 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +5 -2
- sky/setup_files/setup.py +44 -44
- sky/skylet/constants.py +4 -1
- sky/skylet/events.py +42 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +24 -18
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/db/db_utils.py +5 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/core.py +1 -0
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +35 -28
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
|
@@ -24,6 +24,7 @@ from sky.utils import command_runner
|
|
|
24
24
|
from sky.utils import common_utils
|
|
25
25
|
from sky.utils import config_utils
|
|
26
26
|
from sky.utils import kubernetes_enums
|
|
27
|
+
from sky.utils import rich_utils
|
|
27
28
|
from sky.utils import status_lib
|
|
28
29
|
from sky.utils import subprocess_utils
|
|
29
30
|
from sky.utils import timeline
|
|
@@ -302,8 +303,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
|
|
|
302
303
|
f'code {rc}: {command!r}\nOutput: {stdout}.')
|
|
303
304
|
|
|
304
305
|
|
|
306
|
+
def _detect_cluster_event_reason_occurred(namespace, context, search_start,
|
|
307
|
+
reason) -> bool:
|
|
308
|
+
|
|
309
|
+
def _convert_to_utc(timestamp):
|
|
310
|
+
if timestamp.tzinfo is None:
|
|
311
|
+
return timestamp.replace(tzinfo=datetime.timezone.utc)
|
|
312
|
+
return timestamp.astimezone(datetime.timezone.utc)
|
|
313
|
+
|
|
314
|
+
def _get_event_timestamp(event):
|
|
315
|
+
if event.last_timestamp:
|
|
316
|
+
return event.last_timestamp
|
|
317
|
+
elif event.metadata.creation_timestamp:
|
|
318
|
+
return event.metadata.creation_timestamp
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
events = kubernetes.core_api(context).list_namespaced_event(
|
|
322
|
+
namespace=namespace, field_selector=f'reason={reason}')
|
|
323
|
+
for event in events.items:
|
|
324
|
+
ts = _get_event_timestamp(event)
|
|
325
|
+
if ts and _convert_to_utc(ts) > search_start:
|
|
326
|
+
return True
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
|
|
331
|
+
"""Detects whether the cluster had a autoscaling event after a
|
|
332
|
+
specified datetime. This only works when using cluster-autoscaler.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
namespace: kubernetes namespace
|
|
336
|
+
context: kubernetes context
|
|
337
|
+
search_start (datetime.datetime): filter for events that occurred
|
|
338
|
+
after search_start
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
A boolean whether the cluster has an autoscaling event or not.
|
|
342
|
+
"""
|
|
343
|
+
assert namespace is not None
|
|
344
|
+
|
|
345
|
+
try:
|
|
346
|
+
return _detect_cluster_event_reason_occurred(namespace, context,
|
|
347
|
+
search_start,
|
|
348
|
+
'TriggeredScaleUp')
|
|
349
|
+
except Exception as e: # pylint: disable=broad-except
|
|
350
|
+
logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
|
|
351
|
+
return False
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
|
|
355
|
+
"""Detects whether a kubernetes cluster may have an autoscaling event.
|
|
356
|
+
|
|
357
|
+
This is not a definitive detection. FailedScheduling, which is an
|
|
358
|
+
event that can occur when not enough resources are present in the cluster,
|
|
359
|
+
which is a trigger for cluster autoscaling. However, FailedScheduling may
|
|
360
|
+
have occurred due to other reasons (cluster itself is abnormal).
|
|
361
|
+
|
|
362
|
+
Hence, this should only be used for autoscalers that don't emit the
|
|
363
|
+
TriggeredScaleUp event, e.g.: Karpenter.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
namespace: kubernetes namespace
|
|
367
|
+
context: kubernetes context
|
|
368
|
+
search_start (datetime.datetime): filter for events that occurred
|
|
369
|
+
after search_start
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
A boolean whether the cluster has an autoscaling event or not.
|
|
373
|
+
"""
|
|
374
|
+
assert namespace is not None
|
|
375
|
+
|
|
376
|
+
try:
|
|
377
|
+
return _detect_cluster_event_reason_occurred(namespace, context,
|
|
378
|
+
search_start,
|
|
379
|
+
'FailedScheduling')
|
|
380
|
+
except Exception as e: # pylint: disable=broad-except
|
|
381
|
+
logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
|
|
382
|
+
return False
|
|
383
|
+
|
|
384
|
+
|
|
305
385
|
@timeline.event
|
|
306
|
-
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int
|
|
386
|
+
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
|
|
387
|
+
cluster_name: str,
|
|
388
|
+
create_pods_start: datetime.datetime):
|
|
307
389
|
"""Wait for all pods to be scheduled.
|
|
308
390
|
|
|
309
391
|
Wait for all pods including jump pod to be scheduled, and if it
|
|
@@ -312,6 +394,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
312
394
|
allocated and we can exit.
|
|
313
395
|
|
|
314
396
|
If timeout is set to a negative value, this method will wait indefinitely.
|
|
397
|
+
|
|
398
|
+
Will update the spinner message to indicate autoscaling if autoscaling
|
|
399
|
+
is happening.
|
|
315
400
|
"""
|
|
316
401
|
# Create a set of pod names we're waiting for
|
|
317
402
|
if not new_nodes:
|
|
@@ -319,6 +404,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
319
404
|
expected_pod_names = {node.metadata.name for node in new_nodes}
|
|
320
405
|
start_time = time.time()
|
|
321
406
|
|
|
407
|
+
# Variables for autoscaler detection
|
|
408
|
+
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
409
|
+
cloud='kubernetes',
|
|
410
|
+
region=context,
|
|
411
|
+
keys=('autoscaler',),
|
|
412
|
+
default_value=None)
|
|
413
|
+
autoscaler_is_set = autoscaler_type is not None
|
|
414
|
+
use_heuristic_detection = (autoscaler_is_set and
|
|
415
|
+
not kubernetes_enums.KubernetesAutoscalerType(
|
|
416
|
+
autoscaler_type).emits_autoscale_event())
|
|
417
|
+
is_autoscaling = False
|
|
418
|
+
|
|
322
419
|
def _evaluate_timeout() -> bool:
|
|
323
420
|
# If timeout is negative, retry indefinitely.
|
|
324
421
|
if timeout < 0:
|
|
@@ -328,12 +425,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
328
425
|
while _evaluate_timeout():
|
|
329
426
|
# Get all pods in a single API call using the cluster name label
|
|
330
427
|
# which all pods in new_nodes should share
|
|
331
|
-
|
|
428
|
+
cluster_name_on_cloud = new_nodes[0].metadata.labels[
|
|
332
429
|
k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
333
430
|
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
334
431
|
namespace,
|
|
335
432
|
label_selector=
|
|
336
|
-
f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={
|
|
433
|
+
f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
434
|
+
).items
|
|
337
435
|
|
|
338
436
|
# Get the set of found pod names and check if we have all expected pods
|
|
339
437
|
found_pod_names = {pod.metadata.name for pod in pods}
|
|
@@ -357,6 +455,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
357
455
|
|
|
358
456
|
if all_scheduled:
|
|
359
457
|
return
|
|
458
|
+
|
|
459
|
+
# Check if cluster is autoscaling and update spinner message.
|
|
460
|
+
# Minor optimization to not query k8s api after autoscaling
|
|
461
|
+
# event was detected. This is useful because there isn't any
|
|
462
|
+
# autoscaling complete event.
|
|
463
|
+
if autoscaler_is_set and not is_autoscaling:
|
|
464
|
+
if use_heuristic_detection:
|
|
465
|
+
is_autoscaling = _cluster_maybe_autoscaling(
|
|
466
|
+
namespace, context, create_pods_start)
|
|
467
|
+
msg = 'Kubernetes cluster may be scaling up'
|
|
468
|
+
else:
|
|
469
|
+
is_autoscaling = _cluster_had_autoscale_event(
|
|
470
|
+
namespace, context, create_pods_start)
|
|
471
|
+
msg = 'Kubernetes cluster is autoscaling'
|
|
472
|
+
|
|
473
|
+
if is_autoscaling:
|
|
474
|
+
rich_utils.force_update_status(
|
|
475
|
+
ux_utils.spinner_message(f'Launching ({msg})',
|
|
476
|
+
cluster_name=cluster_name))
|
|
477
|
+
|
|
360
478
|
time.sleep(1)
|
|
361
479
|
|
|
362
480
|
# Handle pod scheduling errors
|
|
@@ -761,13 +879,14 @@ def _wait_for_deployment_pod(context,
|
|
|
761
879
|
|
|
762
880
|
|
|
763
881
|
@timeline.event
|
|
764
|
-
def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
882
|
+
def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
765
883
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
766
884
|
"""Create pods based on the config."""
|
|
767
885
|
provider_config = config.provider_config
|
|
768
886
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
769
887
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
770
888
|
pod_spec = copy.deepcopy(config.node_config)
|
|
889
|
+
create_pods_start = datetime.datetime.now(datetime.timezone.utc)
|
|
771
890
|
|
|
772
891
|
to_create_deployment = 'deployment_spec' in pod_spec
|
|
773
892
|
if to_create_deployment:
|
|
@@ -1047,7 +1166,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
1047
1166
|
|
|
1048
1167
|
# Wait until the pods are scheduled and surface cause for error
|
|
1049
1168
|
# if there is one
|
|
1050
|
-
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout
|
|
1169
|
+
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
|
|
1170
|
+
cluster_name, create_pods_start)
|
|
1171
|
+
# Reset spinner message here because it might have hinted autoscaling
|
|
1172
|
+
# while waiting for pods to schedule.
|
|
1173
|
+
rich_utils.force_update_status(
|
|
1174
|
+
ux_utils.spinner_message('Launching', cluster_name=cluster_name))
|
|
1051
1175
|
# Wait until the pods and their containers are up and running, and
|
|
1052
1176
|
# fail early if there is an error
|
|
1053
1177
|
logger.debug(f'run_instances: waiting for pods to be running (pulling '
|
|
@@ -1068,11 +1192,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
1068
1192
|
)
|
|
1069
1193
|
|
|
1070
1194
|
|
|
1071
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
1195
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
1072
1196
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
1073
1197
|
"""Runs instances for the given cluster."""
|
|
1074
1198
|
try:
|
|
1075
|
-
return _create_pods(region, cluster_name_on_cloud, config)
|
|
1199
|
+
return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
|
|
1076
1200
|
except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
|
|
1077
1201
|
e_msg = common_utils.format_exception(e).replace('\n', ' ')
|
|
1078
1202
|
logger.warning('run_instances: Error occurred when creating pods: '
|
|
@@ -1238,6 +1362,7 @@ def get_cluster_info(
|
|
|
1238
1362
|
|
|
1239
1363
|
running_pods = kubernetes_utils.filter_pods(
|
|
1240
1364
|
namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
|
|
1365
|
+
logger.debug(f'Running pods: {list(running_pods.keys())}')
|
|
1241
1366
|
|
|
1242
1367
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
|
1243
1368
|
head_pod_name = None
|
|
@@ -1276,7 +1401,8 @@ def get_cluster_info(
|
|
|
1276
1401
|
assert head_spec is not None, pod
|
|
1277
1402
|
cpu_request = head_spec.containers[0].resources.requests['cpu']
|
|
1278
1403
|
|
|
1279
|
-
assert cpu_request is not None, 'cpu_request should not be None'
|
|
1404
|
+
assert cpu_request is not None, ('cpu_request should not be None, check '
|
|
1405
|
+
'the Pod status')
|
|
1280
1406
|
|
|
1281
1407
|
ssh_user = 'sky'
|
|
1282
1408
|
# Use pattern matching to extract SSH user, handling MOTD contamination.
|
|
@@ -68,9 +68,10 @@ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
|
|
|
68
68
|
return private_ip
|
|
69
69
|
|
|
70
70
|
|
|
71
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
71
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
72
72
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
73
73
|
"""Runs instances for the given cluster"""
|
|
74
|
+
del cluster_name # unused
|
|
74
75
|
lambda_client = _get_lambda_client()
|
|
75
76
|
pending_status = ['booting']
|
|
76
77
|
while True:
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -65,9 +65,10 @@ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
|
|
|
65
65
|
f' to be ready.')
|
|
66
66
|
|
|
67
67
|
|
|
68
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
68
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
69
69
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
70
|
"""Runs instances for the given cluster."""
|
|
71
|
+
del cluster_name # unused
|
|
71
72
|
_wait_until_no_pending(region, cluster_name_on_cloud)
|
|
72
73
|
running_instances = _filter_instances(region, cluster_name_on_cloud,
|
|
73
74
|
['RUNNING'])
|
sky/provision/oci/instance.py
CHANGED
|
@@ -65,9 +65,10 @@ def query_instances(
|
|
|
65
65
|
|
|
66
66
|
|
|
67
67
|
@query_utils.debug_enabled(logger)
|
|
68
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
68
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
69
69
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
70
|
"""Start instances with bootstrapped configuration."""
|
|
71
|
+
del cluster_name # unused
|
|
71
72
|
tags = dict(sorted(copy.deepcopy(config.tags).items()))
|
|
72
73
|
|
|
73
74
|
start_time = round(time.time() * 1000)
|
|
@@ -48,10 +48,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
48
48
|
return head_instance_id
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
51
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
52
52
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
53
53
|
"""Runs instances for the given cluster."""
|
|
54
|
-
|
|
54
|
+
del cluster_name # unused
|
|
55
55
|
pending_status = [
|
|
56
56
|
'starting', 'restarting', 'upgrading', 'provisioning', 'stopping'
|
|
57
57
|
]
|
|
@@ -65,10 +65,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
65
65
|
# Helper is available as utils.parse_ssh_connection.
|
|
66
66
|
|
|
67
67
|
|
|
68
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
68
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
69
69
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
70
|
"""Runs instances for the given cluster."""
|
|
71
|
-
|
|
71
|
+
del cluster_name # unused
|
|
72
72
|
pending_status = [
|
|
73
73
|
'PROVISIONING',
|
|
74
74
|
'PENDING',
|
sky/provision/provisioner.py
CHANGED
sky/provision/runpod/__init__.py
CHANGED
|
@@ -11,4 +11,6 @@ from sky.provision.runpod.instance import terminate_instances
|
|
|
11
11
|
from sky.provision.runpod.instance import wait_instances
|
|
12
12
|
from sky.provision.runpod.volume import apply_volume
|
|
13
13
|
from sky.provision.runpod.volume import delete_volume
|
|
14
|
+
from sky.provision.runpod.volume import get_all_volumes_usedby
|
|
14
15
|
from sky.provision.runpod.volume import get_volume_usedby
|
|
16
|
+
from sky.provision.runpod.volume import map_all_volumes_usedby
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -44,10 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
44
44
|
return head_instance_id
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
47
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
48
48
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
49
49
|
"""Runs instances for the given cluster."""
|
|
50
|
-
|
|
50
|
+
del cluster_name # unused
|
|
51
51
|
pending_status = ['CREATED', 'RESTARTING']
|
|
52
52
|
|
|
53
53
|
while True:
|
sky/provision/scp/instance.py
CHANGED
|
@@ -13,9 +13,9 @@ from sky.utils import status_lib
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
16
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
17
17
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
18
|
-
|
|
18
|
+
del cluster_name # unused
|
|
19
19
|
zone_id = config.node_config['zone_id']
|
|
20
20
|
running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
|
21
21
|
head_instance_id = _get_head_instance_id(running_instances)
|
sky/provision/seeweb/instance.py
CHANGED
|
@@ -502,9 +502,10 @@ class SeewebNodeProvider:
|
|
|
502
502
|
# =============================================================================
|
|
503
503
|
|
|
504
504
|
|
|
505
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
505
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
506
506
|
config: ProvisionConfig) -> ProvisionRecord:
|
|
507
507
|
"""Run instances for Seeweb cluster."""
|
|
508
|
+
del cluster_name # unused
|
|
508
509
|
provider = SeewebNodeProvider(config, cluster_name_on_cloud)
|
|
509
510
|
provider.run_instances(config.node_config, config.count)
|
|
510
511
|
|
sky/provision/vast/instance.py
CHANGED
|
@@ -44,9 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
44
44
|
return None
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
47
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
48
48
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
49
49
|
"""Runs instances for the given cluster."""
|
|
50
|
+
del cluster_name # unused
|
|
50
51
|
pending_status = ['CREATED', 'RESTARTING']
|
|
51
52
|
|
|
52
53
|
created_instance_ids = []
|
|
@@ -30,9 +30,10 @@ HEAD_NODE_VALUE = '1'
|
|
|
30
30
|
WORKER_NODE_VALUE = '0'
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def run_instances(region: str, cluster_name: str,
|
|
33
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
34
34
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
35
35
|
"""See sky/provision/__init__.py"""
|
|
36
|
+
del cluster_name # unused
|
|
36
37
|
logger.info('New provision of Vsphere: run_instances().')
|
|
37
38
|
|
|
38
39
|
resumed_instance_ids: List[str] = []
|
|
@@ -40,7 +41,7 @@ def run_instances(region: str, cluster_name: str,
|
|
|
40
41
|
vc_object = _get_vc_object(region)
|
|
41
42
|
vc_object.connect()
|
|
42
43
|
|
|
43
|
-
exist_instances = _get_filtered_instance(vc_object,
|
|
44
|
+
exist_instances = _get_filtered_instance(vc_object, cluster_name_on_cloud,
|
|
44
45
|
config.provider_config)
|
|
45
46
|
head_instance_id = _get_head_instance_id(exist_instances)
|
|
46
47
|
|
|
@@ -89,8 +90,8 @@ def run_instances(region: str, cluster_name: str,
|
|
|
89
90
|
config, region, vc_object)
|
|
90
91
|
# TODO: update logic for multi-node creation
|
|
91
92
|
for _ in range(to_start_num):
|
|
92
|
-
created_instance_uuid = _create_instances(
|
|
93
|
-
region, vc_object,
|
|
93
|
+
created_instance_uuid = _create_instances(cluster_name_on_cloud,
|
|
94
|
+
config, region, vc_object,
|
|
94
95
|
vsphere_cluster_name)
|
|
95
96
|
created_instance_ids.append(created_instance_uuid)
|
|
96
97
|
if head_instance_id is None:
|
|
@@ -104,7 +105,7 @@ def run_instances(region: str, cluster_name: str,
|
|
|
104
105
|
provider_name='vsphere',
|
|
105
106
|
region=region,
|
|
106
107
|
zone=vsphere_cluster_name,
|
|
107
|
-
cluster_name=
|
|
108
|
+
cluster_name=cluster_name_on_cloud,
|
|
108
109
|
head_instance_id=head_instance_id,
|
|
109
110
|
resumed_instance_ids=resumed_instance_ids,
|
|
110
111
|
created_instance_ids=created_instance_ids,
|
sky/schemas/api/responses.py
CHANGED
|
@@ -86,7 +86,7 @@ class StatusResponse(ResponseBaseModel):
|
|
|
86
86
|
# backends.ResourceHandle, so we use Any here.
|
|
87
87
|
# This is an internally facing field anyway, so it's less
|
|
88
88
|
# of a problem that it's not typed.
|
|
89
|
-
handle: Any
|
|
89
|
+
handle: Optional[Any] = None
|
|
90
90
|
last_use: str
|
|
91
91
|
status: status_lib.ClusterStatus
|
|
92
92
|
autostop: int
|
|
@@ -118,6 +118,7 @@ class StatusResponse(ResponseBaseModel):
|
|
|
118
118
|
cpus: Optional[str] = None
|
|
119
119
|
memory: Optional[str] = None
|
|
120
120
|
accelerators: Optional[str] = None
|
|
121
|
+
cluster_name_on_cloud: Optional[str] = None
|
|
121
122
|
|
|
122
123
|
|
|
123
124
|
class UploadStatus(enum.Enum):
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Add last_activity_time and launched_at to cluster history.
|
|
2
|
+
|
|
3
|
+
Revision ID: 009
|
|
4
|
+
Revises: 008
|
|
5
|
+
Create Date: 2025-09-24
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
import pickle
|
|
10
|
+
from typing import Sequence, Union
|
|
11
|
+
|
|
12
|
+
from alembic import op
|
|
13
|
+
import sqlalchemy as sa
|
|
14
|
+
|
|
15
|
+
from sky.utils.db import db_utils
|
|
16
|
+
|
|
17
|
+
# revision identifiers, used by Alembic.
|
|
18
|
+
revision: str = '009'
|
|
19
|
+
down_revision: Union[str, Sequence[str], None] = '008'
|
|
20
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
21
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def upgrade():
|
|
25
|
+
"""Add last_activity_time and launched_at columns to cluster history."""
|
|
26
|
+
with op.get_context().autocommit_block():
|
|
27
|
+
# Add the columns with indices
|
|
28
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
29
|
+
'last_activity_time',
|
|
30
|
+
sa.Integer(),
|
|
31
|
+
server_default=None,
|
|
32
|
+
index=True)
|
|
33
|
+
|
|
34
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
35
|
+
'launched_at',
|
|
36
|
+
sa.Integer(),
|
|
37
|
+
server_default=None,
|
|
38
|
+
index=True)
|
|
39
|
+
|
|
40
|
+
# Populate the columns for existing rows
|
|
41
|
+
_populate_cluster_history_columns()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _populate_cluster_history_columns():
|
|
45
|
+
"""Populate last_activity_time and launched_at for existing rows using
|
|
46
|
+
usage_intervals logic."""
|
|
47
|
+
connection = op.get_bind()
|
|
48
|
+
|
|
49
|
+
# Get all existing rows with usage_intervals
|
|
50
|
+
result = connection.execute(
|
|
51
|
+
sa.text('SELECT cluster_hash, usage_intervals FROM cluster_history '
|
|
52
|
+
'WHERE usage_intervals IS NOT NULL'))
|
|
53
|
+
|
|
54
|
+
for row in result:
|
|
55
|
+
cluster_hash = row[0]
|
|
56
|
+
usage_intervals_blob = row[1]
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
# Deserialize the usage_intervals
|
|
60
|
+
usage_intervals = pickle.loads(usage_intervals_blob)
|
|
61
|
+
|
|
62
|
+
if usage_intervals:
|
|
63
|
+
# Calculate last_activity_time: end time of last interval
|
|
64
|
+
# or start time if still running
|
|
65
|
+
last_interval = usage_intervals[-1]
|
|
66
|
+
last_activity_time = (last_interval[1] if last_interval[1]
|
|
67
|
+
is not None else last_interval[0])
|
|
68
|
+
|
|
69
|
+
# Calculate launched_at: start time of first interval
|
|
70
|
+
launched_at = usage_intervals[0][0]
|
|
71
|
+
|
|
72
|
+
# Update the row with both calculated values
|
|
73
|
+
connection.execute(
|
|
74
|
+
sa.text('UPDATE cluster_history '
|
|
75
|
+
'SET last_activity_time = :last_activity_time, '
|
|
76
|
+
'launched_at = :launched_at '
|
|
77
|
+
'WHERE cluster_hash = :cluster_hash'), {
|
|
78
|
+
'last_activity_time': last_activity_time,
|
|
79
|
+
'launched_at': launched_at,
|
|
80
|
+
'cluster_hash': cluster_hash
|
|
81
|
+
})
|
|
82
|
+
except (pickle.PickleError, AttributeError, IndexError):
|
|
83
|
+
# Skip rows with corrupted or invalid usage_intervals
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def downgrade():
|
|
88
|
+
"""No-op for backward compatibility."""
|
|
89
|
+
pass
|
sky/serve/autoscalers.py
CHANGED
|
@@ -411,6 +411,8 @@ class _AutoscalerWithHysteresis(Autoscaler):
|
|
|
411
411
|
# `_set_target_num_replicas_with_hysteresis` to have the replicas
|
|
412
412
|
# quickly scale after each update.
|
|
413
413
|
self.target_num_replicas = self._calculate_target_num_replicas()
|
|
414
|
+
logger.debug(f'Target number of replicas: {self.target_num_replicas}'
|
|
415
|
+
'after update_version.')
|
|
414
416
|
# Cleanup hysteresis counters.
|
|
415
417
|
self.upscale_counter = 0
|
|
416
418
|
self.downscale_counter = 0
|
sky/serve/client/impl.py
CHANGED
|
@@ -105,7 +105,8 @@ def update(
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
def apply(
|
|
108
|
-
task: Union['sky.Task', 'sky.Dag'],
|
|
108
|
+
task: Optional[Union['sky.Task', 'sky.Dag']],
|
|
109
|
+
workers: Optional[int],
|
|
109
110
|
service_name: str,
|
|
110
111
|
mode: 'serve_utils.UpdateMode',
|
|
111
112
|
pool: bool = False,
|
|
@@ -117,35 +118,60 @@ def apply(
|
|
|
117
118
|
# Avoid circular import.
|
|
118
119
|
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
|
119
120
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
abort=True,
|
|
133
|
-
show_default=True)
|
|
134
|
-
|
|
135
|
-
dag = client_common.upload_mounts_to_api_server(dag)
|
|
136
|
-
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
|
121
|
+
noun = 'pool' if pool else 'service'
|
|
122
|
+
# There are two cases here. If task is None, we should be trying to
|
|
123
|
+
# update the number of workers in the pool. If task is not None, we should
|
|
124
|
+
# be trying to apply a new config to the pool. The two code paths
|
|
125
|
+
# are slightly different with us needing to craft the dag and validate
|
|
126
|
+
# it if we have a task. In the future we could move this logic to the
|
|
127
|
+
# server side and simplify this code, for the time being we keep it here.
|
|
128
|
+
if task is None:
|
|
129
|
+
if workers is None:
|
|
130
|
+
raise ValueError(f'Cannot create a new {noun} without specifying '
|
|
131
|
+
f'task or workers. Please provide either a task '
|
|
132
|
+
f'or specify the number of workers.')
|
|
137
133
|
|
|
138
134
|
body = payloads.JobsPoolApplyBody(
|
|
139
|
-
|
|
135
|
+
workers=workers,
|
|
140
136
|
pool_name=service_name,
|
|
141
137
|
mode=mode,
|
|
142
138
|
)
|
|
139
|
+
|
|
143
140
|
response = server_common.make_authenticated_request(
|
|
144
141
|
'POST',
|
|
145
142
|
'/jobs/pool_apply',
|
|
146
143
|
json=json.loads(body.model_dump_json()),
|
|
147
144
|
timeout=(5, None))
|
|
148
145
|
return server_common.get_request_id(response)
|
|
146
|
+
else:
|
|
147
|
+
dag = dag_utils.convert_entrypoint_to_dag(task)
|
|
148
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
149
|
+
dag, at_client_side=True) as dag:
|
|
150
|
+
sdk.validate(dag)
|
|
151
|
+
request_id = sdk.optimize(dag)
|
|
152
|
+
sdk.stream_and_get(request_id)
|
|
153
|
+
if _need_confirmation:
|
|
154
|
+
prompt = f'Applying config to {noun} {service_name!r}. Proceed?'
|
|
155
|
+
if prompt is not None:
|
|
156
|
+
click.confirm(prompt,
|
|
157
|
+
default=True,
|
|
158
|
+
abort=True,
|
|
159
|
+
show_default=True)
|
|
160
|
+
|
|
161
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
|
162
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
|
163
|
+
|
|
164
|
+
body = payloads.JobsPoolApplyBody(
|
|
165
|
+
task=dag_str,
|
|
166
|
+
pool_name=service_name,
|
|
167
|
+
mode=mode,
|
|
168
|
+
)
|
|
169
|
+
response = server_common.make_authenticated_request(
|
|
170
|
+
'POST',
|
|
171
|
+
'/jobs/pool_apply',
|
|
172
|
+
json=json.loads(body.model_dump_json()),
|
|
173
|
+
timeout=(5, None))
|
|
174
|
+
return server_common.get_request_id(response)
|
|
149
175
|
|
|
150
176
|
|
|
151
177
|
def down(
|
sky/serve/replica_managers.py
CHANGED
|
@@ -422,11 +422,12 @@ class ReplicaInfo:
|
|
|
422
422
|
based on the cluster name.
|
|
423
423
|
"""
|
|
424
424
|
if cluster_record is None:
|
|
425
|
-
|
|
425
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
426
426
|
self.cluster_name)
|
|
427
|
-
|
|
427
|
+
else:
|
|
428
|
+
handle = cluster_record['handle']
|
|
429
|
+
if handle is None:
|
|
428
430
|
return None
|
|
429
|
-
handle = cluster_record['handle']
|
|
430
431
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
431
432
|
return handle
|
|
432
433
|
|
|
@@ -443,6 +444,12 @@ class ReplicaInfo:
|
|
|
443
444
|
handle = self.handle()
|
|
444
445
|
if handle is None:
|
|
445
446
|
return None
|
|
447
|
+
if self.replica_port == '-':
|
|
448
|
+
# This is a pool replica so there is no endpoint and it's filled
|
|
449
|
+
# with this dummy value. We return None here so that we can
|
|
450
|
+
# get the active ready replicas and perform autoscaling. Otherwise,
|
|
451
|
+
# would error out when trying to get the endpoint.
|
|
452
|
+
return None
|
|
446
453
|
replica_port_int = int(self.replica_port)
|
|
447
454
|
try:
|
|
448
455
|
endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
|
|
@@ -470,7 +477,7 @@ class ReplicaInfo:
|
|
|
470
477
|
with_handle: bool,
|
|
471
478
|
with_url: bool = True) -> Dict[str, Any]:
|
|
472
479
|
cluster_record = global_user_state.get_cluster_from_name(
|
|
473
|
-
self.cluster_name)
|
|
480
|
+
self.cluster_name, include_user_info=False, summary_response=True)
|
|
474
481
|
info_dict = {
|
|
475
482
|
'replica_id': self.replica_id,
|
|
476
483
|
'name': self.cluster_name,
|
|
@@ -956,7 +963,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
956
963
|
# provision) or the cluster is preempted and cleaned up by the status
|
|
957
964
|
# refresh. In this case, we skip spawning a new down process to save
|
|
958
965
|
# controller resources.
|
|
959
|
-
if global_user_state.
|
|
966
|
+
if not global_user_state.cluster_with_name_exists(info.cluster_name):
|
|
960
967
|
self._handle_sky_down_finish(info, exitcode=0)
|
|
961
968
|
return
|
|
962
969
|
|