skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +207 -79
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +112 -53
  7. sky/client/common.py +4 -2
  8. sky/client/sdk.py +17 -7
  9. sky/client/sdk_async.py +4 -2
  10. sky/clouds/kubernetes.py +2 -1
  11. sky/clouds/runpod.py +20 -7
  12. sky/core.py +9 -54
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
  15. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
  22. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  23. sky/dashboard/out/clusters/[cluster].html +1 -1
  24. sky/dashboard/out/clusters.html +1 -1
  25. sky/dashboard/out/config.html +1 -1
  26. sky/dashboard/out/index.html +1 -1
  27. sky/dashboard/out/infra/[context].html +1 -1
  28. sky/dashboard/out/infra.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/dashboard/out/users.html +1 -1
  33. sky/dashboard/out/volumes.html +1 -1
  34. sky/dashboard/out/workspace/new.html +1 -1
  35. sky/dashboard/out/workspaces/[name].html +1 -1
  36. sky/dashboard/out/workspaces.html +1 -1
  37. sky/data/mounting_utils.py +19 -10
  38. sky/execution.py +4 -2
  39. sky/global_user_state.py +271 -67
  40. sky/jobs/client/sdk.py +10 -1
  41. sky/jobs/constants.py +2 -0
  42. sky/jobs/controller.py +11 -7
  43. sky/jobs/server/core.py +5 -3
  44. sky/jobs/server/server.py +15 -11
  45. sky/jobs/utils.py +1 -1
  46. sky/logs/agent.py +30 -3
  47. sky/logs/aws.py +9 -19
  48. sky/provision/__init__.py +2 -1
  49. sky/provision/aws/instance.py +2 -1
  50. sky/provision/azure/instance.py +2 -1
  51. sky/provision/cudo/instance.py +2 -2
  52. sky/provision/do/instance.py +2 -2
  53. sky/provision/docker_utils.py +41 -19
  54. sky/provision/fluidstack/instance.py +2 -2
  55. sky/provision/gcp/instance.py +2 -1
  56. sky/provision/hyperbolic/instance.py +2 -1
  57. sky/provision/instance_setup.py +1 -1
  58. sky/provision/kubernetes/instance.py +134 -8
  59. sky/provision/lambda_cloud/instance.py +2 -1
  60. sky/provision/nebius/instance.py +2 -1
  61. sky/provision/oci/instance.py +2 -1
  62. sky/provision/paperspace/instance.py +2 -2
  63. sky/provision/primeintellect/instance.py +2 -2
  64. sky/provision/provisioner.py +1 -0
  65. sky/provision/runpod/__init__.py +2 -0
  66. sky/provision/runpod/instance.py +2 -2
  67. sky/provision/scp/instance.py +2 -2
  68. sky/provision/seeweb/instance.py +2 -1
  69. sky/provision/vast/instance.py +2 -1
  70. sky/provision/vsphere/instance.py +6 -5
  71. sky/schemas/api/responses.py +2 -1
  72. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  73. sky/serve/autoscalers.py +2 -0
  74. sky/serve/client/impl.py +45 -19
  75. sky/serve/replica_managers.py +12 -5
  76. sky/serve/serve_utils.py +5 -7
  77. sky/serve/server/core.py +9 -6
  78. sky/serve/server/impl.py +78 -25
  79. sky/serve/server/server.py +4 -5
  80. sky/serve/service_spec.py +33 -0
  81. sky/server/constants.py +1 -1
  82. sky/server/daemons.py +2 -3
  83. sky/server/requests/executor.py +56 -6
  84. sky/server/requests/payloads.py +32 -8
  85. sky/server/requests/preconditions.py +2 -3
  86. sky/server/rest.py +2 -0
  87. sky/server/server.py +28 -19
  88. sky/server/stream_utils.py +34 -12
  89. sky/setup_files/dependencies.py +5 -2
  90. sky/setup_files/setup.py +44 -44
  91. sky/skylet/constants.py +4 -1
  92. sky/skylet/events.py +42 -0
  93. sky/templates/jobs-controller.yaml.j2 +3 -0
  94. sky/templates/kubernetes-ray.yml.j2 +24 -18
  95. sky/usage/usage_lib.py +3 -0
  96. sky/utils/cli_utils/status_utils.py +4 -5
  97. sky/utils/context.py +104 -29
  98. sky/utils/controller_utils.py +7 -6
  99. sky/utils/db/db_utils.py +5 -1
  100. sky/utils/db/migration_utils.py +1 -1
  101. sky/utils/kubernetes/create_cluster.sh +13 -28
  102. sky/utils/kubernetes/delete_cluster.sh +10 -7
  103. sky/utils/kubernetes/generate_kind_config.py +6 -66
  104. sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
  105. sky/utils/kubernetes_enums.py +5 -0
  106. sky/utils/ux_utils.py +35 -1
  107. sky/utils/yaml_utils.py +9 -0
  108. sky/volumes/client/sdk.py +44 -8
  109. sky/volumes/server/core.py +1 -0
  110. sky/volumes/server/server.py +33 -7
  111. sky/volumes/volume.py +35 -28
  112. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
  113. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
  114. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  119. /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
  120. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
  121. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
  122. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
  123. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ from sky.utils import command_runner
24
24
  from sky.utils import common_utils
25
25
  from sky.utils import config_utils
26
26
  from sky.utils import kubernetes_enums
27
+ from sky.utils import rich_utils
27
28
  from sky.utils import status_lib
28
29
  from sky.utils import subprocess_utils
29
30
  from sky.utils import timeline
@@ -302,8 +303,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
302
303
  f'code {rc}: {command!r}\nOutput: {stdout}.')
303
304
 
304
305
 
306
+ def _detect_cluster_event_reason_occurred(namespace, context, search_start,
307
+ reason) -> bool:
308
+
309
+ def _convert_to_utc(timestamp):
310
+ if timestamp.tzinfo is None:
311
+ return timestamp.replace(tzinfo=datetime.timezone.utc)
312
+ return timestamp.astimezone(datetime.timezone.utc)
313
+
314
+ def _get_event_timestamp(event):
315
+ if event.last_timestamp:
316
+ return event.last_timestamp
317
+ elif event.metadata.creation_timestamp:
318
+ return event.metadata.creation_timestamp
319
+ return None
320
+
321
+ events = kubernetes.core_api(context).list_namespaced_event(
322
+ namespace=namespace, field_selector=f'reason={reason}')
323
+ for event in events.items:
324
+ ts = _get_event_timestamp(event)
325
+ if ts and _convert_to_utc(ts) > search_start:
326
+ return True
327
+ return False
328
+
329
+
330
+ def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
331
+ """Detects whether the cluster had a autoscaling event after a
332
+ specified datetime. This only works when using cluster-autoscaler.
333
+
334
+ Args:
335
+ namespace: kubernetes namespace
336
+ context: kubernetes context
337
+ search_start (datetime.datetime): filter for events that occurred
338
+ after search_start
339
+
340
+ Returns:
341
+ A boolean whether the cluster has an autoscaling event or not.
342
+ """
343
+ assert namespace is not None
344
+
345
+ try:
346
+ return _detect_cluster_event_reason_occurred(namespace, context,
347
+ search_start,
348
+ 'TriggeredScaleUp')
349
+ except Exception as e: # pylint: disable=broad-except
350
+ logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
351
+ return False
352
+
353
+
354
+ def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
355
+ """Detects whether a kubernetes cluster may have an autoscaling event.
356
+
357
+ This is not a definitive detection. FailedScheduling, which is an
358
+ event that can occur when not enough resources are present in the cluster,
359
+ which is a trigger for cluster autoscaling. However, FailedScheduling may
360
+ have occurred due to other reasons (cluster itself is abnormal).
361
+
362
+ Hence, this should only be used for autoscalers that don't emit the
363
+ TriggeredScaleUp event, e.g.: Karpenter.
364
+
365
+ Args:
366
+ namespace: kubernetes namespace
367
+ context: kubernetes context
368
+ search_start (datetime.datetime): filter for events that occurred
369
+ after search_start
370
+
371
+ Returns:
372
+ A boolean whether the cluster has an autoscaling event or not.
373
+ """
374
+ assert namespace is not None
375
+
376
+ try:
377
+ return _detect_cluster_event_reason_occurred(namespace, context,
378
+ search_start,
379
+ 'FailedScheduling')
380
+ except Exception as e: # pylint: disable=broad-except
381
+ logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
382
+ return False
383
+
384
+
305
385
  @timeline.event
306
- def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
386
+ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
387
+ cluster_name: str,
388
+ create_pods_start: datetime.datetime):
307
389
  """Wait for all pods to be scheduled.
308
390
 
309
391
  Wait for all pods including jump pod to be scheduled, and if it
@@ -312,6 +394,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
312
394
  allocated and we can exit.
313
395
 
314
396
  If timeout is set to a negative value, this method will wait indefinitely.
397
+
398
+ Will update the spinner message to indicate autoscaling if autoscaling
399
+ is happening.
315
400
  """
316
401
  # Create a set of pod names we're waiting for
317
402
  if not new_nodes:
@@ -319,6 +404,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
319
404
  expected_pod_names = {node.metadata.name for node in new_nodes}
320
405
  start_time = time.time()
321
406
 
407
+ # Variables for autoscaler detection
408
+ autoscaler_type = skypilot_config.get_effective_region_config(
409
+ cloud='kubernetes',
410
+ region=context,
411
+ keys=('autoscaler',),
412
+ default_value=None)
413
+ autoscaler_is_set = autoscaler_type is not None
414
+ use_heuristic_detection = (autoscaler_is_set and
415
+ not kubernetes_enums.KubernetesAutoscalerType(
416
+ autoscaler_type).emits_autoscale_event())
417
+ is_autoscaling = False
418
+
322
419
  def _evaluate_timeout() -> bool:
323
420
  # If timeout is negative, retry indefinitely.
324
421
  if timeout < 0:
@@ -328,12 +425,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
328
425
  while _evaluate_timeout():
329
426
  # Get all pods in a single API call using the cluster name label
330
427
  # which all pods in new_nodes should share
331
- cluster_name = new_nodes[0].metadata.labels[
428
+ cluster_name_on_cloud = new_nodes[0].metadata.labels[
332
429
  k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
333
430
  pods = kubernetes.core_api(context).list_namespaced_pod(
334
431
  namespace,
335
432
  label_selector=
336
- f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
433
+ f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
434
+ ).items
337
435
 
338
436
  # Get the set of found pod names and check if we have all expected pods
339
437
  found_pod_names = {pod.metadata.name for pod in pods}
@@ -357,6 +455,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
357
455
 
358
456
  if all_scheduled:
359
457
  return
458
+
459
+ # Check if cluster is autoscaling and update spinner message.
460
+ # Minor optimization to not query k8s api after autoscaling
461
+ # event was detected. This is useful because there isn't any
462
+ # autoscaling complete event.
463
+ if autoscaler_is_set and not is_autoscaling:
464
+ if use_heuristic_detection:
465
+ is_autoscaling = _cluster_maybe_autoscaling(
466
+ namespace, context, create_pods_start)
467
+ msg = 'Kubernetes cluster may be scaling up'
468
+ else:
469
+ is_autoscaling = _cluster_had_autoscale_event(
470
+ namespace, context, create_pods_start)
471
+ msg = 'Kubernetes cluster is autoscaling'
472
+
473
+ if is_autoscaling:
474
+ rich_utils.force_update_status(
475
+ ux_utils.spinner_message(f'Launching ({msg})',
476
+ cluster_name=cluster_name))
477
+
360
478
  time.sleep(1)
361
479
 
362
480
  # Handle pod scheduling errors
@@ -761,13 +879,14 @@ def _wait_for_deployment_pod(context,
761
879
 
762
880
 
763
881
  @timeline.event
764
- def _create_pods(region: str, cluster_name_on_cloud: str,
882
+ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
765
883
  config: common.ProvisionConfig) -> common.ProvisionRecord:
766
884
  """Create pods based on the config."""
767
885
  provider_config = config.provider_config
768
886
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
769
887
  context = kubernetes_utils.get_context_from_config(provider_config)
770
888
  pod_spec = copy.deepcopy(config.node_config)
889
+ create_pods_start = datetime.datetime.now(datetime.timezone.utc)
771
890
 
772
891
  to_create_deployment = 'deployment_spec' in pod_spec
773
892
  if to_create_deployment:
@@ -1047,7 +1166,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
1047
1166
 
1048
1167
  # Wait until the pods are scheduled and surface cause for error
1049
1168
  # if there is one
1050
- _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout)
1169
+ _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
1170
+ cluster_name, create_pods_start)
1171
+ # Reset spinner message here because it might have hinted autoscaling
1172
+ # while waiting for pods to schedule.
1173
+ rich_utils.force_update_status(
1174
+ ux_utils.spinner_message('Launching', cluster_name=cluster_name))
1051
1175
  # Wait until the pods and their containers are up and running, and
1052
1176
  # fail early if there is an error
1053
1177
  logger.debug(f'run_instances: waiting for pods to be running (pulling '
@@ -1068,11 +1192,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
1068
1192
  )
1069
1193
 
1070
1194
 
1071
- def run_instances(region: str, cluster_name_on_cloud: str,
1195
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
1072
1196
  config: common.ProvisionConfig) -> common.ProvisionRecord:
1073
1197
  """Runs instances for the given cluster."""
1074
1198
  try:
1075
- return _create_pods(region, cluster_name_on_cloud, config)
1199
+ return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
1076
1200
  except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
1077
1201
  e_msg = common_utils.format_exception(e).replace('\n', ' ')
1078
1202
  logger.warning('run_instances: Error occurred when creating pods: '
@@ -1238,6 +1362,7 @@ def get_cluster_info(
1238
1362
 
1239
1363
  running_pods = kubernetes_utils.filter_pods(
1240
1364
  namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
1365
+ logger.debug(f'Running pods: {list(running_pods.keys())}')
1241
1366
 
1242
1367
  pods: Dict[str, List[common.InstanceInfo]] = {}
1243
1368
  head_pod_name = None
@@ -1276,7 +1401,8 @@ def get_cluster_info(
1276
1401
  assert head_spec is not None, pod
1277
1402
  cpu_request = head_spec.containers[0].resources.requests['cpu']
1278
1403
 
1279
- assert cpu_request is not None, 'cpu_request should not be None'
1404
+ assert cpu_request is not None, ('cpu_request should not be None, check '
1405
+ 'the Pod status')
1280
1406
 
1281
1407
  ssh_user = 'sky'
1282
1408
  # Use pattern matching to extract SSH user, handling MOTD contamination.
@@ -68,9 +68,10 @@ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
68
68
  return private_ip
69
69
 
70
70
 
71
- def run_instances(region: str, cluster_name_on_cloud: str,
71
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
72
72
  config: common.ProvisionConfig) -> common.ProvisionRecord:
73
73
  """Runs instances for the given cluster"""
74
+ del cluster_name # unused
74
75
  lambda_client = _get_lambda_client()
75
76
  pending_status = ['booting']
76
77
  while True:
@@ -65,9 +65,10 @@ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
65
65
  f' to be ready.')
66
66
 
67
67
 
68
- def run_instances(region: str, cluster_name_on_cloud: str,
68
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
69
69
  config: common.ProvisionConfig) -> common.ProvisionRecord:
70
70
  """Runs instances for the given cluster."""
71
+ del cluster_name # unused
71
72
  _wait_until_no_pending(region, cluster_name_on_cloud)
72
73
  running_instances = _filter_instances(region, cluster_name_on_cloud,
73
74
  ['RUNNING'])
@@ -65,9 +65,10 @@ def query_instances(
65
65
 
66
66
 
67
67
  @query_utils.debug_enabled(logger)
68
- def run_instances(region: str, cluster_name_on_cloud: str,
68
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
69
69
  config: common.ProvisionConfig) -> common.ProvisionRecord:
70
70
  """Start instances with bootstrapped configuration."""
71
+ del cluster_name # unused
71
72
  tags = dict(sorted(copy.deepcopy(config.tags).items()))
72
73
 
73
74
  start_time = round(time.time() * 1000)
@@ -48,10 +48,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
48
48
  return head_instance_id
49
49
 
50
50
 
51
- def run_instances(region: str, cluster_name_on_cloud: str,
51
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
52
52
  config: common.ProvisionConfig) -> common.ProvisionRecord:
53
53
  """Runs instances for the given cluster."""
54
-
54
+ del cluster_name # unused
55
55
  pending_status = [
56
56
  'starting', 'restarting', 'upgrading', 'provisioning', 'stopping'
57
57
  ]
@@ -65,10 +65,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
65
65
  # Helper is available as utils.parse_ssh_connection.
66
66
 
67
67
 
68
- def run_instances(region: str, cluster_name_on_cloud: str,
68
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
69
69
  config: common.ProvisionConfig) -> common.ProvisionRecord:
70
70
  """Runs instances for the given cluster."""
71
-
71
+ del cluster_name # unused
72
72
  pending_status = [
73
73
  'PROVISIONING',
74
74
  'PENDING',
@@ -69,6 +69,7 @@ def _bulk_provision(
69
69
 
70
70
  provision_record = provision.run_instances(provider_name,
71
71
  region_name,
72
+ str(cluster_name),
72
73
  cluster_name.name_on_cloud,
73
74
  config=config)
74
75
 
@@ -11,4 +11,6 @@ from sky.provision.runpod.instance import terminate_instances
11
11
  from sky.provision.runpod.instance import wait_instances
12
12
  from sky.provision.runpod.volume import apply_volume
13
13
  from sky.provision.runpod.volume import delete_volume
14
+ from sky.provision.runpod.volume import get_all_volumes_usedby
14
15
  from sky.provision.runpod.volume import get_volume_usedby
16
+ from sky.provision.runpod.volume import map_all_volumes_usedby
@@ -44,10 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
44
44
  return head_instance_id
45
45
 
46
46
 
47
- def run_instances(region: str, cluster_name_on_cloud: str,
47
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
48
48
  config: common.ProvisionConfig) -> common.ProvisionRecord:
49
49
  """Runs instances for the given cluster."""
50
-
50
+ del cluster_name # unused
51
51
  pending_status = ['CREATED', 'RESTARTING']
52
52
 
53
53
  while True:
@@ -13,9 +13,9 @@ from sky.utils import status_lib
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
15
 
16
- def run_instances(region: str, cluster_name_on_cloud: str,
16
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
17
17
  config: common.ProvisionConfig) -> common.ProvisionRecord:
18
-
18
+ del cluster_name # unused
19
19
  zone_id = config.node_config['zone_id']
20
20
  running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
21
21
  head_instance_id = _get_head_instance_id(running_instances)
@@ -502,9 +502,10 @@ class SeewebNodeProvider:
502
502
  # =============================================================================
503
503
 
504
504
 
505
- def run_instances(region: str, cluster_name_on_cloud: str,
505
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
506
506
  config: ProvisionConfig) -> ProvisionRecord:
507
507
  """Run instances for Seeweb cluster."""
508
+ del cluster_name # unused
508
509
  provider = SeewebNodeProvider(config, cluster_name_on_cloud)
509
510
  provider.run_instances(config.node_config, config.count)
510
511
 
@@ -44,9 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
44
44
  return None
45
45
 
46
46
 
47
- def run_instances(region: str, cluster_name_on_cloud: str,
47
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
48
48
  config: common.ProvisionConfig) -> common.ProvisionRecord:
49
49
  """Runs instances for the given cluster."""
50
+ del cluster_name # unused
50
51
  pending_status = ['CREATED', 'RESTARTING']
51
52
 
52
53
  created_instance_ids = []
@@ -30,9 +30,10 @@ HEAD_NODE_VALUE = '1'
30
30
  WORKER_NODE_VALUE = '0'
31
31
 
32
32
 
33
- def run_instances(region: str, cluster_name: str,
33
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
34
34
  config: common.ProvisionConfig) -> common.ProvisionRecord:
35
35
  """See sky/provision/__init__.py"""
36
+ del cluster_name # unused
36
37
  logger.info('New provision of Vsphere: run_instances().')
37
38
 
38
39
  resumed_instance_ids: List[str] = []
@@ -40,7 +41,7 @@ def run_instances(region: str, cluster_name: str,
40
41
  vc_object = _get_vc_object(region)
41
42
  vc_object.connect()
42
43
 
43
- exist_instances = _get_filtered_instance(vc_object, cluster_name,
44
+ exist_instances = _get_filtered_instance(vc_object, cluster_name_on_cloud,
44
45
  config.provider_config)
45
46
  head_instance_id = _get_head_instance_id(exist_instances)
46
47
 
@@ -89,8 +90,8 @@ def run_instances(region: str, cluster_name: str,
89
90
  config, region, vc_object)
90
91
  # TODO: update logic for multi-node creation
91
92
  for _ in range(to_start_num):
92
- created_instance_uuid = _create_instances(cluster_name, config,
93
- region, vc_object,
93
+ created_instance_uuid = _create_instances(cluster_name_on_cloud,
94
+ config, region, vc_object,
94
95
  vsphere_cluster_name)
95
96
  created_instance_ids.append(created_instance_uuid)
96
97
  if head_instance_id is None:
@@ -104,7 +105,7 @@ def run_instances(region: str, cluster_name: str,
104
105
  provider_name='vsphere',
105
106
  region=region,
106
107
  zone=vsphere_cluster_name,
107
- cluster_name=cluster_name,
108
+ cluster_name=cluster_name_on_cloud,
108
109
  head_instance_id=head_instance_id,
109
110
  resumed_instance_ids=resumed_instance_ids,
110
111
  created_instance_ids=created_instance_ids,
@@ -86,7 +86,7 @@ class StatusResponse(ResponseBaseModel):
86
86
  # backends.ResourceHandle, so we use Any here.
87
87
  # This is an internally facing field anyway, so it's less
88
88
  # of a problem that it's not typed.
89
- handle: Any
89
+ handle: Optional[Any] = None
90
90
  last_use: str
91
91
  status: status_lib.ClusterStatus
92
92
  autostop: int
@@ -118,6 +118,7 @@ class StatusResponse(ResponseBaseModel):
118
118
  cpus: Optional[str] = None
119
119
  memory: Optional[str] = None
120
120
  accelerators: Optional[str] = None
121
+ cluster_name_on_cloud: Optional[str] = None
121
122
 
122
123
 
123
124
  class UploadStatus(enum.Enum):
@@ -0,0 +1,89 @@
1
+ """Add last_activity_time and launched_at to cluster history.
2
+
3
+ Revision ID: 009
4
+ Revises: 008
5
+ Create Date: 2025-09-24
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ import pickle
10
+ from typing import Sequence, Union
11
+
12
+ from alembic import op
13
+ import sqlalchemy as sa
14
+
15
+ from sky.utils.db import db_utils
16
+
17
+ # revision identifiers, used by Alembic.
18
+ revision: str = '009'
19
+ down_revision: Union[str, Sequence[str], None] = '008'
20
+ branch_labels: Union[str, Sequence[str], None] = None
21
+ depends_on: Union[str, Sequence[str], None] = None
22
+
23
+
24
+ def upgrade():
25
+ """Add last_activity_time and launched_at columns to cluster history."""
26
+ with op.get_context().autocommit_block():
27
+ # Add the columns with indices
28
+ db_utils.add_column_to_table_alembic('cluster_history',
29
+ 'last_activity_time',
30
+ sa.Integer(),
31
+ server_default=None,
32
+ index=True)
33
+
34
+ db_utils.add_column_to_table_alembic('cluster_history',
35
+ 'launched_at',
36
+ sa.Integer(),
37
+ server_default=None,
38
+ index=True)
39
+
40
+ # Populate the columns for existing rows
41
+ _populate_cluster_history_columns()
42
+
43
+
44
+ def _populate_cluster_history_columns():
45
+ """Populate last_activity_time and launched_at for existing rows using
46
+ usage_intervals logic."""
47
+ connection = op.get_bind()
48
+
49
+ # Get all existing rows with usage_intervals
50
+ result = connection.execute(
51
+ sa.text('SELECT cluster_hash, usage_intervals FROM cluster_history '
52
+ 'WHERE usage_intervals IS NOT NULL'))
53
+
54
+ for row in result:
55
+ cluster_hash = row[0]
56
+ usage_intervals_blob = row[1]
57
+
58
+ try:
59
+ # Deserialize the usage_intervals
60
+ usage_intervals = pickle.loads(usage_intervals_blob)
61
+
62
+ if usage_intervals:
63
+ # Calculate last_activity_time: end time of last interval
64
+ # or start time if still running
65
+ last_interval = usage_intervals[-1]
66
+ last_activity_time = (last_interval[1] if last_interval[1]
67
+ is not None else last_interval[0])
68
+
69
+ # Calculate launched_at: start time of first interval
70
+ launched_at = usage_intervals[0][0]
71
+
72
+ # Update the row with both calculated values
73
+ connection.execute(
74
+ sa.text('UPDATE cluster_history '
75
+ 'SET last_activity_time = :last_activity_time, '
76
+ 'launched_at = :launched_at '
77
+ 'WHERE cluster_hash = :cluster_hash'), {
78
+ 'last_activity_time': last_activity_time,
79
+ 'launched_at': launched_at,
80
+ 'cluster_hash': cluster_hash
81
+ })
82
+ except (pickle.PickleError, AttributeError, IndexError):
83
+ # Skip rows with corrupted or invalid usage_intervals
84
+ continue
85
+
86
+
87
+ def downgrade():
88
+ """No-op for backward compatibility."""
89
+ pass
sky/serve/autoscalers.py CHANGED
@@ -411,6 +411,8 @@ class _AutoscalerWithHysteresis(Autoscaler):
411
411
  # `_set_target_num_replicas_with_hysteresis` to have the replicas
412
412
  # quickly scale after each update.
413
413
  self.target_num_replicas = self._calculate_target_num_replicas()
414
+ logger.debug(f'Target number of replicas: {self.target_num_replicas}'
415
+ 'after update_version.')
414
416
  # Cleanup hysteresis counters.
415
417
  self.upscale_counter = 0
416
418
  self.downscale_counter = 0
sky/serve/client/impl.py CHANGED
@@ -105,7 +105,8 @@ def update(
105
105
 
106
106
 
107
107
  def apply(
108
- task: Union['sky.Task', 'sky.Dag'],
108
+ task: Optional[Union['sky.Task', 'sky.Dag']],
109
+ workers: Optional[int],
109
110
  service_name: str,
110
111
  mode: 'serve_utils.UpdateMode',
111
112
  pool: bool = False,
@@ -117,35 +118,60 @@ def apply(
117
118
  # Avoid circular import.
118
119
  from sky.client import sdk # pylint: disable=import-outside-toplevel
119
120
 
120
- dag = dag_utils.convert_entrypoint_to_dag(task)
121
- with admin_policy_utils.apply_and_use_config_in_current_request(
122
- dag, at_client_side=True) as dag:
123
- sdk.validate(dag)
124
- request_id = sdk.optimize(dag)
125
- sdk.stream_and_get(request_id)
126
- if _need_confirmation:
127
- noun = 'pool' if pool else 'service'
128
- prompt = f'Applying config to {noun} {service_name!r}. Proceed?'
129
- if prompt is not None:
130
- click.confirm(prompt,
131
- default=True,
132
- abort=True,
133
- show_default=True)
134
-
135
- dag = client_common.upload_mounts_to_api_server(dag)
136
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
121
+ noun = 'pool' if pool else 'service'
122
+ # There are two cases here. If task is None, we should be trying to
123
+ # update the number of workers in the pool. If task is not None, we should
124
+ # be trying to apply a new config to the pool. The two code paths
125
+ # are slightly different with us needing to craft the dag and validate
126
+ # it if we have a task. In the future we could move this logic to the
127
+ # server side and simplify this code, for the time being we keep it here.
128
+ if task is None:
129
+ if workers is None:
130
+ raise ValueError(f'Cannot create a new {noun} without specifying '
131
+ f'task or workers. Please provide either a task '
132
+ f'or specify the number of workers.')
137
133
 
138
134
  body = payloads.JobsPoolApplyBody(
139
- task=dag_str,
135
+ workers=workers,
140
136
  pool_name=service_name,
141
137
  mode=mode,
142
138
  )
139
+
143
140
  response = server_common.make_authenticated_request(
144
141
  'POST',
145
142
  '/jobs/pool_apply',
146
143
  json=json.loads(body.model_dump_json()),
147
144
  timeout=(5, None))
148
145
  return server_common.get_request_id(response)
146
+ else:
147
+ dag = dag_utils.convert_entrypoint_to_dag(task)
148
+ with admin_policy_utils.apply_and_use_config_in_current_request(
149
+ dag, at_client_side=True) as dag:
150
+ sdk.validate(dag)
151
+ request_id = sdk.optimize(dag)
152
+ sdk.stream_and_get(request_id)
153
+ if _need_confirmation:
154
+ prompt = f'Applying config to {noun} {service_name!r}. Proceed?'
155
+ if prompt is not None:
156
+ click.confirm(prompt,
157
+ default=True,
158
+ abort=True,
159
+ show_default=True)
160
+
161
+ dag = client_common.upload_mounts_to_api_server(dag)
162
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
163
+
164
+ body = payloads.JobsPoolApplyBody(
165
+ task=dag_str,
166
+ pool_name=service_name,
167
+ mode=mode,
168
+ )
169
+ response = server_common.make_authenticated_request(
170
+ 'POST',
171
+ '/jobs/pool_apply',
172
+ json=json.loads(body.model_dump_json()),
173
+ timeout=(5, None))
174
+ return server_common.get_request_id(response)
149
175
 
150
176
 
151
177
  def down(
@@ -422,11 +422,12 @@ class ReplicaInfo:
422
422
  based on the cluster name.
423
423
  """
424
424
  if cluster_record is None:
425
- cluster_record = global_user_state.get_cluster_from_name(
425
+ handle = global_user_state.get_handle_from_cluster_name(
426
426
  self.cluster_name)
427
- if cluster_record is None:
427
+ else:
428
+ handle = cluster_record['handle']
429
+ if handle is None:
428
430
  return None
429
- handle = cluster_record['handle']
430
431
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
431
432
  return handle
432
433
 
@@ -443,6 +444,12 @@ class ReplicaInfo:
443
444
  handle = self.handle()
444
445
  if handle is None:
445
446
  return None
447
+ if self.replica_port == '-':
448
+ # This is a pool replica so there is no endpoint and it's filled
449
+ # with this dummy value. We return None here so that we can
450
+ # get the active ready replicas and perform autoscaling. Otherwise,
451
+ # would error out when trying to get the endpoint.
452
+ return None
446
453
  replica_port_int = int(self.replica_port)
447
454
  try:
448
455
  endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
@@ -470,7 +477,7 @@ class ReplicaInfo:
470
477
  with_handle: bool,
471
478
  with_url: bool = True) -> Dict[str, Any]:
472
479
  cluster_record = global_user_state.get_cluster_from_name(
473
- self.cluster_name)
480
+ self.cluster_name, include_user_info=False, summary_response=True)
474
481
  info_dict = {
475
482
  'replica_id': self.replica_id,
476
483
  'name': self.cluster_name,
@@ -956,7 +963,7 @@ class SkyPilotReplicaManager(ReplicaManager):
956
963
  # provision) or the cluster is preempted and cleaned up by the status
957
964
  # refresh. In this case, we skip spawning a new down process to save
958
965
  # controller resources.
959
- if global_user_state.get_cluster_from_name(info.cluster_name) is None:
966
+ if not global_user_state.cluster_with_name_exists(info.cluster_name):
960
967
  self._handle_sky_down_finish(info, exitcode=0)
961
968
  return
962
969