skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +207 -79
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +112 -53
  7. sky/client/common.py +4 -2
  8. sky/client/sdk.py +17 -7
  9. sky/client/sdk_async.py +4 -2
  10. sky/clouds/kubernetes.py +2 -1
  11. sky/clouds/runpod.py +20 -7
  12. sky/core.py +9 -54
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
  15. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
  22. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  23. sky/dashboard/out/clusters/[cluster].html +1 -1
  24. sky/dashboard/out/clusters.html +1 -1
  25. sky/dashboard/out/config.html +1 -1
  26. sky/dashboard/out/index.html +1 -1
  27. sky/dashboard/out/infra/[context].html +1 -1
  28. sky/dashboard/out/infra.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/dashboard/out/users.html +1 -1
  33. sky/dashboard/out/volumes.html +1 -1
  34. sky/dashboard/out/workspace/new.html +1 -1
  35. sky/dashboard/out/workspaces/[name].html +1 -1
  36. sky/dashboard/out/workspaces.html +1 -1
  37. sky/data/mounting_utils.py +19 -10
  38. sky/execution.py +4 -2
  39. sky/global_user_state.py +271 -67
  40. sky/jobs/client/sdk.py +10 -1
  41. sky/jobs/constants.py +2 -0
  42. sky/jobs/controller.py +11 -7
  43. sky/jobs/server/core.py +5 -3
  44. sky/jobs/server/server.py +15 -11
  45. sky/jobs/utils.py +1 -1
  46. sky/logs/agent.py +30 -3
  47. sky/logs/aws.py +9 -19
  48. sky/provision/__init__.py +2 -1
  49. sky/provision/aws/instance.py +2 -1
  50. sky/provision/azure/instance.py +2 -1
  51. sky/provision/cudo/instance.py +2 -2
  52. sky/provision/do/instance.py +2 -2
  53. sky/provision/docker_utils.py +41 -19
  54. sky/provision/fluidstack/instance.py +2 -2
  55. sky/provision/gcp/instance.py +2 -1
  56. sky/provision/hyperbolic/instance.py +2 -1
  57. sky/provision/instance_setup.py +1 -1
  58. sky/provision/kubernetes/instance.py +134 -8
  59. sky/provision/lambda_cloud/instance.py +2 -1
  60. sky/provision/nebius/instance.py +2 -1
  61. sky/provision/oci/instance.py +2 -1
  62. sky/provision/paperspace/instance.py +2 -2
  63. sky/provision/primeintellect/instance.py +2 -2
  64. sky/provision/provisioner.py +1 -0
  65. sky/provision/runpod/__init__.py +2 -0
  66. sky/provision/runpod/instance.py +2 -2
  67. sky/provision/scp/instance.py +2 -2
  68. sky/provision/seeweb/instance.py +2 -1
  69. sky/provision/vast/instance.py +2 -1
  70. sky/provision/vsphere/instance.py +6 -5
  71. sky/schemas/api/responses.py +2 -1
  72. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  73. sky/serve/autoscalers.py +2 -0
  74. sky/serve/client/impl.py +45 -19
  75. sky/serve/replica_managers.py +12 -5
  76. sky/serve/serve_utils.py +5 -7
  77. sky/serve/server/core.py +9 -6
  78. sky/serve/server/impl.py +78 -25
  79. sky/serve/server/server.py +4 -5
  80. sky/serve/service_spec.py +33 -0
  81. sky/server/constants.py +1 -1
  82. sky/server/daemons.py +2 -3
  83. sky/server/requests/executor.py +56 -6
  84. sky/server/requests/payloads.py +32 -8
  85. sky/server/requests/preconditions.py +2 -3
  86. sky/server/rest.py +2 -0
  87. sky/server/server.py +28 -19
  88. sky/server/stream_utils.py +34 -12
  89. sky/setup_files/dependencies.py +5 -2
  90. sky/setup_files/setup.py +44 -44
  91. sky/skylet/constants.py +4 -1
  92. sky/skylet/events.py +42 -0
  93. sky/templates/jobs-controller.yaml.j2 +3 -0
  94. sky/templates/kubernetes-ray.yml.j2 +24 -18
  95. sky/usage/usage_lib.py +3 -0
  96. sky/utils/cli_utils/status_utils.py +4 -5
  97. sky/utils/context.py +104 -29
  98. sky/utils/controller_utils.py +7 -6
  99. sky/utils/db/db_utils.py +5 -1
  100. sky/utils/db/migration_utils.py +1 -1
  101. sky/utils/kubernetes/create_cluster.sh +13 -28
  102. sky/utils/kubernetes/delete_cluster.sh +10 -7
  103. sky/utils/kubernetes/generate_kind_config.py +6 -66
  104. sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
  105. sky/utils/kubernetes_enums.py +5 -0
  106. sky/utils/ux_utils.py +35 -1
  107. sky/utils/yaml_utils.py +9 -0
  108. sky/volumes/client/sdk.py +44 -8
  109. sky/volumes/server/core.py +1 -0
  110. sky/volumes/server/server.py +33 -7
  111. sky/volumes/volume.py +35 -28
  112. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
  113. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
  114. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  119. /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
  120. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
  121. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
  122. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
  123. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py CHANGED
@@ -262,7 +262,7 @@ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
262
262
  controller = controller_utils.get_controller_for_pool(pool).value
263
263
  if current_is_consolidation_mode:
264
264
  controller_cn = controller.cluster_name
265
- if global_user_state.get_cluster_from_name(controller_cn) is not None:
265
+ if global_user_state.cluster_with_name_exists(controller_cn):
266
266
  with ux_utils.print_exception_no_traceback():
267
267
  raise exceptions.InconsistentConsolidationModeError(
268
268
  f'{colorama.Fore.RED}Consolidation mode for '
@@ -896,8 +896,8 @@ def _terminate_failed_services(
896
896
  # replicas, so we don't need to try again here.
897
897
  for replica_info in serve_state.get_replica_infos(service_name):
898
898
  # TODO(tian): Refresh latest status of the cluster.
899
- if global_user_state.get_cluster_from_name(
900
- replica_info.cluster_name) is not None:
899
+ if global_user_state.cluster_with_name_exists(
900
+ replica_info.cluster_name):
901
901
  remaining_replica_clusters.append(f'{replica_info.cluster_name!r}')
902
902
  serve_state.remove_replica(service_name, replica_info.replica_id)
903
903
 
@@ -1133,10 +1133,8 @@ def _process_line(line: str,
1133
1133
  # `✓ Cluster launched: new-http. View logs at: *.log`
1134
1134
  # We should tail the detailed logs for user.
1135
1135
  def cluster_is_up() -> bool:
1136
- cluster_record = global_user_state.get_cluster_from_name(cluster_name)
1137
- if cluster_record is None:
1138
- return False
1139
- return cluster_record['status'] == status_lib.ClusterStatus.UP
1136
+ status = global_user_state.get_status_from_cluster_name(cluster_name)
1137
+ return status == status_lib.ClusterStatus.UP
1140
1138
 
1141
1139
  provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
1142
1140
  line)
sky/serve/server/core.py CHANGED
@@ -46,20 +46,23 @@ def up(
46
46
 
47
47
 
48
48
  @usage_lib.entrypoint
49
- def update(
50
- task: 'sky.Task',
51
- service_name: str,
52
- mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE) -> None:
49
+ def update(task: Optional['sky.Task'],
50
+ service_name: str,
51
+ mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
52
+ workers: Optional[int] = None) -> None:
53
53
  """Updates an existing service.
54
54
 
55
55
  Please refer to the sky.cli.serve_update for the document.
56
56
 
57
57
  Args:
58
- task: sky.Task to update.
58
+ task: sky.Task to update, or None if updating
59
+ the number of workers/replicas.
59
60
  service_name: Name of the service.
60
61
  mode: Update mode.
62
+ workers: Number of workers/replicas to set for the service when
63
+ task is None.
61
64
  """
62
- return impl.update(task, service_name, mode, pool=False)
65
+ return impl.update(task, service_name, mode, pool=False, workers=workers)
63
66
 
64
67
 
65
68
  @usage_lib.entrypoint
sky/serve/server/impl.py CHANGED
@@ -411,6 +411,9 @@ def up(
411
411
  f'\n{ux_utils.INDENT_LAST_SYMBOL}To terminate the pool:\t'
412
412
  f'{ux_utils.BOLD}sky jobs pool down {service_name}'
413
413
  f'{ux_utils.RESET_BOLD}'
414
+ f'\n{ux_utils.INDENT_SYMBOL}To update the number of workers:\t'
415
+ f'{ux_utils.BOLD}sky jobs pool apply --pool {service_name} '
416
+ f'--workers 5{ux_utils.RESET_BOLD}'
414
417
  '\n\n' + ux_utils.finishing_message('Successfully created pool '
415
418
  f'{service_name!r}.'))
416
419
  else:
@@ -448,37 +451,15 @@ def up(
448
451
 
449
452
 
450
453
  def update(
451
- task: 'task_lib.Task',
454
+ task: Optional['task_lib.Task'],
452
455
  service_name: str,
453
456
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
454
457
  pool: bool = False,
458
+ workers: Optional[int] = None,
455
459
  ) -> None:
456
460
  """Updates an existing service or pool."""
457
461
  noun = 'pool' if pool else 'service'
458
462
  capnoun = noun.capitalize()
459
- task.validate()
460
- serve_utils.validate_service_task(task, pool=pool)
461
-
462
- # Always apply the policy again here, even though it might have been applied
463
- # in the CLI. This is to ensure that we apply the policy to the final DAG
464
- # and get the mutated config.
465
- # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
466
- # will not apply the config.
467
- dag, _ = admin_policy_utils.apply(task)
468
- task = dag.tasks[0]
469
- if pool:
470
- if task.run is not None:
471
- logger.warning(f'{colorama.Fore.YELLOW}The `run` section will be '
472
- f'ignored for pool.{colorama.Style.RESET_ALL}')
473
- # Use dummy run script for cluster pool.
474
- task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
475
-
476
- assert task.service is not None
477
- if not pool and task.service.tls_credential is not None:
478
- logger.warning('Updating TLS keyfile and certfile is not supported. '
479
- 'Any updates to the keyfile and certfile will not take '
480
- 'effect. To update TLS keyfile and certfile, please '
481
- 'tear down the service and spin up a new one.')
482
463
 
483
464
  controller_type = controller_utils.get_controller_for_pool(pool)
484
465
  handle = backend_utils.is_controller_accessible(
@@ -505,6 +486,77 @@ def update(
505
486
  f'To spin up a {noun}, use {ux_utils.BOLD}'
506
487
  f'{cmd}{ux_utils.RESET_BOLD}')
507
488
 
489
+ # If task is None and workers is specified, load existing configuration
490
+ # and update replica count.
491
+ if task is None:
492
+ if workers is None:
493
+ with ux_utils.print_exception_no_traceback():
494
+ raise ValueError(
495
+ f'Cannot update {noun} without specifying '
496
+ f'task or workers. Please provide either a task '
497
+ f'or specify the number of workers.')
498
+
499
+ if not pool:
500
+ with ux_utils.print_exception_no_traceback():
501
+ raise ValueError(
502
+ 'Non-pool service, trying to update replicas to '
503
+ f'{workers} is not supported. Ignoring the update.')
504
+
505
+ # Load the existing task configuration from the service's YAML file
506
+ latest_yaml_path = serve_utils.generate_task_yaml_file_name(
507
+ service_name, service_record['version'], expand_user=False)
508
+
509
+ logger.debug('Loading existing task configuration from '
510
+ f'{latest_yaml_path} to create a new modified task.')
511
+
512
+ # Get the path locally.
513
+ with tempfile.NamedTemporaryFile(
514
+ prefix=f'service-task-{service_name}-',
515
+ mode='w',
516
+ ) as service_file:
517
+ try:
518
+ backend.download_file(handle, latest_yaml_path,
519
+ service_file.name)
520
+ except exceptions.CommandError as e:
521
+ raise RuntimeError(
522
+ f'Failed to download the old task configuration from '
523
+ f'{latest_yaml_path}: {e.error_msg}') from e
524
+
525
+ # Load the existing task configuration
526
+ existing_config = yaml_utils.read_yaml(service_file.name)
527
+ task = task_lib.Task.from_yaml_config(existing_config)
528
+
529
+ if task.service is None:
530
+ with ux_utils.print_exception_no_traceback():
531
+ raise RuntimeError('No service configuration found in '
532
+ f'existing {noun} {service_name!r}')
533
+ task.set_service(task.service.copy(min_replicas=workers))
534
+
535
+ task.validate()
536
+ serve_utils.validate_service_task(task, pool=pool)
537
+
538
+ # Now apply the policy and handle task-specific logic
539
+ # Always apply the policy again here, even though it might have been applied
540
+ # in the CLI. This is to ensure that we apply the policy to the final DAG
541
+ # and get the mutated config.
542
+ # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
543
+ # will not apply the config.
544
+ dag, _ = admin_policy_utils.apply(task)
545
+ task = dag.tasks[0]
546
+ if pool:
547
+ if task.run is not None:
548
+ logger.warning(f'{colorama.Fore.YELLOW}The `run` section will be '
549
+ f'ignored for pool.{colorama.Style.RESET_ALL}')
550
+ # Use dummy run script for cluster pool.
551
+ task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
552
+
553
+ assert task.service is not None
554
+ if not pool and task.service.tls_credential is not None:
555
+ logger.warning('Updating TLS keyfile and certfile is not supported. '
556
+ 'Any updates to the keyfile and certfile will not take '
557
+ 'effect. To update TLS keyfile and certfile, please '
558
+ 'tear down the service and spin up a new one.')
559
+
508
560
  prompt = None
509
561
  if (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_FAILED
510
562
  ):
@@ -625,6 +677,7 @@ def update(
625
677
 
626
678
  def apply(
627
679
  task: 'task_lib.Task',
680
+ workers: Optional[int],
628
681
  service_name: str,
629
682
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
630
683
  pool: bool = False,
@@ -640,7 +693,7 @@ def apply(
640
693
  service_record = _get_service_record(service_name, pool, handle,
641
694
  backend)
642
695
  if service_record is not None:
643
- return update(task, service_name, mode, pool)
696
+ return update(task, service_name, mode, pool, workers)
644
697
  except exceptions.ClusterNotUpError:
645
698
  pass
646
699
  up(task, service_name, pool)
@@ -98,7 +98,7 @@ async def tail_logs(
98
98
  request: fastapi.Request, log_body: payloads.ServeLogsBody,
99
99
  background_tasks: fastapi.BackgroundTasks
100
100
  ) -> fastapi.responses.StreamingResponse:
101
- executor.schedule_request(
101
+ request_task = executor.prepare_request(
102
102
  request_id=request.state.request_id,
103
103
  request_name='serve.logs',
104
104
  request_body=log_body,
@@ -106,10 +106,9 @@ async def tail_logs(
106
106
  schedule_type=api_requests.ScheduleType.SHORT,
107
107
  request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
108
108
  )
109
-
110
- request_task = await api_requests.get_request_async(request.state.request_id
111
- )
112
-
109
+ task = executor.execute_request_in_coroutine(request_task)
110
+ # Cancel the coroutine after the request is done or client disconnects
111
+ background_tasks.add_task(task.cancel)
113
112
  return stream_utils.stream_response(
114
113
  request_id=request_task.request_id,
115
114
  logs_path=request_task.log_path,
sky/serve/service_spec.py CHANGED
@@ -506,3 +506,36 @@ class SkyServiceSpec:
506
506
  if not hasattr(self, '_pool'):
507
507
  return False
508
508
  return bool(self._pool)
509
+
510
+ def copy(self, **override) -> 'SkyServiceSpec':
511
+ return SkyServiceSpec(
512
+ readiness_path=override.pop('readiness_path', self._readiness_path),
513
+ initial_delay_seconds=override.pop('initial_delay_seconds',
514
+ self._initial_delay_seconds),
515
+ readiness_timeout_seconds=override.pop(
516
+ 'readiness_timeout_seconds', self._readiness_timeout_seconds),
517
+ min_replicas=override.pop('min_replicas', self._min_replicas),
518
+ max_replicas=override.pop('max_replicas', self._max_replicas),
519
+ num_overprovision=override.pop('num_overprovision',
520
+ self._num_overprovision),
521
+ ports=override.pop('ports', self._ports),
522
+ target_qps_per_replica=override.pop('target_qps_per_replica',
523
+ self._target_qps_per_replica),
524
+ post_data=override.pop('post_data', self._post_data),
525
+ tls_credential=override.pop('tls_credential', self._tls_credential),
526
+ readiness_headers=override.pop('readiness_headers',
527
+ self._readiness_headers),
528
+ dynamic_ondemand_fallback=override.pop(
529
+ 'dynamic_ondemand_fallback', self._dynamic_ondemand_fallback),
530
+ base_ondemand_fallback_replicas=override.pop(
531
+ 'base_ondemand_fallback_replicas',
532
+ self._base_ondemand_fallback_replicas),
533
+ spot_placer=override.pop('spot_placer', self._spot_placer),
534
+ upscale_delay_seconds=override.pop('upscale_delay_seconds',
535
+ self._upscale_delay_seconds),
536
+ downscale_delay_seconds=override.pop('downscale_delay_seconds',
537
+ self._downscale_delay_seconds),
538
+ load_balancing_policy=override.pop('load_balancing_policy',
539
+ self._load_balancing_policy),
540
+ pool=override.pop('pool', self._pool),
541
+ )
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 18
13
+ API_VERSION = 20
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
sky/server/daemons.py CHANGED
@@ -8,7 +8,6 @@ from sky import sky_logging
8
8
  from sky import skypilot_config
9
9
  from sky.server import constants as server_constants
10
10
  from sky.utils import annotations
11
- from sky.utils import common
12
11
  from sky.utils import common_utils
13
12
  from sky.utils import env_options
14
13
  from sky.utils import subprocess_utils
@@ -94,13 +93,13 @@ class InternalRequestDaemon:
94
93
  def refresh_cluster_status_event():
95
94
  """Periodically refresh the cluster status."""
96
95
  # pylint: disable=import-outside-toplevel
97
- from sky import core
96
+ from sky.backends import backend_utils
98
97
 
99
98
  logger.info('=== Refreshing cluster status ===')
100
99
  # This periodically refresh will hold the lock for the cluster being
101
100
  # refreshed, but it is OK because other operations will just wait for
102
101
  # the lock and get the just refreshed status without refreshing again.
103
- core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
102
+ backend_utils.refresh_cluster_records()
104
103
  logger.info('Status refreshed. Sleeping '
105
104
  f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
106
105
  ' seconds for the next refresh...\n')
@@ -502,7 +502,35 @@ def _record_memory_metrics(request_name: str, proc: psutil.Process,
502
502
  name=request_name).observe(max(peak_rss - rss_begin, 0))
503
503
 
504
504
 
505
- async def execute_request_coroutine(request: api_requests.Request):
505
+ class CoroutineTask:
506
+ """Wrapper of a background task runs in coroutine"""
507
+
508
+ def __init__(self, task: asyncio.Task):
509
+ self.task = task
510
+
511
+ async def cancel(self):
512
+ try:
513
+ self.task.cancel()
514
+ await self.task
515
+ except asyncio.CancelledError:
516
+ pass
517
+
518
+
519
+ def execute_request_in_coroutine(
520
+ request: api_requests.Request) -> CoroutineTask:
521
+ """Execute a request in current event loop.
522
+
523
+ Args:
524
+ request: The request to execute.
525
+
526
+ Returns:
527
+ A CoroutineTask handle to operate the background task.
528
+ """
529
+ task = asyncio.create_task(_execute_request_coroutine(request))
530
+ return CoroutineTask(task)
531
+
532
+
533
+ async def _execute_request_coroutine(request: api_requests.Request):
506
534
  """Execute a request in current event loop.
507
535
 
508
536
  Similar to _request_execution_wrapper, but executed as coroutine in current
@@ -640,13 +668,35 @@ def schedule_request(request_id: str,
640
668
  The precondition is waited asynchronously and does not block the
641
669
  caller.
642
670
  """
643
- prepare_request(request_id, request_name, request_body, func,
644
- request_cluster_name, schedule_type, is_skypilot_system)
671
+ request_task = prepare_request(request_id, request_name, request_body, func,
672
+ request_cluster_name, schedule_type,
673
+ is_skypilot_system)
674
+ schedule_prepared_request(request_task, ignore_return_value, precondition,
675
+ retryable)
676
+
677
+
678
+ def schedule_prepared_request(request_task: api_requests.Request,
679
+ ignore_return_value: bool = False,
680
+ precondition: Optional[
681
+ preconditions.Precondition] = None,
682
+ retryable: bool = False) -> None:
683
+ """Enqueue a request to the request queue
684
+
685
+ Args:
686
+ request_task: The prepared request task to schedule.
687
+ ignore_return_value: If True, the return value of the function will be
688
+ ignored.
689
+ precondition: If a precondition is provided, the request will only be
690
+ scheduled for execution when the precondition is met (returns True).
691
+ The precondition is waited asynchronously and does not block the
692
+ caller.
693
+ retryable: Whether the request should be retried if it fails.
694
+ """
645
695
 
646
696
  def enqueue():
647
- input_tuple = (request_id, ignore_return_value, retryable)
648
- logger.info(f'Queuing request: {request_id}')
649
- _get_queue(schedule_type).put(input_tuple)
697
+ input_tuple = (request_task.request_id, ignore_return_value, retryable)
698
+ logger.info(f'Queuing request: {request_task.request_id}')
699
+ _get_queue(request_task.schedule_type).put(input_tuple)
650
700
 
651
701
  if precondition is not None:
652
702
  # Wait async to avoid blocking caller.
@@ -316,6 +316,9 @@ class StatusBody(RequestBody):
316
316
  all_users: bool = True
317
317
  # TODO (kyuds): default to False post 0.10.5
318
318
  include_credentials: bool = True
319
+ # Only return fields that are needed for the
320
+ # dashboard / CLI summary response
321
+ summary_response: bool = False
319
322
 
320
323
 
321
324
  class StartBody(RequestBody):
@@ -475,6 +478,17 @@ class VolumeListBody(RequestBody):
475
478
  pass
476
479
 
477
480
 
481
+ class VolumeValidateBody(RequestBody):
482
+ """The request body for the volume validate endpoint."""
483
+ name: Optional[str] = None
484
+ volume_type: Optional[str] = None
485
+ infra: Optional[str] = None
486
+ size: Optional[str] = None
487
+ labels: Optional[Dict[str, str]] = None
488
+ resource_name: Optional[str] = None
489
+ config: Optional[Dict[str, Any]] = None
490
+
491
+
478
492
  class EndpointsBody(RequestBody):
479
493
  """The request body for the endpoint."""
480
494
  cluster: str
@@ -670,6 +684,13 @@ class LocalUpBody(RequestBody):
670
684
  cleanup: bool = False
671
685
  context_name: Optional[str] = None
672
686
  password: Optional[str] = None
687
+ name: Optional[str] = None
688
+ port_start: Optional[int] = None
689
+
690
+
691
+ class LocalDownBody(RequestBody):
692
+ """The request body for the local down endpoint."""
693
+ name: Optional[str] = None
673
694
 
674
695
 
675
696
  class SSHUpBody(RequestBody):
@@ -709,19 +730,22 @@ class JobsDownloadLogsBody(RequestBody):
709
730
 
710
731
  class JobsPoolApplyBody(RequestBody):
711
732
  """The request body for the jobs pool apply endpoint."""
712
- task: str
733
+ task: Optional[str] = None
734
+ workers: Optional[int] = None
713
735
  pool_name: str
714
736
  mode: serve.UpdateMode
715
737
 
716
738
  def to_kwargs(self) -> Dict[str, Any]:
717
739
  kwargs = super().to_kwargs()
718
- dag = common.process_mounts_in_task_on_api_server(self.task,
719
- self.env_vars,
720
- workdir_only=False)
721
- assert len(
722
- dag.tasks) == 1, ('Must only specify one task in the DAG for '
723
- 'a pool.', dag)
724
- kwargs['task'] = dag.tasks[0]
740
+ if self.task is not None:
741
+ dag = common.process_mounts_in_task_on_api_server(
742
+ self.task, self.env_vars, workdir_only=False)
743
+ assert len(
744
+ dag.tasks) == 1, ('Must only specify one task in the DAG for '
745
+ 'a pool.', dag)
746
+ kwargs['task'] = dag.tasks[0]
747
+ else:
748
+ kwargs['task'] = None
725
749
  return kwargs
726
750
 
727
751
 
@@ -146,10 +146,9 @@ class ClusterStartCompletePrecondition(Precondition):
146
146
  self.cluster_name = cluster_name
147
147
 
148
148
  async def check(self) -> Tuple[bool, Optional[str]]:
149
- cluster_record = global_user_state.get_cluster_from_name(
149
+ cluster_status = global_user_state.get_status_from_cluster_name(
150
150
  self.cluster_name)
151
- if (cluster_record and
152
- cluster_record['status'] is status_lib.ClusterStatus.UP):
151
+ if cluster_status is status_lib.ClusterStatus.UP:
153
152
  # Shortcut for started clusters, ignore cluster not found
154
153
  # since the cluster record might not yet be created by the
155
154
  # launch task.
sky/server/rest.py CHANGED
@@ -9,6 +9,7 @@ import typing
9
9
  from typing import Any, Callable, cast, Optional, TypeVar
10
10
 
11
11
  import colorama
12
+ import urllib3.exceptions
12
13
 
13
14
  from sky import exceptions
14
15
  from sky import sky_logging
@@ -53,6 +54,7 @@ _session.headers[constants.VERSION_HEADER] = (
53
54
  _transient_errors = [
54
55
  requests.exceptions.RequestException,
55
56
  ConnectionError,
57
+ urllib3.exceptions.HTTPError,
56
58
  ]
57
59
 
58
60
 
sky/server/server.py CHANGED
@@ -445,6 +445,22 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
445
445
  loop.call_at(target, tick)
446
446
 
447
447
 
448
+ def schedule_on_boot_check():
449
+ try:
450
+ executor.schedule_request(
451
+ request_id='skypilot-server-on-boot-check',
452
+ request_name='check',
453
+ request_body=payloads.CheckBody(),
454
+ func=sky_check.check,
455
+ schedule_type=requests_lib.ScheduleType.SHORT,
456
+ is_skypilot_system=True,
457
+ )
458
+ except exceptions.RequestAlreadyExistsError:
459
+ # Lifespan will be executed in each uvicorn worker process, we
460
+ # can safely ignore the error if the task is already scheduled.
461
+ logger.debug('Request skypilot-server-on-boot-check already exists.')
462
+
463
+
448
464
  @contextlib.asynccontextmanager
449
465
  async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
450
466
  """FastAPI lifespan context manager."""
@@ -469,6 +485,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
469
485
  # Lifespan will be executed in each uvicorn worker process, we
470
486
  # can safely ignore the error if the task is already scheduled.
471
487
  logger.debug(f'Request {event.id} already exists.')
488
+ schedule_on_boot_check()
472
489
  asyncio.create_task(cleanup_upload_ids())
473
490
  if metrics_utils.METRICS_ENABLED:
474
491
  # Start monitoring the event loop lag in each server worker
@@ -1216,19 +1233,8 @@ async def logs(
1216
1233
  schedule_type=requests_lib.ScheduleType.SHORT,
1217
1234
  request_cluster_name=cluster_job_body.cluster_name,
1218
1235
  )
1219
- task = asyncio.create_task(executor.execute_request_coroutine(request_task))
1220
-
1221
- async def cancel_task():
1222
- try:
1223
- logger.info('Client disconnected for request: '
1224
- f'{request.state.request_id}')
1225
- task.cancel()
1226
- await task
1227
- except asyncio.CancelledError:
1228
- pass
1229
-
1230
- # Cancel the task after the request is done or client disconnects
1231
- background_tasks.add_task(cancel_task)
1236
+ task = executor.execute_request_in_coroutine(request_task)
1237
+ background_tasks.add_task(task.cancel)
1232
1238
  # TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
1233
1239
  # the same approach as /stream.
1234
1240
  return stream_utils.stream_response(
@@ -1354,10 +1360,12 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
1354
1360
  effective_tail = None if tail is None or tail <= 0 else tail
1355
1361
 
1356
1362
  return fastapi.responses.StreamingResponse(
1357
- content=stream_utils.log_streamer(None,
1358
- log_path,
1359
- tail=effective_tail,
1360
- follow=follow),
1363
+ content=stream_utils.log_streamer(
1364
+ None,
1365
+ log_path,
1366
+ tail=effective_tail,
1367
+ follow=follow,
1368
+ cluster_name=cluster_body.cluster_name),
1361
1369
  media_type='text/plain',
1362
1370
  headers={
1363
1371
  'Cache-Control': 'no-cache, no-transform',
@@ -1419,12 +1427,13 @@ async def local_up(request: fastapi.Request,
1419
1427
 
1420
1428
 
1421
1429
  @app.post('/local_down')
1422
- async def local_down(request: fastapi.Request) -> None:
1430
+ async def local_down(request: fastapi.Request,
1431
+ local_down_body: payloads.LocalDownBody) -> None:
1423
1432
  """Tears down the Kubernetes cluster started by local_up."""
1424
1433
  executor.schedule_request(
1425
1434
  request_id=request.state.request_id,
1426
1435
  request_name='local_down',
1427
- request_body=payloads.RequestBody(),
1436
+ request_body=local_down_body,
1428
1437
  func=core.local_down,
1429
1438
  schedule_type=requests_lib.ScheduleType.LONG,
1430
1439
  )
@@ -8,10 +8,12 @@ from typing import AsyncGenerator, Deque, List, Optional
8
8
  import aiofiles
9
9
  import fastapi
10
10
 
11
+ from sky import global_user_state
11
12
  from sky import sky_logging
12
13
  from sky.server.requests import requests as requests_lib
13
14
  from sky.utils import message_utils
14
15
  from sky.utils import rich_utils
16
+ from sky.utils import status_lib
15
17
 
16
18
  logger = sky_logging.init_logger(__name__)
17
19
 
@@ -22,6 +24,7 @@ logger = sky_logging.init_logger(__name__)
22
24
  _BUFFER_SIZE = 8 * 1024 # 8KB
23
25
  _BUFFER_TIMEOUT = 0.02 # 20ms
24
26
  _HEARTBEAT_INTERVAL = 30
27
+ _CLUSTER_STATUS_INTERVAL = 1
25
28
 
26
29
 
27
30
  async def _yield_log_file_with_payloads_skipped(
@@ -37,11 +40,13 @@ async def _yield_log_file_with_payloads_skipped(
37
40
  yield line_str
38
41
 
39
42
 
40
- async def log_streamer(request_id: Optional[str],
41
- log_path: pathlib.Path,
42
- plain_logs: bool = False,
43
- tail: Optional[int] = None,
44
- follow: bool = True) -> AsyncGenerator[str, None]:
43
+ async def log_streamer(
44
+ request_id: Optional[str],
45
+ log_path: pathlib.Path,
46
+ plain_logs: bool = False,
47
+ tail: Optional[int] = None,
48
+ follow: bool = True,
49
+ cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
45
50
  """Streams the logs of a request.
46
51
 
47
52
  Args:
@@ -51,6 +56,8 @@ async def log_streamer(request_id: Optional[str],
51
56
  plain_logs: Whether to show plain logs.
52
57
  tail: The number of lines to tail. If None, tail the whole file.
53
58
  follow: Whether to follow the log file.
59
+ cluster_name: The cluster name to check status for provision logs.
60
+ If provided and cluster status is UP, streaming will terminate.
54
61
  """
55
62
 
56
63
  if request_id is not None:
@@ -104,15 +111,17 @@ async def log_streamer(request_id: Optional[str],
104
111
 
105
112
  async with aiofiles.open(log_path, 'rb') as f:
106
113
  async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
107
- follow):
114
+ follow, cluster_name):
108
115
  yield chunk
109
116
 
110
117
 
111
- async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
112
- request_id: Optional[str] = None,
113
- plain_logs: bool = False,
114
- tail: Optional[int] = None,
115
- follow: bool = True) -> AsyncGenerator[str, None]:
118
+ async def _tail_log_file(
119
+ f: aiofiles.threadpool.binary.AsyncBufferedReader,
120
+ request_id: Optional[str] = None,
121
+ plain_logs: bool = False,
122
+ tail: Optional[int] = None,
123
+ follow: bool = True,
124
+ cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
116
125
  """Tail the opened log file, buffer the lines and flush in chunks."""
117
126
 
118
127
  if tail is not None:
@@ -128,6 +137,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
128
137
  yield line_str
129
138
 
130
139
  last_heartbeat_time = asyncio.get_event_loop().time()
140
+ last_cluster_status_check_time = asyncio.get_event_loop().time()
131
141
 
132
142
  # Buffer the lines in memory and flush them in chunks to improve log
133
143
  # tailing throughput.
@@ -176,7 +186,19 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
176
186
  break
177
187
  if not follow:
178
188
  break
179
-
189
+ # Provision logs pass in cluster_name, check cluster status
190
+ # periodically to see if provisioning is done. We only
191
+ # check once a second to avoid overloading the DB.
192
+ check_status = (current_time - last_cluster_status_check_time
193
+ ) >= _CLUSTER_STATUS_INTERVAL
194
+ if cluster_name is not None and check_status:
195
+ cluster_record = await (
196
+ global_user_state.get_status_from_cluster_name_async(
197
+ cluster_name))
198
+ if (cluster_record is None or
199
+ cluster_record != status_lib.ClusterStatus.INIT):
200
+ break
201
+ last_cluster_status_check_time = current_time
180
202
  if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
181
203
  # Currently just used to keep the connection busy, refer to
182
204
  # https://github.com/skypilot-org/skypilot/issues/5750 for