skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +207 -79
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +112 -53
- sky/client/common.py +4 -2
- sky/client/sdk.py +17 -7
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +9 -54
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +271 -67
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +11 -7
- sky/jobs/server/core.py +5 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -7
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +32 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +5 -2
- sky/setup_files/setup.py +44 -44
- sky/skylet/constants.py +4 -1
- sky/skylet/events.py +42 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +24 -18
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/db/db_utils.py +5 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/core.py +1 -0
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +35 -28
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py
CHANGED
|
@@ -262,7 +262,7 @@ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
|
|
|
262
262
|
controller = controller_utils.get_controller_for_pool(pool).value
|
|
263
263
|
if current_is_consolidation_mode:
|
|
264
264
|
controller_cn = controller.cluster_name
|
|
265
|
-
if global_user_state.
|
|
265
|
+
if global_user_state.cluster_with_name_exists(controller_cn):
|
|
266
266
|
with ux_utils.print_exception_no_traceback():
|
|
267
267
|
raise exceptions.InconsistentConsolidationModeError(
|
|
268
268
|
f'{colorama.Fore.RED}Consolidation mode for '
|
|
@@ -896,8 +896,8 @@ def _terminate_failed_services(
|
|
|
896
896
|
# replicas, so we don't need to try again here.
|
|
897
897
|
for replica_info in serve_state.get_replica_infos(service_name):
|
|
898
898
|
# TODO(tian): Refresh latest status of the cluster.
|
|
899
|
-
if global_user_state.
|
|
900
|
-
replica_info.cluster_name)
|
|
899
|
+
if global_user_state.cluster_with_name_exists(
|
|
900
|
+
replica_info.cluster_name):
|
|
901
901
|
remaining_replica_clusters.append(f'{replica_info.cluster_name!r}')
|
|
902
902
|
serve_state.remove_replica(service_name, replica_info.replica_id)
|
|
903
903
|
|
|
@@ -1133,10 +1133,8 @@ def _process_line(line: str,
|
|
|
1133
1133
|
# `✓ Cluster launched: new-http. View logs at: *.log`
|
|
1134
1134
|
# We should tail the detailed logs for user.
|
|
1135
1135
|
def cluster_is_up() -> bool:
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
return False
|
|
1139
|
-
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
|
1136
|
+
status = global_user_state.get_status_from_cluster_name(cluster_name)
|
|
1137
|
+
return status == status_lib.ClusterStatus.UP
|
|
1140
1138
|
|
|
1141
1139
|
provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
|
|
1142
1140
|
line)
|
sky/serve/server/core.py
CHANGED
|
@@ -46,20 +46,23 @@ def up(
|
|
|
46
46
|
|
|
47
47
|
|
|
48
48
|
@usage_lib.entrypoint
|
|
49
|
-
def update(
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
49
|
+
def update(task: Optional['sky.Task'],
|
|
50
|
+
service_name: str,
|
|
51
|
+
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
52
|
+
workers: Optional[int] = None) -> None:
|
|
53
53
|
"""Updates an existing service.
|
|
54
54
|
|
|
55
55
|
Please refer to the sky.cli.serve_update for the document.
|
|
56
56
|
|
|
57
57
|
Args:
|
|
58
|
-
task: sky.Task to update
|
|
58
|
+
task: sky.Task to update, or None if updating
|
|
59
|
+
the number of workers/replicas.
|
|
59
60
|
service_name: Name of the service.
|
|
60
61
|
mode: Update mode.
|
|
62
|
+
workers: Number of workers/replicas to set for the service when
|
|
63
|
+
task is None.
|
|
61
64
|
"""
|
|
62
|
-
return impl.update(task, service_name, mode, pool=False)
|
|
65
|
+
return impl.update(task, service_name, mode, pool=False, workers=workers)
|
|
63
66
|
|
|
64
67
|
|
|
65
68
|
@usage_lib.entrypoint
|
sky/serve/server/impl.py
CHANGED
|
@@ -411,6 +411,9 @@ def up(
|
|
|
411
411
|
f'\n{ux_utils.INDENT_LAST_SYMBOL}To terminate the pool:\t'
|
|
412
412
|
f'{ux_utils.BOLD}sky jobs pool down {service_name}'
|
|
413
413
|
f'{ux_utils.RESET_BOLD}'
|
|
414
|
+
f'\n{ux_utils.INDENT_SYMBOL}To update the number of workers:\t'
|
|
415
|
+
f'{ux_utils.BOLD}sky jobs pool apply --pool {service_name} '
|
|
416
|
+
f'--workers 5{ux_utils.RESET_BOLD}'
|
|
414
417
|
'\n\n' + ux_utils.finishing_message('Successfully created pool '
|
|
415
418
|
f'{service_name!r}.'))
|
|
416
419
|
else:
|
|
@@ -448,37 +451,15 @@ def up(
|
|
|
448
451
|
|
|
449
452
|
|
|
450
453
|
def update(
|
|
451
|
-
task: 'task_lib.Task',
|
|
454
|
+
task: Optional['task_lib.Task'],
|
|
452
455
|
service_name: str,
|
|
453
456
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
454
457
|
pool: bool = False,
|
|
458
|
+
workers: Optional[int] = None,
|
|
455
459
|
) -> None:
|
|
456
460
|
"""Updates an existing service or pool."""
|
|
457
461
|
noun = 'pool' if pool else 'service'
|
|
458
462
|
capnoun = noun.capitalize()
|
|
459
|
-
task.validate()
|
|
460
|
-
serve_utils.validate_service_task(task, pool=pool)
|
|
461
|
-
|
|
462
|
-
# Always apply the policy again here, even though it might have been applied
|
|
463
|
-
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
464
|
-
# and get the mutated config.
|
|
465
|
-
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
|
466
|
-
# will not apply the config.
|
|
467
|
-
dag, _ = admin_policy_utils.apply(task)
|
|
468
|
-
task = dag.tasks[0]
|
|
469
|
-
if pool:
|
|
470
|
-
if task.run is not None:
|
|
471
|
-
logger.warning(f'{colorama.Fore.YELLOW}The `run` section will be '
|
|
472
|
-
f'ignored for pool.{colorama.Style.RESET_ALL}')
|
|
473
|
-
# Use dummy run script for cluster pool.
|
|
474
|
-
task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
|
|
475
|
-
|
|
476
|
-
assert task.service is not None
|
|
477
|
-
if not pool and task.service.tls_credential is not None:
|
|
478
|
-
logger.warning('Updating TLS keyfile and certfile is not supported. '
|
|
479
|
-
'Any updates to the keyfile and certfile will not take '
|
|
480
|
-
'effect. To update TLS keyfile and certfile, please '
|
|
481
|
-
'tear down the service and spin up a new one.')
|
|
482
463
|
|
|
483
464
|
controller_type = controller_utils.get_controller_for_pool(pool)
|
|
484
465
|
handle = backend_utils.is_controller_accessible(
|
|
@@ -505,6 +486,77 @@ def update(
|
|
|
505
486
|
f'To spin up a {noun}, use {ux_utils.BOLD}'
|
|
506
487
|
f'{cmd}{ux_utils.RESET_BOLD}')
|
|
507
488
|
|
|
489
|
+
# If task is None and workers is specified, load existing configuration
|
|
490
|
+
# and update replica count.
|
|
491
|
+
if task is None:
|
|
492
|
+
if workers is None:
|
|
493
|
+
with ux_utils.print_exception_no_traceback():
|
|
494
|
+
raise ValueError(
|
|
495
|
+
f'Cannot update {noun} without specifying '
|
|
496
|
+
f'task or workers. Please provide either a task '
|
|
497
|
+
f'or specify the number of workers.')
|
|
498
|
+
|
|
499
|
+
if not pool:
|
|
500
|
+
with ux_utils.print_exception_no_traceback():
|
|
501
|
+
raise ValueError(
|
|
502
|
+
'Non-pool service, trying to update replicas to '
|
|
503
|
+
f'{workers} is not supported. Ignoring the update.')
|
|
504
|
+
|
|
505
|
+
# Load the existing task configuration from the service's YAML file
|
|
506
|
+
latest_yaml_path = serve_utils.generate_task_yaml_file_name(
|
|
507
|
+
service_name, service_record['version'], expand_user=False)
|
|
508
|
+
|
|
509
|
+
logger.debug('Loading existing task configuration from '
|
|
510
|
+
f'{latest_yaml_path} to create a new modified task.')
|
|
511
|
+
|
|
512
|
+
# Get the path locally.
|
|
513
|
+
with tempfile.NamedTemporaryFile(
|
|
514
|
+
prefix=f'service-task-{service_name}-',
|
|
515
|
+
mode='w',
|
|
516
|
+
) as service_file:
|
|
517
|
+
try:
|
|
518
|
+
backend.download_file(handle, latest_yaml_path,
|
|
519
|
+
service_file.name)
|
|
520
|
+
except exceptions.CommandError as e:
|
|
521
|
+
raise RuntimeError(
|
|
522
|
+
f'Failed to download the old task configuration from '
|
|
523
|
+
f'{latest_yaml_path}: {e.error_msg}') from e
|
|
524
|
+
|
|
525
|
+
# Load the existing task configuration
|
|
526
|
+
existing_config = yaml_utils.read_yaml(service_file.name)
|
|
527
|
+
task = task_lib.Task.from_yaml_config(existing_config)
|
|
528
|
+
|
|
529
|
+
if task.service is None:
|
|
530
|
+
with ux_utils.print_exception_no_traceback():
|
|
531
|
+
raise RuntimeError('No service configuration found in '
|
|
532
|
+
f'existing {noun} {service_name!r}')
|
|
533
|
+
task.set_service(task.service.copy(min_replicas=workers))
|
|
534
|
+
|
|
535
|
+
task.validate()
|
|
536
|
+
serve_utils.validate_service_task(task, pool=pool)
|
|
537
|
+
|
|
538
|
+
# Now apply the policy and handle task-specific logic
|
|
539
|
+
# Always apply the policy again here, even though it might have been applied
|
|
540
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
541
|
+
# and get the mutated config.
|
|
542
|
+
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
|
543
|
+
# will not apply the config.
|
|
544
|
+
dag, _ = admin_policy_utils.apply(task)
|
|
545
|
+
task = dag.tasks[0]
|
|
546
|
+
if pool:
|
|
547
|
+
if task.run is not None:
|
|
548
|
+
logger.warning(f'{colorama.Fore.YELLOW}The `run` section will be '
|
|
549
|
+
f'ignored for pool.{colorama.Style.RESET_ALL}')
|
|
550
|
+
# Use dummy run script for cluster pool.
|
|
551
|
+
task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
|
|
552
|
+
|
|
553
|
+
assert task.service is not None
|
|
554
|
+
if not pool and task.service.tls_credential is not None:
|
|
555
|
+
logger.warning('Updating TLS keyfile and certfile is not supported. '
|
|
556
|
+
'Any updates to the keyfile and certfile will not take '
|
|
557
|
+
'effect. To update TLS keyfile and certfile, please '
|
|
558
|
+
'tear down the service and spin up a new one.')
|
|
559
|
+
|
|
508
560
|
prompt = None
|
|
509
561
|
if (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_FAILED
|
|
510
562
|
):
|
|
@@ -625,6 +677,7 @@ def update(
|
|
|
625
677
|
|
|
626
678
|
def apply(
|
|
627
679
|
task: 'task_lib.Task',
|
|
680
|
+
workers: Optional[int],
|
|
628
681
|
service_name: str,
|
|
629
682
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
630
683
|
pool: bool = False,
|
|
@@ -640,7 +693,7 @@ def apply(
|
|
|
640
693
|
service_record = _get_service_record(service_name, pool, handle,
|
|
641
694
|
backend)
|
|
642
695
|
if service_record is not None:
|
|
643
|
-
return update(task, service_name, mode, pool)
|
|
696
|
+
return update(task, service_name, mode, pool, workers)
|
|
644
697
|
except exceptions.ClusterNotUpError:
|
|
645
698
|
pass
|
|
646
699
|
up(task, service_name, pool)
|
sky/serve/server/server.py
CHANGED
|
@@ -98,7 +98,7 @@ async def tail_logs(
|
|
|
98
98
|
request: fastapi.Request, log_body: payloads.ServeLogsBody,
|
|
99
99
|
background_tasks: fastapi.BackgroundTasks
|
|
100
100
|
) -> fastapi.responses.StreamingResponse:
|
|
101
|
-
executor.
|
|
101
|
+
request_task = executor.prepare_request(
|
|
102
102
|
request_id=request.state.request_id,
|
|
103
103
|
request_name='serve.logs',
|
|
104
104
|
request_body=log_body,
|
|
@@ -106,10 +106,9 @@ async def tail_logs(
|
|
|
106
106
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
107
107
|
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
|
108
108
|
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
109
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
110
|
+
# Cancel the coroutine after the request is done or client disconnects
|
|
111
|
+
background_tasks.add_task(task.cancel)
|
|
113
112
|
return stream_utils.stream_response(
|
|
114
113
|
request_id=request_task.request_id,
|
|
115
114
|
logs_path=request_task.log_path,
|
sky/serve/service_spec.py
CHANGED
|
@@ -506,3 +506,36 @@ class SkyServiceSpec:
|
|
|
506
506
|
if not hasattr(self, '_pool'):
|
|
507
507
|
return False
|
|
508
508
|
return bool(self._pool)
|
|
509
|
+
|
|
510
|
+
def copy(self, **override) -> 'SkyServiceSpec':
|
|
511
|
+
return SkyServiceSpec(
|
|
512
|
+
readiness_path=override.pop('readiness_path', self._readiness_path),
|
|
513
|
+
initial_delay_seconds=override.pop('initial_delay_seconds',
|
|
514
|
+
self._initial_delay_seconds),
|
|
515
|
+
readiness_timeout_seconds=override.pop(
|
|
516
|
+
'readiness_timeout_seconds', self._readiness_timeout_seconds),
|
|
517
|
+
min_replicas=override.pop('min_replicas', self._min_replicas),
|
|
518
|
+
max_replicas=override.pop('max_replicas', self._max_replicas),
|
|
519
|
+
num_overprovision=override.pop('num_overprovision',
|
|
520
|
+
self._num_overprovision),
|
|
521
|
+
ports=override.pop('ports', self._ports),
|
|
522
|
+
target_qps_per_replica=override.pop('target_qps_per_replica',
|
|
523
|
+
self._target_qps_per_replica),
|
|
524
|
+
post_data=override.pop('post_data', self._post_data),
|
|
525
|
+
tls_credential=override.pop('tls_credential', self._tls_credential),
|
|
526
|
+
readiness_headers=override.pop('readiness_headers',
|
|
527
|
+
self._readiness_headers),
|
|
528
|
+
dynamic_ondemand_fallback=override.pop(
|
|
529
|
+
'dynamic_ondemand_fallback', self._dynamic_ondemand_fallback),
|
|
530
|
+
base_ondemand_fallback_replicas=override.pop(
|
|
531
|
+
'base_ondemand_fallback_replicas',
|
|
532
|
+
self._base_ondemand_fallback_replicas),
|
|
533
|
+
spot_placer=override.pop('spot_placer', self._spot_placer),
|
|
534
|
+
upscale_delay_seconds=override.pop('upscale_delay_seconds',
|
|
535
|
+
self._upscale_delay_seconds),
|
|
536
|
+
downscale_delay_seconds=override.pop('downscale_delay_seconds',
|
|
537
|
+
self._downscale_delay_seconds),
|
|
538
|
+
load_balancing_policy=override.pop('load_balancing_policy',
|
|
539
|
+
self._load_balancing_policy),
|
|
540
|
+
pool=override.pop('pool', self._pool),
|
|
541
|
+
)
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 20
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/daemons.py
CHANGED
|
@@ -8,7 +8,6 @@ from sky import sky_logging
|
|
|
8
8
|
from sky import skypilot_config
|
|
9
9
|
from sky.server import constants as server_constants
|
|
10
10
|
from sky.utils import annotations
|
|
11
|
-
from sky.utils import common
|
|
12
11
|
from sky.utils import common_utils
|
|
13
12
|
from sky.utils import env_options
|
|
14
13
|
from sky.utils import subprocess_utils
|
|
@@ -94,13 +93,13 @@ class InternalRequestDaemon:
|
|
|
94
93
|
def refresh_cluster_status_event():
|
|
95
94
|
"""Periodically refresh the cluster status."""
|
|
96
95
|
# pylint: disable=import-outside-toplevel
|
|
97
|
-
from sky import
|
|
96
|
+
from sky.backends import backend_utils
|
|
98
97
|
|
|
99
98
|
logger.info('=== Refreshing cluster status ===')
|
|
100
99
|
# This periodically refresh will hold the lock for the cluster being
|
|
101
100
|
# refreshed, but it is OK because other operations will just wait for
|
|
102
101
|
# the lock and get the just refreshed status without refreshing again.
|
|
103
|
-
|
|
102
|
+
backend_utils.refresh_cluster_records()
|
|
104
103
|
logger.info('Status refreshed. Sleeping '
|
|
105
104
|
f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
106
105
|
' seconds for the next refresh...\n')
|
sky/server/requests/executor.py
CHANGED
|
@@ -502,7 +502,35 @@ def _record_memory_metrics(request_name: str, proc: psutil.Process,
|
|
|
502
502
|
name=request_name).observe(max(peak_rss - rss_begin, 0))
|
|
503
503
|
|
|
504
504
|
|
|
505
|
-
|
|
505
|
+
class CoroutineTask:
|
|
506
|
+
"""Wrapper of a background task runs in coroutine"""
|
|
507
|
+
|
|
508
|
+
def __init__(self, task: asyncio.Task):
|
|
509
|
+
self.task = task
|
|
510
|
+
|
|
511
|
+
async def cancel(self):
|
|
512
|
+
try:
|
|
513
|
+
self.task.cancel()
|
|
514
|
+
await self.task
|
|
515
|
+
except asyncio.CancelledError:
|
|
516
|
+
pass
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def execute_request_in_coroutine(
|
|
520
|
+
request: api_requests.Request) -> CoroutineTask:
|
|
521
|
+
"""Execute a request in current event loop.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
request: The request to execute.
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
A CoroutineTask handle to operate the background task.
|
|
528
|
+
"""
|
|
529
|
+
task = asyncio.create_task(_execute_request_coroutine(request))
|
|
530
|
+
return CoroutineTask(task)
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
async def _execute_request_coroutine(request: api_requests.Request):
|
|
506
534
|
"""Execute a request in current event loop.
|
|
507
535
|
|
|
508
536
|
Similar to _request_execution_wrapper, but executed as coroutine in current
|
|
@@ -640,13 +668,35 @@ def schedule_request(request_id: str,
|
|
|
640
668
|
The precondition is waited asynchronously and does not block the
|
|
641
669
|
caller.
|
|
642
670
|
"""
|
|
643
|
-
prepare_request(request_id, request_name, request_body, func,
|
|
644
|
-
|
|
671
|
+
request_task = prepare_request(request_id, request_name, request_body, func,
|
|
672
|
+
request_cluster_name, schedule_type,
|
|
673
|
+
is_skypilot_system)
|
|
674
|
+
schedule_prepared_request(request_task, ignore_return_value, precondition,
|
|
675
|
+
retryable)
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def schedule_prepared_request(request_task: api_requests.Request,
|
|
679
|
+
ignore_return_value: bool = False,
|
|
680
|
+
precondition: Optional[
|
|
681
|
+
preconditions.Precondition] = None,
|
|
682
|
+
retryable: bool = False) -> None:
|
|
683
|
+
"""Enqueue a request to the request queue
|
|
684
|
+
|
|
685
|
+
Args:
|
|
686
|
+
request_task: The prepared request task to schedule.
|
|
687
|
+
ignore_return_value: If True, the return value of the function will be
|
|
688
|
+
ignored.
|
|
689
|
+
precondition: If a precondition is provided, the request will only be
|
|
690
|
+
scheduled for execution when the precondition is met (returns True).
|
|
691
|
+
The precondition is waited asynchronously and does not block the
|
|
692
|
+
caller.
|
|
693
|
+
retryable: Whether the request should be retried if it fails.
|
|
694
|
+
"""
|
|
645
695
|
|
|
646
696
|
def enqueue():
|
|
647
|
-
input_tuple = (request_id, ignore_return_value, retryable)
|
|
648
|
-
logger.info(f'Queuing request: {request_id}')
|
|
649
|
-
_get_queue(schedule_type).put(input_tuple)
|
|
697
|
+
input_tuple = (request_task.request_id, ignore_return_value, retryable)
|
|
698
|
+
logger.info(f'Queuing request: {request_task.request_id}')
|
|
699
|
+
_get_queue(request_task.schedule_type).put(input_tuple)
|
|
650
700
|
|
|
651
701
|
if precondition is not None:
|
|
652
702
|
# Wait async to avoid blocking caller.
|
sky/server/requests/payloads.py
CHANGED
|
@@ -316,6 +316,9 @@ class StatusBody(RequestBody):
|
|
|
316
316
|
all_users: bool = True
|
|
317
317
|
# TODO (kyuds): default to False post 0.10.5
|
|
318
318
|
include_credentials: bool = True
|
|
319
|
+
# Only return fields that are needed for the
|
|
320
|
+
# dashboard / CLI summary response
|
|
321
|
+
summary_response: bool = False
|
|
319
322
|
|
|
320
323
|
|
|
321
324
|
class StartBody(RequestBody):
|
|
@@ -475,6 +478,17 @@ class VolumeListBody(RequestBody):
|
|
|
475
478
|
pass
|
|
476
479
|
|
|
477
480
|
|
|
481
|
+
class VolumeValidateBody(RequestBody):
|
|
482
|
+
"""The request body for the volume validate endpoint."""
|
|
483
|
+
name: Optional[str] = None
|
|
484
|
+
volume_type: Optional[str] = None
|
|
485
|
+
infra: Optional[str] = None
|
|
486
|
+
size: Optional[str] = None
|
|
487
|
+
labels: Optional[Dict[str, str]] = None
|
|
488
|
+
resource_name: Optional[str] = None
|
|
489
|
+
config: Optional[Dict[str, Any]] = None
|
|
490
|
+
|
|
491
|
+
|
|
478
492
|
class EndpointsBody(RequestBody):
|
|
479
493
|
"""The request body for the endpoint."""
|
|
480
494
|
cluster: str
|
|
@@ -670,6 +684,13 @@ class LocalUpBody(RequestBody):
|
|
|
670
684
|
cleanup: bool = False
|
|
671
685
|
context_name: Optional[str] = None
|
|
672
686
|
password: Optional[str] = None
|
|
687
|
+
name: Optional[str] = None
|
|
688
|
+
port_start: Optional[int] = None
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
class LocalDownBody(RequestBody):
|
|
692
|
+
"""The request body for the local down endpoint."""
|
|
693
|
+
name: Optional[str] = None
|
|
673
694
|
|
|
674
695
|
|
|
675
696
|
class SSHUpBody(RequestBody):
|
|
@@ -709,19 +730,22 @@ class JobsDownloadLogsBody(RequestBody):
|
|
|
709
730
|
|
|
710
731
|
class JobsPoolApplyBody(RequestBody):
|
|
711
732
|
"""The request body for the jobs pool apply endpoint."""
|
|
712
|
-
task: str
|
|
733
|
+
task: Optional[str] = None
|
|
734
|
+
workers: Optional[int] = None
|
|
713
735
|
pool_name: str
|
|
714
736
|
mode: serve.UpdateMode
|
|
715
737
|
|
|
716
738
|
def to_kwargs(self) -> Dict[str, Any]:
|
|
717
739
|
kwargs = super().to_kwargs()
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
740
|
+
if self.task is not None:
|
|
741
|
+
dag = common.process_mounts_in_task_on_api_server(
|
|
742
|
+
self.task, self.env_vars, workdir_only=False)
|
|
743
|
+
assert len(
|
|
744
|
+
dag.tasks) == 1, ('Must only specify one task in the DAG for '
|
|
745
|
+
'a pool.', dag)
|
|
746
|
+
kwargs['task'] = dag.tasks[0]
|
|
747
|
+
else:
|
|
748
|
+
kwargs['task'] = None
|
|
725
749
|
return kwargs
|
|
726
750
|
|
|
727
751
|
|
|
@@ -146,10 +146,9 @@ class ClusterStartCompletePrecondition(Precondition):
|
|
|
146
146
|
self.cluster_name = cluster_name
|
|
147
147
|
|
|
148
148
|
async def check(self) -> Tuple[bool, Optional[str]]:
|
|
149
|
-
|
|
149
|
+
cluster_status = global_user_state.get_status_from_cluster_name(
|
|
150
150
|
self.cluster_name)
|
|
151
|
-
if
|
|
152
|
-
cluster_record['status'] is status_lib.ClusterStatus.UP):
|
|
151
|
+
if cluster_status is status_lib.ClusterStatus.UP:
|
|
153
152
|
# Shortcut for started clusters, ignore cluster not found
|
|
154
153
|
# since the cluster record might not yet be created by the
|
|
155
154
|
# launch task.
|
sky/server/rest.py
CHANGED
|
@@ -9,6 +9,7 @@ import typing
|
|
|
9
9
|
from typing import Any, Callable, cast, Optional, TypeVar
|
|
10
10
|
|
|
11
11
|
import colorama
|
|
12
|
+
import urllib3.exceptions
|
|
12
13
|
|
|
13
14
|
from sky import exceptions
|
|
14
15
|
from sky import sky_logging
|
|
@@ -53,6 +54,7 @@ _session.headers[constants.VERSION_HEADER] = (
|
|
|
53
54
|
_transient_errors = [
|
|
54
55
|
requests.exceptions.RequestException,
|
|
55
56
|
ConnectionError,
|
|
57
|
+
urllib3.exceptions.HTTPError,
|
|
56
58
|
]
|
|
57
59
|
|
|
58
60
|
|
sky/server/server.py
CHANGED
|
@@ -445,6 +445,22 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
445
445
|
loop.call_at(target, tick)
|
|
446
446
|
|
|
447
447
|
|
|
448
|
+
def schedule_on_boot_check():
|
|
449
|
+
try:
|
|
450
|
+
executor.schedule_request(
|
|
451
|
+
request_id='skypilot-server-on-boot-check',
|
|
452
|
+
request_name='check',
|
|
453
|
+
request_body=payloads.CheckBody(),
|
|
454
|
+
func=sky_check.check,
|
|
455
|
+
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
456
|
+
is_skypilot_system=True,
|
|
457
|
+
)
|
|
458
|
+
except exceptions.RequestAlreadyExistsError:
|
|
459
|
+
# Lifespan will be executed in each uvicorn worker process, we
|
|
460
|
+
# can safely ignore the error if the task is already scheduled.
|
|
461
|
+
logger.debug('Request skypilot-server-on-boot-check already exists.')
|
|
462
|
+
|
|
463
|
+
|
|
448
464
|
@contextlib.asynccontextmanager
|
|
449
465
|
async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
|
|
450
466
|
"""FastAPI lifespan context manager."""
|
|
@@ -469,6 +485,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
469
485
|
# Lifespan will be executed in each uvicorn worker process, we
|
|
470
486
|
# can safely ignore the error if the task is already scheduled.
|
|
471
487
|
logger.debug(f'Request {event.id} already exists.')
|
|
488
|
+
schedule_on_boot_check()
|
|
472
489
|
asyncio.create_task(cleanup_upload_ids())
|
|
473
490
|
if metrics_utils.METRICS_ENABLED:
|
|
474
491
|
# Start monitoring the event loop lag in each server worker
|
|
@@ -1216,19 +1233,8 @@ async def logs(
|
|
|
1216
1233
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1217
1234
|
request_cluster_name=cluster_job_body.cluster_name,
|
|
1218
1235
|
)
|
|
1219
|
-
task =
|
|
1220
|
-
|
|
1221
|
-
async def cancel_task():
|
|
1222
|
-
try:
|
|
1223
|
-
logger.info('Client disconnected for request: '
|
|
1224
|
-
f'{request.state.request_id}')
|
|
1225
|
-
task.cancel()
|
|
1226
|
-
await task
|
|
1227
|
-
except asyncio.CancelledError:
|
|
1228
|
-
pass
|
|
1229
|
-
|
|
1230
|
-
# Cancel the task after the request is done or client disconnects
|
|
1231
|
-
background_tasks.add_task(cancel_task)
|
|
1236
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
1237
|
+
background_tasks.add_task(task.cancel)
|
|
1232
1238
|
# TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
|
|
1233
1239
|
# the same approach as /stream.
|
|
1234
1240
|
return stream_utils.stream_response(
|
|
@@ -1354,10 +1360,12 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
|
1354
1360
|
effective_tail = None if tail is None or tail <= 0 else tail
|
|
1355
1361
|
|
|
1356
1362
|
return fastapi.responses.StreamingResponse(
|
|
1357
|
-
content=stream_utils.log_streamer(
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1363
|
+
content=stream_utils.log_streamer(
|
|
1364
|
+
None,
|
|
1365
|
+
log_path,
|
|
1366
|
+
tail=effective_tail,
|
|
1367
|
+
follow=follow,
|
|
1368
|
+
cluster_name=cluster_body.cluster_name),
|
|
1361
1369
|
media_type='text/plain',
|
|
1362
1370
|
headers={
|
|
1363
1371
|
'Cache-Control': 'no-cache, no-transform',
|
|
@@ -1419,12 +1427,13 @@ async def local_up(request: fastapi.Request,
|
|
|
1419
1427
|
|
|
1420
1428
|
|
|
1421
1429
|
@app.post('/local_down')
|
|
1422
|
-
async def local_down(request: fastapi.Request
|
|
1430
|
+
async def local_down(request: fastapi.Request,
|
|
1431
|
+
local_down_body: payloads.LocalDownBody) -> None:
|
|
1423
1432
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1424
1433
|
executor.schedule_request(
|
|
1425
1434
|
request_id=request.state.request_id,
|
|
1426
1435
|
request_name='local_down',
|
|
1427
|
-
request_body=
|
|
1436
|
+
request_body=local_down_body,
|
|
1428
1437
|
func=core.local_down,
|
|
1429
1438
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
1430
1439
|
)
|
sky/server/stream_utils.py
CHANGED
|
@@ -8,10 +8,12 @@ from typing import AsyncGenerator, Deque, List, Optional
|
|
|
8
8
|
import aiofiles
|
|
9
9
|
import fastapi
|
|
10
10
|
|
|
11
|
+
from sky import global_user_state
|
|
11
12
|
from sky import sky_logging
|
|
12
13
|
from sky.server.requests import requests as requests_lib
|
|
13
14
|
from sky.utils import message_utils
|
|
14
15
|
from sky.utils import rich_utils
|
|
16
|
+
from sky.utils import status_lib
|
|
15
17
|
|
|
16
18
|
logger = sky_logging.init_logger(__name__)
|
|
17
19
|
|
|
@@ -22,6 +24,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
22
24
|
_BUFFER_SIZE = 8 * 1024 # 8KB
|
|
23
25
|
_BUFFER_TIMEOUT = 0.02 # 20ms
|
|
24
26
|
_HEARTBEAT_INTERVAL = 30
|
|
27
|
+
_CLUSTER_STATUS_INTERVAL = 1
|
|
25
28
|
|
|
26
29
|
|
|
27
30
|
async def _yield_log_file_with_payloads_skipped(
|
|
@@ -37,11 +40,13 @@ async def _yield_log_file_with_payloads_skipped(
|
|
|
37
40
|
yield line_str
|
|
38
41
|
|
|
39
42
|
|
|
40
|
-
async def log_streamer(
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
43
|
+
async def log_streamer(
|
|
44
|
+
request_id: Optional[str],
|
|
45
|
+
log_path: pathlib.Path,
|
|
46
|
+
plain_logs: bool = False,
|
|
47
|
+
tail: Optional[int] = None,
|
|
48
|
+
follow: bool = True,
|
|
49
|
+
cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
|
|
45
50
|
"""Streams the logs of a request.
|
|
46
51
|
|
|
47
52
|
Args:
|
|
@@ -51,6 +56,8 @@ async def log_streamer(request_id: Optional[str],
|
|
|
51
56
|
plain_logs: Whether to show plain logs.
|
|
52
57
|
tail: The number of lines to tail. If None, tail the whole file.
|
|
53
58
|
follow: Whether to follow the log file.
|
|
59
|
+
cluster_name: The cluster name to check status for provision logs.
|
|
60
|
+
If provided and cluster status is UP, streaming will terminate.
|
|
54
61
|
"""
|
|
55
62
|
|
|
56
63
|
if request_id is not None:
|
|
@@ -104,15 +111,17 @@ async def log_streamer(request_id: Optional[str],
|
|
|
104
111
|
|
|
105
112
|
async with aiofiles.open(log_path, 'rb') as f:
|
|
106
113
|
async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
|
|
107
|
-
follow):
|
|
114
|
+
follow, cluster_name):
|
|
108
115
|
yield chunk
|
|
109
116
|
|
|
110
117
|
|
|
111
|
-
async def _tail_log_file(
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
118
|
+
async def _tail_log_file(
|
|
119
|
+
f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
120
|
+
request_id: Optional[str] = None,
|
|
121
|
+
plain_logs: bool = False,
|
|
122
|
+
tail: Optional[int] = None,
|
|
123
|
+
follow: bool = True,
|
|
124
|
+
cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
|
|
116
125
|
"""Tail the opened log file, buffer the lines and flush in chunks."""
|
|
117
126
|
|
|
118
127
|
if tail is not None:
|
|
@@ -128,6 +137,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
128
137
|
yield line_str
|
|
129
138
|
|
|
130
139
|
last_heartbeat_time = asyncio.get_event_loop().time()
|
|
140
|
+
last_cluster_status_check_time = asyncio.get_event_loop().time()
|
|
131
141
|
|
|
132
142
|
# Buffer the lines in memory and flush them in chunks to improve log
|
|
133
143
|
# tailing throughput.
|
|
@@ -176,7 +186,19 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
176
186
|
break
|
|
177
187
|
if not follow:
|
|
178
188
|
break
|
|
179
|
-
|
|
189
|
+
# Provision logs pass in cluster_name, check cluster status
|
|
190
|
+
# periodically to see if provisioning is done. We only
|
|
191
|
+
# check once a second to avoid overloading the DB.
|
|
192
|
+
check_status = (current_time - last_cluster_status_check_time
|
|
193
|
+
) >= _CLUSTER_STATUS_INTERVAL
|
|
194
|
+
if cluster_name is not None and check_status:
|
|
195
|
+
cluster_record = await (
|
|
196
|
+
global_user_state.get_status_from_cluster_name_async(
|
|
197
|
+
cluster_name))
|
|
198
|
+
if (cluster_record is None or
|
|
199
|
+
cluster_record != status_lib.ClusterStatus.INIT):
|
|
200
|
+
break
|
|
201
|
+
last_cluster_status_check_time = current_time
|
|
180
202
|
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
|
181
203
|
# Currently just used to keep the connection busy, refer to
|
|
182
204
|
# https://github.com/skypilot-org/skypilot/issues/5750 for
|