skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,472 @@
|
|
1
|
+
"""Executor for the requests.
|
2
|
+
|
3
|
+
We start limited number of workers for long-running requests, and
|
4
|
+
significantly more workers for short-running requests. This is to optimize the
|
5
|
+
resource usage and the latency of the requests.
|
6
|
+
|
7
|
+
* Long-running requests are those requests that can take a long time to finish
|
8
|
+
and more resources are needed, such as cluster launching, starting, job
|
9
|
+
submission, managed job submission, etc.
|
10
|
+
|
11
|
+
* Short-running requests are those requests that can be done quickly, and
|
12
|
+
require a quick response, such as status check, job status check, etc.
|
13
|
+
|
14
|
+
With more short-running workers, we can serve more short-running requests in
|
15
|
+
parallel, and reduce the latency.
|
16
|
+
|
17
|
+
The number of the workers is determined by the system resources.
|
18
|
+
|
19
|
+
See the [README.md](../README.md) for detailed architecture of the executor.
|
20
|
+
"""
|
21
|
+
import concurrent.futures
|
22
|
+
import contextlib
|
23
|
+
import dataclasses
|
24
|
+
import enum
|
25
|
+
import multiprocessing
|
26
|
+
import os
|
27
|
+
import queue as queue_lib
|
28
|
+
import signal
|
29
|
+
import sys
|
30
|
+
import time
|
31
|
+
import traceback
|
32
|
+
import typing
|
33
|
+
from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
|
34
|
+
|
35
|
+
import setproctitle
|
36
|
+
|
37
|
+
from sky import global_user_state
|
38
|
+
from sky import models
|
39
|
+
from sky import sky_logging
|
40
|
+
from sky import skypilot_config
|
41
|
+
from sky.server import common as server_common
|
42
|
+
from sky.server import constants as server_constants
|
43
|
+
from sky.server.requests import payloads
|
44
|
+
from sky.server.requests import requests as api_requests
|
45
|
+
from sky.server.requests.queues import mp_queue
|
46
|
+
from sky.skylet import constants
|
47
|
+
from sky.utils import annotations
|
48
|
+
from sky.utils import common_utils
|
49
|
+
from sky.utils import timeline
|
50
|
+
from sky.utils import ux_utils
|
51
|
+
|
52
|
+
if typing.TYPE_CHECKING:
|
53
|
+
import types
|
54
|
+
|
55
|
+
# pylint: disable=ungrouped-imports
|
56
|
+
if sys.version_info >= (3, 10):
|
57
|
+
from typing import ParamSpec
|
58
|
+
else:
|
59
|
+
from typing_extensions import ParamSpec
|
60
|
+
|
61
|
+
P = ParamSpec('P')
|
62
|
+
|
63
|
+
logger = sky_logging.init_logger(__name__)
|
64
|
+
|
65
|
+
# On macOS, the default start method for multiprocessing is 'fork', which
|
66
|
+
# can cause issues with certain types of resources, including those used in
|
67
|
+
# the QueueManager in mp_queue.py.
|
68
|
+
# The 'spawn' start method is generally more compatible across different
|
69
|
+
# platforms, including macOS.
|
70
|
+
multiprocessing.set_start_method('spawn', force=True)
|
71
|
+
|
72
|
+
# Constants based on profiling the peak memory usage while serving various
|
73
|
+
# sky commands. These estimation are highly related to usage patterns
|
74
|
+
# (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
|
75
|
+
# the profiling covers major clouds and common usage patterns. For user has
|
76
|
+
# deviated usage pattern, they can override the default estimation by
|
77
|
+
# environment variables.
|
78
|
+
# NOTE(dev): update these constants for each release according to the load
|
79
|
+
# test results.
|
80
|
+
# TODO(aylei): maintaining these constants is error-prone, we may need to
|
81
|
+
# automatically tune parallelism at runtime according to system usage stats
|
82
|
+
# in the future.
|
83
|
+
_LONG_WORKER_MEM_GB = 0.4
|
84
|
+
_SHORT_WORKER_MEM_GB = 0.25
|
85
|
+
# To control the number of long workers.
|
86
|
+
_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
|
87
|
+
# Limit the number of long workers of local API server, since local server is
|
88
|
+
# typically:
|
89
|
+
# 1. launched automatically in an environment with high resource contention
|
90
|
+
# (e.g. Laptop)
|
91
|
+
# 2. used by a single user
|
92
|
+
_MAX_LONG_WORKERS_LOCAL = 4
|
93
|
+
# Percentage of memory for long requests
|
94
|
+
# from the memory reserved for SkyPilot.
|
95
|
+
# This is to reserve some memory for short requests.
|
96
|
+
_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
|
97
|
+
# Minimal number of long workers to ensure responsiveness.
|
98
|
+
_MIN_LONG_WORKERS = 1
|
99
|
+
# Minimal number of short workers, there is a daemon task running on short
|
100
|
+
# workers so at least 2 workers are needed to ensure responsiveness.
|
101
|
+
_MIN_SHORT_WORKERS = 2
|
102
|
+
|
103
|
+
|
104
|
+
class QueueBackend(enum.Enum):
|
105
|
+
MULTIPROCESSING = 'multiprocessing'
|
106
|
+
# TODO(zhwu): we can add redis backend in the future.
|
107
|
+
|
108
|
+
|
109
|
+
@dataclasses.dataclass
|
110
|
+
class RequestWorker:
|
111
|
+
id: int
|
112
|
+
# The type of queue this worker works on.
|
113
|
+
schedule_type: api_requests.ScheduleType
|
114
|
+
|
115
|
+
def __str__(self) -> str:
|
116
|
+
return f'Worker(id={self.id}, schedule_type={self.schedule_type.value})'
|
117
|
+
|
118
|
+
|
119
|
+
class RequestQueue:
|
120
|
+
"""The queue for the requests, either redis or multiprocessing.
|
121
|
+
|
122
|
+
The elements in the queue are tuples of (request_id, ignore_return_value).
|
123
|
+
"""
|
124
|
+
|
125
|
+
def __init__(self,
|
126
|
+
schedule_type: api_requests.ScheduleType,
|
127
|
+
backend: Optional[QueueBackend] = None) -> None:
|
128
|
+
self.name = schedule_type.value
|
129
|
+
self.backend = backend
|
130
|
+
assert (backend is None or
|
131
|
+
backend == QueueBackend.MULTIPROCESSING), backend
|
132
|
+
self.queue = mp_queue.get_queue(self.name)
|
133
|
+
|
134
|
+
def put(self, request: Tuple[str, bool]) -> None:
|
135
|
+
"""Put and request to the queue.
|
136
|
+
|
137
|
+
Args:
|
138
|
+
request: A tuple of request_id and ignore_return_value.
|
139
|
+
"""
|
140
|
+
self.queue.put(request) # type: ignore
|
141
|
+
|
142
|
+
def get(self) -> Optional[Tuple[str, bool]]:
|
143
|
+
"""Get a request from the queue.
|
144
|
+
|
145
|
+
It is non-blocking if the queue is empty, and returns None.
|
146
|
+
|
147
|
+
Returns:
|
148
|
+
A tuple of request_id and ignore_return_value.
|
149
|
+
"""
|
150
|
+
try:
|
151
|
+
return self.queue.get(block=False)
|
152
|
+
except queue_lib.Empty:
|
153
|
+
return None
|
154
|
+
|
155
|
+
def __len__(self) -> int:
|
156
|
+
"""Get the length of the queue."""
|
157
|
+
return self.queue.qsize()
|
158
|
+
|
159
|
+
|
160
|
+
queue_backend = QueueBackend.MULTIPROCESSING
|
161
|
+
|
162
|
+
|
163
|
+
@annotations.lru_cache(scope='global', maxsize=None)
|
164
|
+
def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
|
165
|
+
return RequestQueue(schedule_type, backend=queue_backend)
|
166
|
+
|
167
|
+
|
168
|
+
@contextlib.contextmanager
|
169
|
+
def override_request_env_and_config(
|
170
|
+
request_body: payloads.RequestBody) -> Generator[None, None, None]:
|
171
|
+
"""Override the environment and SkyPilot config for a request."""
|
172
|
+
original_env = os.environ.copy()
|
173
|
+
os.environ.update(request_body.env_vars)
|
174
|
+
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
175
|
+
name=request_body.env_vars[constants.USER_ENV_VAR])
|
176
|
+
global_user_state.add_or_update_user(user)
|
177
|
+
# Force color to be enabled.
|
178
|
+
os.environ['CLICOLOR_FORCE'] = '1'
|
179
|
+
server_common.reload_for_new_request(
|
180
|
+
client_entrypoint=request_body.entrypoint,
|
181
|
+
client_command=request_body.entrypoint_command,
|
182
|
+
using_remote_api_server=request_body.using_remote_api_server)
|
183
|
+
try:
|
184
|
+
with skypilot_config.override_skypilot_config(
|
185
|
+
request_body.override_skypilot_config):
|
186
|
+
yield
|
187
|
+
finally:
|
188
|
+
# We need to call the save_timeline() since atexit will not be
|
189
|
+
# triggered as multiple requests can be sharing the same process.
|
190
|
+
timeline.save_timeline()
|
191
|
+
# Restore the original environment variables, so that a new request
|
192
|
+
# won't be affected by the previous request, e.g. SKYPILOT_DEBUG
|
193
|
+
# setting, etc. This is necessary as our executor is reusing the
|
194
|
+
# same process for multiple requests.
|
195
|
+
os.environ.clear()
|
196
|
+
os.environ.update(original_env)
|
197
|
+
|
198
|
+
|
199
|
+
def _redirect_output(file: TextIO) -> Tuple[int, int]:
|
200
|
+
"""Redirect stdout and stderr to the log file."""
|
201
|
+
fd = file.fileno() # Get the file descriptor from the file object
|
202
|
+
# Store copies of the original stdout and stderr file descriptors
|
203
|
+
original_stdout = os.dup(sys.stdout.fileno())
|
204
|
+
original_stderr = os.dup(sys.stderr.fileno())
|
205
|
+
|
206
|
+
# Copy this fd to stdout and stderr
|
207
|
+
os.dup2(fd, sys.stdout.fileno())
|
208
|
+
os.dup2(fd, sys.stderr.fileno())
|
209
|
+
return original_stdout, original_stderr
|
210
|
+
|
211
|
+
|
212
|
+
def _restore_output(original_stdout: int, original_stderr: int) -> None:
|
213
|
+
"""Restore stdout and stderr to their original file descriptors."""
|
214
|
+
os.dup2(original_stdout, sys.stdout.fileno())
|
215
|
+
os.dup2(original_stderr, sys.stderr.fileno())
|
216
|
+
|
217
|
+
# Close the duplicate file descriptors
|
218
|
+
os.close(original_stdout)
|
219
|
+
os.close(original_stderr)
|
220
|
+
|
221
|
+
|
222
|
+
def _request_execution_wrapper(request_id: str,
|
223
|
+
ignore_return_value: bool) -> None:
|
224
|
+
"""Wrapper for a request execution.
|
225
|
+
|
226
|
+
It wraps the execution of a request to:
|
227
|
+
1. Deserialize the request from the request database and serialize the
|
228
|
+
return value/exception in the request database;
|
229
|
+
2. Update the request status based on the execution result;
|
230
|
+
3. Redirect the stdout and stderr of the execution to log file;
|
231
|
+
4. Handle the SIGTERM signal to abort the request gracefully.
|
232
|
+
"""
|
233
|
+
|
234
|
+
def sigterm_handler(signum: int,
|
235
|
+
frame: Optional['types.FrameType']) -> None:
|
236
|
+
raise KeyboardInterrupt
|
237
|
+
|
238
|
+
signal.signal(signal.SIGTERM, sigterm_handler)
|
239
|
+
|
240
|
+
pid = multiprocessing.current_process().pid
|
241
|
+
logger.info(f'Running request {request_id} with pid {pid}')
|
242
|
+
with api_requests.update_request(request_id) as request_task:
|
243
|
+
assert request_task is not None, request_id
|
244
|
+
log_path = request_task.log_path
|
245
|
+
request_task.pid = pid
|
246
|
+
request_task.status = api_requests.RequestStatus.RUNNING
|
247
|
+
func = request_task.entrypoint
|
248
|
+
request_body = request_task.request_body
|
249
|
+
|
250
|
+
with log_path.open('w', encoding='utf-8') as f:
|
251
|
+
# Store copies of the original stdout and stderr file descriptors
|
252
|
+
original_stdout, original_stderr = _redirect_output(f)
|
253
|
+
# Redirect the stdout/stderr before overriding the environment and
|
254
|
+
# config, as there can be some logs during override that needs to be
|
255
|
+
# captured in the log file.
|
256
|
+
try:
|
257
|
+
with override_request_env_and_config(request_body):
|
258
|
+
return_value = func(**request_body.to_kwargs())
|
259
|
+
f.flush()
|
260
|
+
except KeyboardInterrupt:
|
261
|
+
logger.info(f'Request {request_id} cancelled by user')
|
262
|
+
_restore_output(original_stdout, original_stderr)
|
263
|
+
return
|
264
|
+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
265
|
+
with ux_utils.enable_traceback():
|
266
|
+
stacktrace = traceback.format_exc()
|
267
|
+
setattr(e, 'stacktrace', stacktrace)
|
268
|
+
with api_requests.update_request(request_id) as request_task:
|
269
|
+
assert request_task is not None, request_id
|
270
|
+
request_task.status = api_requests.RequestStatus.FAILED
|
271
|
+
request_task.set_error(e)
|
272
|
+
_restore_output(original_stdout, original_stderr)
|
273
|
+
logger.info(f'Request {request_id} failed due to '
|
274
|
+
f'{common_utils.format_exception(e)}')
|
275
|
+
return
|
276
|
+
else:
|
277
|
+
with api_requests.update_request(request_id) as request_task:
|
278
|
+
assert request_task is not None, request_id
|
279
|
+
request_task.status = api_requests.RequestStatus.SUCCEEDED
|
280
|
+
if not ignore_return_value:
|
281
|
+
request_task.set_return_value(return_value)
|
282
|
+
_restore_output(original_stdout, original_stderr)
|
283
|
+
logger.info(f'Request {request_id} finished')
|
284
|
+
|
285
|
+
|
286
|
+
def schedule_request(request_id: str,
|
287
|
+
request_name: str,
|
288
|
+
request_body: payloads.RequestBody,
|
289
|
+
func: Callable[P, Any],
|
290
|
+
request_cluster_name: Optional[str] = None,
|
291
|
+
ignore_return_value: bool = False,
|
292
|
+
schedule_type: api_requests.ScheduleType = api_requests.
|
293
|
+
ScheduleType.LONG,
|
294
|
+
is_skypilot_system: bool = False) -> None:
|
295
|
+
"""Enqueue a request to the request queue."""
|
296
|
+
user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
|
297
|
+
if is_skypilot_system:
|
298
|
+
user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
|
299
|
+
global_user_state.add_or_update_user(
|
300
|
+
models.User(id=user_id, name=user_id))
|
301
|
+
request = api_requests.Request(request_id=request_id,
|
302
|
+
name=server_constants.REQUEST_NAME_PREFIX +
|
303
|
+
request_name,
|
304
|
+
entrypoint=func,
|
305
|
+
request_body=request_body,
|
306
|
+
status=api_requests.RequestStatus.PENDING,
|
307
|
+
created_at=time.time(),
|
308
|
+
schedule_type=schedule_type,
|
309
|
+
user_id=user_id,
|
310
|
+
cluster_name=request_cluster_name)
|
311
|
+
|
312
|
+
if not api_requests.create_if_not_exists(request):
|
313
|
+
logger.debug(f'Request {request_id} already exists.')
|
314
|
+
return
|
315
|
+
|
316
|
+
request.log_path.touch()
|
317
|
+
input_tuple = (request_id, ignore_return_value)
|
318
|
+
|
319
|
+
logger.info(f'Queuing request: {request_id}')
|
320
|
+
_get_queue(schedule_type).put(input_tuple)
|
321
|
+
|
322
|
+
|
323
|
+
def executor_initializer(proc_group: str):
|
324
|
+
setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
|
325
|
+
f'{multiprocessing.current_process().pid}')
|
326
|
+
|
327
|
+
|
328
|
+
def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
329
|
+
"""Worker for the requests.
|
330
|
+
|
331
|
+
Args:
|
332
|
+
max_parallel_size: Maximum number of parallel jobs this worker can run.
|
333
|
+
"""
|
334
|
+
proc_group = f'{worker.schedule_type.value}-{worker.id}'
|
335
|
+
setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
|
336
|
+
queue = _get_queue(worker.schedule_type)
|
337
|
+
|
338
|
+
def process_request(executor: concurrent.futures.ProcessPoolExecutor):
|
339
|
+
try:
|
340
|
+
request_element = queue.get()
|
341
|
+
if request_element is None:
|
342
|
+
time.sleep(0.1)
|
343
|
+
return
|
344
|
+
request_id, ignore_return_value = request_element
|
345
|
+
request = api_requests.get_request(request_id)
|
346
|
+
assert request is not None, f'Request with ID {request_id} is None'
|
347
|
+
if request.status == api_requests.RequestStatus.CANCELLED:
|
348
|
+
return
|
349
|
+
logger.info(f'[{worker}] Submitting request: {request_id}')
|
350
|
+
# Start additional process to run the request, so that it can be
|
351
|
+
# cancelled when requested by a user.
|
352
|
+
# TODO(zhwu): since the executor is reusing the request process,
|
353
|
+
# multiple requests can share the same process pid, which may cause
|
354
|
+
# issues with SkyPilot core functions if they rely on the exit of
|
355
|
+
# the process, such as subprocess_daemon.py.
|
356
|
+
future = executor.submit(_request_execution_wrapper, request_id,
|
357
|
+
ignore_return_value)
|
358
|
+
|
359
|
+
if worker.schedule_type == api_requests.ScheduleType.LONG:
|
360
|
+
try:
|
361
|
+
future.result(timeout=None)
|
362
|
+
except Exception as e: # pylint: disable=broad-except
|
363
|
+
logger.error(f'[{worker}] Request {request_id} failed: {e}')
|
364
|
+
logger.info(f'[{worker}] Finished request: {request_id}')
|
365
|
+
else:
|
366
|
+
logger.info(f'[{worker}] Submitted request: {request_id}')
|
367
|
+
except KeyboardInterrupt:
|
368
|
+
# Interrupt the worker process will stop request execution, but
|
369
|
+
# the SIGTERM request should be respected anyway since it might
|
370
|
+
# be explicitly sent by user.
|
371
|
+
# TODO(aylei): crash the API server or recreate the worker process
|
372
|
+
# to avoid broken state.
|
373
|
+
logger.error(f'[{worker}] Worker process interrupted')
|
374
|
+
with ux_utils.print_exception_no_traceback():
|
375
|
+
raise
|
376
|
+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
377
|
+
# Catch any other exceptions to avoid crashing the worker process.
|
378
|
+
logger.error(
|
379
|
+
f'[{worker}] Error processing request {request_id}: '
|
380
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
381
|
+
|
382
|
+
# Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
|
383
|
+
# because the former is more efficient with the support of lazy creation of
|
384
|
+
# worker processes.
|
385
|
+
# We use executor instead of individual multiprocessing.Process to avoid
|
386
|
+
# the overhead of forking a new process for each request, which can be about
|
387
|
+
# 1s delay.
|
388
|
+
with concurrent.futures.ProcessPoolExecutor(
|
389
|
+
max_workers=max_parallel_size,
|
390
|
+
initializer=executor_initializer,
|
391
|
+
initargs=(proc_group,)) as executor:
|
392
|
+
while True:
|
393
|
+
process_request(executor)
|
394
|
+
|
395
|
+
|
396
|
+
def start(deploy: bool) -> List[multiprocessing.Process]:
|
397
|
+
"""Start the request workers."""
|
398
|
+
# Determine the job capacity of the workers based on the system resources.
|
399
|
+
cpu_count = common_utils.get_cpu_count()
|
400
|
+
mem_size_gb = common_utils.get_mem_size_gb()
|
401
|
+
mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
|
402
|
+
max_parallel_for_long = _max_long_worker_parallism(cpu_count,
|
403
|
+
mem_size_gb,
|
404
|
+
local=not deploy)
|
405
|
+
max_parallel_for_short = _max_short_worker_parallism(
|
406
|
+
mem_size_gb, max_parallel_for_long)
|
407
|
+
logger.info(
|
408
|
+
f'SkyPilot API server will start {max_parallel_for_long} workers for '
|
409
|
+
f'long requests and will allow at max '
|
410
|
+
f'{max_parallel_for_short} short requests in parallel.')
|
411
|
+
|
412
|
+
sub_procs = []
|
413
|
+
# Setup the queues.
|
414
|
+
if queue_backend == QueueBackend.MULTIPROCESSING:
|
415
|
+
logger.info('Creating shared request queues')
|
416
|
+
queue_names = [
|
417
|
+
schedule_type.value for schedule_type in api_requests.ScheduleType
|
418
|
+
]
|
419
|
+
# TODO(aylei): make queue manager port configurable or pick an available
|
420
|
+
# port automatically.
|
421
|
+
port = mp_queue.DEFAULT_QUEUE_MANAGER_PORT
|
422
|
+
if not common_utils.is_port_available(port):
|
423
|
+
raise RuntimeError(
|
424
|
+
f'SkyPilot API server fails to start as port {port!r} is '
|
425
|
+
'already in use by another process.')
|
426
|
+
queue_server = multiprocessing.Process(
|
427
|
+
target=mp_queue.start_queue_manager, args=(queue_names, port))
|
428
|
+
queue_server.start()
|
429
|
+
sub_procs.append(queue_server)
|
430
|
+
mp_queue.wait_for_queues_to_be_ready(queue_names, port=port)
|
431
|
+
|
432
|
+
logger.info('Request queues created')
|
433
|
+
|
434
|
+
for worker_id in range(max_parallel_for_long):
|
435
|
+
worker = RequestWorker(id=worker_id,
|
436
|
+
schedule_type=api_requests.ScheduleType.LONG)
|
437
|
+
worker_proc = multiprocessing.Process(target=request_worker,
|
438
|
+
args=(worker, 1))
|
439
|
+
worker_proc.start()
|
440
|
+
sub_procs.append(worker_proc)
|
441
|
+
|
442
|
+
# Start a worker for short requests.
|
443
|
+
worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
|
444
|
+
worker_proc = multiprocessing.Process(target=request_worker,
|
445
|
+
args=(worker, max_parallel_for_short))
|
446
|
+
worker_proc.start()
|
447
|
+
sub_procs.append(worker_proc)
|
448
|
+
return sub_procs
|
449
|
+
|
450
|
+
|
451
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
452
|
+
def _max_long_worker_parallism(cpu_count: int,
|
453
|
+
mem_size_gb: float,
|
454
|
+
local=False) -> int:
|
455
|
+
"""Max parallelism for long workers."""
|
456
|
+
cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
|
457
|
+
mem_based_max_parallel = int(mem_size_gb * _MAX_MEM_PERCENT_FOR_BLOCKING /
|
458
|
+
_LONG_WORKER_MEM_GB)
|
459
|
+
n = max(_MIN_LONG_WORKERS,
|
460
|
+
min(cpu_based_max_parallel, mem_based_max_parallel))
|
461
|
+
if local:
|
462
|
+
return min(n, _MAX_LONG_WORKERS_LOCAL)
|
463
|
+
return n
|
464
|
+
|
465
|
+
|
466
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
467
|
+
def _max_short_worker_parallism(mem_size_gb: float,
|
468
|
+
long_worker_parallism: int) -> int:
|
469
|
+
"""Max parallelism for short workers."""
|
470
|
+
available_mem = mem_size_gb - (long_worker_parallism * _LONG_WORKER_MEM_GB)
|
471
|
+
n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
|
472
|
+
return n
|