skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/utils/common_utils.py
CHANGED
@@ -5,7 +5,7 @@ import functools
|
|
5
5
|
import getpass
|
6
6
|
import hashlib
|
7
7
|
import inspect
|
8
|
-
import
|
8
|
+
import io
|
9
9
|
import os
|
10
10
|
import platform
|
11
11
|
import random
|
@@ -16,20 +16,21 @@ import time
|
|
16
16
|
from typing import Any, Callable, Dict, List, Optional, Union
|
17
17
|
import uuid
|
18
18
|
|
19
|
-
import colorama
|
20
19
|
import jinja2
|
21
20
|
import jsonschema
|
21
|
+
import psutil
|
22
22
|
import yaml
|
23
23
|
|
24
24
|
from sky import exceptions
|
25
25
|
from sky import sky_logging
|
26
26
|
from sky.skylet import constants
|
27
|
+
from sky.usage import constants as usage_constants
|
28
|
+
from sky.utils import annotations
|
27
29
|
from sky.utils import ux_utils
|
28
30
|
from sky.utils import validator
|
29
31
|
|
30
32
|
_USER_HASH_FILE = os.path.expanduser('~/.sky/user_hash')
|
31
33
|
USER_HASH_LENGTH = 8
|
32
|
-
USER_HASH_LENGTH_IN_CLUSTER_NAME = 4
|
33
34
|
|
34
35
|
# We are using base36 to reduce the length of the hash. 2 chars -> 36^2 = 1296
|
35
36
|
# possibilities. considering the final cluster name contains the prefix as well,
|
@@ -38,16 +39,12 @@ CLUSTER_NAME_HASH_LENGTH = 2
|
|
38
39
|
|
39
40
|
_COLOR_PATTERN = re.compile(r'\x1b[^m]*m')
|
40
41
|
|
41
|
-
_PAYLOAD_PATTERN = re.compile(r'<sky-payload>(.*)</sky-payload>')
|
42
|
-
_PAYLOAD_STR = '<sky-payload>{}</sky-payload>'
|
43
|
-
|
44
42
|
_VALID_ENV_VAR_REGEX = '[a-zA-Z_][a-zA-Z0-9_]*'
|
45
43
|
|
46
44
|
logger = sky_logging.init_logger(__name__)
|
47
45
|
|
48
|
-
_usage_run_id = None
|
49
|
-
|
50
46
|
|
47
|
+
@annotations.lru_cache(scope='request')
|
51
48
|
def get_usage_run_id() -> str:
|
52
49
|
"""Returns a unique run id for each 'run'.
|
53
50
|
|
@@ -55,42 +52,44 @@ def get_usage_run_id() -> str:
|
|
55
52
|
and has called its CLI or programmatic APIs. For example, two successive
|
56
53
|
`sky launch` are two runs.
|
57
54
|
"""
|
58
|
-
|
59
|
-
if
|
60
|
-
|
61
|
-
return
|
55
|
+
usage_run_id = os.getenv(usage_constants.USAGE_RUN_ID_ENV_VAR)
|
56
|
+
if usage_run_id is not None:
|
57
|
+
return usage_run_id
|
58
|
+
return str(uuid.uuid4())
|
59
|
+
|
60
|
+
|
61
|
+
def _is_valid_user_hash(user_hash: Optional[str]) -> bool:
|
62
|
+
if user_hash is None:
|
63
|
+
return False
|
64
|
+
try:
|
65
|
+
int(user_hash, 16)
|
66
|
+
except (TypeError, ValueError):
|
67
|
+
return False
|
68
|
+
return len(user_hash) == USER_HASH_LENGTH
|
69
|
+
|
70
|
+
|
71
|
+
def generate_user_hash() -> str:
|
72
|
+
"""Generates a unique user-machine specific hash."""
|
73
|
+
hash_str = user_and_hostname_hash()
|
74
|
+
user_hash = hashlib.md5(hash_str.encode()).hexdigest()[:USER_HASH_LENGTH]
|
75
|
+
if not _is_valid_user_hash(user_hash):
|
76
|
+
# A fallback in case the hash is invalid.
|
77
|
+
user_hash = uuid.uuid4().hex[:USER_HASH_LENGTH]
|
78
|
+
return user_hash
|
62
79
|
|
63
80
|
|
64
|
-
def get_user_hash(
|
81
|
+
def get_user_hash() -> str:
|
65
82
|
"""Returns a unique user-machine specific hash as a user id.
|
66
83
|
|
67
84
|
We cache the user hash in a file to avoid potential user_name or
|
68
85
|
hostname changes causing a new user hash to be generated.
|
69
|
-
|
70
|
-
Args:
|
71
|
-
force_fresh_hash: Bypasses the cached hash in USER_HASH_FILE and the
|
72
|
-
hash in the USER_ID_ENV_VAR and forces a fresh user-machine hash
|
73
|
-
to be generated. Used by `kubernetes.ssh_key_secret_field_name` to
|
74
|
-
avoid controllers sharing the same ssh key field name as the
|
75
|
-
local client.
|
76
86
|
"""
|
87
|
+
user_hash = os.getenv(constants.USER_ID_ENV_VAR)
|
88
|
+
if _is_valid_user_hash(user_hash):
|
89
|
+
assert user_hash is not None
|
90
|
+
return user_hash
|
77
91
|
|
78
|
-
|
79
|
-
if user_hash is None:
|
80
|
-
return False
|
81
|
-
try:
|
82
|
-
int(user_hash, 16)
|
83
|
-
except (TypeError, ValueError):
|
84
|
-
return False
|
85
|
-
return len(user_hash) == USER_HASH_LENGTH
|
86
|
-
|
87
|
-
if not force_fresh_hash:
|
88
|
-
user_hash = os.getenv(constants.USER_ID_ENV_VAR)
|
89
|
-
if _is_valid_user_hash(user_hash):
|
90
|
-
assert user_hash is not None
|
91
|
-
return user_hash
|
92
|
-
|
93
|
-
if not force_fresh_hash and os.path.exists(_USER_HASH_FILE):
|
92
|
+
if os.path.exists(_USER_HASH_FILE):
|
94
93
|
# Read from cached user hash file.
|
95
94
|
with open(_USER_HASH_FILE, 'r', encoding='utf-8') as f:
|
96
95
|
# Remove invalid characters.
|
@@ -98,19 +97,10 @@ def get_user_hash(force_fresh_hash: bool = False) -> str:
|
|
98
97
|
if _is_valid_user_hash(user_hash):
|
99
98
|
return user_hash
|
100
99
|
|
101
|
-
|
102
|
-
user_hash = hashlib.md5(hash_str.encode()).hexdigest()[:USER_HASH_LENGTH]
|
103
|
-
if not _is_valid_user_hash(user_hash):
|
104
|
-
# A fallback in case the hash is invalid.
|
105
|
-
user_hash = uuid.uuid4().hex[:USER_HASH_LENGTH]
|
100
|
+
user_hash = generate_user_hash()
|
106
101
|
os.makedirs(os.path.dirname(_USER_HASH_FILE), exist_ok=True)
|
107
|
-
|
108
|
-
|
109
|
-
# be intentionally using a different hash, e.g. we want to keep the
|
110
|
-
# user_hash for usage collection the same on the jobs/serve controller
|
111
|
-
# as users' local client.
|
112
|
-
with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
|
113
|
-
f.write(user_hash)
|
102
|
+
with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
|
103
|
+
f.write(user_hash)
|
114
104
|
return user_hash
|
115
105
|
|
116
106
|
|
@@ -183,7 +173,7 @@ def make_cluster_name_on_cloud(display_name: str,
|
|
183
173
|
f'on the cloud, we convert it to {cluster_name_on_cloud}.')
|
184
174
|
user_hash = ''
|
185
175
|
if add_user_hash:
|
186
|
-
user_hash = get_user_hash()
|
176
|
+
user_hash = get_user_hash()
|
187
177
|
user_hash = f'-{user_hash}'
|
188
178
|
user_hash_length = len(user_hash)
|
189
179
|
|
@@ -233,7 +223,7 @@ class Backoff:
|
|
233
223
|
MULTIPLIER = 1.6
|
234
224
|
JITTER = 0.4
|
235
225
|
|
236
|
-
def __init__(self, initial_backoff:
|
226
|
+
def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
|
237
227
|
self._initial = True
|
238
228
|
self._backoff = 0.0
|
239
229
|
self._initial_backoff = initial_backoff
|
@@ -255,7 +245,62 @@ class Backoff:
|
|
255
245
|
return self._backoff
|
256
246
|
|
257
247
|
|
258
|
-
|
248
|
+
_current_command: Optional[str] = None
|
249
|
+
_current_client_entrypoint: Optional[str] = None
|
250
|
+
_using_remote_api_server: Optional[bool] = None
|
251
|
+
|
252
|
+
|
253
|
+
def set_client_status(client_entrypoint: Optional[str],
|
254
|
+
client_command: Optional[str],
|
255
|
+
using_remote_api_server: bool):
|
256
|
+
"""Override the current client entrypoint and command.
|
257
|
+
|
258
|
+
This is useful when we are on the SkyPilot API server side and we have a
|
259
|
+
client entrypoint and command from the client.
|
260
|
+
"""
|
261
|
+
global _current_command
|
262
|
+
global _current_client_entrypoint
|
263
|
+
global _using_remote_api_server
|
264
|
+
_current_command = client_command
|
265
|
+
_current_client_entrypoint = client_entrypoint
|
266
|
+
_using_remote_api_server = using_remote_api_server
|
267
|
+
|
268
|
+
|
269
|
+
def get_current_command() -> str:
|
270
|
+
"""Returns the command related to this operation.
|
271
|
+
|
272
|
+
Normally uses get_pretty_entry_point(), but will use the client command on
|
273
|
+
the server side.
|
274
|
+
"""
|
275
|
+
if _current_command is not None:
|
276
|
+
return _current_command
|
277
|
+
|
278
|
+
return get_pretty_entrypoint_cmd()
|
279
|
+
|
280
|
+
|
281
|
+
def get_current_client_entrypoint(server_entrypoint: str) -> str:
|
282
|
+
"""Returns the current client entrypoint.
|
283
|
+
|
284
|
+
Gets the client entrypoint from the context, if it is not set, returns the
|
285
|
+
server entrypoint.
|
286
|
+
"""
|
287
|
+
if _current_client_entrypoint is not None:
|
288
|
+
return _current_client_entrypoint
|
289
|
+
return server_entrypoint
|
290
|
+
|
291
|
+
|
292
|
+
def get_using_remote_api_server() -> bool:
|
293
|
+
"""Returns whether the API server is remote."""
|
294
|
+
if _using_remote_api_server is not None:
|
295
|
+
return _using_remote_api_server
|
296
|
+
# This gets the right status for the local client.
|
297
|
+
# TODO(zhwu): This is to prevent circular import. We should refactor this.
|
298
|
+
# pylint: disable=import-outside-toplevel
|
299
|
+
from sky.server import common as server_common
|
300
|
+
return not server_common.is_api_server_local()
|
301
|
+
|
302
|
+
|
303
|
+
def get_pretty_entrypoint_cmd() -> str:
|
259
304
|
"""Returns the prettified entry point of this process (sys.argv).
|
260
305
|
|
261
306
|
Example return values:
|
@@ -300,28 +345,51 @@ def user_and_hostname_hash() -> str:
|
|
300
345
|
return f'{getpass.getuser()}-{hostname_hash}'
|
301
346
|
|
302
347
|
|
303
|
-
def read_yaml(path) -> Dict[str, Any]:
|
348
|
+
def read_yaml(path: Optional[str]) -> Dict[str, Any]:
|
349
|
+
if path is None:
|
350
|
+
raise ValueError('Attempted to read a None YAML.')
|
304
351
|
with open(path, 'r', encoding='utf-8') as f:
|
305
352
|
config = yaml.safe_load(f)
|
306
353
|
return config
|
307
354
|
|
308
355
|
|
356
|
+
def read_yaml_all_str(yaml_str: str) -> List[Dict[str, Any]]:
|
357
|
+
stream = io.StringIO(yaml_str)
|
358
|
+
config = yaml.safe_load_all(stream)
|
359
|
+
configs = list(config)
|
360
|
+
if not configs:
|
361
|
+
# Empty YAML file.
|
362
|
+
return [{}]
|
363
|
+
return configs
|
364
|
+
|
365
|
+
|
309
366
|
def read_yaml_all(path: str) -> List[Dict[str, Any]]:
|
310
367
|
with open(path, 'r', encoding='utf-8') as f:
|
311
|
-
|
312
|
-
|
313
|
-
if not configs:
|
314
|
-
# Empty YAML file.
|
315
|
-
return [{}]
|
316
|
-
return configs
|
368
|
+
return read_yaml_all_str(f.read())
|
369
|
+
|
317
370
|
|
371
|
+
def dump_yaml(path: str, config: Union[List[Dict[str, Any]],
|
372
|
+
Dict[str, Any]]) -> None:
|
373
|
+
"""Dumps a YAML file.
|
318
374
|
|
319
|
-
|
375
|
+
Args:
|
376
|
+
path: the path to the YAML file.
|
377
|
+
config: the configuration to dump.
|
378
|
+
"""
|
320
379
|
with open(path, 'w', encoding='utf-8') as f:
|
321
380
|
f.write(dump_yaml_str(config))
|
322
381
|
|
323
382
|
|
324
|
-
def dump_yaml_str(config):
|
383
|
+
def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
|
384
|
+
"""Dumps a YAML string.
|
385
|
+
|
386
|
+
Args:
|
387
|
+
config: the configuration to dump.
|
388
|
+
|
389
|
+
Returns:
|
390
|
+
The YAML string.
|
391
|
+
"""
|
392
|
+
|
325
393
|
# https://github.com/yaml/pyyaml/issues/127
|
326
394
|
class LineBreakDumper(yaml.SafeDumper):
|
327
395
|
|
@@ -331,9 +399,9 @@ def dump_yaml_str(config):
|
|
331
399
|
super().write_line_break()
|
332
400
|
|
333
401
|
if isinstance(config, list):
|
334
|
-
dump_func = yaml.dump_all
|
402
|
+
dump_func = yaml.dump_all # type: ignore
|
335
403
|
else:
|
336
|
-
dump_func = yaml.dump
|
404
|
+
dump_func = yaml.dump # type: ignore
|
337
405
|
return dump_func(config,
|
338
406
|
Dumper=LineBreakDumper,
|
339
407
|
sort_keys=False,
|
@@ -362,7 +430,6 @@ def make_decorator(cls, name_or_fn: Union[str, Callable],
|
|
362
430
|
|
363
431
|
@functools.wraps(f)
|
364
432
|
def _record(*args, **kwargs):
|
365
|
-
nonlocal name_or_fn
|
366
433
|
with cls(name_or_fn, **ctx_kwargs):
|
367
434
|
return f(*args, **kwargs)
|
368
435
|
|
@@ -376,7 +443,6 @@ def make_decorator(cls, name_or_fn: Union[str, Callable],
|
|
376
443
|
|
377
444
|
@functools.wraps(name_or_fn)
|
378
445
|
def _record(*args, **kwargs):
|
379
|
-
nonlocal name_or_fn
|
380
446
|
f = name_or_fn
|
381
447
|
func_name = getattr(f, '__qualname__', f.__name__)
|
382
448
|
module_name = getattr(f, '__module__', '')
|
@@ -411,43 +477,6 @@ def retry(method, max_retries=3, initial_backoff=1):
|
|
411
477
|
return method_with_retries
|
412
478
|
|
413
479
|
|
414
|
-
def encode_payload(payload: Any) -> str:
|
415
|
-
"""Encode a payload to make it more robust for parsing.
|
416
|
-
|
417
|
-
This makes message transfer more robust to any additional strings added to
|
418
|
-
the message during transfer.
|
419
|
-
|
420
|
-
An example message that is polluted by the system warning:
|
421
|
-
"LC_ALL: cannot change locale (en_US.UTF-8)\n<sky-payload>hello, world</sky-payload>" # pylint: disable=line-too-long
|
422
|
-
|
423
|
-
Args:
|
424
|
-
payload: A str, dict or list to be encoded.
|
425
|
-
|
426
|
-
Returns:
|
427
|
-
A string that is encoded from the payload.
|
428
|
-
"""
|
429
|
-
payload_str = json.dumps(payload)
|
430
|
-
payload_str = _PAYLOAD_STR.format(payload_str)
|
431
|
-
return payload_str
|
432
|
-
|
433
|
-
|
434
|
-
def decode_payload(payload_str: str) -> Any:
|
435
|
-
"""Decode a payload string.
|
436
|
-
|
437
|
-
Args:
|
438
|
-
payload_str: A string that is encoded from a payload.
|
439
|
-
|
440
|
-
Returns:
|
441
|
-
A str, dict or list that is decoded from the payload string.
|
442
|
-
"""
|
443
|
-
matched = _PAYLOAD_PATTERN.findall(payload_str)
|
444
|
-
if not matched:
|
445
|
-
raise ValueError(f'Invalid payload string: \n{payload_str}')
|
446
|
-
payload_str = matched[0]
|
447
|
-
payload = json.loads(payload_str)
|
448
|
-
return payload
|
449
|
-
|
450
|
-
|
451
480
|
def class_fullname(cls, skip_builtins: bool = True):
|
452
481
|
"""Get the full name of a class.
|
453
482
|
|
@@ -478,11 +507,9 @@ def format_exception(e: Union[Exception, SystemExit, KeyboardInterrupt],
|
|
478
507
|
Returns:
|
479
508
|
A string that represents the exception.
|
480
509
|
"""
|
481
|
-
bright = colorama.Style.BRIGHT
|
482
|
-
reset = colorama.Style.RESET_ALL
|
483
510
|
if use_bracket:
|
484
|
-
return f'
|
485
|
-
return f'{
|
511
|
+
return f'[{class_fullname(e.__class__)}] {e}'
|
512
|
+
return f'{class_fullname(e.__class__)}: {e}'
|
486
513
|
|
487
514
|
|
488
515
|
def remove_color(s: str):
|
@@ -497,12 +524,14 @@ def remove_color(s: str):
|
|
497
524
|
return _COLOR_PATTERN.sub('', s)
|
498
525
|
|
499
526
|
|
500
|
-
def remove_file_if_exists(path: str):
|
527
|
+
def remove_file_if_exists(path: Optional[str]):
|
501
528
|
"""Delete a file if it exists.
|
502
529
|
|
503
530
|
Args:
|
504
531
|
path: The path to the file.
|
505
532
|
"""
|
533
|
+
if path is None:
|
534
|
+
return
|
506
535
|
try:
|
507
536
|
os.remove(path)
|
508
537
|
except FileNotFoundError:
|
@@ -581,7 +610,10 @@ def validate_schema(obj, schema, err_msg_prefix='', skip_none=True):
|
|
581
610
|
e.message)
|
582
611
|
else:
|
583
612
|
err_msg = err_msg_prefix
|
613
|
+
assert isinstance(e.schema, dict), 'Schema must be a dictionary'
|
584
614
|
known_fields = set(e.schema.get('properties', {}).keys())
|
615
|
+
assert isinstance(e.instance,
|
616
|
+
dict), 'Instance must be a dictionary'
|
585
617
|
for field in e.instance:
|
586
618
|
if field not in known_fields:
|
587
619
|
most_similar_field = difflib.get_close_matches(
|
@@ -602,7 +634,7 @@ def validate_schema(obj, schema, err_msg_prefix='', skip_none=True):
|
|
602
634
|
|
603
635
|
if err_msg:
|
604
636
|
with ux_utils.print_exception_no_traceback():
|
605
|
-
raise
|
637
|
+
raise exceptions.InvalidSkyPilotConfigError(err_msg)
|
606
638
|
|
607
639
|
|
608
640
|
def get_cleaned_username(username: str = '') -> str:
|
@@ -634,7 +666,7 @@ def get_cleaned_username(username: str = '') -> str:
|
|
634
666
|
return username
|
635
667
|
|
636
668
|
|
637
|
-
def fill_template(template_name: str, variables: Dict,
|
669
|
+
def fill_template(template_name: str, variables: Dict[str, Any],
|
638
670
|
output_path: str) -> None:
|
639
671
|
"""Create a file from a Jinja template and return the filename."""
|
640
672
|
assert template_name.endswith('.j2'), template_name
|
@@ -678,3 +710,182 @@ def deprecated_function(
|
|
678
710
|
return func(*args, **kwargs)
|
679
711
|
|
680
712
|
return new_func
|
713
|
+
|
714
|
+
|
715
|
+
def truncate_long_string(s: str, max_length: int = 35) -> str:
|
716
|
+
"""Truncate a string to a maximum length, preserving whole words."""
|
717
|
+
if len(s) <= max_length:
|
718
|
+
return s
|
719
|
+
splits = s.split(' ')
|
720
|
+
if len(splits[0]) > max_length:
|
721
|
+
return splits[0][:max_length] + '...' # Use '…'?
|
722
|
+
# Truncate on word boundary.
|
723
|
+
i = 0
|
724
|
+
total = 0
|
725
|
+
for i, part in enumerate(splits):
|
726
|
+
total += len(part)
|
727
|
+
if total >= max_length:
|
728
|
+
break
|
729
|
+
prefix = ' '.join(splits[:i])
|
730
|
+
if len(prefix) < max_length:
|
731
|
+
prefix += s[len(prefix):max_length]
|
732
|
+
return prefix + '...'
|
733
|
+
|
734
|
+
|
735
|
+
def hash_file(path: str, hash_alg: str) -> 'hashlib._Hash':
|
736
|
+
# In python 3.11, hashlib.file_digest is available, but for <3.11 we have to
|
737
|
+
# do it manually.
|
738
|
+
# This implementation is simplified from the implementation in CPython.
|
739
|
+
# TODO(cooperc): Use hashlib.file_digest once we move to 3.11+.
|
740
|
+
# Beware of f.read() as some files may be larger than memory.
|
741
|
+
with open(path, 'rb') as f:
|
742
|
+
file_hash = hashlib.new(hash_alg)
|
743
|
+
buf = bytearray(2**18)
|
744
|
+
view = memoryview(buf)
|
745
|
+
while True:
|
746
|
+
size = f.readinto(buf)
|
747
|
+
if size == 0:
|
748
|
+
# EOF
|
749
|
+
break
|
750
|
+
file_hash.update(view[:size])
|
751
|
+
return file_hash
|
752
|
+
|
753
|
+
|
754
|
+
def is_port_available(port: int, reuse_addr: bool = True) -> bool:
|
755
|
+
"""Check if a TCP port is available for binding on localhost.
|
756
|
+
|
757
|
+
Args:
|
758
|
+
port: The port number to check.
|
759
|
+
reuse_addr: If True, sets SO_REUSEADDR socket option to allow reusing
|
760
|
+
ports in TIME_WAIT state. Servers like multiprocessing.Manager set
|
761
|
+
SO_REUSEADDR by default to accelerate restart. The option should be
|
762
|
+
coordinated in check.
|
763
|
+
|
764
|
+
Returns:
|
765
|
+
bool: True if the port is available for binding, False otherwise.
|
766
|
+
"""
|
767
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
768
|
+
if reuse_addr:
|
769
|
+
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
770
|
+
try:
|
771
|
+
s.bind(('localhost', port))
|
772
|
+
return True
|
773
|
+
except OSError:
|
774
|
+
return False
|
775
|
+
|
776
|
+
|
777
|
+
def get_cpu_count() -> int:
|
778
|
+
"""Get the number of CPUs, with cgroup awareness."""
|
779
|
+
# This env-var is kept since it is still useful for limiting the resource
|
780
|
+
# of SkyPilot in non-containerized environments.
|
781
|
+
cpu_count = os.getenv('SKYPILOT_POD_CPU_CORE_LIMIT')
|
782
|
+
if cpu_count is not None:
|
783
|
+
try:
|
784
|
+
return int(float(cpu_count))
|
785
|
+
except ValueError as e:
|
786
|
+
with ux_utils.print_exception_no_traceback():
|
787
|
+
raise ValueError(
|
788
|
+
f'Failed to parse the number of CPUs from {cpu_count}'
|
789
|
+
) from e
|
790
|
+
return _cpu_count()
|
791
|
+
|
792
|
+
|
793
|
+
def get_mem_size_gb() -> float:
|
794
|
+
"""Get the memory size in GB, with cgroup awareness."""
|
795
|
+
mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
|
796
|
+
if mem_size is not None:
|
797
|
+
try:
|
798
|
+
return float(mem_size)
|
799
|
+
except ValueError as e:
|
800
|
+
with ux_utils.print_exception_no_traceback():
|
801
|
+
raise ValueError(
|
802
|
+
f'Failed to parse the memory size from {mem_size}') from e
|
803
|
+
return _mem_size_gb()
|
804
|
+
|
805
|
+
|
806
|
+
def _cpu_count() -> int:
|
807
|
+
# host cpu cores (logical)
|
808
|
+
cpu = psutil.cpu_count()
|
809
|
+
# cpu affinity on Linux
|
810
|
+
if hasattr(os, 'sched_getaffinity'):
|
811
|
+
# just for safe, length of CPU set should always <= logical cpu cores
|
812
|
+
cpu = min(cpu, len(os.sched_getaffinity(0)))
|
813
|
+
cgroup_cpu = _get_cgroup_cpu_limit()
|
814
|
+
if cgroup_cpu is not None:
|
815
|
+
cpu = min(cpu, int(cgroup_cpu))
|
816
|
+
return cpu
|
817
|
+
|
818
|
+
|
819
|
+
def _mem_size_gb() -> float:
|
820
|
+
# host memory limit
|
821
|
+
mem = psutil.virtual_memory().total
|
822
|
+
cgroup_mem = _get_cgroup_memory_limit()
|
823
|
+
if cgroup_mem is not None:
|
824
|
+
mem = min(mem, cgroup_mem)
|
825
|
+
return mem / (1024**3)
|
826
|
+
|
827
|
+
|
828
|
+
# Refer to:
|
829
|
+
# - https://docs.kernel.org/admin-guide/cgroup-v1/index.html
|
830
|
+
# - https://docs.kernel.org/admin-guide/cgroup-v2.html
|
831
|
+
# for the standards of handler files in cgroupv1 and v2.
|
832
|
+
# Since all those paths are well-known standards that are unlikely to change,
|
833
|
+
# we use string literals instead of defining extra constants.
|
834
|
+
def _get_cgroup_cpu_limit() -> Optional[float]:
|
835
|
+
"""Return cpu limit from cgroups in cores.
|
836
|
+
|
837
|
+
Returns:
|
838
|
+
The cpu limit in cores as a float (can be fractional), or None if there
|
839
|
+
is no limit in cgroups.
|
840
|
+
"""
|
841
|
+
try:
|
842
|
+
if _is_cgroup_v2():
|
843
|
+
with open('/sys/fs/cgroup/cpu.max', 'r', encoding='utf-8') as f:
|
844
|
+
quota_str, period_str = f.read().strip().split()
|
845
|
+
if quota_str == 'max':
|
846
|
+
return None
|
847
|
+
quota = float(quota_str)
|
848
|
+
period = float(period_str)
|
849
|
+
return quota / period if quota > 0 else None
|
850
|
+
else:
|
851
|
+
# cgroup v1
|
852
|
+
with open('/sys/fs/cgroup/cpu/cpu.cfs_quota_us',
|
853
|
+
'r',
|
854
|
+
encoding='utf-8') as f:
|
855
|
+
quota = float(f.read().strip())
|
856
|
+
with open('/sys/fs/cgroup/cpu/cpu.cfs_period_us',
|
857
|
+
'r',
|
858
|
+
encoding='utf-8') as f:
|
859
|
+
period = float(f.read().strip())
|
860
|
+
# Return unlimited if cpu quota is not set.
|
861
|
+
# Note that we do not use cpu.shares since it is a relative weight
|
862
|
+
# instead of a hard limit. It is okay to get CPU throttling under
|
863
|
+
# high contention. And unlimited enables the server to use as much
|
864
|
+
# CPU as available if there is no contention.
|
865
|
+
return quota / period if (quota > 0 and period > 0) else None
|
866
|
+
except (OSError, ValueError):
|
867
|
+
return None
|
868
|
+
|
869
|
+
|
870
|
+
def _get_cgroup_memory_limit() -> Optional[int]:
|
871
|
+
"""Return memory limit from cgroups in bytes.
|
872
|
+
|
873
|
+
Returns:
|
874
|
+
The memory limit in bytes, or None if there is no limit in cgroups.
|
875
|
+
"""
|
876
|
+
try:
|
877
|
+
path = ('/sys/fs/cgroup/memory.max' if _is_cgroup_v2() else
|
878
|
+
'/sys/fs/cgroup/memory/memory.limit_in_bytes')
|
879
|
+
with open(path, 'r', encoding='utf-8') as f:
|
880
|
+
value = f.read().strip()
|
881
|
+
if value == 'max' or not value:
|
882
|
+
return None
|
883
|
+
limit = int(value)
|
884
|
+
return limit if limit > 0 else None
|
885
|
+
except (OSError, ValueError):
|
886
|
+
return None
|
887
|
+
|
888
|
+
|
889
|
+
def _is_cgroup_v2() -> bool:
|
890
|
+
"""Return True if the environment is running cgroup v2."""
|
891
|
+
return os.path.isfile('/sys/fs/cgroup/cgroup.controllers')
|