skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
# When using pod@namespace+context, rsync passes args as: {us} -l pod namespace+context
|
2
|
+
# We need to split the pod@namespace+context into pod, namespace and context
|
3
|
+
# For backward compatibility, we use + as the separator between namespace and context and add handling when context is not provided
|
4
|
+
shift
|
5
|
+
pod=$1
|
6
|
+
shift
|
7
|
+
echo "pod: $pod" >&2
|
8
|
+
encoded_namespace_context=$1
|
9
|
+
# Revert the encoded namespace+context to the original string.
|
10
|
+
namespace_context=$(echo "$encoded_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
|
11
|
+
echo "namespace_context: $namespace_context" >&2
|
12
|
+
namespace=$(echo $namespace_context | cut -d+ -f1)
|
13
|
+
echo "namespace: $namespace" >&2
|
14
|
+
context=$(echo $namespace_context | grep '+' >/dev/null && echo $namespace_context | cut -d+ -f2- || echo "")
|
15
|
+
echo "context: $context" >&2
|
16
|
+
context_lower=$(echo "$context" | tr '[:upper:]' '[:lower:]')
|
17
|
+
shift
|
18
|
+
if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
|
19
|
+
# If context is none, it means we are using incluster auth. In this case,
|
20
|
+
# use need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
|
21
|
+
kubectl exec -i $pod -n $namespace --kubeconfig=/dev/null -- "$@"
|
22
|
+
else
|
23
|
+
kubectl exec -i $pod -n $namespace --context=$context -- "$@"
|
24
|
+
fi
|
sky/utils/log_utils.py
CHANGED
@@ -1,13 +1,18 @@
|
|
1
1
|
"""Logging utils."""
|
2
2
|
import enum
|
3
|
-
|
3
|
+
import time
|
4
|
+
import types
|
5
|
+
from typing import Callable, Iterator, List, Optional, TextIO, Type
|
4
6
|
|
5
7
|
import colorama
|
8
|
+
# slow due to https://github.com/python-pendulum/pendulum/issues/808
|
9
|
+
# FIXME(aylei): bump pendulum if it get fixed
|
6
10
|
import pendulum
|
7
11
|
import prettytable
|
8
12
|
|
9
13
|
from sky import sky_logging
|
10
14
|
from sky.utils import rich_utils
|
15
|
+
from sky.utils import ux_utils
|
11
16
|
|
12
17
|
logger = sky_logging.init_logger(__name__)
|
13
18
|
|
@@ -15,13 +20,15 @@ logger = sky_logging.init_logger(__name__)
|
|
15
20
|
class LineProcessor(object):
|
16
21
|
"""A processor for log lines."""
|
17
22
|
|
18
|
-
def __enter__(self):
|
23
|
+
def __enter__(self) -> None:
|
19
24
|
pass
|
20
25
|
|
21
|
-
def process_line(self, log_line):
|
26
|
+
def process_line(self, log_line: str) -> None:
|
22
27
|
pass
|
23
28
|
|
24
|
-
def __exit__(self, except_type
|
29
|
+
def __exit__(self, except_type: Optional[Type[BaseException]],
|
30
|
+
except_value: Optional[BaseException],
|
31
|
+
traceback: Optional[types.TracebackType]) -> None:
|
25
32
|
del except_type, except_value, traceback # unused
|
26
33
|
pass
|
27
34
|
|
@@ -34,33 +41,39 @@ class RayUpLineProcessor(LineProcessor):
|
|
34
41
|
RUNTIME_SETUP = 1
|
35
42
|
PULLING_DOCKER_IMAGES = 2
|
36
43
|
|
37
|
-
def
|
44
|
+
def __init__(self, log_path: str):
|
45
|
+
self.log_path = log_path
|
46
|
+
|
47
|
+
def __enter__(self) -> None:
|
38
48
|
self.state = self.ProvisionStatus.LAUNCH
|
39
|
-
self.status_display = rich_utils.safe_status(
|
49
|
+
self.status_display = rich_utils.safe_status(
|
50
|
+
ux_utils.spinner_message('Launching', self.log_path))
|
40
51
|
self.status_display.start()
|
41
52
|
|
42
|
-
def process_line(self, log_line):
|
53
|
+
def process_line(self, log_line: str) -> None:
|
43
54
|
if ('Success.' in log_line and
|
44
55
|
self.state == self.ProvisionStatus.LAUNCH):
|
45
|
-
logger.info(
|
46
|
-
f'{colorama.Style.RESET_ALL}')
|
56
|
+
logger.info(' Head VM is up.')
|
47
57
|
self.status_display.update(
|
48
|
-
|
58
|
+
ux_utils.spinner_message(
|
59
|
+
'Launching - Preparing SkyPilot runtime', self.log_path))
|
49
60
|
self.state = self.ProvisionStatus.RUNTIME_SETUP
|
50
61
|
if ('Pulling from' in log_line and
|
51
62
|
self.state == self.ProvisionStatus.RUNTIME_SETUP):
|
52
63
|
self.status_display.update(
|
53
|
-
|
64
|
+
ux_utils.spinner_message(
|
65
|
+
'Launching - Initializing docker container', self.log_path))
|
54
66
|
self.state = self.ProvisionStatus.PULLING_DOCKER_IMAGES
|
55
67
|
if ('Status: Downloaded newer image' in log_line and
|
56
68
|
self.state == self.ProvisionStatus.PULLING_DOCKER_IMAGES):
|
57
|
-
logger.info(f'{colorama.Fore.GREEN}Docker image is downloaded.'
|
58
|
-
f'{colorama.Style.RESET_ALL}')
|
59
69
|
self.status_display.update(
|
60
|
-
|
70
|
+
ux_utils.spinner_message(
|
71
|
+
'Launching - Preparing SkyPilot runtime', self.log_path))
|
61
72
|
self.state = self.ProvisionStatus.RUNTIME_SETUP
|
62
73
|
|
63
|
-
def __exit__(self, except_type
|
74
|
+
def __exit__(self, except_type: Optional[Type[BaseException]],
|
75
|
+
except_value: Optional[BaseException],
|
76
|
+
traceback: Optional[types.TracebackType]) -> None:
|
64
77
|
del except_type, except_value, traceback # unused
|
65
78
|
self.status_display.stop()
|
66
79
|
|
@@ -68,42 +81,69 @@ class RayUpLineProcessor(LineProcessor):
|
|
68
81
|
class SkyLocalUpLineProcessor(LineProcessor):
|
69
82
|
"""A processor for `sky local up` log lines."""
|
70
83
|
|
84
|
+
def __init__(self, log_path: str, is_local: bool):
|
85
|
+
self.log_path = log_path
|
86
|
+
self.is_local = is_local
|
87
|
+
|
71
88
|
def __enter__(self):
|
72
|
-
|
73
|
-
|
89
|
+
# TODO(romilb): Use ux_utils.INDENT_SYMBOL to be consistent with other
|
90
|
+
# messages.
|
91
|
+
msg = 'Creating local cluster - initializing Kubernetes'
|
92
|
+
status = rich_utils.safe_status(
|
93
|
+
ux_utils.spinner_message(msg,
|
94
|
+
log_path=self.log_path,
|
95
|
+
is_local=self.is_local))
|
74
96
|
self.status_display = status
|
75
97
|
self.status_display.start()
|
76
98
|
|
77
|
-
def process_line(self, log_line):
|
99
|
+
def process_line(self, log_line: str) -> None:
|
78
100
|
if 'Kind cluster created.' in log_line:
|
79
101
|
logger.info(f'{colorama.Fore.GREEN}Kubernetes is running.'
|
80
102
|
f'{colorama.Style.RESET_ALL}')
|
81
103
|
if 'Installing NVIDIA GPU operator...' in log_line:
|
82
|
-
self.status_display.update(
|
83
|
-
|
104
|
+
self.status_display.update(
|
105
|
+
ux_utils.spinner_message(
|
106
|
+
'Creating local cluster - '
|
107
|
+
'Installing NVIDIA GPU operator',
|
108
|
+
log_path=self.log_path,
|
109
|
+
is_local=self.is_local))
|
84
110
|
if 'Starting wait for GPU operator installation...' in log_line:
|
85
111
|
self.status_display.update(
|
86
|
-
|
87
|
-
|
112
|
+
ux_utils.spinner_message(
|
113
|
+
'Creating local cluster - '
|
114
|
+
'waiting for NVIDIA GPU operator installation to complete',
|
115
|
+
log_path=self.log_path,
|
116
|
+
is_local=self.is_local))
|
88
117
|
logger.info('To check NVIDIA GPU operator status, '
|
89
118
|
'see pods: kubectl get pods -n gpu-operator')
|
90
119
|
if 'GPU operator installed' in log_line:
|
91
120
|
logger.info(f'{colorama.Fore.GREEN}NVIDIA GPU Operator installed.'
|
92
121
|
f'{colorama.Style.RESET_ALL}')
|
93
122
|
if 'Pulling SkyPilot GPU image...' in log_line:
|
94
|
-
self.status_display.update(
|
95
|
-
|
123
|
+
self.status_display.update(
|
124
|
+
ux_utils.spinner_message(
|
125
|
+
'Creating local cluster - '
|
126
|
+
'pulling and loading SkyPilot GPU image',
|
127
|
+
log_path=self.log_path,
|
128
|
+
is_local=self.is_local))
|
96
129
|
if 'SkyPilot GPU image loaded into kind cluster' in log_line:
|
97
130
|
logger.info(f'{colorama.Fore.GREEN}SkyPilot GPU image pulled.'
|
98
131
|
f'{colorama.Style.RESET_ALL}')
|
99
132
|
if 'Labelling nodes with GPUs...' in log_line:
|
100
|
-
self.status_display.update(
|
101
|
-
|
133
|
+
self.status_display.update(
|
134
|
+
ux_utils.spinner_message(
|
135
|
+
'Creating local cluster - '
|
136
|
+
'launching GPU labelling jobs',
|
137
|
+
log_path=self.log_path,
|
138
|
+
is_local=self.is_local))
|
102
139
|
if ('Starting wait for SkyPilot GPU labeling jobs to complete'
|
103
140
|
in log_line):
|
104
141
|
self.status_display.update(
|
105
|
-
|
106
|
-
|
142
|
+
ux_utils.spinner_message(
|
143
|
+
'Creating local cluster - '
|
144
|
+
'waiting for GPU labelling jobs to complete',
|
145
|
+
log_path=self.log_path,
|
146
|
+
is_local=self.is_local))
|
107
147
|
logger.info(
|
108
148
|
'To check GPU labelling status, see jobs: '
|
109
149
|
'kubectl get jobs -n kube-system -l job=sky-gpu-labeler')
|
@@ -111,20 +151,136 @@ class SkyLocalUpLineProcessor(LineProcessor):
|
|
111
151
|
logger.info(f'{colorama.Fore.GREEN}GPU labelling complete.'
|
112
152
|
f'{colorama.Style.RESET_ALL}')
|
113
153
|
if 'Pulling SkyPilot CPU image...' in log_line:
|
114
|
-
self.status_display.update(
|
115
|
-
|
154
|
+
self.status_display.update(
|
155
|
+
ux_utils.spinner_message(
|
156
|
+
'Creating local cluster - '
|
157
|
+
'pulling and loading SkyPilot CPU image',
|
158
|
+
log_path=self.log_path,
|
159
|
+
is_local=self.is_local))
|
116
160
|
if 'SkyPilot CPU image loaded into kind cluster' in log_line:
|
117
161
|
logger.info(f'{colorama.Fore.GREEN}SkyPilot CPU image pulled.'
|
118
162
|
f'{colorama.Style.RESET_ALL}')
|
119
163
|
if 'Starting installation of Nginx Ingress Controller...' in log_line:
|
120
164
|
self.status_display.update(
|
121
|
-
|
165
|
+
ux_utils.spinner_message(
|
166
|
+
'Creating local cluster - '
|
167
|
+
'creating Nginx Ingress Controller',
|
168
|
+
log_path=self.log_path,
|
169
|
+
is_local=self.is_local))
|
122
170
|
if 'Nginx Ingress Controller installed' in log_line:
|
123
171
|
logger.info(
|
124
172
|
f'{colorama.Fore.GREEN}Nginx Ingress Controller installed.'
|
125
173
|
f'{colorama.Style.RESET_ALL}')
|
174
|
+
self.status_display.update(
|
175
|
+
ux_utils.spinner_message('Wrapping up local cluster setup',
|
176
|
+
log_path=self.log_path,
|
177
|
+
is_local=self.is_local))
|
126
178
|
|
127
|
-
def __exit__(self, except_type
|
179
|
+
def __exit__(self, except_type: Optional[Type[BaseException]],
|
180
|
+
except_value: Optional[BaseException],
|
181
|
+
traceback: Optional[types.TracebackType]) -> None:
|
182
|
+
del except_type, except_value, traceback # unused
|
183
|
+
self.status_display.stop()
|
184
|
+
|
185
|
+
|
186
|
+
class SkyRemoteUpLineProcessor(LineProcessor):
|
187
|
+
"""A processor for deploy_remote_cluster.sh log lines."""
|
188
|
+
|
189
|
+
def __init__(self, log_path: str, is_local: bool):
|
190
|
+
self.log_path = log_path
|
191
|
+
self.is_local = is_local
|
192
|
+
|
193
|
+
def __enter__(self) -> None:
|
194
|
+
# TODO(romilb): Use ux_utils.INDENT_SYMBOL to be consistent with other
|
195
|
+
# messages.
|
196
|
+
status = rich_utils.safe_status(
|
197
|
+
ux_utils.spinner_message('Creating remote cluster',
|
198
|
+
log_path=self.log_path,
|
199
|
+
is_local=self.is_local))
|
200
|
+
self.status_display = status
|
201
|
+
self.status_display.start()
|
202
|
+
|
203
|
+
def process_line(self, log_line: str) -> None:
|
204
|
+
# Pre-flight checks
|
205
|
+
if 'SSH connection successful' in log_line:
|
206
|
+
logger.info(f'{colorama.Fore.GREEN}SSH connection established.'
|
207
|
+
f'{colorama.Style.RESET_ALL}')
|
208
|
+
|
209
|
+
# Kubernetes installation steps
|
210
|
+
if 'Deploying Kubernetes on head node' in log_line:
|
211
|
+
self.status_display.update(
|
212
|
+
ux_utils.spinner_message(
|
213
|
+
'Creating remote cluster - '
|
214
|
+
'deploying Kubernetes on head node',
|
215
|
+
log_path=self.log_path,
|
216
|
+
is_local=self.is_local))
|
217
|
+
if 'K3s deployed on head node.' in log_line:
|
218
|
+
logger.info(f'{colorama.Fore.GREEN}'
|
219
|
+
'✔ K3s successfully deployed on head node.'
|
220
|
+
f'{colorama.Style.RESET_ALL}')
|
221
|
+
|
222
|
+
# Worker nodes
|
223
|
+
if 'Deploying Kubernetes on worker node' in log_line:
|
224
|
+
self.status_display.update(
|
225
|
+
ux_utils.spinner_message(
|
226
|
+
'Creating remote cluster - '
|
227
|
+
'deploying Kubernetes on worker nodes',
|
228
|
+
log_path=self.log_path,
|
229
|
+
is_local=self.is_local))
|
230
|
+
if 'Kubernetes deployed on worker node' in log_line:
|
231
|
+
logger.info(f'{colorama.Fore.GREEN}'
|
232
|
+
'✔ K3s successfully deployed on worker node.'
|
233
|
+
f'{colorama.Style.RESET_ALL}')
|
234
|
+
|
235
|
+
# Cluster configuration
|
236
|
+
if 'Configuring local kubectl to connect to the cluster...' in log_line:
|
237
|
+
self.status_display.update(
|
238
|
+
ux_utils.spinner_message(
|
239
|
+
'Creating remote cluster - '
|
240
|
+
'configuring local kubectl',
|
241
|
+
log_path=self.log_path,
|
242
|
+
is_local=self.is_local))
|
243
|
+
if 'kubectl configured to connect to the cluster.' in log_line:
|
244
|
+
logger.info(f'{colorama.Fore.GREEN}'
|
245
|
+
'✔ kubectl configured for the remote cluster.'
|
246
|
+
f'{colorama.Style.RESET_ALL}')
|
247
|
+
|
248
|
+
# GPU operator installation
|
249
|
+
if 'Installing Nvidia GPU Operator...' in log_line:
|
250
|
+
self.status_display.update(
|
251
|
+
ux_utils.spinner_message(
|
252
|
+
'Creating remote cluster - '
|
253
|
+
'installing Nvidia GPU Operator',
|
254
|
+
log_path=self.log_path,
|
255
|
+
is_local=self.is_local))
|
256
|
+
if 'GPU Operator installed.' in log_line:
|
257
|
+
logger.info(f'{colorama.Fore.GREEN}'
|
258
|
+
'✔ Nvidia GPU Operator installed successfully.'
|
259
|
+
f'{colorama.Style.RESET_ALL}')
|
260
|
+
|
261
|
+
# Cleanup steps
|
262
|
+
if 'Cleaning up head node' in log_line:
|
263
|
+
self.status_display.update(
|
264
|
+
ux_utils.spinner_message('Cleaning up head node',
|
265
|
+
log_path=self.log_path,
|
266
|
+
is_local=self.is_local))
|
267
|
+
if 'Cleaning up node' in log_line:
|
268
|
+
self.status_display.update(
|
269
|
+
ux_utils.spinner_message('Cleaning up worker node',
|
270
|
+
log_path=self.log_path,
|
271
|
+
is_local=self.is_local))
|
272
|
+
if 'cleaned up successfully' in log_line:
|
273
|
+
logger.info(f'{colorama.Fore.GREEN}'
|
274
|
+
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
275
|
+
|
276
|
+
# Final status
|
277
|
+
if 'Cluster deployment completed.' in log_line:
|
278
|
+
logger.info(f'{colorama.Fore.GREEN}✔ Remote k3s is running.'
|
279
|
+
f'{colorama.Style.RESET_ALL}')
|
280
|
+
|
281
|
+
def __exit__(self, except_type: Optional[Type[BaseException]],
|
282
|
+
except_value: Optional[BaseException],
|
283
|
+
traceback: Optional[types.TracebackType]) -> None:
|
128
284
|
del except_type, except_value, traceback # unused
|
129
285
|
self.status_display.stop()
|
130
286
|
|
@@ -157,7 +313,8 @@ def readable_time_duration(start: Optional[float],
|
|
157
313
|
e.g. "1h 2m 23s"
|
158
314
|
"""
|
159
315
|
# start < 0 means that the starting time is not specified yet.
|
160
|
-
# It is only used in
|
316
|
+
# It is only used in jobs_utils.format_job_table() for job duration
|
317
|
+
# calculation.
|
161
318
|
if start is None or start < 0:
|
162
319
|
return '-'
|
163
320
|
if end == start == 0:
|
@@ -191,3 +348,53 @@ def readable_time_duration(start: Optional[float],
|
|
191
348
|
diff = diff.replace('hour', 'hr')
|
192
349
|
|
193
350
|
return diff
|
351
|
+
|
352
|
+
|
353
|
+
def follow_logs(
|
354
|
+
file: TextIO,
|
355
|
+
*,
|
356
|
+
should_stop: Callable[[], bool],
|
357
|
+
stop_on_eof: bool = False,
|
358
|
+
process_line: Optional[Callable[[str], Iterator[str]]] = None,
|
359
|
+
idle_timeout_seconds: Optional[int] = None,
|
360
|
+
) -> Iterator[str]:
|
361
|
+
"""Streams and processes logs line by line from a file.
|
362
|
+
|
363
|
+
Args:
|
364
|
+
file: File object to read logs from.
|
365
|
+
should_stop: Callback that returns True when streaming should stop.
|
366
|
+
stop_on_eof: If True, stop when reaching end of file.
|
367
|
+
process_line: Optional callback to transform/filter each line.
|
368
|
+
idle_timeout_seconds: If set, stop after these many seconds without
|
369
|
+
new content.
|
370
|
+
|
371
|
+
Yields:
|
372
|
+
Log lines, possibly transformed by process_line if provided.
|
373
|
+
"""
|
374
|
+
current_line: str = ''
|
375
|
+
seconds_without_content: int = 0
|
376
|
+
|
377
|
+
while True:
|
378
|
+
content = file.readline()
|
379
|
+
|
380
|
+
if not content:
|
381
|
+
if stop_on_eof or should_stop():
|
382
|
+
break
|
383
|
+
|
384
|
+
if idle_timeout_seconds is not None:
|
385
|
+
if seconds_without_content >= idle_timeout_seconds:
|
386
|
+
break
|
387
|
+
seconds_without_content += 1
|
388
|
+
|
389
|
+
time.sleep(1)
|
390
|
+
continue
|
391
|
+
|
392
|
+
seconds_without_content = 0
|
393
|
+
current_line += content
|
394
|
+
|
395
|
+
if '\n' in current_line or '\r' in current_line:
|
396
|
+
if process_line is not None:
|
397
|
+
yield from process_line(current_line)
|
398
|
+
else:
|
399
|
+
yield current_line
|
400
|
+
current_line = ''
|
@@ -0,0 +1,81 @@
|
|
1
|
+
"""Utilities for encoding and decoding messages."""
|
2
|
+
import json
|
3
|
+
import re
|
4
|
+
import typing
|
5
|
+
from typing import Any, Literal, Optional, Tuple, Union
|
6
|
+
|
7
|
+
_PAYLOAD_PATTERN = re.compile(r'<sky-payload(.*?)>(.*?)</sky-payload>')
|
8
|
+
_PAYLOAD_STR = '<sky-payload{type}>{content}</sky-payload>\n'
|
9
|
+
|
10
|
+
|
11
|
+
def encode_payload(payload: Any, payload_type: Optional[str] = None) -> str:
|
12
|
+
"""Encode a payload to make it more robust for parsing.
|
13
|
+
|
14
|
+
This makes message transfer more robust to any additional strings added to
|
15
|
+
the message during transfer.
|
16
|
+
|
17
|
+
An example message that is polluted by the system warning:
|
18
|
+
"LC_ALL: cannot change locale (en_US.UTF-8)\n<sky-payload>hello, world</sky-payload>" # pylint: disable=line-too-long
|
19
|
+
|
20
|
+
Args:
|
21
|
+
payload: A str, dict or list to be encoded.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
A string that is encoded from the payload.
|
25
|
+
"""
|
26
|
+
payload_str = json.dumps(payload)
|
27
|
+
if payload_type is None:
|
28
|
+
payload_type = ''
|
29
|
+
payload_str = _PAYLOAD_STR.format(type=payload_type, content=payload_str)
|
30
|
+
return payload_str
|
31
|
+
|
32
|
+
|
33
|
+
@typing.overload
|
34
|
+
def decode_payload(payload_str: str,
|
35
|
+
payload_type: Optional[str] = None,
|
36
|
+
raise_for_mismatch: Literal[True] = True) -> Any:
|
37
|
+
...
|
38
|
+
|
39
|
+
|
40
|
+
@typing.overload
|
41
|
+
def decode_payload(
|
42
|
+
payload_str: str,
|
43
|
+
payload_type: Optional[str] = None,
|
44
|
+
raise_for_mismatch: Literal[False] = False) -> Tuple[bool, Any]:
|
45
|
+
...
|
46
|
+
|
47
|
+
|
48
|
+
def decode_payload(
|
49
|
+
payload_str: str,
|
50
|
+
payload_type: Optional[str] = None,
|
51
|
+
raise_for_mismatch: bool = True) -> Union[Tuple[bool, Any], Any]:
|
52
|
+
"""Decode a payload string.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
payload_str: A string that is encoded from a payload.
|
56
|
+
payload_type: The type of the payload.
|
57
|
+
raise_for_mismatch: Whether to raise an error if the payload string is
|
58
|
+
not valid.
|
59
|
+
|
60
|
+
Returns:
|
61
|
+
A tuple of (bool, Any). The bool indicates whether it is a payload
|
62
|
+
string. The Any is the decoded payload, which is a str, dict or list.
|
63
|
+
"""
|
64
|
+
matched = _PAYLOAD_PATTERN.findall(payload_str)
|
65
|
+
if not matched:
|
66
|
+
if raise_for_mismatch:
|
67
|
+
raise ValueError(f'Invalid payload string: \n{payload_str}')
|
68
|
+
else:
|
69
|
+
return False, payload_str
|
70
|
+
|
71
|
+
for payload_type_str, payload_str in matched:
|
72
|
+
if payload_type is None or payload_type == payload_type_str:
|
73
|
+
if raise_for_mismatch:
|
74
|
+
return json.loads(payload_str)
|
75
|
+
else:
|
76
|
+
return True, json.loads(payload_str)
|
77
|
+
|
78
|
+
if raise_for_mismatch:
|
79
|
+
raise ValueError(f'Invalid payload string: \n{payload_str}')
|
80
|
+
else:
|
81
|
+
return False, payload_str
|
sky/utils/registry.py
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
"""Registry for classes to be discovered"""
|
2
|
+
|
3
|
+
import typing
|
4
|
+
from typing import Callable, Dict, List, Optional, Set, Type, Union
|
5
|
+
|
6
|
+
from sky.utils import ux_utils
|
7
|
+
|
8
|
+
if typing.TYPE_CHECKING:
|
9
|
+
from sky.backends import backend
|
10
|
+
from sky.clouds import cloud
|
11
|
+
from sky.jobs import recovery_strategy
|
12
|
+
|
13
|
+
T = typing.TypeVar('T')
|
14
|
+
|
15
|
+
|
16
|
+
class _Registry(dict, typing.Generic[T]):
|
17
|
+
"""Registry."""
|
18
|
+
|
19
|
+
def __init__(self,
|
20
|
+
registry_name: str,
|
21
|
+
exclude: Optional[Set[str]],
|
22
|
+
type_register: bool = False):
|
23
|
+
super().__init__()
|
24
|
+
self._registry_name = registry_name
|
25
|
+
self._exclude = exclude or set()
|
26
|
+
self._default: Optional[str] = None
|
27
|
+
self._type_register: bool = type_register
|
28
|
+
self._aliases: Dict[str, str] = {}
|
29
|
+
|
30
|
+
def from_str(self, name: Optional[str]) -> Optional[T]:
|
31
|
+
"""Returns the cloud instance from the canonical name or alias."""
|
32
|
+
if name is None:
|
33
|
+
return None
|
34
|
+
|
35
|
+
search_name = name.lower()
|
36
|
+
if search_name in self._exclude:
|
37
|
+
return None
|
38
|
+
|
39
|
+
if search_name in self:
|
40
|
+
return self[search_name]
|
41
|
+
|
42
|
+
if search_name in self._aliases:
|
43
|
+
return self[self._aliases[search_name]]
|
44
|
+
|
45
|
+
with ux_utils.print_exception_no_traceback():
|
46
|
+
raise ValueError(
|
47
|
+
f'{self._registry_name.capitalize()} {name!r} is not a '
|
48
|
+
f'valid {self._registry_name} among '
|
49
|
+
f'{[*self.keys(), *self._aliases.keys()]}')
|
50
|
+
|
51
|
+
def type_register(self,
|
52
|
+
name: str,
|
53
|
+
default: bool = False) -> Callable[[Type[T]], Type[T]]:
|
54
|
+
|
55
|
+
name = name.lower()
|
56
|
+
|
57
|
+
def decorator(cls: Type[T]) -> Type[T]:
|
58
|
+
assert self._type_register, ('type_register can only be used '
|
59
|
+
'when type_register is True')
|
60
|
+
assert name not in self, f'{name} already registered'
|
61
|
+
self[name] = cls
|
62
|
+
if default:
|
63
|
+
self._default = name
|
64
|
+
return cls
|
65
|
+
|
66
|
+
return decorator
|
67
|
+
|
68
|
+
@typing.overload
|
69
|
+
def register(self, cls: Type[T]) -> Type[T]:
|
70
|
+
...
|
71
|
+
|
72
|
+
@typing.overload
|
73
|
+
def register(
|
74
|
+
self,
|
75
|
+
cls: None = None,
|
76
|
+
aliases: Optional[List[str]] = None
|
77
|
+
) -> Callable[[Type[T]], Type[T]]:
|
78
|
+
...
|
79
|
+
|
80
|
+
def register(
|
81
|
+
self,
|
82
|
+
cls: Optional[Type[T]] = None,
|
83
|
+
aliases: Optional[List[str]] = None
|
84
|
+
) -> Union[Type[T], Callable[[Type[T]], Type[T]]]:
|
85
|
+
assert not self._type_register, ('register can only be used when '
|
86
|
+
'type_register is False')
|
87
|
+
|
88
|
+
def _register(cls: Type[T]) -> Type[T]:
|
89
|
+
name = cls.__name__.lower()
|
90
|
+
assert name not in self, f'{name} already registered'
|
91
|
+
self[name] = cls()
|
92
|
+
|
93
|
+
for alias in aliases or []:
|
94
|
+
alias = alias.lower()
|
95
|
+
assert alias not in self._aliases, f'{alias} already registered'
|
96
|
+
self._aliases[alias] = name
|
97
|
+
return cls
|
98
|
+
|
99
|
+
if cls is not None:
|
100
|
+
# Invocation without parentheses (e.g. @register)
|
101
|
+
return _register(cls)
|
102
|
+
|
103
|
+
# Invocation with parentheses (e.g. @register(aliases=['alias']))
|
104
|
+
return _register
|
105
|
+
|
106
|
+
@property
|
107
|
+
def default(self) -> str:
|
108
|
+
assert self._default is not None, ('default is not set', self)
|
109
|
+
return self._default
|
110
|
+
|
111
|
+
|
112
|
+
# Backward compatibility. global_user_state's DB may have recorded
|
113
|
+
# Local cloud, and we've just removed it from the registry, and
|
114
|
+
# global_user_state.get_enabled_clouds() would call into this func
|
115
|
+
# and fail.
|
116
|
+
|
117
|
+
CLOUD_REGISTRY: _Registry = _Registry['cloud.Cloud'](registry_name='cloud',
|
118
|
+
exclude={'local'})
|
119
|
+
|
120
|
+
BACKEND_REGISTRY: _Registry = _Registry['backend.Backend'](
|
121
|
+
registry_name='backend', type_register=True, exclude=None)
|
122
|
+
|
123
|
+
JOBS_RECOVERY_STRATEGY_REGISTRY: _Registry = (
|
124
|
+
_Registry['recovery_strategy.StrategyExecutor'](
|
125
|
+
registry_name='jobs recovery strategy',
|
126
|
+
exclude=None,
|
127
|
+
type_register=True))
|