skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,383 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Helper class for some OCI operations methods which needs to be shared/called
|
3
|
-
by multiple places.
|
4
|
-
|
5
|
-
History:
|
6
|
-
- Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
|
7
|
-
|
8
|
-
"""
|
9
|
-
|
10
|
-
from datetime import datetime
|
11
|
-
import logging
|
12
|
-
import re
|
13
|
-
import time
|
14
|
-
import traceback
|
15
|
-
import typing
|
16
|
-
from typing import Optional
|
17
|
-
|
18
|
-
from sky.adaptors import common as adaptors_common
|
19
|
-
from sky.adaptors import oci as oci_adaptor
|
20
|
-
from sky.clouds.utils import oci_utils
|
21
|
-
from sky.skylet.providers.oci import utils
|
22
|
-
|
23
|
-
if typing.TYPE_CHECKING:
|
24
|
-
import pandas as pd
|
25
|
-
else:
|
26
|
-
pd = adaptors_common.LazyImport('pandas')
|
27
|
-
|
28
|
-
logger = logging.getLogger(__name__)
|
29
|
-
|
30
|
-
|
31
|
-
class oci_query_helper:
|
32
|
-
|
33
|
-
# Call Cloud API to try getting the satisfied nodes.
|
34
|
-
@classmethod
|
35
|
-
@utils.debug_enabled(logger=logger)
|
36
|
-
def query_instances_by_tags(cls, tag_filters, region):
|
37
|
-
|
38
|
-
where_clause_tags = ""
|
39
|
-
for tag_key in tag_filters:
|
40
|
-
if where_clause_tags != "":
|
41
|
-
where_clause_tags += " && "
|
42
|
-
|
43
|
-
tag_value = tag_filters[tag_key]
|
44
|
-
where_clause_tags += (f"(freeformTags.key = '{tag_key}'"
|
45
|
-
f" && freeformTags.value = '{tag_value}')")
|
46
|
-
|
47
|
-
qv_str = (f"query instance resources where {where_clause_tags}"
|
48
|
-
f" && (lifecycleState != 'TERMINATED'"
|
49
|
-
f" && lifecycleState != 'TERMINATING')")
|
50
|
-
|
51
|
-
qv = oci_adaptor.oci.resource_search.models.StructuredSearchDetails(
|
52
|
-
query=qv_str,
|
53
|
-
type="Structured",
|
54
|
-
matching_context_type=oci_adaptor.oci.resource_search.models.
|
55
|
-
SearchDetails.MATCHING_CONTEXT_TYPE_NONE,
|
56
|
-
)
|
57
|
-
|
58
|
-
list_instances_response = oci_adaptor.get_search_client(
|
59
|
-
region, oci_utils.oci_config.get_profile()).search_resources(qv)
|
60
|
-
result_set = list_instances_response.data.items
|
61
|
-
|
62
|
-
return result_set
|
63
|
-
|
64
|
-
@classmethod
|
65
|
-
def terminate_instances_by_tags(cls, tag_filters, region) -> int:
|
66
|
-
logger.debug(f"Terminate instance by tags: {tag_filters}")
|
67
|
-
insts = cls.query_instances_by_tags(tag_filters, region)
|
68
|
-
fail_count = 0
|
69
|
-
for inst in insts:
|
70
|
-
inst_id = inst.identifier
|
71
|
-
logger.debug(f"Got instance(to be terminated): {inst_id}")
|
72
|
-
|
73
|
-
try:
|
74
|
-
oci_adaptor.get_core_client(
|
75
|
-
region,
|
76
|
-
oci_utils.oci_config.get_profile()).terminate_instance(
|
77
|
-
inst_id)
|
78
|
-
except Exception as e:
|
79
|
-
fail_count += 1
|
80
|
-
logger.error(f"Terminate instance failed: {str(e)}\n: {inst}")
|
81
|
-
traceback.print_exc()
|
82
|
-
|
83
|
-
if fail_count == 0:
|
84
|
-
logger.debug(f"Instance teardown result: OK")
|
85
|
-
else:
|
86
|
-
logger.warn(f"Instance teardown result: {fail_count} failed!")
|
87
|
-
|
88
|
-
return fail_count
|
89
|
-
|
90
|
-
@classmethod
|
91
|
-
@utils.debug_enabled(logger=logger)
|
92
|
-
def subscribe_image(cls, compartment_id, listing_id, resource_version,
|
93
|
-
region):
|
94
|
-
if (pd.isna(listing_id) or listing_id.strip() == "None" or
|
95
|
-
listing_id.strip() == "nan"):
|
96
|
-
return
|
97
|
-
|
98
|
-
core_client = oci_adaptor.get_core_client(
|
99
|
-
region, oci_utils.oci_config.get_profile())
|
100
|
-
try:
|
101
|
-
agreements_response = core_client.get_app_catalog_listing_agreements(
|
102
|
-
listing_id=listing_id, resource_version=resource_version)
|
103
|
-
agreements = agreements_response.data
|
104
|
-
|
105
|
-
core_client.create_app_catalog_subscription(
|
106
|
-
create_app_catalog_subscription_details=oci_adaptor.oci.core.
|
107
|
-
models.CreateAppCatalogSubscriptionDetails(
|
108
|
-
compartment_id=compartment_id,
|
109
|
-
listing_id=listing_id,
|
110
|
-
listing_resource_version=agreements.
|
111
|
-
listing_resource_version,
|
112
|
-
oracle_terms_of_use_link=agreements.
|
113
|
-
oracle_terms_of_use_link,
|
114
|
-
time_retrieved=datetime.strptime(
|
115
|
-
re.sub(
|
116
|
-
"\d{3}\+\d{2}\:\d{2}",
|
117
|
-
"Z",
|
118
|
-
str(agreements.time_retrieved),
|
119
|
-
0,
|
120
|
-
),
|
121
|
-
"%Y-%m-%d %H:%M:%S.%fZ",
|
122
|
-
),
|
123
|
-
signature=agreements.signature,
|
124
|
-
eula_link=agreements.eula_link,
|
125
|
-
))
|
126
|
-
except Exception as e:
|
127
|
-
logger.critical(
|
128
|
-
f"subscribe_image: {listing_id} - {resource_version} ... [Failed]"
|
129
|
-
f"Error message: {str(e)}")
|
130
|
-
raise RuntimeError("ERR: Image subscription error!")
|
131
|
-
|
132
|
-
@classmethod
|
133
|
-
@utils.debug_enabled(logger=logger)
|
134
|
-
def find_compartment(cls, region) -> str:
|
135
|
-
""" If compartment is not configured, we use root compartment """
|
136
|
-
# Try to use the configured one first
|
137
|
-
skypilot_compartment = oci_utils.oci_config.get_compartment(region)
|
138
|
-
if skypilot_compartment is not None:
|
139
|
-
return skypilot_compartment
|
140
|
-
|
141
|
-
# If not specified, we try to find the one skypilot-compartment
|
142
|
-
# Pass-in a profile parameter so that multiple profile in oci
|
143
|
-
# config file is supported (2023/06/09).
|
144
|
-
root = oci_adaptor.get_oci_config(
|
145
|
-
region, oci_utils.oci_config.get_profile())['tenancy']
|
146
|
-
list_compartments_response = oci_adaptor.get_identity_client(
|
147
|
-
region, oci_utils.oci_config.get_profile()).list_compartments(
|
148
|
-
compartment_id=root,
|
149
|
-
name=oci_utils.oci_config.COMPARTMENT,
|
150
|
-
lifecycle_state='ACTIVE',
|
151
|
-
limit=1)
|
152
|
-
compartments = list_compartments_response.data
|
153
|
-
if len(compartments) > 0:
|
154
|
-
skypilot_compartment = compartments[0].id
|
155
|
-
return skypilot_compartment
|
156
|
-
|
157
|
-
# Finally, we use root compartment none matches above
|
158
|
-
skypilot_compartment = root
|
159
|
-
return skypilot_compartment
|
160
|
-
|
161
|
-
@classmethod
|
162
|
-
@utils.debug_enabled(logger=logger)
|
163
|
-
def find_create_vcn_subnet(cls, region) -> Optional[str]:
|
164
|
-
""" If sub is not configured, we find/create VCN skypilot_vcn """
|
165
|
-
subnet = oci_utils.oci_config.get_vcn_subnet(region)
|
166
|
-
if subnet is not None:
|
167
|
-
# User explicitly specified the subnet in sky config.
|
168
|
-
return subnet
|
169
|
-
|
170
|
-
# Try to reuse the skypilot_vcn.
|
171
|
-
net_client = oci_adaptor.get_net_client(
|
172
|
-
region, oci_utils.oci_config.get_profile())
|
173
|
-
skypilot_compartment = cls.find_compartment(region)
|
174
|
-
list_vcns_response = net_client.list_vcns(
|
175
|
-
compartment_id=skypilot_compartment,
|
176
|
-
display_name=oci_utils.oci_config.VCN_NAME,
|
177
|
-
lifecycle_state="AVAILABLE")
|
178
|
-
vcns = list_vcns_response.data
|
179
|
-
if len(vcns) > 0:
|
180
|
-
# Found the VCN.
|
181
|
-
skypilot_vcn = vcns[0].id
|
182
|
-
list_subnets_response = net_client.list_subnets(
|
183
|
-
compartment_id=skypilot_compartment,
|
184
|
-
limit=1,
|
185
|
-
vcn_id=skypilot_vcn,
|
186
|
-
display_name=oci_utils.oci_config.VCN_SUBNET_NAME,
|
187
|
-
lifecycle_state="AVAILABLE")
|
188
|
-
logger.debug(f'Got VCN subnet \n{list_subnets_response.data}')
|
189
|
-
if len(list_subnets_response.data) < 1:
|
190
|
-
logger.error(
|
191
|
-
f'No subnet {oci_utils.oci_config.VCN_SUBNET_NAME} '
|
192
|
-
f'found in the VCN {oci_utils.oci_config.VCN_NAME}')
|
193
|
-
raise RuntimeError(
|
194
|
-
f'VcnSubnetNotFound Error: No subnet '
|
195
|
-
f'{oci_utils.oci_config.VCN_SUBNET_NAME} found in '
|
196
|
-
f'the VCN {oci_utils.oci_config.VCN_NAME}')
|
197
|
-
subnet = list_subnets_response.data[0].id
|
198
|
-
return subnet
|
199
|
-
else:
|
200
|
-
# Create the skypilot_vcn and related resources
|
201
|
-
return cls.create_vcn_subnet(net_client, skypilot_compartment)
|
202
|
-
|
203
|
-
@classmethod
|
204
|
-
@utils.debug_enabled(logger=logger)
|
205
|
-
def create_vcn_subnet(cls, net_client,
|
206
|
-
skypilot_compartment) -> Optional[str]:
|
207
|
-
try:
|
208
|
-
create_vcn_response = net_client.create_vcn(
|
209
|
-
create_vcn_details=oci_adaptor.oci.core.models.CreateVcnDetails(
|
210
|
-
compartment_id=skypilot_compartment,
|
211
|
-
cidr_blocks=[oci_utils.oci_config.VCN_CIDR],
|
212
|
-
display_name=oci_utils.oci_config.VCN_NAME,
|
213
|
-
is_ipv6_enabled=False,
|
214
|
-
dns_label=oci_utils.oci_config.VCN_DNS_LABEL))
|
215
|
-
vcn_data = create_vcn_response.data
|
216
|
-
logger.debug(f'Created VCN \n{vcn_data}')
|
217
|
-
skypilot_vcn = vcn_data.id
|
218
|
-
route_table = vcn_data.default_route_table_id
|
219
|
-
security_list = vcn_data.default_security_list_id
|
220
|
-
dhcp_options_id = vcn_data.default_dhcp_options_id
|
221
|
-
|
222
|
-
# Create internet gateway for internet access
|
223
|
-
create_ig_response = net_client.create_internet_gateway(
|
224
|
-
create_internet_gateway_details=oci_adaptor.oci.core.models.
|
225
|
-
CreateInternetGatewayDetails(
|
226
|
-
compartment_id=skypilot_compartment,
|
227
|
-
is_enabled=True,
|
228
|
-
vcn_id=skypilot_vcn,
|
229
|
-
display_name=oci_utils.oci_config.VCN_INTERNET_GATEWAY_NAME
|
230
|
-
))
|
231
|
-
logger.debug(
|
232
|
-
f'Created internet gateway \n{create_ig_response.data}')
|
233
|
-
ig = create_ig_response.data.id
|
234
|
-
|
235
|
-
# Create a public subnet.
|
236
|
-
create_subnet_response = net_client.create_subnet(
|
237
|
-
create_subnet_details=oci_adaptor.oci.core.models.
|
238
|
-
CreateSubnetDetails(
|
239
|
-
cidr_block=oci_utils.oci_config.VCN_SUBNET_CIDR,
|
240
|
-
compartment_id=skypilot_compartment,
|
241
|
-
vcn_id=skypilot_vcn,
|
242
|
-
dhcp_options_id=dhcp_options_id,
|
243
|
-
display_name=oci_utils.oci_config.VCN_SUBNET_NAME,
|
244
|
-
prohibit_internet_ingress=False,
|
245
|
-
prohibit_public_ip_on_vnic=False,
|
246
|
-
route_table_id=route_table,
|
247
|
-
security_list_ids=[security_list]))
|
248
|
-
logger.debug(f'Created subnet \n{create_subnet_response.data}')
|
249
|
-
subnet = create_subnet_response.data.id
|
250
|
-
|
251
|
-
list_services_response = net_client.list_services(limit=100)
|
252
|
-
services = [
|
253
|
-
s for s in list_services_response.data
|
254
|
-
if str(s.cidr_block).startswith('all-') and str(s.cidr_block).
|
255
|
-
endswith('-services-in-oracle-services-network')
|
256
|
-
]
|
257
|
-
if len(services) > 0:
|
258
|
-
# Create service gateway for regional services.
|
259
|
-
create_sg_response = net_client.create_service_gateway(
|
260
|
-
create_service_gateway_details=oci_adaptor.oci.core.models.
|
261
|
-
CreateServiceGatewayDetails(
|
262
|
-
compartment_id=skypilot_compartment,
|
263
|
-
services=[
|
264
|
-
oci_adaptor.oci.core.models.ServiceIdRequestDetails(
|
265
|
-
service_id=services[0].id)
|
266
|
-
],
|
267
|
-
vcn_id=skypilot_vcn))
|
268
|
-
logger.debug(f'Service Gateway: \n{create_sg_response.data}')
|
269
|
-
sg = create_sg_response.data.id
|
270
|
-
|
271
|
-
# Update security list: Allow all traffic in the same subnet
|
272
|
-
update_security_list_response = net_client.update_security_list(
|
273
|
-
security_list_id=security_list,
|
274
|
-
update_security_list_details=oci_adaptor.oci.core.models.
|
275
|
-
UpdateSecurityListDetails(ingress_security_rules=[
|
276
|
-
oci_adaptor.oci.core.models.IngressSecurityRule(
|
277
|
-
protocol="6",
|
278
|
-
source=oci_utils.oci_config.VCN_CIDR_INTERNET,
|
279
|
-
is_stateless=False,
|
280
|
-
source_type="CIDR_BLOCK",
|
281
|
-
tcp_options=oci_adaptor.oci.core.models.TcpOptions(
|
282
|
-
destination_port_range=oci_adaptor.oci.core.models.
|
283
|
-
PortRange(max=22, min=22),
|
284
|
-
source_port_range=oci_adaptor.oci.core.models.
|
285
|
-
PortRange(max=65535, min=1)),
|
286
|
-
description="Allow SSH port."),
|
287
|
-
oci_adaptor.oci.core.models.IngressSecurityRule(
|
288
|
-
protocol="all",
|
289
|
-
source=oci_utils.oci_config.VCN_SUBNET_CIDR,
|
290
|
-
is_stateless=False,
|
291
|
-
source_type="CIDR_BLOCK",
|
292
|
-
description="Allow all traffic from/to same subnet."),
|
293
|
-
oci_adaptor.oci.core.models.IngressSecurityRule(
|
294
|
-
protocol="1",
|
295
|
-
source=oci_utils.oci_config.VCN_CIDR_INTERNET,
|
296
|
-
is_stateless=False,
|
297
|
-
source_type="CIDR_BLOCK",
|
298
|
-
icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
|
299
|
-
type=3, code=4),
|
300
|
-
description="ICMP traffic."),
|
301
|
-
oci_adaptor.oci.core.models.IngressSecurityRule(
|
302
|
-
protocol="1",
|
303
|
-
source=oci_utils.oci_config.VCN_CIDR,
|
304
|
-
is_stateless=False,
|
305
|
-
source_type="CIDR_BLOCK",
|
306
|
-
icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
|
307
|
-
type=3),
|
308
|
-
description="ICMP traffic (VCN)."),
|
309
|
-
]))
|
310
|
-
logger.debug(
|
311
|
-
f'Updated security_list: \n{update_security_list_response.data}'
|
312
|
-
)
|
313
|
-
|
314
|
-
# Update route table: bind to the internet gateway
|
315
|
-
update_route_table_response = net_client.update_route_table(
|
316
|
-
rt_id=route_table,
|
317
|
-
update_route_table_details=oci_adaptor.oci.core.models.
|
318
|
-
UpdateRouteTableDetails(route_rules=[
|
319
|
-
oci_adaptor.oci.core.models.RouteRule(
|
320
|
-
network_entity_id=create_ig_response.data.id,
|
321
|
-
destination='0.0.0.0/0',
|
322
|
-
destination_type='CIDR_BLOCK',
|
323
|
-
description='Route table for SkyPilot VCN',
|
324
|
-
route_type='STATIC')
|
325
|
-
]))
|
326
|
-
logger.debug(f'Route table: \n{update_route_table_response.data}')
|
327
|
-
|
328
|
-
except oci_adaptor.service_exception() as e:
|
329
|
-
logger.error(f'Create VCN Error: Create new VCN '
|
330
|
-
f'{oci_utils.oci_config.VCN_NAME} failed: {str(e)}')
|
331
|
-
# In case of partial success while creating vcn
|
332
|
-
cls.delete_vcn(net_client, skypilot_vcn, subnet, ig, sg)
|
333
|
-
subnet = None
|
334
|
-
|
335
|
-
return subnet
|
336
|
-
|
337
|
-
@classmethod
|
338
|
-
@utils.debug_enabled(logger=logger)
|
339
|
-
def delete_vcn(cls, net_client, skypilot_vcn, skypilot_subnet,
|
340
|
-
internet_gateway, service_gateway):
|
341
|
-
if skypilot_vcn is None:
|
342
|
-
return # Nothing to delete
|
343
|
-
try:
|
344
|
-
if internet_gateway is not None:
|
345
|
-
# Delete internet gateway
|
346
|
-
delete_ig_response = net_client.delete_internet_gateway(
|
347
|
-
ig_id=internet_gateway)
|
348
|
-
logger.debug(f'Deleted internet gateway {internet_gateway}'
|
349
|
-
f'-{delete_ig_response.data}')
|
350
|
-
if service_gateway is not None:
|
351
|
-
# Delete service gateway
|
352
|
-
delete_sg_response = net_client.delete_service_gateway(
|
353
|
-
service_gateway_id=service_gateway)
|
354
|
-
logger.debug(f'Deleted service gateway {service_gateway}'
|
355
|
-
f'-{delete_sg_response.data}')
|
356
|
-
if skypilot_subnet is not None:
|
357
|
-
# Delete subnet
|
358
|
-
delete_subnet_response = net_client.delete_subnet(
|
359
|
-
subnet_id=skypilot_subnet)
|
360
|
-
logger.debug(f'Deleted subnet {skypilot_subnet}'
|
361
|
-
f'-{delete_subnet_response.data}')
|
362
|
-
# Delete vcn
|
363
|
-
retry_count = 0
|
364
|
-
while retry_count < oci_utils.oci_config.MAX_RETRY_COUNT:
|
365
|
-
try:
|
366
|
-
delete_vcn_response = net_client.delete_vcn(
|
367
|
-
vcn_id=skypilot_vcn)
|
368
|
-
logger.debug(
|
369
|
-
f'Deleted vcn {skypilot_vcn}-{delete_vcn_response.data}'
|
370
|
-
)
|
371
|
-
break
|
372
|
-
except oci_adaptor.service_exception() as e:
|
373
|
-
logger.info(f'Waiting del SG/IG/Subnet finish: {str(e)}')
|
374
|
-
retry_count = retry_count + 1
|
375
|
-
if retry_count == oci_utils.oci_config.MAX_RETRY_COUNT:
|
376
|
-
raise e
|
377
|
-
else:
|
378
|
-
time.sleep(
|
379
|
-
oci_utils.oci_config.RETRY_INTERVAL_BASE_SECONDS)
|
380
|
-
|
381
|
-
except oci_adaptor.service_exception() as e:
|
382
|
-
logger.error(
|
383
|
-
f'Delete VCN {oci_utils.oci_config.VCN_NAME} Error: {str(e)}')
|
@@ -1,21 +0,0 @@
|
|
1
|
-
from datetime import datetime
|
2
|
-
import functools
|
3
|
-
from logging import Logger
|
4
|
-
|
5
|
-
|
6
|
-
def debug_enabled(logger: Logger):
|
7
|
-
|
8
|
-
def decorate(f):
|
9
|
-
|
10
|
-
@functools.wraps(f)
|
11
|
-
def wrapper(*args, **kwargs):
|
12
|
-
dt_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
13
|
-
logger.debug(f"{dt_str} Enter {f}, {args}, {kwargs}")
|
14
|
-
try:
|
15
|
-
return f(*args, **kwargs)
|
16
|
-
finally:
|
17
|
-
logger.debug(f"{dt_str} Exit {f}")
|
18
|
-
|
19
|
-
return wrapper
|
20
|
-
|
21
|
-
return decorate
|
sky/utils/cluster_yaml_utils.py
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
"""Utility functions for cluster yaml file."""
|
2
|
-
|
3
|
-
import re
|
4
|
-
|
5
|
-
# The cluster yaml used to create the current cluster where the module is
|
6
|
-
# called.
|
7
|
-
SKY_CLUSTER_YAML_REMOTE_PATH = '~/.sky/sky_ray.yml'
|
8
|
-
|
9
|
-
|
10
|
-
def get_provider_name(config: dict) -> str:
|
11
|
-
"""Return the name of the provider."""
|
12
|
-
|
13
|
-
provider_module = config['provider']['module']
|
14
|
-
# Examples:
|
15
|
-
# 'sky.skylet.providers.aws.AWSNodeProviderV2' -> 'aws'
|
16
|
-
# 'sky.provision.aws' -> 'aws'
|
17
|
-
provider_search = re.search(r'(?:providers|provision)\.(\w+)\.?',
|
18
|
-
provider_module)
|
19
|
-
assert provider_search is not None, config
|
20
|
-
provider_name = provider_search.group(1).lower()
|
21
|
-
# Special handling for lambda_cloud as Lambda cloud is registered as lambda.
|
22
|
-
if provider_name == 'lambda_cloud':
|
23
|
-
provider_name = 'lambda'
|
24
|
-
return provider_name
|
@@ -1,137 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
# This script creates a new k8s Service Account and generates a kubeconfig with
|
3
|
-
# its credentials. This Service Account has all the necessary permissions for
|
4
|
-
# SkyPilot. The kubeconfig is written in the current directory.
|
5
|
-
#
|
6
|
-
# You must configure your local kubectl to point to the right k8s cluster and
|
7
|
-
# have admin-level access.
|
8
|
-
#
|
9
|
-
# Note: all of the k8s resources are created in namespace "skypilot". If you
|
10
|
-
# delete any of these objects, SkyPilot will stop working.
|
11
|
-
#
|
12
|
-
# You can override the default namespace "skypilot" using the
|
13
|
-
# SKYPILOT_NAMESPACE environment variable.
|
14
|
-
# You can override the default service account name "skypilot-sa" using the
|
15
|
-
# SKYPILOT_SA_NAME environment variable.
|
16
|
-
|
17
|
-
set -eu -o pipefail
|
18
|
-
|
19
|
-
# Allow passing in common name and username in environment. If not provided,
|
20
|
-
# use default.
|
21
|
-
SKYPILOT_SA=${SKYPILOT_SA_NAME:-skypilot-sa}
|
22
|
-
NAMESPACE=${SKYPILOT_NAMESPACE:-default}
|
23
|
-
|
24
|
-
# Set OS specific values.
|
25
|
-
if [[ "$OSTYPE" == "linux-gnu" ]]; then
|
26
|
-
BASE64_DECODE_FLAG="-d"
|
27
|
-
elif [[ "$OSTYPE" == "darwin"* ]]; then
|
28
|
-
BASE64_DECODE_FLAG="-D"
|
29
|
-
elif [[ "$OSTYPE" == "linux-musl" ]]; then
|
30
|
-
BASE64_DECODE_FLAG="-d"
|
31
|
-
else
|
32
|
-
echo "Unknown OS ${OSTYPE}"
|
33
|
-
exit 1
|
34
|
-
fi
|
35
|
-
|
36
|
-
echo "Creating the Kubernetes Service Account with minimal RBAC permissions."
|
37
|
-
kubectl apply -f - <<EOF
|
38
|
-
apiVersion: v1
|
39
|
-
kind: Namespace
|
40
|
-
metadata:
|
41
|
-
name: ${NAMESPACE}
|
42
|
-
---
|
43
|
-
apiVersion: v1
|
44
|
-
kind: ServiceAccount
|
45
|
-
metadata:
|
46
|
-
name: ${SKYPILOT_SA}
|
47
|
-
namespace: ${NAMESPACE}
|
48
|
-
---
|
49
|
-
apiVersion: rbac.authorization.k8s.io/v1
|
50
|
-
kind: ClusterRole
|
51
|
-
metadata:
|
52
|
-
name: skypilot-role
|
53
|
-
rules:
|
54
|
-
- apiGroups: ["*"]
|
55
|
-
resources: ["*"]
|
56
|
-
verbs: ["*"]
|
57
|
-
---
|
58
|
-
apiVersion: rbac.authorization.k8s.io/v1
|
59
|
-
kind: ClusterRoleBinding
|
60
|
-
metadata:
|
61
|
-
name: skypilot-crb
|
62
|
-
roleRef:
|
63
|
-
apiGroup: rbac.authorization.k8s.io
|
64
|
-
kind: ClusterRole
|
65
|
-
name: skypilot-role
|
66
|
-
subjects:
|
67
|
-
- kind: ServiceAccount
|
68
|
-
name: ${SKYPILOT_SA}
|
69
|
-
namespace: ${NAMESPACE}
|
70
|
-
EOF
|
71
|
-
|
72
|
-
# Checks if secret entry was defined for Service account. If defined it means that Kubernetes server has a
|
73
|
-
# version bellow 1.24, otherwise one must manually create the secret and bind it to the Service account to have a non expiring token.
|
74
|
-
# After Kubernetes v1.24 Service accounts no longer generate automatic tokens/secrets.
|
75
|
-
# We can use kubectl create token but the token has a expiration time.
|
76
|
-
# https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG/CHANGELOG-1.24.md#urgent-upgrade-notes
|
77
|
-
SA_SECRET_NAME=$(kubectl get -n ${NAMESPACE} sa/${SKYPILOT_SA} -o "jsonpath={.secrets[0]..name}")
|
78
|
-
if [ -z $SA_SECRET_NAME ]
|
79
|
-
then
|
80
|
-
# Create the secret and bind it to the desired SA
|
81
|
-
kubectl apply -f - <<EOF
|
82
|
-
apiVersion: v1
|
83
|
-
kind: Secret
|
84
|
-
type: kubernetes.io/service-account-token
|
85
|
-
metadata:
|
86
|
-
name: ${SKYPILOT_SA}
|
87
|
-
namespace: ${NAMESPACE}
|
88
|
-
annotations:
|
89
|
-
kubernetes.io/service-account.name: "${SKYPILOT_SA}"
|
90
|
-
EOF
|
91
|
-
|
92
|
-
SA_SECRET_NAME=${SKYPILOT_SA}
|
93
|
-
fi
|
94
|
-
|
95
|
-
# Note: service account token is stored base64-encoded in the secret but must
|
96
|
-
# be plaintext in kubeconfig.
|
97
|
-
SA_TOKEN=$(kubectl get -n ${NAMESPACE} secrets/${SA_SECRET_NAME} -o "jsonpath={.data['token']}" | base64 ${BASE64_DECODE_FLAG})
|
98
|
-
CA_CERT=$(kubectl get -n ${NAMESPACE} secrets/${SA_SECRET_NAME} -o "jsonpath={.data['ca\.crt']}")
|
99
|
-
|
100
|
-
# Extract cluster IP from the current context
|
101
|
-
CURRENT_CONTEXT=$(kubectl config current-context)
|
102
|
-
CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"${CURRENT_CONTEXT}\"})].context.cluster}")
|
103
|
-
CURRENT_CLUSTER_ADDR=$(kubectl config view -o jsonpath="{.clusters[?(@.name == \"${CURRENT_CLUSTER}\"})].cluster.server}")
|
104
|
-
|
105
|
-
echo "Writing kubeconfig."
|
106
|
-
cat > kubeconfig <<EOF
|
107
|
-
apiVersion: v1
|
108
|
-
clusters:
|
109
|
-
- cluster:
|
110
|
-
certificate-authority-data: ${CA_CERT}
|
111
|
-
server: ${CURRENT_CLUSTER_ADDR}
|
112
|
-
name: ${CURRENT_CLUSTER}
|
113
|
-
contexts:
|
114
|
-
- context:
|
115
|
-
cluster: ${CURRENT_CLUSTER}
|
116
|
-
user: ${CURRENT_CLUSTER}-${SKYPILOT_SA}
|
117
|
-
name: ${CURRENT_CONTEXT}
|
118
|
-
current-context: ${CURRENT_CONTEXT}
|
119
|
-
kind: Config
|
120
|
-
preferences: {}
|
121
|
-
users:
|
122
|
-
- name: ${CURRENT_CLUSTER}-${SKYPILOT_SA}
|
123
|
-
user:
|
124
|
-
token: ${SA_TOKEN}
|
125
|
-
EOF
|
126
|
-
|
127
|
-
echo "---
|
128
|
-
Done!
|
129
|
-
|
130
|
-
Copy the generated kubeconfig file to your SkyPilot Proxy server, and set the
|
131
|
-
kubeconfig_file parameter in your skypilot.yaml config file to point to this
|
132
|
-
kubeconfig file.
|
133
|
-
|
134
|
-
If you need access to multiple kubernetes clusters, you can generate additional
|
135
|
-
kubeconfig files using this script and then merge them using merge-kubeconfigs.sh.
|
136
|
-
|
137
|
-
Note: Kubernetes RBAC rules for SkyPilot were created, you won't need to create them manually."
|