skypilot-nightly 1.0.0.dev20241109__py3-none-any.whl → 1.0.0.dev20241111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +0 -19
- sky/clouds/oci.py +11 -21
- sky/clouds/service_catalog/oci_catalog.py +1 -1
- sky/clouds/utils/oci_utils.py +16 -2
- sky/dag.py +19 -15
- sky/provision/__init__.py +1 -0
- sky/provision/docker_utils.py +1 -1
- sky/provision/kubernetes/instance.py +104 -102
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +430 -0
- sky/{skylet/providers/oci/query_helper.py → provision/oci/query_utils.py} +148 -59
- sky/serve/__init__.py +2 -0
- sky/serve/load_balancer.py +34 -8
- sky/serve/load_balancing_policies.py +23 -1
- sky/serve/service.py +4 -1
- sky/serve/service_spec.py +19 -0
- sky/setup_files/MANIFEST.in +0 -1
- sky/skylet/job_lib.py +29 -17
- sky/templates/kubernetes-ray.yml.j2 +21 -1
- sky/templates/oci-ray.yml.j2 +3 -53
- sky/utils/schemas.py +8 -0
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/RECORD +29 -29
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/utils.py +0 -21
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '91323d86baaeb1341c6953e15bbf19f2896b67ad'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241111'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -3979,25 +3979,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3979
3979
|
stdout = ''
|
3980
3980
|
stderr = str(e)
|
3981
3981
|
|
3982
|
-
# Apr, 2023 by Hysun(hysun.he@oracle.com): Added support for OCI
|
3983
|
-
# May, 2023 by Hysun: Allow terminate INIT cluster which may have
|
3984
|
-
# some instances provisioning in background but not completed.
|
3985
|
-
elif (isinstance(cloud, clouds.OCI) and terminate and
|
3986
|
-
prev_cluster_status in (status_lib.ClusterStatus.STOPPED,
|
3987
|
-
status_lib.ClusterStatus.INIT)):
|
3988
|
-
region = config['provider']['region']
|
3989
|
-
|
3990
|
-
# pylint: disable=import-outside-toplevel
|
3991
|
-
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
3992
|
-
|
3993
|
-
from sky.skylet.providers.oci.query_helper import oci_query_helper
|
3994
|
-
|
3995
|
-
# 0: All terminated successfully, failed count otherwise
|
3996
|
-
returncode = oci_query_helper.terminate_instances_by_tags(
|
3997
|
-
{TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}, region)
|
3998
|
-
|
3999
|
-
# To avoid undefined local variables error.
|
4000
|
-
stdout = stderr = ''
|
4001
3982
|
else:
|
4002
3983
|
config['provider']['cache_stopped_nodes'] = not terminate
|
4003
3984
|
with tempfile.NamedTemporaryFile('w',
|
sky/clouds/oci.py
CHANGED
@@ -31,6 +31,7 @@ from sky import status_lib
|
|
31
31
|
from sky.adaptors import oci as oci_adaptor
|
32
32
|
from sky.clouds import service_catalog
|
33
33
|
from sky.clouds.utils import oci_utils
|
34
|
+
from sky.provision.oci.query_utils import query_helper
|
34
35
|
from sky.utils import common_utils
|
35
36
|
from sky.utils import resources_utils
|
36
37
|
from sky.utils import ux_utils
|
@@ -60,6 +61,9 @@ class OCI(clouds.Cloud):
|
|
60
61
|
{resources_utils.DiskTier.ULTRA})
|
61
62
|
_BEST_DISK_TIER = resources_utils.DiskTier.HIGH
|
62
63
|
|
64
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
65
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
66
|
+
|
63
67
|
@classmethod
|
64
68
|
def _unsupported_features_for_resources(
|
65
69
|
cls, resources: 'resources_lib.Resources'
|
@@ -433,7 +437,7 @@ class OCI(clouds.Cloud):
|
|
433
437
|
return True, None
|
434
438
|
except (oci_adaptor.oci.exceptions.ConfigFileNotFound,
|
435
439
|
oci_adaptor.oci.exceptions.InvalidConfig,
|
436
|
-
oci_adaptor.
|
440
|
+
oci_adaptor.oci.exceptions.ServiceError) as e:
|
437
441
|
return False, (
|
438
442
|
f'OCI credential is not correctly set. '
|
439
443
|
f'Check the credential file at {conf_file}\n'
|
@@ -597,25 +601,11 @@ class OCI(clouds.Cloud):
|
|
597
601
|
region: Optional[str], zone: Optional[str],
|
598
602
|
**kwargs) -> List[status_lib.ClusterStatus]:
|
599
603
|
del zone, kwargs # Unused.
|
600
|
-
# Check the lifecycleState definition from the page
|
601
|
-
# https://docs.oracle.com/en-us/iaas/api/#/en/iaas/latest/Instance/
|
602
|
-
status_map = {
|
603
|
-
'PROVISIONING': status_lib.ClusterStatus.INIT,
|
604
|
-
'STARTING': status_lib.ClusterStatus.INIT,
|
605
|
-
'RUNNING': status_lib.ClusterStatus.UP,
|
606
|
-
'STOPPING': status_lib.ClusterStatus.STOPPED,
|
607
|
-
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
608
|
-
'TERMINATED': None,
|
609
|
-
'TERMINATING': None,
|
610
|
-
}
|
611
|
-
|
612
|
-
# pylint: disable=import-outside-toplevel
|
613
|
-
from sky.skylet.providers.oci.query_helper import oci_query_helper
|
614
604
|
|
615
605
|
status_list = []
|
616
606
|
try:
|
617
|
-
vms =
|
618
|
-
|
607
|
+
vms = query_helper.query_instances_by_tags(tag_filters=tag_filters,
|
608
|
+
region=region)
|
619
609
|
except Exception as e: # pylint: disable=broad-except
|
620
610
|
with ux_utils.print_exception_no_traceback():
|
621
611
|
raise exceptions.ClusterStatusFetchingError(
|
@@ -625,9 +615,9 @@ class OCI(clouds.Cloud):
|
|
625
615
|
|
626
616
|
for node in vms:
|
627
617
|
vm_status = node.lifecycle_state
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
618
|
+
sky_status = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY.get(
|
619
|
+
vm_status, None)
|
620
|
+
if sky_status is not None:
|
621
|
+
status_list.append(sky_status)
|
632
622
|
|
633
623
|
return status_list
|
@@ -66,7 +66,7 @@ def _get_df() -> 'pd.DataFrame':
|
|
66
66
|
logger.debug(f'It is OK goes here when testing: {str(e)}')
|
67
67
|
subscribed_regions = []
|
68
68
|
|
69
|
-
except oci_adaptor.
|
69
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
70
70
|
# Should never expect going here. However, we still catch
|
71
71
|
# it so that if any OCI call failed, the program can still
|
72
72
|
# proceed with try-and-error way.
|
sky/clouds/utils/oci_utils.py
CHANGED
@@ -5,13 +5,14 @@ History:
|
|
5
5
|
- Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS
|
6
6
|
configuration.
|
7
7
|
"""
|
8
|
-
import logging
|
9
8
|
import os
|
10
9
|
|
10
|
+
from sky import sky_logging
|
11
11
|
from sky import skypilot_config
|
12
|
+
from sky import status_lib
|
12
13
|
from sky.utils import resources_utils
|
13
14
|
|
14
|
-
logger =
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
15
16
|
|
16
17
|
|
17
18
|
class OCIConfig:
|
@@ -77,6 +78,19 @@ class OCIConfig:
|
|
77
78
|
resources_utils.DiskTier.HIGH: DISK_TIER_HIGH,
|
78
79
|
}
|
79
80
|
|
81
|
+
# Oracle instance's lifecycle state to sky state mapping.
|
82
|
+
# For Oracle VM instance's lifecyle state, please refer to the link:
|
83
|
+
# https://docs.oracle.com/en-us/iaas/api/#/en/iaas/latest/Instance/
|
84
|
+
STATE_MAPPING_OCI_TO_SKY = {
|
85
|
+
'PROVISIONING': status_lib.ClusterStatus.INIT,
|
86
|
+
'STARTING': status_lib.ClusterStatus.INIT,
|
87
|
+
'RUNNING': status_lib.ClusterStatus.UP,
|
88
|
+
'STOPPING': status_lib.ClusterStatus.STOPPED,
|
89
|
+
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
90
|
+
'TERMINATED': None,
|
91
|
+
'TERMINATING': None,
|
92
|
+
}
|
93
|
+
|
80
94
|
@classmethod
|
81
95
|
def get_compartment(cls, region):
|
82
96
|
# Allow task(cluster)-specific compartment/VCN parameters.
|
sky/dag.py
CHANGED
@@ -56,21 +56,25 @@ class Dag:
|
|
56
56
|
return self.graph
|
57
57
|
|
58
58
|
def is_chain(self) -> bool:
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
59
|
+
"""Check if the DAG is a linear chain of tasks."""
|
60
|
+
|
61
|
+
nodes = list(self.graph.nodes)
|
62
|
+
|
63
|
+
if len(nodes) == 0:
|
64
|
+
return True
|
65
|
+
|
66
|
+
in_degrees = [self.graph.in_degree(node) for node in nodes]
|
67
|
+
out_degrees = [self.graph.out_degree(node) for node in nodes]
|
68
|
+
|
69
|
+
# Check out-degrees: all <= 1 and exactly one node has out_degree == 0
|
70
|
+
out_degree_condition = (all(degree <= 1 for degree in out_degrees) and
|
71
|
+
sum(degree == 0 for degree in out_degrees) == 1)
|
72
|
+
|
73
|
+
# Check in-degrees: all <= 1 and exactly one node has in_degree == 0
|
74
|
+
in_degree_condition = (all(degree <= 1 for degree in in_degrees) and
|
75
|
+
sum(degree == 0 for degree in in_degrees) == 1)
|
76
|
+
|
77
|
+
return out_degree_condition and in_degree_condition
|
74
78
|
|
75
79
|
|
76
80
|
class _DagContext(threading.local):
|
sky/provision/__init__.py
CHANGED
@@ -20,6 +20,7 @@ from sky.provision import fluidstack
|
|
20
20
|
from sky.provision import gcp
|
21
21
|
from sky.provision import kubernetes
|
22
22
|
from sky.provision import lambda_cloud
|
23
|
+
from sky.provision import oci
|
23
24
|
from sky.provision import runpod
|
24
25
|
from sky.provision import vsphere
|
25
26
|
from sky.utils import command_runner
|
sky/provision/docker_utils.py
CHANGED
@@ -20,7 +20,7 @@ SETUP_ENV_VARS_CMD = (
|
|
20
20
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
|
21
21
|
'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
|
22
22
|
'~/container_env_var.sh && '
|
23
|
-
'$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh'
|
23
|
+
'$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;'
|
24
24
|
)
|
25
25
|
|
26
26
|
# Docker daemon may not be ready when the machine is firstly started. The error
|
@@ -333,52 +333,37 @@ def _run_function_with_retries(func: Callable,
|
|
333
333
|
raise
|
334
334
|
|
335
335
|
|
336
|
-
def
|
337
|
-
|
338
|
-
"""Setting environment variables in pods.
|
339
|
-
|
340
|
-
Once all containers are ready, we can exec into them and set env vars.
|
341
|
-
Kubernetes automatically populates containers with critical
|
342
|
-
environment variables, such as those for discovering services running
|
343
|
-
in the cluster and CUDA/nvidia environment variables. We need to
|
344
|
-
make sure these env vars are available in every task and ssh session.
|
345
|
-
This is needed for GPU support and service discovery.
|
346
|
-
See https://github.com/skypilot-org/skypilot/issues/2287 for
|
347
|
-
more details.
|
348
|
-
|
349
|
-
To do so, we capture env vars from the pod's runtime and write them to
|
350
|
-
/etc/profile.d/, making them available for all users in future
|
351
|
-
shell sessions.
|
352
|
-
"""
|
353
|
-
set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
|
336
|
+
def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
|
337
|
+
"""Pre-initialization step for SkyPilot pods.
|
354
338
|
|
355
|
-
|
356
|
-
|
357
|
-
logger.info(f'{"-"*20}Start: Set up env vars in pod {pod_name!r} '
|
358
|
-
f'{"-"*20}')
|
359
|
-
runner = command_runner.KubernetesCommandRunner(
|
360
|
-
((namespace, context), pod_name))
|
339
|
+
This step is run in the pod right after it is created and before the
|
340
|
+
SkyPilot runtime is setup.
|
361
341
|
|
362
|
-
|
363
|
-
rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
|
364
|
-
require_outputs=True,
|
365
|
-
stream_logs=False)
|
366
|
-
_raise_command_running_error('set env vars', set_k8s_env_var_cmd,
|
367
|
-
pod_name, rc, stdout)
|
342
|
+
This step includes three key steps:
|
368
343
|
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
344
|
+
1. Privilege check: Checks if the default user has sufficient privilege
|
345
|
+
to set up the kubernetes instance pod.
|
346
|
+
2. SSH setup: Sets up SSH for the pod instance.
|
347
|
+
3. Environment variable setup to populate k8s env vars in the pod.
|
373
348
|
|
374
|
-
|
375
|
-
|
349
|
+
Make sure commands used in these methods are generic and work
|
350
|
+
on most base images. E.g., do not use Python, since that may not
|
351
|
+
be installed by default.
|
376
352
|
|
353
|
+
If you run any apt commands, be sure to check if the lock is available.
|
354
|
+
It is possible the `apt update` run in the pod container args may still
|
355
|
+
be running.
|
356
|
+
|
357
|
+
Args:
|
358
|
+
namespace (str): Kubernetes namespace.
|
359
|
+
context (Optional[str]): Kubernetes context.
|
360
|
+
new_nodes (List): List of new pod instances.
|
361
|
+
|
362
|
+
Raises:
|
363
|
+
config_lib.KubernetesError: If user privileges are insufficient or
|
364
|
+
setup fails.
|
365
|
+
"""
|
377
366
|
|
378
|
-
def _check_user_privilege(namespace: str, context: Optional[str],
|
379
|
-
new_nodes: List) -> None:
|
380
|
-
# Checks if the default user has sufficient privilege to set up
|
381
|
-
# the kubernetes instance pod.
|
382
367
|
check_k8s_user_sudo_cmd = (
|
383
368
|
'if [ $(id -u) -eq 0 ]; then'
|
384
369
|
# If user is root, create an alias for sudo used in skypilot setup
|
@@ -386,56 +371,67 @@ def _check_user_privilege(namespace: str, context: Optional[str],
|
|
386
371
|
'else '
|
387
372
|
' if command -v sudo >/dev/null 2>&1; then '
|
388
373
|
' timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || '
|
389
|
-
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r};
|
374
|
+
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
|
375
|
+
f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
|
390
376
|
' else '
|
391
|
-
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r};
|
377
|
+
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
|
378
|
+
f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
|
392
379
|
' fi; '
|
393
|
-
'fi')
|
380
|
+
'fi;')
|
381
|
+
|
382
|
+
# Kubernetes automatically populates containers with critical
|
383
|
+
# environment variables, such as those for discovering services running
|
384
|
+
# in the cluster and CUDA/nvidia environment variables. We need to
|
385
|
+
# make sure these env vars are available in every task and ssh session.
|
386
|
+
# This is needed for GPU support and service discovery.
|
387
|
+
# See https://github.com/skypilot-org/skypilot/issues/2287 for more details.
|
388
|
+
# To do so, we capture env vars from the pod's runtime and write them to
|
389
|
+
# /etc/profile.d/, making them available for all users in future
|
390
|
+
# shell sessions.
|
391
|
+
set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
|
394
392
|
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
393
|
+
check_apt_update_complete_cmd = (
|
394
|
+
'echo "Checking if apt update from container init is complete..."; '
|
395
|
+
'timeout_secs=600; '
|
396
|
+
'start_time=$(date +%s); '
|
397
|
+
'while ! grep -q "Fetched" /tmp/apt-update.log 2>/dev/null; do '
|
398
|
+
' echo "apt update still running. Logs:"; '
|
399
|
+
' cat /tmp/apt-update.log; '
|
400
|
+
' current_time=$(date +%s); '
|
401
|
+
' elapsed=$((current_time - start_time)); '
|
402
|
+
' if [ $elapsed -ge $timeout_secs ]; then '
|
403
|
+
' echo "Timed out waiting for apt update"; '
|
404
|
+
' exit 1; '
|
405
|
+
' fi; '
|
406
|
+
' sleep 5; '
|
407
|
+
'done; '
|
408
|
+
'echo "apt update complete."; ')
|
399
409
|
|
400
|
-
|
401
|
-
((namespace, context), pod_name))
|
402
|
-
logger.info(f'{"-"*20}Start: Check user privilege in pod {pod_name!r} '
|
403
|
-
f'{"-"*20}')
|
404
|
-
|
405
|
-
def _run_privilege_check():
|
406
|
-
rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd,
|
407
|
-
require_outputs=True,
|
408
|
-
separate_stderr=True,
|
409
|
-
stream_logs=False)
|
410
|
-
_raise_command_running_error('check user privilege',
|
411
|
-
check_k8s_user_sudo_cmd, pod_name, rc,
|
412
|
-
stdout + stderr)
|
413
|
-
return stdout
|
414
|
-
|
415
|
-
stdout = _run_function_with_retries(
|
416
|
-
_run_privilege_check, f'check user privilege in pod {pod_name!r}')
|
417
|
-
|
418
|
-
if stdout == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
|
419
|
-
raise config_lib.KubernetesError(
|
420
|
-
'Insufficient system privileges detected. '
|
421
|
-
'Ensure the default user has root access or '
|
422
|
-
'"sudo" is installed and the user is added to the sudoers '
|
423
|
-
'from the image.')
|
424
|
-
logger.info(f'{"-"*20}End: Check user privilege in pod {pod_name!r} '
|
425
|
-
f'{"-"*20}')
|
426
|
-
|
427
|
-
|
428
|
-
def _setup_ssh_in_pods(namespace: str, context: Optional[str],
|
429
|
-
new_nodes: List) -> None:
|
430
|
-
# Setting up ssh for the pod instance. This is already setup for
|
431
|
-
# the jump pod so it does not need to be run for it.
|
432
|
-
set_k8s_ssh_cmd = (
|
433
|
-
'set -ex; '
|
410
|
+
install_ssh_k8s_cmd = (
|
434
411
|
'prefix_cmd() '
|
435
412
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
|
436
413
|
'export DEBIAN_FRONTEND=noninteractive;'
|
437
|
-
'
|
438
|
-
'
|
414
|
+
'echo "Installing missing packages..."; '
|
415
|
+
'for i in {1..5}; do '
|
416
|
+
' output=$($(prefix_cmd) apt install openssh-server rsync -y 2>&1); '
|
417
|
+
' rc=$?; '
|
418
|
+
' if [ $rc -eq 0 ]; then '
|
419
|
+
' break; '
|
420
|
+
' fi; '
|
421
|
+
' echo "$output" | grep -qi "could not get lock" || '
|
422
|
+
' grep -qi "Unable to acquire the dpkg frontend lock"; '
|
423
|
+
' if [ $? -eq 0 ]; then '
|
424
|
+
' echo "apt install failed due to lock, retrying. (Attempt $i/5)"; '
|
425
|
+
' sleep 5; '
|
426
|
+
' else '
|
427
|
+
' echo "apt install failed for a non-lock reason: $output"; '
|
428
|
+
' exit $rc; '
|
429
|
+
' fi; '
|
430
|
+
'done; '
|
431
|
+
'if [ $rc -ne 0 ]; then '
|
432
|
+
' echo "apt install failed after 5 attempts due to lock errors."; '
|
433
|
+
' exit $rc; '
|
434
|
+
'fi; '
|
439
435
|
'$(prefix_cmd) mkdir -p /var/run/sshd; '
|
440
436
|
'$(prefix_cmd) '
|
441
437
|
'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
|
@@ -456,24 +452,35 @@ def _setup_ssh_in_pods(namespace: str, context: Optional[str],
|
|
456
452
|
# See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
|
457
453
|
'$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
|
458
454
|
|
459
|
-
|
455
|
+
pre_init_cmd = ('set -ex; ' + check_k8s_user_sudo_cmd +
|
456
|
+
set_k8s_env_var_cmd + check_apt_update_complete_cmd +
|
457
|
+
install_ssh_k8s_cmd)
|
458
|
+
|
459
|
+
def _pre_init_thread(new_node):
|
460
460
|
pod_name = new_node.metadata.name
|
461
|
+
logger.info(f'{"-"*20}Start: Pre-init in pod {pod_name!r} {"-"*20}')
|
461
462
|
runner = command_runner.KubernetesCommandRunner(
|
462
463
|
((namespace, context), pod_name))
|
463
|
-
logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
|
464
464
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
465
|
+
# Run the combined pre-init command
|
466
|
+
rc, stdout, _ = runner.run(pre_init_cmd,
|
467
|
+
require_outputs=True,
|
468
|
+
stream_logs=False)
|
469
|
+
if rc == exceptions.INSUFFICIENT_PRIVILEGES_CODE:
|
470
|
+
raise config_lib.KubernetesError(
|
471
|
+
'Insufficient system privileges detected. '
|
472
|
+
'Ensure the default user has root access or '
|
473
|
+
'"sudo" is installed and the user is added to the sudoers '
|
474
|
+
'from the image.')
|
475
|
+
|
476
|
+
op_name = 'pre-init'
|
477
|
+
_raise_command_running_error(op_name, pre_init_cmd, pod_name, rc,
|
478
|
+
stdout)
|
471
479
|
|
472
|
-
|
473
|
-
f'setup ssh in pod {pod_name!r}')
|
474
|
-
logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
|
480
|
+
logger.info(f'{"-"*20}End: Pre-init in pod {pod_name!r} {"-"*20}')
|
475
481
|
|
476
|
-
|
482
|
+
# Run pre_init in parallel across all new_nodes
|
483
|
+
subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, NUM_THREADS)
|
477
484
|
|
478
485
|
|
479
486
|
def _label_pod(namespace: str, context: Optional[str], pod_name: str,
|
@@ -724,13 +731,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
724
731
|
f'pods: {list(uninitialized_pods.keys())}')
|
725
732
|
uninitialized_pods_list = list(uninitialized_pods.values())
|
726
733
|
|
727
|
-
#
|
728
|
-
|
729
|
-
# on most base images. E.g., do not use Python, since that may not
|
730
|
-
# be installed by default.
|
731
|
-
_check_user_privilege(namespace, context, uninitialized_pods_list)
|
732
|
-
_setup_ssh_in_pods(namespace, context, uninitialized_pods_list)
|
733
|
-
_set_env_vars_in_pods(namespace, context, uninitialized_pods_list)
|
734
|
+
# Run pre-init steps in the pod.
|
735
|
+
pre_init(namespace, context, uninitialized_pods_list)
|
734
736
|
|
735
737
|
for pod in uninitialized_pods.values():
|
736
738
|
_label_pod(namespace,
|
@@ -0,0 +1,15 @@
|
|
1
|
+
"""OCI provisioner for SkyPilot.
|
2
|
+
|
3
|
+
History:
|
4
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
|
5
|
+
"""
|
6
|
+
|
7
|
+
from sky.provision.oci.config import bootstrap_instances
|
8
|
+
from sky.provision.oci.instance import cleanup_ports
|
9
|
+
from sky.provision.oci.instance import get_cluster_info
|
10
|
+
from sky.provision.oci.instance import open_ports
|
11
|
+
from sky.provision.oci.instance import query_instances
|
12
|
+
from sky.provision.oci.instance import run_instances
|
13
|
+
from sky.provision.oci.instance import stop_instances
|
14
|
+
from sky.provision.oci.instance import terminate_instances
|
15
|
+
from sky.provision.oci.instance import wait_instances
|
@@ -0,0 +1,51 @@
|
|
1
|
+
"""OCI configuration bootstrapping.
|
2
|
+
|
3
|
+
Creates the resource group and deploys the configuration template to OCI for
|
4
|
+
a cluster to be launched.
|
5
|
+
|
6
|
+
History:
|
7
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
|
8
|
+
"""
|
9
|
+
|
10
|
+
from sky import exceptions
|
11
|
+
from sky import sky_logging
|
12
|
+
from sky.adaptors import oci as oci_adaptor
|
13
|
+
from sky.clouds.utils import oci_utils
|
14
|
+
from sky.provision import common
|
15
|
+
from sky.provision.oci.query_utils import query_helper
|
16
|
+
|
17
|
+
logger = sky_logging.init_logger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
@common.log_function_start_end
|
21
|
+
def bootstrap_instances(
|
22
|
+
region: str, cluster_name_on_cloud: str,
|
23
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
24
|
+
"""See sky/provision/__init__.py"""
|
25
|
+
# OCI module import and oci client
|
26
|
+
oci_adaptor.get_core_client(region, oci_utils.oci_config.get_profile())
|
27
|
+
|
28
|
+
# Find / create a compartment for creating instances.
|
29
|
+
compartment = query_helper.find_compartment(region)
|
30
|
+
|
31
|
+
# Find the configured VCN, or create a new one.
|
32
|
+
vcn = query_helper.find_create_vcn_subnet(region)
|
33
|
+
if vcn is None:
|
34
|
+
# pylint: disable=line-too-long
|
35
|
+
raise exceptions.ResourcesUnavailableError(
|
36
|
+
'Failed to create a new VCN, possibly you hit the resource limitation.'
|
37
|
+
)
|
38
|
+
|
39
|
+
node_config = config.node_config
|
40
|
+
|
41
|
+
# Subscribe the image if it is from Marketplace listing.
|
42
|
+
query_helper.subscribe_image(
|
43
|
+
compartment_id=compartment,
|
44
|
+
listing_id=node_config['AppCatalogListingId'],
|
45
|
+
resource_version=node_config['ResourceVersion'],
|
46
|
+
region=region,
|
47
|
+
)
|
48
|
+
|
49
|
+
logger.info(f'Using cluster name: {cluster_name_on_cloud}')
|
50
|
+
|
51
|
+
return config
|