skypilot-nightly 1.0.0.dev20241108__py3-none-any.whl → 1.0.0.dev20241110__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +6 -21
- sky/backends/wheel_utils.py +5 -1
- sky/cli.py +25 -1
- sky/clouds/oci.py +11 -21
- sky/clouds/service_catalog/oci_catalog.py +1 -1
- sky/clouds/utils/oci_utils.py +16 -2
- sky/core.py +3 -2
- sky/dag.py +20 -15
- sky/data/mounting_utils.py +4 -16
- sky/exceptions.py +4 -1
- sky/execution.py +10 -8
- sky/jobs/core.py +3 -1
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +25 -5
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +430 -0
- sky/{skylet/providers/oci/query_helper.py → provision/oci/query_utils.py} +148 -59
- sky/serve/core.py +11 -1
- sky/setup_files/MANIFEST.in +0 -1
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +39 -20
- sky/skylet/log_lib.py +77 -8
- sky/templates/kubernetes-ray.yml.j2 +3 -1
- sky/templates/oci-ray.yml.j2 +3 -53
- sky/utils/admin_policy_utils.py +1 -0
- sky/utils/command_runner.py +14 -2
- sky/utils/control_master_utils.py +49 -0
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/RECORD +35 -34
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/utils.py +0 -21
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'dddd65187953a5d6b32f762bea78eed1f109ec3c'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241110'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -3711,7 +3711,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3711
3711
|
handle: CloudVmRayResourceHandle,
|
3712
3712
|
job_id: Optional[int],
|
3713
3713
|
managed_job_id: Optional[int] = None,
|
3714
|
-
follow: bool = True
|
3714
|
+
follow: bool = True,
|
3715
|
+
tail: int = 0) -> int:
|
3715
3716
|
"""Tail the logs of a job.
|
3716
3717
|
|
3717
3718
|
Args:
|
@@ -3719,10 +3720,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3719
3720
|
job_id: The job ID to tail the logs of.
|
3720
3721
|
managed_job_id: The managed job ID for display purpose only.
|
3721
3722
|
follow: Whether to follow the logs.
|
3723
|
+
tail: The number of lines to display from the end of the
|
3724
|
+
log file. If 0, print all lines.
|
3722
3725
|
"""
|
3723
3726
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
3724
3727
|
managed_job_id=managed_job_id,
|
3725
|
-
follow=follow
|
3728
|
+
follow=follow,
|
3729
|
+
tail=tail)
|
3726
3730
|
if job_id is None and managed_job_id is None:
|
3727
3731
|
logger.info(
|
3728
3732
|
'Job ID not provided. Streaming the logs of the latest job.')
|
@@ -3975,25 +3979,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3975
3979
|
stdout = ''
|
3976
3980
|
stderr = str(e)
|
3977
3981
|
|
3978
|
-
# Apr, 2023 by Hysun(hysun.he@oracle.com): Added support for OCI
|
3979
|
-
# May, 2023 by Hysun: Allow terminate INIT cluster which may have
|
3980
|
-
# some instances provisioning in background but not completed.
|
3981
|
-
elif (isinstance(cloud, clouds.OCI) and terminate and
|
3982
|
-
prev_cluster_status in (status_lib.ClusterStatus.STOPPED,
|
3983
|
-
status_lib.ClusterStatus.INIT)):
|
3984
|
-
region = config['provider']['region']
|
3985
|
-
|
3986
|
-
# pylint: disable=import-outside-toplevel
|
3987
|
-
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
3988
|
-
|
3989
|
-
from sky.skylet.providers.oci.query_helper import oci_query_helper
|
3990
|
-
|
3991
|
-
# 0: All terminated successfully, failed count otherwise
|
3992
|
-
returncode = oci_query_helper.terminate_instances_by_tags(
|
3993
|
-
{TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}, region)
|
3994
|
-
|
3995
|
-
# To avoid undefined local variables error.
|
3996
|
-
stdout = stderr = ''
|
3997
3982
|
else:
|
3998
3983
|
config['provider']['cache_stopped_nodes'] = not terminate
|
3999
3984
|
with tempfile.NamedTemporaryFile('w',
|
sky/backends/wheel_utils.py
CHANGED
@@ -129,7 +129,11 @@ def _build_sky_wheel() -> pathlib.Path:
|
|
129
129
|
|
130
130
|
wheel_dir = WHEEL_DIR / hash_of_latest_wheel
|
131
131
|
wheel_dir.mkdir(parents=True, exist_ok=True)
|
132
|
-
shutil.move
|
132
|
+
# shutil.move will fail when the file already exists and is being
|
133
|
+
# moved across filesystems.
|
134
|
+
if not os.path.exists(
|
135
|
+
os.path.join(wheel_dir, os.path.basename(wheel_path))):
|
136
|
+
shutil.move(str(wheel_path), wheel_dir)
|
133
137
|
return wheel_dir / wheel_path.name
|
134
138
|
|
135
139
|
|
sky/cli.py
CHANGED
@@ -46,6 +46,7 @@ from rich import progress as rich_progress
|
|
46
46
|
import yaml
|
47
47
|
|
48
48
|
import sky
|
49
|
+
from sky import admin_policy
|
49
50
|
from sky import backends
|
50
51
|
from sky import check as sky_check
|
51
52
|
from sky import clouds as sky_clouds
|
@@ -67,6 +68,7 @@ from sky.skylet import constants
|
|
67
68
|
from sky.skylet import job_lib
|
68
69
|
from sky.skylet import log_lib
|
69
70
|
from sky.usage import usage_lib
|
71
|
+
from sky.utils import admin_policy_utils
|
70
72
|
from sky.utils import common_utils
|
71
73
|
from sky.utils import controller_utils
|
72
74
|
from sky.utils import dag_utils
|
@@ -582,6 +584,15 @@ def _launch_with_confirm(
|
|
582
584
|
with ux_utils.print_exception_no_traceback():
|
583
585
|
raise RuntimeError(f'{colorama.Fore.YELLOW}{e}'
|
584
586
|
f'{colorama.Style.RESET_ALL}') from e
|
587
|
+
dag, _ = admin_policy_utils.apply(
|
588
|
+
dag,
|
589
|
+
request_options=admin_policy.RequestOptions(
|
590
|
+
cluster_name=cluster,
|
591
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
592
|
+
down=down,
|
593
|
+
dryrun=dryrun,
|
594
|
+
),
|
595
|
+
)
|
585
596
|
dag = sky.optimize(dag)
|
586
597
|
task = dag.tasks[0]
|
587
598
|
|
@@ -2011,6 +2022,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
2011
2022
|
help=('Follow the logs of a job. '
|
2012
2023
|
'If --no-follow is specified, print the log so far and exit. '
|
2013
2024
|
'[default: --follow]'))
|
2025
|
+
@click.option(
|
2026
|
+
'--tail',
|
2027
|
+
default=0,
|
2028
|
+
type=int,
|
2029
|
+
help=('The number of lines to display from the end of the log file. '
|
2030
|
+
'Default is 0, which means print all lines.'))
|
2014
2031
|
@click.argument('cluster',
|
2015
2032
|
required=True,
|
2016
2033
|
type=str,
|
@@ -2024,6 +2041,7 @@ def logs(
|
|
2024
2041
|
sync_down: bool,
|
2025
2042
|
status: bool, # pylint: disable=redefined-outer-name
|
2026
2043
|
follow: bool,
|
2044
|
+
tail: int,
|
2027
2045
|
):
|
2028
2046
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2029
2047
|
"""Tail the log of a job.
|
@@ -2090,7 +2108,7 @@ def logs(
|
|
2090
2108
|
click.secho(f'Job {id_str}not found', fg='red')
|
2091
2109
|
sys.exit(1)
|
2092
2110
|
|
2093
|
-
core.tail_logs(cluster, job_id, follow)
|
2111
|
+
core.tail_logs(cluster, job_id, follow, tail)
|
2094
2112
|
|
2095
2113
|
|
2096
2114
|
@cli.command()
|
@@ -3667,6 +3685,8 @@ def jobs_launch(
|
|
3667
3685
|
|
3668
3686
|
click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
|
3669
3687
|
fg='cyan')
|
3688
|
+
dag, _ = admin_policy_utils.apply(
|
3689
|
+
dag, use_mutated_config_in_current_request=False)
|
3670
3690
|
dag = sky.optimize(dag)
|
3671
3691
|
|
3672
3692
|
if not yes:
|
@@ -4145,6 +4165,8 @@ def serve_up(
|
|
4145
4165
|
fg='cyan')
|
4146
4166
|
with sky.Dag() as dag:
|
4147
4167
|
dag.add(task)
|
4168
|
+
dag, _ = admin_policy_utils.apply(
|
4169
|
+
dag, use_mutated_config_in_current_request=False)
|
4148
4170
|
sky.optimize(dag)
|
4149
4171
|
|
4150
4172
|
if not yes:
|
@@ -4261,6 +4283,8 @@ def serve_update(
|
|
4261
4283
|
fg='cyan')
|
4262
4284
|
with sky.Dag() as dag:
|
4263
4285
|
dag.add(task)
|
4286
|
+
dag, _ = admin_policy_utils.apply(
|
4287
|
+
dag, use_mutated_config_in_current_request=False)
|
4264
4288
|
sky.optimize(dag)
|
4265
4289
|
|
4266
4290
|
if not yes:
|
sky/clouds/oci.py
CHANGED
@@ -31,6 +31,7 @@ from sky import status_lib
|
|
31
31
|
from sky.adaptors import oci as oci_adaptor
|
32
32
|
from sky.clouds import service_catalog
|
33
33
|
from sky.clouds.utils import oci_utils
|
34
|
+
from sky.provision.oci.query_utils import query_helper
|
34
35
|
from sky.utils import common_utils
|
35
36
|
from sky.utils import resources_utils
|
36
37
|
from sky.utils import ux_utils
|
@@ -60,6 +61,9 @@ class OCI(clouds.Cloud):
|
|
60
61
|
{resources_utils.DiskTier.ULTRA})
|
61
62
|
_BEST_DISK_TIER = resources_utils.DiskTier.HIGH
|
62
63
|
|
64
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
65
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
66
|
+
|
63
67
|
@classmethod
|
64
68
|
def _unsupported_features_for_resources(
|
65
69
|
cls, resources: 'resources_lib.Resources'
|
@@ -433,7 +437,7 @@ class OCI(clouds.Cloud):
|
|
433
437
|
return True, None
|
434
438
|
except (oci_adaptor.oci.exceptions.ConfigFileNotFound,
|
435
439
|
oci_adaptor.oci.exceptions.InvalidConfig,
|
436
|
-
oci_adaptor.
|
440
|
+
oci_adaptor.oci.exceptions.ServiceError) as e:
|
437
441
|
return False, (
|
438
442
|
f'OCI credential is not correctly set. '
|
439
443
|
f'Check the credential file at {conf_file}\n'
|
@@ -597,25 +601,11 @@ class OCI(clouds.Cloud):
|
|
597
601
|
region: Optional[str], zone: Optional[str],
|
598
602
|
**kwargs) -> List[status_lib.ClusterStatus]:
|
599
603
|
del zone, kwargs # Unused.
|
600
|
-
# Check the lifecycleState definition from the page
|
601
|
-
# https://docs.oracle.com/en-us/iaas/api/#/en/iaas/latest/Instance/
|
602
|
-
status_map = {
|
603
|
-
'PROVISIONING': status_lib.ClusterStatus.INIT,
|
604
|
-
'STARTING': status_lib.ClusterStatus.INIT,
|
605
|
-
'RUNNING': status_lib.ClusterStatus.UP,
|
606
|
-
'STOPPING': status_lib.ClusterStatus.STOPPED,
|
607
|
-
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
608
|
-
'TERMINATED': None,
|
609
|
-
'TERMINATING': None,
|
610
|
-
}
|
611
|
-
|
612
|
-
# pylint: disable=import-outside-toplevel
|
613
|
-
from sky.skylet.providers.oci.query_helper import oci_query_helper
|
614
604
|
|
615
605
|
status_list = []
|
616
606
|
try:
|
617
|
-
vms =
|
618
|
-
|
607
|
+
vms = query_helper.query_instances_by_tags(tag_filters=tag_filters,
|
608
|
+
region=region)
|
619
609
|
except Exception as e: # pylint: disable=broad-except
|
620
610
|
with ux_utils.print_exception_no_traceback():
|
621
611
|
raise exceptions.ClusterStatusFetchingError(
|
@@ -625,9 +615,9 @@ class OCI(clouds.Cloud):
|
|
625
615
|
|
626
616
|
for node in vms:
|
627
617
|
vm_status = node.lifecycle_state
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
618
|
+
sky_status = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY.get(
|
619
|
+
vm_status, None)
|
620
|
+
if sky_status is not None:
|
621
|
+
status_list.append(sky_status)
|
632
622
|
|
633
623
|
return status_list
|
@@ -66,7 +66,7 @@ def _get_df() -> 'pd.DataFrame':
|
|
66
66
|
logger.debug(f'It is OK goes here when testing: {str(e)}')
|
67
67
|
subscribed_regions = []
|
68
68
|
|
69
|
-
except oci_adaptor.
|
69
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
70
70
|
# Should never expect going here. However, we still catch
|
71
71
|
# it so that if any OCI call failed, the program can still
|
72
72
|
# proceed with try-and-error way.
|
sky/clouds/utils/oci_utils.py
CHANGED
@@ -5,13 +5,14 @@ History:
|
|
5
5
|
- Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS
|
6
6
|
configuration.
|
7
7
|
"""
|
8
|
-
import logging
|
9
8
|
import os
|
10
9
|
|
10
|
+
from sky import sky_logging
|
11
11
|
from sky import skypilot_config
|
12
|
+
from sky import status_lib
|
12
13
|
from sky.utils import resources_utils
|
13
14
|
|
14
|
-
logger =
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
15
16
|
|
16
17
|
|
17
18
|
class OCIConfig:
|
@@ -77,6 +78,19 @@ class OCIConfig:
|
|
77
78
|
resources_utils.DiskTier.HIGH: DISK_TIER_HIGH,
|
78
79
|
}
|
79
80
|
|
81
|
+
# Oracle instance's lifecycle state to sky state mapping.
|
82
|
+
# For Oracle VM instance's lifecyle state, please refer to the link:
|
83
|
+
# https://docs.oracle.com/en-us/iaas/api/#/en/iaas/latest/Instance/
|
84
|
+
STATE_MAPPING_OCI_TO_SKY = {
|
85
|
+
'PROVISIONING': status_lib.ClusterStatus.INIT,
|
86
|
+
'STARTING': status_lib.ClusterStatus.INIT,
|
87
|
+
'RUNNING': status_lib.ClusterStatus.UP,
|
88
|
+
'STOPPING': status_lib.ClusterStatus.STOPPED,
|
89
|
+
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
90
|
+
'TERMINATED': None,
|
91
|
+
'TERMINATING': None,
|
92
|
+
}
|
93
|
+
|
80
94
|
@classmethod
|
81
95
|
def get_compartment(cls, region):
|
82
96
|
# Allow task(cluster)-specific compartment/VCN parameters.
|
sky/core.py
CHANGED
@@ -742,7 +742,8 @@ def cancel(
|
|
742
742
|
@usage_lib.entrypoint
|
743
743
|
def tail_logs(cluster_name: str,
|
744
744
|
job_id: Optional[int],
|
745
|
-
follow: bool = True
|
745
|
+
follow: bool = True,
|
746
|
+
tail: int = 0) -> None:
|
746
747
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
747
748
|
"""Tail the logs of a job.
|
748
749
|
|
@@ -775,7 +776,7 @@ def tail_logs(cluster_name: str,
|
|
775
776
|
f'{colorama.Style.RESET_ALL}')
|
776
777
|
|
777
778
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
778
|
-
backend.tail_logs(handle, job_id, follow=follow)
|
779
|
+
backend.tail_logs(handle, job_id, follow=follow, tail=tail)
|
779
780
|
|
780
781
|
|
781
782
|
@usage_lib.entrypoint
|
sky/dag.py
CHANGED
@@ -23,6 +23,7 @@ class Dag:
|
|
23
23
|
|
24
24
|
self.graph = nx.DiGraph()
|
25
25
|
self.name: Optional[str] = None
|
26
|
+
self.policy_applied: bool = False
|
26
27
|
|
27
28
|
def add(self, task: 'task.Task') -> None:
|
28
29
|
self.graph.add_node(task)
|
@@ -55,21 +56,25 @@ class Dag:
|
|
55
56
|
return self.graph
|
56
57
|
|
57
58
|
def is_chain(self) -> bool:
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
59
|
+
"""Check if the DAG is a linear chain of tasks."""
|
60
|
+
|
61
|
+
nodes = list(self.graph.nodes)
|
62
|
+
|
63
|
+
if len(nodes) == 0:
|
64
|
+
return True
|
65
|
+
|
66
|
+
in_degrees = [self.graph.in_degree(node) for node in nodes]
|
67
|
+
out_degrees = [self.graph.out_degree(node) for node in nodes]
|
68
|
+
|
69
|
+
# Check out-degrees: all <= 1 and exactly one node has out_degree == 0
|
70
|
+
out_degree_condition = (all(degree <= 1 for degree in out_degrees) and
|
71
|
+
sum(degree == 0 for degree in out_degrees) == 1)
|
72
|
+
|
73
|
+
# Check in-degrees: all <= 1 and exactly one node has in_degree == 0
|
74
|
+
in_degree_condition = (all(degree <= 1 for degree in in_degrees) and
|
75
|
+
sum(degree == 0 for degree in in_degrees) == 1)
|
76
|
+
|
77
|
+
return out_degree_condition and in_degree_condition
|
73
78
|
|
74
79
|
|
75
80
|
class _DagContext(threading.local):
|
sky/data/mounting_utils.py
CHANGED
@@ -276,23 +276,11 @@ def get_mounting_command(
|
|
276
276
|
script = get_mounting_script(mount_path, mount_cmd, install_cmd,
|
277
277
|
version_check_cmd)
|
278
278
|
|
279
|
-
# TODO(romilb): Get direct bash script to work like so:
|
280
|
-
# command = f'bash <<-\EOL' \
|
281
|
-
# f'{script}' \
|
282
|
-
# 'EOL'
|
283
|
-
|
284
|
-
# TODO(romilb): This heredoc should have EOF after script, but it
|
285
|
-
# fails with sky's ssh pipeline. Instead, we don't use EOF and use )
|
286
|
-
# as the end of heredoc. This raises a warning (here-document delimited
|
287
|
-
# by end-of-file) that can be safely ignored.
|
288
|
-
|
289
279
|
# While these commands are run sequentially for each storage object,
|
290
280
|
# we add random int to be on the safer side and avoid collisions.
|
291
281
|
script_path = f'~/.sky/mount_{random.randint(0, 1000000)}.sh'
|
292
|
-
|
293
|
-
|
294
|
-
f'{
|
295
|
-
f'
|
296
|
-
f' && bash {script_path}'
|
297
|
-
f' && rm {script_path}')
|
282
|
+
command = (f'echo {shlex.quote(script)} > {script_path} && '
|
283
|
+
f'chmod +x {script_path} && '
|
284
|
+
f'bash {script_path} && '
|
285
|
+
f'rm {script_path}')
|
298
286
|
return command
|
sky/exceptions.py
CHANGED
@@ -3,6 +3,8 @@ import enum
|
|
3
3
|
import typing
|
4
4
|
from typing import List, Optional, Sequence
|
5
5
|
|
6
|
+
from sky.utils import env_options
|
7
|
+
|
6
8
|
if typing.TYPE_CHECKING:
|
7
9
|
from sky import status_lib
|
8
10
|
from sky.backends import backend
|
@@ -104,7 +106,8 @@ class CommandError(Exception):
|
|
104
106
|
if not command:
|
105
107
|
message = error_msg
|
106
108
|
else:
|
107
|
-
if len(command) > 100
|
109
|
+
if (len(command) > 100 and
|
110
|
+
not env_options.Options.SHOW_DEBUG_INFO.get()):
|
108
111
|
# Chunck the command to avoid overflow.
|
109
112
|
command = command[:100] + '...'
|
110
113
|
message = (f'Command {command} failed with return code '
|
sky/execution.py
CHANGED
@@ -160,14 +160,16 @@ def _execute(
|
|
160
160
|
"""
|
161
161
|
|
162
162
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
163
|
-
|
164
|
-
dag,
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
163
|
+
if not dag.policy_applied:
|
164
|
+
dag, _ = admin_policy_utils.apply(
|
165
|
+
dag,
|
166
|
+
request_options=admin_policy.RequestOptions(
|
167
|
+
cluster_name=cluster_name,
|
168
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
169
|
+
down=down,
|
170
|
+
dryrun=dryrun,
|
171
|
+
),
|
172
|
+
)
|
171
173
|
assert len(dag) == 1, f'We support 1 task for now. {dag}'
|
172
174
|
task = dag.tasks[0]
|
173
175
|
|
sky/jobs/core.py
CHANGED
@@ -59,8 +59,10 @@ def launch(
|
|
59
59
|
"""
|
60
60
|
entrypoint = task
|
61
61
|
dag_uuid = str(uuid.uuid4().hex[:4])
|
62
|
-
|
63
62
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
63
|
+
# Always apply the policy again here, even though it might have been applied
|
64
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
65
|
+
# and get the mutated config.
|
64
66
|
dag, mutated_user_config = admin_policy_utils.apply(
|
65
67
|
dag, use_mutated_config_in_current_request=False)
|
66
68
|
if not dag.is_chain():
|
sky/provision/__init__.py
CHANGED
@@ -20,6 +20,7 @@ from sky.provision import fluidstack
|
|
20
20
|
from sky.provision import gcp
|
21
21
|
from sky.provision import kubernetes
|
22
22
|
from sky.provision import lambda_cloud
|
23
|
+
from sky.provision import oci
|
23
24
|
from sky.provision import runpod
|
24
25
|
from sky.provision import vsphere
|
25
26
|
from sky.utils import command_runner
|
sky/provision/aws/config.py
CHANGED
@@ -42,8 +42,9 @@ def _skypilot_log_error_and_exit_for_failover(error: str) -> None:
|
|
42
42
|
Mainly used for handling VPC/subnet errors before nodes are launched.
|
43
43
|
"""
|
44
44
|
# NOTE: keep. The backend looks for this to know no nodes are launched.
|
45
|
-
|
46
|
-
|
45
|
+
full_error = f'SKYPILOT_ERROR_NO_NODES_LAUNCHED: {error}'
|
46
|
+
logger.error(full_error)
|
47
|
+
raise RuntimeError(full_error)
|
47
48
|
|
48
49
|
|
49
50
|
def bootstrap_instances(
|
@@ -222,10 +223,27 @@ def _configure_iam_role(iam) -> Dict[str, Any]:
|
|
222
223
|
|
223
224
|
|
224
225
|
@functools.lru_cache(maxsize=128) # Keep bounded.
|
225
|
-
def _get_route_tables(ec2, vpc_id: Optional[str],
|
226
|
+
def _get_route_tables(ec2, vpc_id: Optional[str], region: str,
|
227
|
+
main: bool) -> List[Any]:
|
228
|
+
"""Get route tables associated with a VPC and region
|
229
|
+
|
230
|
+
Args:
|
231
|
+
ec2: ec2 resource object
|
232
|
+
vpc_id: vpc_id is optional, if not provided, all route tables in the
|
233
|
+
region will be returned
|
234
|
+
region: region is mandatory to allow the lru cache
|
235
|
+
to return the corect results
|
236
|
+
main: if True, only main route tables will be returned otherwise
|
237
|
+
only non-main route tables will be returned
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
A list of route tables associated with the options VPC and region
|
241
|
+
"""
|
226
242
|
filters = [{'Name': 'association.main', 'Values': [str(main).lower()]}]
|
227
243
|
if vpc_id is not None:
|
228
244
|
filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
|
245
|
+
logger.debug(
|
246
|
+
f'Getting route tables with filters: {filters} in region: {region}')
|
229
247
|
return ec2.meta.client.describe_route_tables(Filters=filters).get(
|
230
248
|
'RouteTables', [])
|
231
249
|
|
@@ -238,7 +256,8 @@ def _is_subnet_public(ec2, subnet_id, vpc_id: Optional[str]) -> bool:
|
|
238
256
|
https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Internet_Gateway.html
|
239
257
|
"""
|
240
258
|
# Get the route tables associated with the subnet
|
241
|
-
|
259
|
+
region = ec2.meta.client.meta.region_name
|
260
|
+
all_route_tables = _get_route_tables(ec2, vpc_id, region, main=False)
|
242
261
|
route_tables = [
|
243
262
|
rt for rt in all_route_tables
|
244
263
|
# An RT can be associated with multiple subnets, i.e.,
|
@@ -267,7 +286,8 @@ def _is_subnet_public(ec2, subnet_id, vpc_id: Optional[str]) -> bool:
|
|
267
286
|
# subnets. Since the associations are implicit, the filter above won't find
|
268
287
|
# any. Check there exists a main route table with routes pointing to an IGW.
|
269
288
|
logger.debug('Checking main route table')
|
270
|
-
|
289
|
+
region = ec2.meta.client.meta.region_name
|
290
|
+
main_route_tables = _get_route_tables(ec2, vpc_id, region, main=True)
|
271
291
|
return _has_igw_route(main_route_tables)
|
272
292
|
|
273
293
|
|
@@ -0,0 +1,15 @@
|
|
1
|
+
"""OCI provisioner for SkyPilot.
|
2
|
+
|
3
|
+
History:
|
4
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
|
5
|
+
"""
|
6
|
+
|
7
|
+
from sky.provision.oci.config import bootstrap_instances
|
8
|
+
from sky.provision.oci.instance import cleanup_ports
|
9
|
+
from sky.provision.oci.instance import get_cluster_info
|
10
|
+
from sky.provision.oci.instance import open_ports
|
11
|
+
from sky.provision.oci.instance import query_instances
|
12
|
+
from sky.provision.oci.instance import run_instances
|
13
|
+
from sky.provision.oci.instance import stop_instances
|
14
|
+
from sky.provision.oci.instance import terminate_instances
|
15
|
+
from sky.provision.oci.instance import wait_instances
|
@@ -0,0 +1,51 @@
|
|
1
|
+
"""OCI configuration bootstrapping.
|
2
|
+
|
3
|
+
Creates the resource group and deploys the configuration template to OCI for
|
4
|
+
a cluster to be launched.
|
5
|
+
|
6
|
+
History:
|
7
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
|
8
|
+
"""
|
9
|
+
|
10
|
+
from sky import exceptions
|
11
|
+
from sky import sky_logging
|
12
|
+
from sky.adaptors import oci as oci_adaptor
|
13
|
+
from sky.clouds.utils import oci_utils
|
14
|
+
from sky.provision import common
|
15
|
+
from sky.provision.oci.query_utils import query_helper
|
16
|
+
|
17
|
+
logger = sky_logging.init_logger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
@common.log_function_start_end
|
21
|
+
def bootstrap_instances(
|
22
|
+
region: str, cluster_name_on_cloud: str,
|
23
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
24
|
+
"""See sky/provision/__init__.py"""
|
25
|
+
# OCI module import and oci client
|
26
|
+
oci_adaptor.get_core_client(region, oci_utils.oci_config.get_profile())
|
27
|
+
|
28
|
+
# Find / create a compartment for creating instances.
|
29
|
+
compartment = query_helper.find_compartment(region)
|
30
|
+
|
31
|
+
# Find the configured VCN, or create a new one.
|
32
|
+
vcn = query_helper.find_create_vcn_subnet(region)
|
33
|
+
if vcn is None:
|
34
|
+
# pylint: disable=line-too-long
|
35
|
+
raise exceptions.ResourcesUnavailableError(
|
36
|
+
'Failed to create a new VCN, possibly you hit the resource limitation.'
|
37
|
+
)
|
38
|
+
|
39
|
+
node_config = config.node_config
|
40
|
+
|
41
|
+
# Subscribe the image if it is from Marketplace listing.
|
42
|
+
query_helper.subscribe_image(
|
43
|
+
compartment_id=compartment,
|
44
|
+
listing_id=node_config['AppCatalogListingId'],
|
45
|
+
resource_version=node_config['ResourceVersion'],
|
46
|
+
region=region,
|
47
|
+
)
|
48
|
+
|
49
|
+
logger.info(f'Using cluster name: {cluster_name_on_cloud}')
|
50
|
+
|
51
|
+
return config
|