skypilot-nightly 1.0.0.dev20241114__py3-none-any.whl → 1.0.0.dev20241115__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +7 -4
- sky/backends/cloud_vm_ray_backend.py +14 -10
- sky/clouds/oci.py +0 -2
- sky/clouds/utils/oci_utils.py +5 -0
- sky/execution.py +37 -22
- sky/jobs/core.py +0 -1
- sky/jobs/utils.py +4 -3
- sky/provision/oci/instance.py +12 -11
- sky/provision/oci/query_utils.py +212 -6
- sky/serve/core.py +1 -0
- sky/serve/serve_utils.py +35 -30
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +249 -138
- sky/skylet/log_lib.py +1 -34
- sky/skylet/subprocess_daemon.py +33 -13
- sky/utils/controller_utils.py +10 -9
- sky/utils/subprocess_utils.py +50 -0
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241115.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241115.dist-info}/RECORD +24 -24
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241115.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241115.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241115.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241115.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'a404e3fc9bee7f0865f4118cfdd158de2b51ee28'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241115'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/backends/backend_utils.py
CHANGED
@@ -2604,15 +2604,18 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
|
|
2604
2604
|
pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
|
2605
2605
|
r'attribute \'(.*)\'')
|
2606
2606
|
if returncode != 0:
|
2607
|
+
# TODO(zhwu): Backward compatibility for old SkyPilot runtime version on
|
2608
|
+
# the remote cluster. Remove this after 0.10.0 is released.
|
2607
2609
|
attribute_error = re.findall(pattern, stderr)
|
2608
|
-
if attribute_error:
|
2610
|
+
if attribute_error or 'SkyPilot runtime is too old' in stderr:
|
2609
2611
|
with ux_utils.print_exception_no_traceback():
|
2610
2612
|
raise RuntimeError(
|
2611
2613
|
f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
|
2612
|
-
'on the remote cluster. To update, run
|
2613
|
-
|
2614
|
+
f'on the remote cluster: {cluster_name}. To update, run '
|
2615
|
+
'(existing jobs will not be interrupted): '
|
2616
|
+
f'{colorama.Style.BRIGHT}sky start -f -y '
|
2614
2617
|
f'{cluster_name}{colorama.Style.RESET_ALL}'
|
2615
|
-
f'\n--- Details ---\n{stderr.strip()}\n')
|
2618
|
+
f'\n--- Details ---\n{stderr.strip()}\n') from None
|
2616
2619
|
|
2617
2620
|
|
2618
2621
|
def get_endpoints(cluster: str,
|
@@ -276,6 +276,7 @@ class RayCodeGen:
|
|
276
276
|
from sky.skylet import constants
|
277
277
|
from sky.skylet import job_lib
|
278
278
|
from sky.utils import log_utils
|
279
|
+
from sky.utils import subprocess_utils
|
279
280
|
|
280
281
|
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
|
281
282
|
|
@@ -3275,14 +3276,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3275
3276
|
encoded_script = shlex.quote(codegen)
|
3276
3277
|
create_script_code = (f'{{ echo {encoded_script} > {script_path}; }}')
|
3277
3278
|
job_submit_cmd = (
|
3278
|
-
|
3279
|
-
|
3280
|
-
'
|
3281
|
-
f'
|
3282
|
-
f'"{constants.SKY_PYTHON_CMD} -u {script_path} '
|
3279
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process retrieved
|
3280
|
+
# with pid is the same driver process.
|
3281
|
+
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
3282
|
+
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
3283
3283
|
# Do not use &>, which is not POSIX and may not work.
|
3284
3284
|
# Note that the order of ">filename 2>&1" matters.
|
3285
|
-
f'> {remote_log_path} 2>&1
|
3285
|
+
f'> {remote_log_path} 2>&1')
|
3286
3286
|
|
3287
3287
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
3288
3288
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
@@ -3330,6 +3330,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3330
3330
|
job_submit_cmd,
|
3331
3331
|
stream_logs=False,
|
3332
3332
|
require_outputs=True)
|
3333
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
3334
|
+
# running a job. Necessitating calling `sky launch`.
|
3335
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
3336
|
+
handle.cluster_name)
|
3333
3337
|
if returncode == 255 and 'too long' in stdout + stderr:
|
3334
3338
|
# If the generated script is too long, we retry it with dumping
|
3335
3339
|
# the script to a file and running it with SSH. We use a general
|
@@ -3344,10 +3348,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3344
3348
|
stream_logs=False,
|
3345
3349
|
require_outputs=True)
|
3346
3350
|
|
3347
|
-
# Happens when someone calls `sky exec` but remote is outdated
|
3348
|
-
# necessitating calling `sky launch`.
|
3349
|
-
backend_utils.check_stale_runtime_on_remote(returncode, stdout,
|
3350
|
-
handle.cluster_name)
|
3351
3351
|
subprocess_utils.handle_returncode(returncode,
|
3352
3352
|
job_submit_cmd,
|
3353
3353
|
f'Failed to submit job {job_id}.',
|
@@ -3417,6 +3417,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3417
3417
|
stream_logs=False,
|
3418
3418
|
require_outputs=True,
|
3419
3419
|
separate_stderr=True)
|
3420
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
3421
|
+
# adding a job. Necessitating calling `sky launch`.
|
3422
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
3423
|
+
handle.cluster_name)
|
3420
3424
|
# TODO(zhwu): this sometimes will unexpectedly fail, we can add
|
3421
3425
|
# retry for this, after we figure out the reason.
|
3422
3426
|
subprocess_utils.handle_returncode(returncode, code,
|
sky/clouds/oci.py
CHANGED
@@ -75,8 +75,6 @@ class OCI(clouds.Cloud):
|
|
75
75
|
(f'Docker image is currently not supported on {cls._REPR}. '
|
76
76
|
'You can try running docker command inside the '
|
77
77
|
'`run` section in task.yaml.'),
|
78
|
-
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
79
|
-
(f'Opening ports is currently not supported on {cls._REPR}.'),
|
80
78
|
}
|
81
79
|
if resources.use_spot:
|
82
80
|
features[clouds.CloudImplementationFeatures.STOP] = (
|
sky/clouds/utils/oci_utils.py
CHANGED
@@ -4,6 +4,8 @@ History:
|
|
4
4
|
- Zhanghao Wu @ Oct 2023: Formatting and refactoring
|
5
5
|
- Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS
|
6
6
|
configuration.
|
7
|
+
- Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add the constant
|
8
|
+
SERVICE_PORT_RULE_TAG
|
7
9
|
"""
|
8
10
|
import os
|
9
11
|
|
@@ -42,6 +44,9 @@ class OCIConfig:
|
|
42
44
|
VCN_CIDR_INTERNET = '0.0.0.0/0'
|
43
45
|
VCN_CIDR = '192.168.0.0/16'
|
44
46
|
VCN_SUBNET_CIDR = '192.168.0.0/18'
|
47
|
+
SERVICE_PORT_RULE_TAG = 'SkyServe-Service-Port'
|
48
|
+
# NSG name template
|
49
|
+
NSG_NAME_TEMPLATE = 'nsg_{cluster_name}'
|
45
50
|
|
46
51
|
MAX_RETRY_COUNT = 3
|
47
52
|
RETRY_INTERVAL_BASE_SECONDS = 5
|
sky/execution.py
CHANGED
@@ -11,10 +11,10 @@ import sky
|
|
11
11
|
from sky import admin_policy
|
12
12
|
from sky import backends
|
13
13
|
from sky import clouds
|
14
|
-
from sky import exceptions
|
15
14
|
from sky import global_user_state
|
16
15
|
from sky import optimizer
|
17
16
|
from sky import sky_logging
|
17
|
+
from sky import status_lib
|
18
18
|
from sky.backends import backend_utils
|
19
19
|
from sky.usage import usage_lib
|
20
20
|
from sky.utils import admin_policy_utils
|
@@ -463,28 +463,43 @@ def launch(
|
|
463
463
|
stages = None
|
464
464
|
# Check if cluster exists and we are doing fast provisioning
|
465
465
|
if fast and cluster_name is not None:
|
466
|
-
maybe_handle =
|
467
|
-
cluster_name)
|
468
|
-
if
|
469
|
-
|
470
|
-
|
471
|
-
|
466
|
+
cluster_status, maybe_handle = (
|
467
|
+
backend_utils.refresh_cluster_status_handle(cluster_name))
|
468
|
+
if cluster_status == status_lib.ClusterStatus.INIT:
|
469
|
+
# If the cluster is INIT, it may be provisioning. We want to prevent
|
470
|
+
# concurrent calls from queueing up many sequential reprovision
|
471
|
+
# attempts. Since provisioning will hold the cluster status lock, we
|
472
|
+
# wait to hold that lock by force refreshing the status. This will
|
473
|
+
# block until the cluster finishes provisioning, then correctly see
|
474
|
+
# that it is UP.
|
475
|
+
# TODO(cooperc): If multiple processes launched in parallel see that
|
476
|
+
# the cluster is STOPPED or does not exist, they will still all try
|
477
|
+
# to provision it, since we do not hold the lock continuously from
|
478
|
+
# the status check until the provision call. Fixing this requires a
|
479
|
+
# bigger refactor.
|
480
|
+
cluster_status, maybe_handle = (
|
481
|
+
backend_utils.refresh_cluster_status_handle(
|
472
482
|
cluster_name,
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
483
|
+
force_refresh_statuses=[
|
484
|
+
# If the cluster is INIT, we want to try to grab the
|
485
|
+
# status lock, which should block until provisioning is
|
486
|
+
# finished.
|
487
|
+
status_lib.ClusterStatus.INIT,
|
488
|
+
],
|
489
|
+
# Wait indefinitely to obtain the lock, so that we don't
|
490
|
+
# have multiple processes launching the same cluster at
|
491
|
+
# once.
|
492
|
+
cluster_status_lock_timeout=-1,
|
493
|
+
))
|
494
|
+
if cluster_status == status_lib.ClusterStatus.UP:
|
495
|
+
handle = maybe_handle
|
496
|
+
stages = [
|
497
|
+
Stage.SYNC_WORKDIR,
|
498
|
+
Stage.SYNC_FILE_MOUNTS,
|
499
|
+
Stage.PRE_EXEC,
|
500
|
+
Stage.EXEC,
|
501
|
+
Stage.DOWN,
|
502
|
+
]
|
488
503
|
|
489
504
|
return _execute(
|
490
505
|
entrypoint=entrypoint,
|
sky/jobs/core.py
CHANGED
sky/jobs/utils.py
CHANGED
@@ -85,7 +85,8 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
|
|
85
85
|
cluster_name: str) -> Optional['job_lib.JobStatus']:
|
86
86
|
"""Check the status of the job running on a managed job cluster.
|
87
87
|
|
88
|
-
It can be None, INIT, RUNNING, SUCCEEDED, FAILED,
|
88
|
+
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
|
89
|
+
FAILED_SETUP or CANCELLED.
|
89
90
|
"""
|
90
91
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
91
92
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
@@ -866,7 +867,7 @@ class ManagedJobCodeGen:
|
|
866
867
|
code += inspect.getsource(stream_logs)
|
867
868
|
code += textwrap.dedent(f"""\
|
868
869
|
|
869
|
-
msg = stream_logs({job_id!r}, {job_name!r},
|
870
|
+
msg = stream_logs({job_id!r}, {job_name!r},
|
870
871
|
follow={follow}, controller={controller})
|
871
872
|
print(msg, flush=True)
|
872
873
|
""")
|
@@ -883,7 +884,7 @@ class ManagedJobCodeGen:
|
|
883
884
|
resources_str = backend_utils.get_task_resources_str(
|
884
885
|
task, is_managed_job=True)
|
885
886
|
code += textwrap.dedent(f"""\
|
886
|
-
managed_job_state.set_pending({job_id}, {task_id},
|
887
|
+
managed_job_state.set_pending({job_id}, {task_id},
|
887
888
|
{task.name!r}, {resources_str!r})
|
888
889
|
""")
|
889
890
|
return cls._build(code)
|
sky/provision/oci/instance.py
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
History:
|
4
4
|
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
|
5
|
+
- Hysun He (hysun.he@oracle.com) @ Nov.13, 2024: Implement open_ports
|
6
|
+
and cleanup_ports for supporting SkyServe.
|
5
7
|
"""
|
6
8
|
|
7
9
|
import copy
|
@@ -292,11 +294,11 @@ def open_ports(
|
|
292
294
|
provider_config: Optional[Dict[str, Any]] = None,
|
293
295
|
) -> None:
|
294
296
|
"""Open ports for inbound traffic."""
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
297
|
+
assert provider_config is not None, cluster_name_on_cloud
|
298
|
+
region = provider_config['region']
|
299
|
+
query_helper.create_nsg_rules(region=region,
|
300
|
+
cluster_name=cluster_name_on_cloud,
|
301
|
+
ports=ports)
|
300
302
|
|
301
303
|
|
302
304
|
@query_utils.debug_enabled(logger)
|
@@ -306,12 +308,11 @@ def cleanup_ports(
|
|
306
308
|
provider_config: Optional[Dict[str, Any]] = None,
|
307
309
|
) -> None:
|
308
310
|
"""Delete any opened ports."""
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
# to delete the VCN or not from OCI console, for example.
|
311
|
+
assert provider_config is not None, cluster_name_on_cloud
|
312
|
+
region = provider_config['region']
|
313
|
+
del ports
|
314
|
+
query_helper.remove_cluster_nsg(region=region,
|
315
|
+
cluster_name=cluster_name_on_cloud)
|
315
316
|
|
316
317
|
|
317
318
|
@query_utils.debug_enabled(logger)
|
sky/provision/oci/query_utils.py
CHANGED
@@ -5,6 +5,8 @@ History:
|
|
5
5
|
migrated from the old provisioning API.
|
6
6
|
- Hysun He (hysun.he@oracle.com) @ Oct.18, 2024: Enhancement.
|
7
7
|
find_compartment: allow search subtree when find a compartment.
|
8
|
+
- Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add methods to
|
9
|
+
Add/remove security rules: create_nsg_rules & remove_nsg
|
8
10
|
"""
|
9
11
|
from datetime import datetime
|
10
12
|
import functools
|
@@ -13,12 +15,15 @@ import re
|
|
13
15
|
import time
|
14
16
|
import traceback
|
15
17
|
import typing
|
16
|
-
from typing import Optional
|
18
|
+
from typing import List, Optional, Tuple
|
17
19
|
|
20
|
+
from sky import exceptions
|
18
21
|
from sky import sky_logging
|
19
22
|
from sky.adaptors import common as adaptors_common
|
20
23
|
from sky.adaptors import oci as oci_adaptor
|
21
24
|
from sky.clouds.utils import oci_utils
|
25
|
+
from sky.provision import constants
|
26
|
+
from sky.utils import resources_utils
|
22
27
|
|
23
28
|
if typing.TYPE_CHECKING:
|
24
29
|
import pandas as pd
|
@@ -81,19 +86,33 @@ class QueryHelper:
|
|
81
86
|
return result_set
|
82
87
|
|
83
88
|
@classmethod
|
89
|
+
@debug_enabled(logger)
|
84
90
|
def terminate_instances_by_tags(cls, tag_filters, region) -> int:
|
85
91
|
logger.debug(f'Terminate instance by tags: {tag_filters}')
|
92
|
+
|
93
|
+
cluster_name = tag_filters[constants.TAG_RAY_CLUSTER_NAME]
|
94
|
+
nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
|
95
|
+
cluster_name=cluster_name)
|
96
|
+
nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=False)
|
97
|
+
|
98
|
+
core_client = oci_adaptor.get_core_client(
|
99
|
+
region, oci_utils.oci_config.get_profile())
|
100
|
+
|
86
101
|
insts = cls.query_instances_by_tags(tag_filters, region)
|
87
102
|
fail_count = 0
|
88
103
|
for inst in insts:
|
89
104
|
inst_id = inst.identifier
|
90
|
-
logger.debug(f'
|
105
|
+
logger.debug(f'Terminating instance {inst_id}')
|
91
106
|
|
92
107
|
try:
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
108
|
+
# Release the NSG reference so that the NSG can be
|
109
|
+
# deleted without waiting the instance being terminated.
|
110
|
+
if nsg_id is not None:
|
111
|
+
cls.detach_nsg(region, inst, nsg_id)
|
112
|
+
|
113
|
+
# Terminate the instance
|
114
|
+
core_client.terminate_instance(inst_id)
|
115
|
+
|
97
116
|
except oci_adaptor.oci.exceptions.ServiceError as e:
|
98
117
|
fail_count += 1
|
99
118
|
logger.error(f'Terminate instance failed: {str(e)}\n: {inst}')
|
@@ -468,5 +487,192 @@ class QueryHelper:
|
|
468
487
|
logger.error(
|
469
488
|
f'Delete VCN {oci_utils.oci_config.VCN_NAME} Error: {str(e)}')
|
470
489
|
|
490
|
+
@classmethod
|
491
|
+
@debug_enabled(logger)
|
492
|
+
def find_nsg(cls, region: str, nsg_name: str,
|
493
|
+
create_if_not_exist: bool) -> Optional[str]:
|
494
|
+
net_client = oci_adaptor.get_net_client(
|
495
|
+
region, oci_utils.oci_config.get_profile())
|
496
|
+
|
497
|
+
compartment = cls.find_compartment(region)
|
498
|
+
|
499
|
+
list_vcns_resp = net_client.list_vcns(
|
500
|
+
compartment_id=compartment,
|
501
|
+
display_name=oci_utils.oci_config.VCN_NAME,
|
502
|
+
lifecycle_state='AVAILABLE',
|
503
|
+
)
|
504
|
+
|
505
|
+
if not list_vcns_resp:
|
506
|
+
raise exceptions.ResourcesUnavailableError(
|
507
|
+
'The VCN is not available')
|
508
|
+
|
509
|
+
# Get the primary vnic.
|
510
|
+
assert len(list_vcns_resp.data) > 0
|
511
|
+
vcn = list_vcns_resp.data[0]
|
512
|
+
|
513
|
+
list_nsg_resp = net_client.list_network_security_groups(
|
514
|
+
compartment_id=compartment,
|
515
|
+
vcn_id=vcn.id,
|
516
|
+
limit=1,
|
517
|
+
display_name=nsg_name,
|
518
|
+
)
|
519
|
+
|
520
|
+
nsgs = list_nsg_resp.data
|
521
|
+
if nsgs:
|
522
|
+
assert len(nsgs) == 1
|
523
|
+
return nsgs[0].id
|
524
|
+
elif not create_if_not_exist:
|
525
|
+
return None
|
526
|
+
|
527
|
+
# Continue to create new NSG if not exists
|
528
|
+
create_nsg_resp = net_client.create_network_security_group(
|
529
|
+
create_network_security_group_details=oci_adaptor.oci.core.models.
|
530
|
+
CreateNetworkSecurityGroupDetails(
|
531
|
+
compartment_id=compartment,
|
532
|
+
vcn_id=vcn.id,
|
533
|
+
display_name=nsg_name,
|
534
|
+
))
|
535
|
+
get_nsg_resp = net_client.get_network_security_group(
|
536
|
+
network_security_group_id=create_nsg_resp.data.id)
|
537
|
+
oci_adaptor.oci.wait_until(
|
538
|
+
net_client,
|
539
|
+
get_nsg_resp,
|
540
|
+
'lifecycle_state',
|
541
|
+
'AVAILABLE',
|
542
|
+
)
|
543
|
+
|
544
|
+
return get_nsg_resp.data.id
|
545
|
+
|
546
|
+
@classmethod
|
547
|
+
def get_range_min_max(cls, port_range: str) -> Tuple[int, int]:
|
548
|
+
range_list = port_range.split('-')
|
549
|
+
if len(range_list) == 1:
|
550
|
+
return (int(range_list[0]), int(range_list[0]))
|
551
|
+
from_port, to_port = range_list
|
552
|
+
return (int(from_port), int(to_port))
|
553
|
+
|
554
|
+
@classmethod
|
555
|
+
@debug_enabled(logger)
|
556
|
+
def create_nsg_rules(cls, region: str, cluster_name: str,
|
557
|
+
ports: List[str]) -> None:
|
558
|
+
""" Create per-cluster NSG with ingress rules """
|
559
|
+
if not ports:
|
560
|
+
return
|
561
|
+
|
562
|
+
net_client = oci_adaptor.get_net_client(
|
563
|
+
region, oci_utils.oci_config.get_profile())
|
564
|
+
|
565
|
+
nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
|
566
|
+
cluster_name=cluster_name)
|
567
|
+
nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=True)
|
568
|
+
|
569
|
+
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name}
|
570
|
+
insts = query_helper.query_instances_by_tags(filters, region)
|
571
|
+
for inst in insts:
|
572
|
+
vnic = cls.get_instance_primary_vnic(
|
573
|
+
region=region,
|
574
|
+
inst_info={
|
575
|
+
'inst_id': inst.identifier,
|
576
|
+
'ad': inst.availability_domain,
|
577
|
+
'compartment': inst.compartment_id,
|
578
|
+
})
|
579
|
+
nsg_ids = vnic.nsg_ids
|
580
|
+
if not nsg_ids:
|
581
|
+
net_client.update_vnic(
|
582
|
+
vnic_id=vnic.id,
|
583
|
+
update_vnic_details=oci_adaptor.oci.core.models.
|
584
|
+
UpdateVnicDetails(nsg_ids=[nsg_id],
|
585
|
+
skip_source_dest_check=False),
|
586
|
+
)
|
587
|
+
|
588
|
+
# pylint: disable=line-too-long
|
589
|
+
list_nsg_rules_resp = net_client.list_network_security_group_security_rules(
|
590
|
+
network_security_group_id=nsg_id,
|
591
|
+
direction='INGRESS',
|
592
|
+
sort_by='TIMECREATED',
|
593
|
+
sort_order='DESC',
|
594
|
+
)
|
595
|
+
|
596
|
+
ingress_rules: List = list_nsg_rules_resp.data
|
597
|
+
existing_port_ranges: List[str] = []
|
598
|
+
for r in ingress_rules:
|
599
|
+
if r.tcp_options:
|
600
|
+
options_range = r.tcp_options.destination_port_range
|
601
|
+
rule_port_range = f'{options_range.min}-{options_range.max}'
|
602
|
+
existing_port_ranges.append(rule_port_range)
|
603
|
+
|
604
|
+
new_ports = resources_utils.port_ranges_to_set(ports)
|
605
|
+
existing_ports = resources_utils.port_ranges_to_set(
|
606
|
+
existing_port_ranges)
|
607
|
+
if new_ports.issubset(existing_ports):
|
608
|
+
# ports already contains in the existing rules, nothing to add.
|
609
|
+
return
|
610
|
+
|
611
|
+
# Determine the ports to be added, without overlapping.
|
612
|
+
ports_to_open = new_ports - existing_ports
|
613
|
+
port_ranges_to_open = resources_utils.port_set_to_ranges(ports_to_open)
|
614
|
+
|
615
|
+
new_rules = []
|
616
|
+
for port_range in port_ranges_to_open:
|
617
|
+
port_range_min, port_range_max = cls.get_range_min_max(port_range)
|
618
|
+
new_rules.append(
|
619
|
+
oci_adaptor.oci.core.models.AddSecurityRuleDetails(
|
620
|
+
direction='INGRESS',
|
621
|
+
protocol='6',
|
622
|
+
is_stateless=False,
|
623
|
+
source=oci_utils.oci_config.VCN_CIDR_INTERNET,
|
624
|
+
source_type='CIDR_BLOCK',
|
625
|
+
tcp_options=oci_adaptor.oci.core.models.TcpOptions(
|
626
|
+
destination_port_range=oci_adaptor.oci.core.models.
|
627
|
+
PortRange(min=port_range_min, max=port_range_max),),
|
628
|
+
description=oci_utils.oci_config.SERVICE_PORT_RULE_TAG,
|
629
|
+
))
|
630
|
+
|
631
|
+
net_client.add_network_security_group_security_rules(
|
632
|
+
network_security_group_id=nsg_id,
|
633
|
+
add_network_security_group_security_rules_details=oci_adaptor.oci.
|
634
|
+
core.models.AddNetworkSecurityGroupSecurityRulesDetails(
|
635
|
+
security_rules=new_rules),
|
636
|
+
)
|
637
|
+
|
638
|
+
@classmethod
|
639
|
+
@debug_enabled(logger)
|
640
|
+
def detach_nsg(cls, region: str, inst, nsg_id: Optional[str]) -> None:
|
641
|
+
if nsg_id is None:
|
642
|
+
return
|
643
|
+
|
644
|
+
vnic = cls.get_instance_primary_vnic(
|
645
|
+
region=region,
|
646
|
+
inst_info={
|
647
|
+
'inst_id': inst.identifier,
|
648
|
+
'ad': inst.availability_domain,
|
649
|
+
'compartment': inst.compartment_id,
|
650
|
+
})
|
651
|
+
|
652
|
+
# Detatch the NSG before removing it.
|
653
|
+
oci_adaptor.get_net_client(region, oci_utils.oci_config.get_profile(
|
654
|
+
)).update_vnic(
|
655
|
+
vnic_id=vnic.id,
|
656
|
+
update_vnic_details=oci_adaptor.oci.core.models.UpdateVnicDetails(
|
657
|
+
nsg_ids=[], skip_source_dest_check=False),
|
658
|
+
)
|
659
|
+
|
660
|
+
@classmethod
|
661
|
+
@debug_enabled(logger)
|
662
|
+
def remove_cluster_nsg(cls, region: str, cluster_name: str) -> None:
|
663
|
+
""" Remove NSG of the cluster """
|
664
|
+
net_client = oci_adaptor.get_net_client(
|
665
|
+
region, oci_utils.oci_config.get_profile())
|
666
|
+
|
667
|
+
nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
|
668
|
+
cluster_name=cluster_name)
|
669
|
+
nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=False)
|
670
|
+
if nsg_id is None:
|
671
|
+
return
|
672
|
+
|
673
|
+
# Delete the NSG
|
674
|
+
net_client.delete_network_security_group(
|
675
|
+
network_security_group_id=nsg_id)
|
676
|
+
|
471
677
|
|
472
678
|
query_helper = QueryHelper()
|
sky/serve/core.py
CHANGED
@@ -701,6 +701,7 @@ def tail_logs(
|
|
701
701
|
with ux_utils.print_exception_no_traceback():
|
702
702
|
raise ValueError(f'`target` must be a string or '
|
703
703
|
f'sky.serve.ServiceComponent, got {type(target)}.')
|
704
|
+
|
704
705
|
if target == serve_utils.ServiceComponent.REPLICA:
|
705
706
|
if replica_id is None:
|
706
707
|
with ux_utils.print_exception_no_traceback():
|