skypilot-nightly 1.0.0.dev20241114__py3-none-any.whl → 1.0.0.dev20241116__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +142 -74
- sky/backends/cloud_vm_ray_backend.py +15 -11
- sky/cli.py +15 -4
- sky/clouds/aws.py +1 -0
- sky/clouds/oci.py +0 -2
- sky/clouds/service_catalog/aws_catalog.py +2 -0
- sky/clouds/utils/oci_utils.py +5 -0
- sky/execution.py +43 -22
- sky/global_user_state.py +36 -16
- sky/jobs/core.py +0 -1
- sky/jobs/utils.py +4 -3
- sky/provision/kubernetes/utils.py +2 -0
- sky/provision/oci/instance.py +12 -11
- sky/provision/oci/query_utils.py +212 -6
- sky/serve/core.py +1 -0
- sky/serve/serve_utils.py +35 -30
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +249 -138
- sky/skylet/log_lib.py +1 -34
- sky/skylet/subprocess_daemon.py +33 -13
- sky/utils/controller_utils.py +10 -9
- sky/utils/schemas.py +1 -0
- sky/utils/subprocess_utils.py +50 -0
- sky/utils/timeline.py +2 -4
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/RECORD +31 -31
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
@@ -60,7 +60,8 @@ def create_table(cursor, conn):
|
|
60
60
|
owner TEXT DEFAULT null,
|
61
61
|
cluster_hash TEXT DEFAULT null,
|
62
62
|
storage_mounts_metadata BLOB DEFAULT null,
|
63
|
-
cluster_ever_up INTEGER DEFAULT 0
|
63
|
+
cluster_ever_up INTEGER DEFAULT 0,
|
64
|
+
status_updated_at INTEGER DEFAULT null)""")
|
64
65
|
|
65
66
|
# Table for Cluster History
|
66
67
|
# usage_intervals: List[Tuple[int, int]]
|
@@ -130,6 +131,10 @@ def create_table(cursor, conn):
|
|
130
131
|
# clusters were never really UP, setting it to 1 means they won't be
|
131
132
|
# auto-deleted during any failover.
|
132
133
|
value_to_replace_existing_entries=1)
|
134
|
+
|
135
|
+
db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
|
136
|
+
'INTEGER DEFAULT null')
|
137
|
+
|
133
138
|
conn.commit()
|
134
139
|
|
135
140
|
|
@@ -159,6 +164,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
159
164
|
status = status_lib.ClusterStatus.INIT
|
160
165
|
if ready:
|
161
166
|
status = status_lib.ClusterStatus.UP
|
167
|
+
status_updated_at = int(time.time())
|
162
168
|
|
163
169
|
# TODO (sumanth): Cluster history table will have multiple entries
|
164
170
|
# when the cluster failover through multiple regions (one entry per region).
|
@@ -191,7 +197,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
191
197
|
# specified.
|
192
198
|
'(name, launched_at, handle, last_use, status, '
|
193
199
|
'autostop, to_down, metadata, owner, cluster_hash, '
|
194
|
-
'storage_mounts_metadata, cluster_ever_up) '
|
200
|
+
'storage_mounts_metadata, cluster_ever_up, status_updated_at) '
|
195
201
|
'VALUES ('
|
196
202
|
# name
|
197
203
|
'?, '
|
@@ -228,7 +234,9 @@ def add_or_update_cluster(cluster_name: str,
|
|
228
234
|
'COALESCE('
|
229
235
|
'(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null), '
|
230
236
|
# cluster_ever_up
|
231
|
-
'((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?)'
|
237
|
+
'((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?),'
|
238
|
+
# status_updated_at
|
239
|
+
'?'
|
232
240
|
')',
|
233
241
|
(
|
234
242
|
# name
|
@@ -260,6 +268,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
260
268
|
# cluster_ever_up
|
261
269
|
cluster_name,
|
262
270
|
int(ready),
|
271
|
+
# status_updated_at
|
272
|
+
status_updated_at,
|
263
273
|
))
|
264
274
|
|
265
275
|
launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
|
@@ -330,11 +340,13 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
330
340
|
# stopped VM, which leads to timeout.
|
331
341
|
if hasattr(handle, 'stable_internal_external_ips'):
|
332
342
|
handle.stable_internal_external_ips = None
|
343
|
+
current_time = int(time.time())
|
333
344
|
_DB.cursor.execute(
|
334
|
-
'UPDATE clusters SET handle=(?), status=(?) '
|
335
|
-
'WHERE name=(?)', (
|
345
|
+
'UPDATE clusters SET handle=(?), status=(?), '
|
346
|
+
'status_updated_at=(?) WHERE name=(?)', (
|
336
347
|
pickle.dumps(handle),
|
337
348
|
status_lib.ClusterStatus.STOPPED.value,
|
349
|
+
current_time,
|
338
350
|
cluster_name,
|
339
351
|
))
|
340
352
|
_DB.conn.commit()
|
@@ -359,10 +371,10 @@ def get_glob_cluster_names(cluster_name: str) -> List[str]:
|
|
359
371
|
|
360
372
|
def set_cluster_status(cluster_name: str,
|
361
373
|
status: status_lib.ClusterStatus) -> None:
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
374
|
+
current_time = int(time.time())
|
375
|
+
_DB.cursor.execute(
|
376
|
+
'UPDATE clusters SET status=(?), status_updated_at=(?) WHERE name=(?)',
|
377
|
+
(status.value, current_time, cluster_name))
|
366
378
|
count = _DB.cursor.rowcount
|
367
379
|
_DB.conn.commit()
|
368
380
|
assert count <= 1, count
|
@@ -570,15 +582,18 @@ def _load_storage_mounts_metadata(
|
|
570
582
|
|
571
583
|
def get_cluster_from_name(
|
572
584
|
cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
|
573
|
-
rows = _DB.cursor.execute(
|
574
|
-
|
585
|
+
rows = _DB.cursor.execute(
|
586
|
+
'SELECT name, launched_at, handle, last_use, status, autostop, '
|
587
|
+
'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
|
588
|
+
'cluster_ever_up, status_updated_at FROM clusters WHERE name=(?)',
|
589
|
+
(cluster_name,)).fetchall()
|
575
590
|
for row in rows:
|
576
591
|
# Explicitly specify the number of fields to unpack, so that
|
577
592
|
# we can add new fields to the database in the future without
|
578
593
|
# breaking the previous code.
|
579
594
|
(name, launched_at, handle, last_use, status, autostop, metadata,
|
580
|
-
to_down, owner, cluster_hash, storage_mounts_metadata,
|
581
|
-
|
595
|
+
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
596
|
+
status_updated_at) = row[:13]
|
582
597
|
# TODO: use namedtuple instead of dict
|
583
598
|
record = {
|
584
599
|
'name': name,
|
@@ -594,6 +609,7 @@ def get_cluster_from_name(
|
|
594
609
|
'storage_mounts_metadata':
|
595
610
|
_load_storage_mounts_metadata(storage_mounts_metadata),
|
596
611
|
'cluster_ever_up': bool(cluster_ever_up),
|
612
|
+
'status_updated_at': status_updated_at,
|
597
613
|
}
|
598
614
|
return record
|
599
615
|
return None
|
@@ -601,12 +617,15 @@ def get_cluster_from_name(
|
|
601
617
|
|
602
618
|
def get_clusters() -> List[Dict[str, Any]]:
|
603
619
|
rows = _DB.cursor.execute(
|
604
|
-
'select
|
620
|
+
'select name, launched_at, handle, last_use, status, autostop, '
|
621
|
+
'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
|
622
|
+
'cluster_ever_up, status_updated_at from clusters '
|
623
|
+
'order by launched_at desc').fetchall()
|
605
624
|
records = []
|
606
625
|
for row in rows:
|
607
626
|
(name, launched_at, handle, last_use, status, autostop, metadata,
|
608
|
-
to_down, owner, cluster_hash, storage_mounts_metadata,
|
609
|
-
|
627
|
+
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
628
|
+
status_updated_at) = row[:13]
|
610
629
|
# TODO: use namedtuple instead of dict
|
611
630
|
record = {
|
612
631
|
'name': name,
|
@@ -622,6 +641,7 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
622
641
|
'storage_mounts_metadata':
|
623
642
|
_load_storage_mounts_metadata(storage_mounts_metadata),
|
624
643
|
'cluster_ever_up': bool(cluster_ever_up),
|
644
|
+
'status_updated_at': status_updated_at,
|
625
645
|
}
|
626
646
|
|
627
647
|
records.append(record)
|
sky/jobs/core.py
CHANGED
sky/jobs/utils.py
CHANGED
@@ -85,7 +85,8 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
|
|
85
85
|
cluster_name: str) -> Optional['job_lib.JobStatus']:
|
86
86
|
"""Check the status of the job running on a managed job cluster.
|
87
87
|
|
88
|
-
It can be None, INIT, RUNNING, SUCCEEDED, FAILED,
|
88
|
+
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
|
89
|
+
FAILED_SETUP or CANCELLED.
|
89
90
|
"""
|
90
91
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
91
92
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
@@ -866,7 +867,7 @@ class ManagedJobCodeGen:
|
|
866
867
|
code += inspect.getsource(stream_logs)
|
867
868
|
code += textwrap.dedent(f"""\
|
868
869
|
|
869
|
-
msg = stream_logs({job_id!r}, {job_name!r},
|
870
|
+
msg = stream_logs({job_id!r}, {job_name!r},
|
870
871
|
follow={follow}, controller={controller})
|
871
872
|
print(msg, flush=True)
|
872
873
|
""")
|
@@ -883,7 +884,7 @@ class ManagedJobCodeGen:
|
|
883
884
|
resources_str = backend_utils.get_task_resources_str(
|
884
885
|
task, is_managed_job=True)
|
885
886
|
code += textwrap.dedent(f"""\
|
886
|
-
managed_job_state.set_pending({job_id}, {task_id},
|
887
|
+
managed_job_state.set_pending({job_id}, {task_id},
|
887
888
|
{task.name!r}, {resources_str!r})
|
888
889
|
""")
|
889
890
|
return cls._build(code)
|
sky/provision/oci/instance.py
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
History:
|
4
4
|
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
|
5
|
+
- Hysun He (hysun.he@oracle.com) @ Nov.13, 2024: Implement open_ports
|
6
|
+
and cleanup_ports for supporting SkyServe.
|
5
7
|
"""
|
6
8
|
|
7
9
|
import copy
|
@@ -292,11 +294,11 @@ def open_ports(
|
|
292
294
|
provider_config: Optional[Dict[str, Any]] = None,
|
293
295
|
) -> None:
|
294
296
|
"""Open ports for inbound traffic."""
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
297
|
+
assert provider_config is not None, cluster_name_on_cloud
|
298
|
+
region = provider_config['region']
|
299
|
+
query_helper.create_nsg_rules(region=region,
|
300
|
+
cluster_name=cluster_name_on_cloud,
|
301
|
+
ports=ports)
|
300
302
|
|
301
303
|
|
302
304
|
@query_utils.debug_enabled(logger)
|
@@ -306,12 +308,11 @@ def cleanup_ports(
|
|
306
308
|
provider_config: Optional[Dict[str, Any]] = None,
|
307
309
|
) -> None:
|
308
310
|
"""Delete any opened ports."""
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
# to delete the VCN or not from OCI console, for example.
|
311
|
+
assert provider_config is not None, cluster_name_on_cloud
|
312
|
+
region = provider_config['region']
|
313
|
+
del ports
|
314
|
+
query_helper.remove_cluster_nsg(region=region,
|
315
|
+
cluster_name=cluster_name_on_cloud)
|
315
316
|
|
316
317
|
|
317
318
|
@query_utils.debug_enabled(logger)
|
sky/provision/oci/query_utils.py
CHANGED
@@ -5,6 +5,8 @@ History:
|
|
5
5
|
migrated from the old provisioning API.
|
6
6
|
- Hysun He (hysun.he@oracle.com) @ Oct.18, 2024: Enhancement.
|
7
7
|
find_compartment: allow search subtree when find a compartment.
|
8
|
+
- Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add methods to
|
9
|
+
Add/remove security rules: create_nsg_rules & remove_nsg
|
8
10
|
"""
|
9
11
|
from datetime import datetime
|
10
12
|
import functools
|
@@ -13,12 +15,15 @@ import re
|
|
13
15
|
import time
|
14
16
|
import traceback
|
15
17
|
import typing
|
16
|
-
from typing import Optional
|
18
|
+
from typing import List, Optional, Tuple
|
17
19
|
|
20
|
+
from sky import exceptions
|
18
21
|
from sky import sky_logging
|
19
22
|
from sky.adaptors import common as adaptors_common
|
20
23
|
from sky.adaptors import oci as oci_adaptor
|
21
24
|
from sky.clouds.utils import oci_utils
|
25
|
+
from sky.provision import constants
|
26
|
+
from sky.utils import resources_utils
|
22
27
|
|
23
28
|
if typing.TYPE_CHECKING:
|
24
29
|
import pandas as pd
|
@@ -81,19 +86,33 @@ class QueryHelper:
|
|
81
86
|
return result_set
|
82
87
|
|
83
88
|
@classmethod
|
89
|
+
@debug_enabled(logger)
|
84
90
|
def terminate_instances_by_tags(cls, tag_filters, region) -> int:
|
85
91
|
logger.debug(f'Terminate instance by tags: {tag_filters}')
|
92
|
+
|
93
|
+
cluster_name = tag_filters[constants.TAG_RAY_CLUSTER_NAME]
|
94
|
+
nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
|
95
|
+
cluster_name=cluster_name)
|
96
|
+
nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=False)
|
97
|
+
|
98
|
+
core_client = oci_adaptor.get_core_client(
|
99
|
+
region, oci_utils.oci_config.get_profile())
|
100
|
+
|
86
101
|
insts = cls.query_instances_by_tags(tag_filters, region)
|
87
102
|
fail_count = 0
|
88
103
|
for inst in insts:
|
89
104
|
inst_id = inst.identifier
|
90
|
-
logger.debug(f'
|
105
|
+
logger.debug(f'Terminating instance {inst_id}')
|
91
106
|
|
92
107
|
try:
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
108
|
+
# Release the NSG reference so that the NSG can be
|
109
|
+
# deleted without waiting the instance being terminated.
|
110
|
+
if nsg_id is not None:
|
111
|
+
cls.detach_nsg(region, inst, nsg_id)
|
112
|
+
|
113
|
+
# Terminate the instance
|
114
|
+
core_client.terminate_instance(inst_id)
|
115
|
+
|
97
116
|
except oci_adaptor.oci.exceptions.ServiceError as e:
|
98
117
|
fail_count += 1
|
99
118
|
logger.error(f'Terminate instance failed: {str(e)}\n: {inst}')
|
@@ -468,5 +487,192 @@ class QueryHelper:
|
|
468
487
|
logger.error(
|
469
488
|
f'Delete VCN {oci_utils.oci_config.VCN_NAME} Error: {str(e)}')
|
470
489
|
|
490
|
+
@classmethod
|
491
|
+
@debug_enabled(logger)
|
492
|
+
def find_nsg(cls, region: str, nsg_name: str,
|
493
|
+
create_if_not_exist: bool) -> Optional[str]:
|
494
|
+
net_client = oci_adaptor.get_net_client(
|
495
|
+
region, oci_utils.oci_config.get_profile())
|
496
|
+
|
497
|
+
compartment = cls.find_compartment(region)
|
498
|
+
|
499
|
+
list_vcns_resp = net_client.list_vcns(
|
500
|
+
compartment_id=compartment,
|
501
|
+
display_name=oci_utils.oci_config.VCN_NAME,
|
502
|
+
lifecycle_state='AVAILABLE',
|
503
|
+
)
|
504
|
+
|
505
|
+
if not list_vcns_resp:
|
506
|
+
raise exceptions.ResourcesUnavailableError(
|
507
|
+
'The VCN is not available')
|
508
|
+
|
509
|
+
# Get the primary vnic.
|
510
|
+
assert len(list_vcns_resp.data) > 0
|
511
|
+
vcn = list_vcns_resp.data[0]
|
512
|
+
|
513
|
+
list_nsg_resp = net_client.list_network_security_groups(
|
514
|
+
compartment_id=compartment,
|
515
|
+
vcn_id=vcn.id,
|
516
|
+
limit=1,
|
517
|
+
display_name=nsg_name,
|
518
|
+
)
|
519
|
+
|
520
|
+
nsgs = list_nsg_resp.data
|
521
|
+
if nsgs:
|
522
|
+
assert len(nsgs) == 1
|
523
|
+
return nsgs[0].id
|
524
|
+
elif not create_if_not_exist:
|
525
|
+
return None
|
526
|
+
|
527
|
+
# Continue to create new NSG if not exists
|
528
|
+
create_nsg_resp = net_client.create_network_security_group(
|
529
|
+
create_network_security_group_details=oci_adaptor.oci.core.models.
|
530
|
+
CreateNetworkSecurityGroupDetails(
|
531
|
+
compartment_id=compartment,
|
532
|
+
vcn_id=vcn.id,
|
533
|
+
display_name=nsg_name,
|
534
|
+
))
|
535
|
+
get_nsg_resp = net_client.get_network_security_group(
|
536
|
+
network_security_group_id=create_nsg_resp.data.id)
|
537
|
+
oci_adaptor.oci.wait_until(
|
538
|
+
net_client,
|
539
|
+
get_nsg_resp,
|
540
|
+
'lifecycle_state',
|
541
|
+
'AVAILABLE',
|
542
|
+
)
|
543
|
+
|
544
|
+
return get_nsg_resp.data.id
|
545
|
+
|
546
|
+
@classmethod
|
547
|
+
def get_range_min_max(cls, port_range: str) -> Tuple[int, int]:
|
548
|
+
range_list = port_range.split('-')
|
549
|
+
if len(range_list) == 1:
|
550
|
+
return (int(range_list[0]), int(range_list[0]))
|
551
|
+
from_port, to_port = range_list
|
552
|
+
return (int(from_port), int(to_port))
|
553
|
+
|
554
|
+
@classmethod
|
555
|
+
@debug_enabled(logger)
|
556
|
+
def create_nsg_rules(cls, region: str, cluster_name: str,
|
557
|
+
ports: List[str]) -> None:
|
558
|
+
""" Create per-cluster NSG with ingress rules """
|
559
|
+
if not ports:
|
560
|
+
return
|
561
|
+
|
562
|
+
net_client = oci_adaptor.get_net_client(
|
563
|
+
region, oci_utils.oci_config.get_profile())
|
564
|
+
|
565
|
+
nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
|
566
|
+
cluster_name=cluster_name)
|
567
|
+
nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=True)
|
568
|
+
|
569
|
+
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name}
|
570
|
+
insts = query_helper.query_instances_by_tags(filters, region)
|
571
|
+
for inst in insts:
|
572
|
+
vnic = cls.get_instance_primary_vnic(
|
573
|
+
region=region,
|
574
|
+
inst_info={
|
575
|
+
'inst_id': inst.identifier,
|
576
|
+
'ad': inst.availability_domain,
|
577
|
+
'compartment': inst.compartment_id,
|
578
|
+
})
|
579
|
+
nsg_ids = vnic.nsg_ids
|
580
|
+
if not nsg_ids:
|
581
|
+
net_client.update_vnic(
|
582
|
+
vnic_id=vnic.id,
|
583
|
+
update_vnic_details=oci_adaptor.oci.core.models.
|
584
|
+
UpdateVnicDetails(nsg_ids=[nsg_id],
|
585
|
+
skip_source_dest_check=False),
|
586
|
+
)
|
587
|
+
|
588
|
+
# pylint: disable=line-too-long
|
589
|
+
list_nsg_rules_resp = net_client.list_network_security_group_security_rules(
|
590
|
+
network_security_group_id=nsg_id,
|
591
|
+
direction='INGRESS',
|
592
|
+
sort_by='TIMECREATED',
|
593
|
+
sort_order='DESC',
|
594
|
+
)
|
595
|
+
|
596
|
+
ingress_rules: List = list_nsg_rules_resp.data
|
597
|
+
existing_port_ranges: List[str] = []
|
598
|
+
for r in ingress_rules:
|
599
|
+
if r.tcp_options:
|
600
|
+
options_range = r.tcp_options.destination_port_range
|
601
|
+
rule_port_range = f'{options_range.min}-{options_range.max}'
|
602
|
+
existing_port_ranges.append(rule_port_range)
|
603
|
+
|
604
|
+
new_ports = resources_utils.port_ranges_to_set(ports)
|
605
|
+
existing_ports = resources_utils.port_ranges_to_set(
|
606
|
+
existing_port_ranges)
|
607
|
+
if new_ports.issubset(existing_ports):
|
608
|
+
# ports already contains in the existing rules, nothing to add.
|
609
|
+
return
|
610
|
+
|
611
|
+
# Determine the ports to be added, without overlapping.
|
612
|
+
ports_to_open = new_ports - existing_ports
|
613
|
+
port_ranges_to_open = resources_utils.port_set_to_ranges(ports_to_open)
|
614
|
+
|
615
|
+
new_rules = []
|
616
|
+
for port_range in port_ranges_to_open:
|
617
|
+
port_range_min, port_range_max = cls.get_range_min_max(port_range)
|
618
|
+
new_rules.append(
|
619
|
+
oci_adaptor.oci.core.models.AddSecurityRuleDetails(
|
620
|
+
direction='INGRESS',
|
621
|
+
protocol='6',
|
622
|
+
is_stateless=False,
|
623
|
+
source=oci_utils.oci_config.VCN_CIDR_INTERNET,
|
624
|
+
source_type='CIDR_BLOCK',
|
625
|
+
tcp_options=oci_adaptor.oci.core.models.TcpOptions(
|
626
|
+
destination_port_range=oci_adaptor.oci.core.models.
|
627
|
+
PortRange(min=port_range_min, max=port_range_max),),
|
628
|
+
description=oci_utils.oci_config.SERVICE_PORT_RULE_TAG,
|
629
|
+
))
|
630
|
+
|
631
|
+
net_client.add_network_security_group_security_rules(
|
632
|
+
network_security_group_id=nsg_id,
|
633
|
+
add_network_security_group_security_rules_details=oci_adaptor.oci.
|
634
|
+
core.models.AddNetworkSecurityGroupSecurityRulesDetails(
|
635
|
+
security_rules=new_rules),
|
636
|
+
)
|
637
|
+
|
638
|
+
@classmethod
|
639
|
+
@debug_enabled(logger)
|
640
|
+
def detach_nsg(cls, region: str, inst, nsg_id: Optional[str]) -> None:
|
641
|
+
if nsg_id is None:
|
642
|
+
return
|
643
|
+
|
644
|
+
vnic = cls.get_instance_primary_vnic(
|
645
|
+
region=region,
|
646
|
+
inst_info={
|
647
|
+
'inst_id': inst.identifier,
|
648
|
+
'ad': inst.availability_domain,
|
649
|
+
'compartment': inst.compartment_id,
|
650
|
+
})
|
651
|
+
|
652
|
+
# Detatch the NSG before removing it.
|
653
|
+
oci_adaptor.get_net_client(region, oci_utils.oci_config.get_profile(
|
654
|
+
)).update_vnic(
|
655
|
+
vnic_id=vnic.id,
|
656
|
+
update_vnic_details=oci_adaptor.oci.core.models.UpdateVnicDetails(
|
657
|
+
nsg_ids=[], skip_source_dest_check=False),
|
658
|
+
)
|
659
|
+
|
660
|
+
@classmethod
|
661
|
+
@debug_enabled(logger)
|
662
|
+
def remove_cluster_nsg(cls, region: str, cluster_name: str) -> None:
|
663
|
+
""" Remove NSG of the cluster """
|
664
|
+
net_client = oci_adaptor.get_net_client(
|
665
|
+
region, oci_utils.oci_config.get_profile())
|
666
|
+
|
667
|
+
nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
|
668
|
+
cluster_name=cluster_name)
|
669
|
+
nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=False)
|
670
|
+
if nsg_id is None:
|
671
|
+
return
|
672
|
+
|
673
|
+
# Delete the NSG
|
674
|
+
net_client.delete_network_security_group(
|
675
|
+
network_security_group_id=nsg_id)
|
676
|
+
|
471
677
|
|
472
678
|
query_helper = QueryHelper()
|
sky/serve/core.py
CHANGED
@@ -701,6 +701,7 @@ def tail_logs(
|
|
701
701
|
with ux_utils.print_exception_no_traceback():
|
702
702
|
raise ValueError(f'`target` must be a string or '
|
703
703
|
f'sky.serve.ServiceComponent, got {type(target)}.')
|
704
|
+
|
704
705
|
if target == serve_utils.ServiceComponent.REPLICA:
|
705
706
|
if replica_id is None:
|
706
707
|
with ux_utils.print_exception_no_traceback():
|
sky/serve/serve_utils.py
CHANGED
@@ -46,8 +46,14 @@ NUM_SERVICE_THRESHOLD = (_SYSTEM_MEMORY_GB //
|
|
46
46
|
constants.CONTROLLER_MEMORY_USAGE_GB)
|
47
47
|
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
|
48
48
|
|
49
|
-
|
50
|
-
|
49
|
+
# NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
|
50
|
+
# and always appear after a space. Be careful when changing UX as this
|
51
|
+
# assumption is used to expand some log files while ignoring others.
|
52
|
+
_SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
|
53
|
+
_SKYPILOT_PROVISION_LOG_PATTERN = (
|
54
|
+
fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
|
55
|
+
_SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
|
56
|
+
|
51
57
|
# TODO(tian): Find all existing replica id and print here.
|
52
58
|
_FAILED_TO_FIND_REPLICA_MSG = (
|
53
59
|
f'{colorama.Fore.RED}Failed to find replica '
|
@@ -591,7 +597,7 @@ def get_latest_version_with_min_replicas(
|
|
591
597
|
return active_versions[-1] if active_versions else None
|
592
598
|
|
593
599
|
|
594
|
-
def
|
600
|
+
def _follow_logs_with_provision_expanding(
|
595
601
|
file: TextIO,
|
596
602
|
cluster_name: str,
|
597
603
|
*,
|
@@ -599,7 +605,7 @@ def _follow_replica_logs(
|
|
599
605
|
stop_on_eof: bool = False,
|
600
606
|
idle_timeout_seconds: Optional[int] = None,
|
601
607
|
) -> Iterator[str]:
|
602
|
-
"""Follows logs
|
608
|
+
"""Follows logs and expands any provision.log references found.
|
603
609
|
|
604
610
|
Args:
|
605
611
|
file: Log file to read from.
|
@@ -610,7 +616,7 @@ def _follow_replica_logs(
|
|
610
616
|
new content.
|
611
617
|
|
612
618
|
Yields:
|
613
|
-
Log lines
|
619
|
+
Log lines, including expanded content from referenced provision logs.
|
614
620
|
"""
|
615
621
|
|
616
622
|
def cluster_is_up() -> bool:
|
@@ -620,36 +626,35 @@ def _follow_replica_logs(
|
|
620
626
|
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
621
627
|
|
622
628
|
def process_line(line: str) -> Iterator[str]:
|
623
|
-
#
|
624
|
-
#
|
625
|
-
#
|
629
|
+
# The line might be directing users to view logs, like
|
630
|
+
# `✓ Cluster launched: new-http. View logs at: *.log`
|
631
|
+
# We should tail the detailed logs for user.
|
626
632
|
provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
|
627
|
-
|
633
|
+
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
628
634
|
|
629
635
|
if provision_log_prompt is not None:
|
630
636
|
nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
yield
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
idle_timeout_seconds=10)
|
637
|
+
|
638
|
+
try:
|
639
|
+
with open(nested_log_path, 'r', newline='',
|
640
|
+
encoding='utf-8') as f:
|
641
|
+
# We still exit if more than 10 seconds without new content
|
642
|
+
# to avoid any internal bug that causes the launch to fail
|
643
|
+
# while cluster status remains INIT.
|
644
|
+
yield from log_utils.follow_logs(f,
|
645
|
+
should_stop=cluster_is_up,
|
646
|
+
stop_on_eof=stop_on_eof,
|
647
|
+
idle_timeout_seconds=10)
|
648
|
+
except FileNotFoundError:
|
649
|
+
yield line
|
650
|
+
|
651
|
+
yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
|
652
|
+
f'Try to expand log file {nested_log_path} but not '
|
653
|
+
f'found. Skipping...{colorama.Style.RESET_ALL}')
|
654
|
+
pass
|
650
655
|
return
|
651
656
|
|
652
|
-
if
|
657
|
+
if log_prompt is not None:
|
653
658
|
# Now we skip other logs (file sync logs) since we lack
|
654
659
|
# utility to determine when these log files are finished
|
655
660
|
# writing.
|
@@ -702,7 +707,7 @@ def stream_replica_logs(service_name: str, replica_id: int,
|
|
702
707
|
replica_provisioned = (
|
703
708
|
lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
|
704
709
|
with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
|
705
|
-
for line in
|
710
|
+
for line in _follow_logs_with_provision_expanding(
|
706
711
|
f,
|
707
712
|
replica_cluster_name,
|
708
713
|
should_stop=replica_provisioned,
|
sky/skylet/constants.py
CHANGED
@@ -75,7 +75,7 @@ TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
|
|
75
75
|
# cluster yaml is updated.
|
76
76
|
#
|
77
77
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
78
|
-
SKYLET_VERSION = '
|
78
|
+
SKYLET_VERSION = '9'
|
79
79
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
80
80
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
81
81
|
# user can be notified to update their SkyPilot version on the remote cluster.
|