skypilot-nightly 1.0.0.dev20241114__py3-none-any.whl → 1.0.0.dev20241116__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +142 -74
  3. sky/backends/cloud_vm_ray_backend.py +15 -11
  4. sky/cli.py +15 -4
  5. sky/clouds/aws.py +1 -0
  6. sky/clouds/oci.py +0 -2
  7. sky/clouds/service_catalog/aws_catalog.py +2 -0
  8. sky/clouds/utils/oci_utils.py +5 -0
  9. sky/execution.py +43 -22
  10. sky/global_user_state.py +36 -16
  11. sky/jobs/core.py +0 -1
  12. sky/jobs/utils.py +4 -3
  13. sky/provision/kubernetes/utils.py +2 -0
  14. sky/provision/oci/instance.py +12 -11
  15. sky/provision/oci/query_utils.py +212 -6
  16. sky/serve/core.py +1 -0
  17. sky/serve/serve_utils.py +35 -30
  18. sky/skylet/constants.py +1 -1
  19. sky/skylet/job_lib.py +249 -138
  20. sky/skylet/log_lib.py +1 -34
  21. sky/skylet/subprocess_daemon.py +33 -13
  22. sky/utils/controller_utils.py +10 -9
  23. sky/utils/schemas.py +1 -0
  24. sky/utils/subprocess_utils.py +50 -0
  25. sky/utils/timeline.py +2 -4
  26. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/METADATA +1 -1
  27. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/RECORD +31 -31
  28. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/LICENSE +0 -0
  29. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/WHEEL +0 -0
  30. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/entry_points.txt +0 -0
  31. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -60,7 +60,8 @@ def create_table(cursor, conn):
60
60
  owner TEXT DEFAULT null,
61
61
  cluster_hash TEXT DEFAULT null,
62
62
  storage_mounts_metadata BLOB DEFAULT null,
63
- cluster_ever_up INTEGER DEFAULT 0)""")
63
+ cluster_ever_up INTEGER DEFAULT 0,
64
+ status_updated_at INTEGER DEFAULT null)""")
64
65
 
65
66
  # Table for Cluster History
66
67
  # usage_intervals: List[Tuple[int, int]]
@@ -130,6 +131,10 @@ def create_table(cursor, conn):
130
131
  # clusters were never really UP, setting it to 1 means they won't be
131
132
  # auto-deleted during any failover.
132
133
  value_to_replace_existing_entries=1)
134
+
135
+ db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
136
+ 'INTEGER DEFAULT null')
137
+
133
138
  conn.commit()
134
139
 
135
140
 
@@ -159,6 +164,7 @@ def add_or_update_cluster(cluster_name: str,
159
164
  status = status_lib.ClusterStatus.INIT
160
165
  if ready:
161
166
  status = status_lib.ClusterStatus.UP
167
+ status_updated_at = int(time.time())
162
168
 
163
169
  # TODO (sumanth): Cluster history table will have multiple entries
164
170
  # when the cluster failover through multiple regions (one entry per region).
@@ -191,7 +197,7 @@ def add_or_update_cluster(cluster_name: str,
191
197
  # specified.
192
198
  '(name, launched_at, handle, last_use, status, '
193
199
  'autostop, to_down, metadata, owner, cluster_hash, '
194
- 'storage_mounts_metadata, cluster_ever_up) '
200
+ 'storage_mounts_metadata, cluster_ever_up, status_updated_at) '
195
201
  'VALUES ('
196
202
  # name
197
203
  '?, '
@@ -228,7 +234,9 @@ def add_or_update_cluster(cluster_name: str,
228
234
  'COALESCE('
229
235
  '(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null), '
230
236
  # cluster_ever_up
231
- '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?)'
237
+ '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?),'
238
+ # status_updated_at
239
+ '?'
232
240
  ')',
233
241
  (
234
242
  # name
@@ -260,6 +268,8 @@ def add_or_update_cluster(cluster_name: str,
260
268
  # cluster_ever_up
261
269
  cluster_name,
262
270
  int(ready),
271
+ # status_updated_at
272
+ status_updated_at,
263
273
  ))
264
274
 
265
275
  launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
@@ -330,11 +340,13 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
330
340
  # stopped VM, which leads to timeout.
331
341
  if hasattr(handle, 'stable_internal_external_ips'):
332
342
  handle.stable_internal_external_ips = None
343
+ current_time = int(time.time())
333
344
  _DB.cursor.execute(
334
- 'UPDATE clusters SET handle=(?), status=(?) '
335
- 'WHERE name=(?)', (
345
+ 'UPDATE clusters SET handle=(?), status=(?), '
346
+ 'status_updated_at=(?) WHERE name=(?)', (
336
347
  pickle.dumps(handle),
337
348
  status_lib.ClusterStatus.STOPPED.value,
349
+ current_time,
338
350
  cluster_name,
339
351
  ))
340
352
  _DB.conn.commit()
@@ -359,10 +371,10 @@ def get_glob_cluster_names(cluster_name: str) -> List[str]:
359
371
 
360
372
  def set_cluster_status(cluster_name: str,
361
373
  status: status_lib.ClusterStatus) -> None:
362
- _DB.cursor.execute('UPDATE clusters SET status=(?) WHERE name=(?)', (
363
- status.value,
364
- cluster_name,
365
- ))
374
+ current_time = int(time.time())
375
+ _DB.cursor.execute(
376
+ 'UPDATE clusters SET status=(?), status_updated_at=(?) WHERE name=(?)',
377
+ (status.value, current_time, cluster_name))
366
378
  count = _DB.cursor.rowcount
367
379
  _DB.conn.commit()
368
380
  assert count <= 1, count
@@ -570,15 +582,18 @@ def _load_storage_mounts_metadata(
570
582
 
571
583
  def get_cluster_from_name(
572
584
  cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
573
- rows = _DB.cursor.execute('SELECT * FROM clusters WHERE name=(?)',
574
- (cluster_name,)).fetchall()
585
+ rows = _DB.cursor.execute(
586
+ 'SELECT name, launched_at, handle, last_use, status, autostop, '
587
+ 'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
588
+ 'cluster_ever_up, status_updated_at FROM clusters WHERE name=(?)',
589
+ (cluster_name,)).fetchall()
575
590
  for row in rows:
576
591
  # Explicitly specify the number of fields to unpack, so that
577
592
  # we can add new fields to the database in the future without
578
593
  # breaking the previous code.
579
594
  (name, launched_at, handle, last_use, status, autostop, metadata,
580
- to_down, owner, cluster_hash, storage_mounts_metadata,
581
- cluster_ever_up) = row[:12]
595
+ to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
596
+ status_updated_at) = row[:13]
582
597
  # TODO: use namedtuple instead of dict
583
598
  record = {
584
599
  'name': name,
@@ -594,6 +609,7 @@ def get_cluster_from_name(
594
609
  'storage_mounts_metadata':
595
610
  _load_storage_mounts_metadata(storage_mounts_metadata),
596
611
  'cluster_ever_up': bool(cluster_ever_up),
612
+ 'status_updated_at': status_updated_at,
597
613
  }
598
614
  return record
599
615
  return None
@@ -601,12 +617,15 @@ def get_cluster_from_name(
601
617
 
602
618
  def get_clusters() -> List[Dict[str, Any]]:
603
619
  rows = _DB.cursor.execute(
604
- 'select * from clusters order by launched_at desc').fetchall()
620
+ 'select name, launched_at, handle, last_use, status, autostop, '
621
+ 'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
622
+ 'cluster_ever_up, status_updated_at from clusters '
623
+ 'order by launched_at desc').fetchall()
605
624
  records = []
606
625
  for row in rows:
607
626
  (name, launched_at, handle, last_use, status, autostop, metadata,
608
- to_down, owner, cluster_hash, storage_mounts_metadata,
609
- cluster_ever_up) = row[:12]
627
+ to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
628
+ status_updated_at) = row[:13]
610
629
  # TODO: use namedtuple instead of dict
611
630
  record = {
612
631
  'name': name,
@@ -622,6 +641,7 @@ def get_clusters() -> List[Dict[str, Any]]:
622
641
  'storage_mounts_metadata':
623
642
  _load_storage_mounts_metadata(storage_mounts_metadata),
624
643
  'cluster_ever_up': bool(cluster_ever_up),
644
+ 'status_updated_at': status_updated_at,
625
645
  }
626
646
 
627
647
  records.append(record)
sky/jobs/core.py CHANGED
@@ -133,7 +133,6 @@ def launch(
133
133
  controller_task.set_resources(controller_resources)
134
134
 
135
135
  controller_task.managed_job_dag = dag
136
- assert len(controller_task.resources) == 1, controller_task
137
136
 
138
137
  sky_logging.print(
139
138
  f'{colorama.Fore.YELLOW}'
sky/jobs/utils.py CHANGED
@@ -85,7 +85,8 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
85
85
  cluster_name: str) -> Optional['job_lib.JobStatus']:
86
86
  """Check the status of the job running on a managed job cluster.
87
87
 
88
- It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_SETUP or CANCELLED.
88
+ It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
89
+ FAILED_SETUP or CANCELLED.
89
90
  """
90
91
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
91
92
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
@@ -866,7 +867,7 @@ class ManagedJobCodeGen:
866
867
  code += inspect.getsource(stream_logs)
867
868
  code += textwrap.dedent(f"""\
868
869
 
869
- msg = stream_logs({job_id!r}, {job_name!r},
870
+ msg = stream_logs({job_id!r}, {job_name!r},
870
871
  follow={follow}, controller={controller})
871
872
  print(msg, flush=True)
872
873
  """)
@@ -883,7 +884,7 @@ class ManagedJobCodeGen:
883
884
  resources_str = backend_utils.get_task_resources_str(
884
885
  task, is_managed_job=True)
885
886
  code += textwrap.dedent(f"""\
886
- managed_job_state.set_pending({job_id}, {task_id},
887
+ managed_job_state.set_pending({job_id}, {task_id},
887
888
  {task.name!r}, {resources_str!r})
888
889
  """)
889
890
  return cls._build(code)
@@ -1693,6 +1693,8 @@ def merge_dicts(source: Dict[Any, Any], destination: Dict[Any, Any]):
1693
1693
  else:
1694
1694
  destination[key].extend(value)
1695
1695
  else:
1696
+ if destination is None:
1697
+ destination = {}
1696
1698
  destination[key] = value
1697
1699
 
1698
1700
 
@@ -2,6 +2,8 @@
2
2
 
3
3
  History:
4
4
  - Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
5
+ - Hysun He (hysun.he@oracle.com) @ Nov.13, 2024: Implement open_ports
6
+ and cleanup_ports for supporting SkyServe.
5
7
  """
6
8
 
7
9
  import copy
@@ -292,11 +294,11 @@ def open_ports(
292
294
  provider_config: Optional[Dict[str, Any]] = None,
293
295
  ) -> None:
294
296
  """Open ports for inbound traffic."""
295
- # OCI ports in security groups are opened while creating the new
296
- # VCN (skypilot_vcn). If user configure to use existing VCN, it is
297
- # intended to let user to manage the ports instead of automatically
298
- # opening ports here.
299
- del cluster_name_on_cloud, ports, provider_config
297
+ assert provider_config is not None, cluster_name_on_cloud
298
+ region = provider_config['region']
299
+ query_helper.create_nsg_rules(region=region,
300
+ cluster_name=cluster_name_on_cloud,
301
+ ports=ports)
300
302
 
301
303
 
302
304
  @query_utils.debug_enabled(logger)
@@ -306,12 +308,11 @@ def cleanup_ports(
306
308
  provider_config: Optional[Dict[str, Any]] = None,
307
309
  ) -> None:
308
310
  """Delete any opened ports."""
309
- del cluster_name_on_cloud, ports, provider_config
310
- # OCI ports in security groups are opened while creating the new
311
- # VCN (skypilot_vcn). The VCN will only be created at the first
312
- # time when it is not existed. We'll not automatically delete the
313
- # VCN while teardown clusters. it is intended to let user to decide
314
- # to delete the VCN or not from OCI console, for example.
311
+ assert provider_config is not None, cluster_name_on_cloud
312
+ region = provider_config['region']
313
+ del ports
314
+ query_helper.remove_cluster_nsg(region=region,
315
+ cluster_name=cluster_name_on_cloud)
315
316
 
316
317
 
317
318
  @query_utils.debug_enabled(logger)
@@ -5,6 +5,8 @@ History:
5
5
  migrated from the old provisioning API.
6
6
  - Hysun He (hysun.he@oracle.com) @ Oct.18, 2024: Enhancement.
7
7
  find_compartment: allow search subtree when find a compartment.
8
+ - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add methods to
9
+ Add/remove security rules: create_nsg_rules & remove_nsg
8
10
  """
9
11
  from datetime import datetime
10
12
  import functools
@@ -13,12 +15,15 @@ import re
13
15
  import time
14
16
  import traceback
15
17
  import typing
16
- from typing import Optional
18
+ from typing import List, Optional, Tuple
17
19
 
20
+ from sky import exceptions
18
21
  from sky import sky_logging
19
22
  from sky.adaptors import common as adaptors_common
20
23
  from sky.adaptors import oci as oci_adaptor
21
24
  from sky.clouds.utils import oci_utils
25
+ from sky.provision import constants
26
+ from sky.utils import resources_utils
22
27
 
23
28
  if typing.TYPE_CHECKING:
24
29
  import pandas as pd
@@ -81,19 +86,33 @@ class QueryHelper:
81
86
  return result_set
82
87
 
83
88
  @classmethod
89
+ @debug_enabled(logger)
84
90
  def terminate_instances_by_tags(cls, tag_filters, region) -> int:
85
91
  logger.debug(f'Terminate instance by tags: {tag_filters}')
92
+
93
+ cluster_name = tag_filters[constants.TAG_RAY_CLUSTER_NAME]
94
+ nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
95
+ cluster_name=cluster_name)
96
+ nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=False)
97
+
98
+ core_client = oci_adaptor.get_core_client(
99
+ region, oci_utils.oci_config.get_profile())
100
+
86
101
  insts = cls.query_instances_by_tags(tag_filters, region)
87
102
  fail_count = 0
88
103
  for inst in insts:
89
104
  inst_id = inst.identifier
90
- logger.debug(f'Got instance(to be terminated): {inst_id}')
105
+ logger.debug(f'Terminating instance {inst_id}')
91
106
 
92
107
  try:
93
- oci_adaptor.get_core_client(
94
- region,
95
- oci_utils.oci_config.get_profile()).terminate_instance(
96
- inst_id)
108
+ # Release the NSG reference so that the NSG can be
109
+ # deleted without waiting the instance being terminated.
110
+ if nsg_id is not None:
111
+ cls.detach_nsg(region, inst, nsg_id)
112
+
113
+ # Terminate the instance
114
+ core_client.terminate_instance(inst_id)
115
+
97
116
  except oci_adaptor.oci.exceptions.ServiceError as e:
98
117
  fail_count += 1
99
118
  logger.error(f'Terminate instance failed: {str(e)}\n: {inst}')
@@ -468,5 +487,192 @@ class QueryHelper:
468
487
  logger.error(
469
488
  f'Delete VCN {oci_utils.oci_config.VCN_NAME} Error: {str(e)}')
470
489
 
490
+ @classmethod
491
+ @debug_enabled(logger)
492
+ def find_nsg(cls, region: str, nsg_name: str,
493
+ create_if_not_exist: bool) -> Optional[str]:
494
+ net_client = oci_adaptor.get_net_client(
495
+ region, oci_utils.oci_config.get_profile())
496
+
497
+ compartment = cls.find_compartment(region)
498
+
499
+ list_vcns_resp = net_client.list_vcns(
500
+ compartment_id=compartment,
501
+ display_name=oci_utils.oci_config.VCN_NAME,
502
+ lifecycle_state='AVAILABLE',
503
+ )
504
+
505
+ if not list_vcns_resp:
506
+ raise exceptions.ResourcesUnavailableError(
507
+ 'The VCN is not available')
508
+
509
+ # Get the primary vnic.
510
+ assert len(list_vcns_resp.data) > 0
511
+ vcn = list_vcns_resp.data[0]
512
+
513
+ list_nsg_resp = net_client.list_network_security_groups(
514
+ compartment_id=compartment,
515
+ vcn_id=vcn.id,
516
+ limit=1,
517
+ display_name=nsg_name,
518
+ )
519
+
520
+ nsgs = list_nsg_resp.data
521
+ if nsgs:
522
+ assert len(nsgs) == 1
523
+ return nsgs[0].id
524
+ elif not create_if_not_exist:
525
+ return None
526
+
527
+ # Continue to create new NSG if not exists
528
+ create_nsg_resp = net_client.create_network_security_group(
529
+ create_network_security_group_details=oci_adaptor.oci.core.models.
530
+ CreateNetworkSecurityGroupDetails(
531
+ compartment_id=compartment,
532
+ vcn_id=vcn.id,
533
+ display_name=nsg_name,
534
+ ))
535
+ get_nsg_resp = net_client.get_network_security_group(
536
+ network_security_group_id=create_nsg_resp.data.id)
537
+ oci_adaptor.oci.wait_until(
538
+ net_client,
539
+ get_nsg_resp,
540
+ 'lifecycle_state',
541
+ 'AVAILABLE',
542
+ )
543
+
544
+ return get_nsg_resp.data.id
545
+
546
+ @classmethod
547
+ def get_range_min_max(cls, port_range: str) -> Tuple[int, int]:
548
+ range_list = port_range.split('-')
549
+ if len(range_list) == 1:
550
+ return (int(range_list[0]), int(range_list[0]))
551
+ from_port, to_port = range_list
552
+ return (int(from_port), int(to_port))
553
+
554
+ @classmethod
555
+ @debug_enabled(logger)
556
+ def create_nsg_rules(cls, region: str, cluster_name: str,
557
+ ports: List[str]) -> None:
558
+ """ Create per-cluster NSG with ingress rules """
559
+ if not ports:
560
+ return
561
+
562
+ net_client = oci_adaptor.get_net_client(
563
+ region, oci_utils.oci_config.get_profile())
564
+
565
+ nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
566
+ cluster_name=cluster_name)
567
+ nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=True)
568
+
569
+ filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name}
570
+ insts = query_helper.query_instances_by_tags(filters, region)
571
+ for inst in insts:
572
+ vnic = cls.get_instance_primary_vnic(
573
+ region=region,
574
+ inst_info={
575
+ 'inst_id': inst.identifier,
576
+ 'ad': inst.availability_domain,
577
+ 'compartment': inst.compartment_id,
578
+ })
579
+ nsg_ids = vnic.nsg_ids
580
+ if not nsg_ids:
581
+ net_client.update_vnic(
582
+ vnic_id=vnic.id,
583
+ update_vnic_details=oci_adaptor.oci.core.models.
584
+ UpdateVnicDetails(nsg_ids=[nsg_id],
585
+ skip_source_dest_check=False),
586
+ )
587
+
588
+ # pylint: disable=line-too-long
589
+ list_nsg_rules_resp = net_client.list_network_security_group_security_rules(
590
+ network_security_group_id=nsg_id,
591
+ direction='INGRESS',
592
+ sort_by='TIMECREATED',
593
+ sort_order='DESC',
594
+ )
595
+
596
+ ingress_rules: List = list_nsg_rules_resp.data
597
+ existing_port_ranges: List[str] = []
598
+ for r in ingress_rules:
599
+ if r.tcp_options:
600
+ options_range = r.tcp_options.destination_port_range
601
+ rule_port_range = f'{options_range.min}-{options_range.max}'
602
+ existing_port_ranges.append(rule_port_range)
603
+
604
+ new_ports = resources_utils.port_ranges_to_set(ports)
605
+ existing_ports = resources_utils.port_ranges_to_set(
606
+ existing_port_ranges)
607
+ if new_ports.issubset(existing_ports):
608
+ # ports already contains in the existing rules, nothing to add.
609
+ return
610
+
611
+ # Determine the ports to be added, without overlapping.
612
+ ports_to_open = new_ports - existing_ports
613
+ port_ranges_to_open = resources_utils.port_set_to_ranges(ports_to_open)
614
+
615
+ new_rules = []
616
+ for port_range in port_ranges_to_open:
617
+ port_range_min, port_range_max = cls.get_range_min_max(port_range)
618
+ new_rules.append(
619
+ oci_adaptor.oci.core.models.AddSecurityRuleDetails(
620
+ direction='INGRESS',
621
+ protocol='6',
622
+ is_stateless=False,
623
+ source=oci_utils.oci_config.VCN_CIDR_INTERNET,
624
+ source_type='CIDR_BLOCK',
625
+ tcp_options=oci_adaptor.oci.core.models.TcpOptions(
626
+ destination_port_range=oci_adaptor.oci.core.models.
627
+ PortRange(min=port_range_min, max=port_range_max),),
628
+ description=oci_utils.oci_config.SERVICE_PORT_RULE_TAG,
629
+ ))
630
+
631
+ net_client.add_network_security_group_security_rules(
632
+ network_security_group_id=nsg_id,
633
+ add_network_security_group_security_rules_details=oci_adaptor.oci.
634
+ core.models.AddNetworkSecurityGroupSecurityRulesDetails(
635
+ security_rules=new_rules),
636
+ )
637
+
638
+ @classmethod
639
+ @debug_enabled(logger)
640
+ def detach_nsg(cls, region: str, inst, nsg_id: Optional[str]) -> None:
641
+ if nsg_id is None:
642
+ return
643
+
644
+ vnic = cls.get_instance_primary_vnic(
645
+ region=region,
646
+ inst_info={
647
+ 'inst_id': inst.identifier,
648
+ 'ad': inst.availability_domain,
649
+ 'compartment': inst.compartment_id,
650
+ })
651
+
652
+ # Detatch the NSG before removing it.
653
+ oci_adaptor.get_net_client(region, oci_utils.oci_config.get_profile(
654
+ )).update_vnic(
655
+ vnic_id=vnic.id,
656
+ update_vnic_details=oci_adaptor.oci.core.models.UpdateVnicDetails(
657
+ nsg_ids=[], skip_source_dest_check=False),
658
+ )
659
+
660
+ @classmethod
661
+ @debug_enabled(logger)
662
+ def remove_cluster_nsg(cls, region: str, cluster_name: str) -> None:
663
+ """ Remove NSG of the cluster """
664
+ net_client = oci_adaptor.get_net_client(
665
+ region, oci_utils.oci_config.get_profile())
666
+
667
+ nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
668
+ cluster_name=cluster_name)
669
+ nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=False)
670
+ if nsg_id is None:
671
+ return
672
+
673
+ # Delete the NSG
674
+ net_client.delete_network_security_group(
675
+ network_security_group_id=nsg_id)
676
+
471
677
 
472
678
  query_helper = QueryHelper()
sky/serve/core.py CHANGED
@@ -701,6 +701,7 @@ def tail_logs(
701
701
  with ux_utils.print_exception_no_traceback():
702
702
  raise ValueError(f'`target` must be a string or '
703
703
  f'sky.serve.ServiceComponent, got {type(target)}.')
704
+
704
705
  if target == serve_utils.ServiceComponent.REPLICA:
705
706
  if replica_id is None:
706
707
  with ux_utils.print_exception_no_traceback():
sky/serve/serve_utils.py CHANGED
@@ -46,8 +46,14 @@ NUM_SERVICE_THRESHOLD = (_SYSTEM_MEMORY_GB //
46
46
  constants.CONTROLLER_MEMORY_USAGE_GB)
47
47
  _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
48
48
 
49
- _SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*'
50
- _SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*'
49
+ # NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
50
+ # and always appear after a space. Be careful when changing UX as this
51
+ # assumption is used to expand some log files while ignoring others.
52
+ _SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
53
+ _SKYPILOT_PROVISION_LOG_PATTERN = (
54
+ fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
55
+ _SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
56
+
51
57
  # TODO(tian): Find all existing replica id and print here.
52
58
  _FAILED_TO_FIND_REPLICA_MSG = (
53
59
  f'{colorama.Fore.RED}Failed to find replica '
@@ -591,7 +597,7 @@ def get_latest_version_with_min_replicas(
591
597
  return active_versions[-1] if active_versions else None
592
598
 
593
599
 
594
- def _follow_replica_logs(
600
+ def _follow_logs_with_provision_expanding(
595
601
  file: TextIO,
596
602
  cluster_name: str,
597
603
  *,
@@ -599,7 +605,7 @@ def _follow_replica_logs(
599
605
  stop_on_eof: bool = False,
600
606
  idle_timeout_seconds: Optional[int] = None,
601
607
  ) -> Iterator[str]:
602
- """Follows logs for a replica, handling nested log files.
608
+ """Follows logs and expands any provision.log references found.
603
609
 
604
610
  Args:
605
611
  file: Log file to read from.
@@ -610,7 +616,7 @@ def _follow_replica_logs(
610
616
  new content.
611
617
 
612
618
  Yields:
613
- Log lines from the main file and any nested log files.
619
+ Log lines, including expanded content from referenced provision logs.
614
620
  """
615
621
 
616
622
  def cluster_is_up() -> bool:
@@ -620,36 +626,35 @@ def _follow_replica_logs(
620
626
  return cluster_record['status'] == status_lib.ClusterStatus.UP
621
627
 
622
628
  def process_line(line: str) -> Iterator[str]:
623
- # Tailing detailed progress for user. All logs in skypilot is
624
- # of format `To view detailed progress: tail -n100 -f *.log`.
625
- # Check if the line is directing users to view logs
629
+ # The line might be directing users to view logs, like
630
+ # `✓ Cluster launched: new-http. View logs at: *.log`
631
+ # We should tail the detailed logs for user.
626
632
  provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
627
- other_log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
633
+ log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
628
634
 
629
635
  if provision_log_prompt is not None:
630
636
  nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
631
- with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
632
- # We still exit if more than 10 seconds without new content
633
- # to avoid any internal bug that causes the launch to fail
634
- # while cluster status remains INIT.
635
- # Originally, we output the next line first before printing
636
- # the launching logs. Since the next line is always
637
- # `Launching on <cloud> <region> (<zone>)`, we output it first
638
- # to indicate the process is starting.
639
- # TODO(andyl): After refactor #4323, the above logic is broken,
640
- # but coincidentally with the new UX 3.0, the `Cluster launched`
641
- # message is printed first, making the output appear correct.
642
- # Explaining this since it's technically a breaking change
643
- # for this refactor PR #4323. Will remove soon in a fix PR
644
- # for adapting the serve.follow_logs to the new UX.
645
- yield from _follow_replica_logs(f,
646
- cluster_name,
647
- should_stop=cluster_is_up,
648
- stop_on_eof=stop_on_eof,
649
- idle_timeout_seconds=10)
637
+
638
+ try:
639
+ with open(nested_log_path, 'r', newline='',
640
+ encoding='utf-8') as f:
641
+ # We still exit if more than 10 seconds without new content
642
+ # to avoid any internal bug that causes the launch to fail
643
+ # while cluster status remains INIT.
644
+ yield from log_utils.follow_logs(f,
645
+ should_stop=cluster_is_up,
646
+ stop_on_eof=stop_on_eof,
647
+ idle_timeout_seconds=10)
648
+ except FileNotFoundError:
649
+ yield line
650
+
651
+ yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
652
+ f'Try to expand log file {nested_log_path} but not '
653
+ f'found. Skipping...{colorama.Style.RESET_ALL}')
654
+ pass
650
655
  return
651
656
 
652
- if other_log_prompt is not None:
657
+ if log_prompt is not None:
653
658
  # Now we skip other logs (file sync logs) since we lack
654
659
  # utility to determine when these log files are finished
655
660
  # writing.
@@ -702,7 +707,7 @@ def stream_replica_logs(service_name: str, replica_id: int,
702
707
  replica_provisioned = (
703
708
  lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
704
709
  with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
705
- for line in _follow_replica_logs(
710
+ for line in _follow_logs_with_provision_expanding(
706
711
  f,
707
712
  replica_cluster_name,
708
713
  should_stop=replica_provisioned,
sky/skylet/constants.py CHANGED
@@ -75,7 +75,7 @@ TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
75
75
  # cluster yaml is updated.
76
76
  #
77
77
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
78
- SKYLET_VERSION = '8'
78
+ SKYLET_VERSION = '9'
79
79
  # The version of the lib files that skylet/jobs use. Whenever there is an API
80
80
  # change for the job_lib or log_lib, we need to bump this version, so that the
81
81
  # user can be notified to update their SkyPilot version on the remote cluster.