skypilot-nightly 1.0.0.dev20241109__py3-none-any.whl → 1.0.0.dev20241110__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,56 +1,75 @@
1
- """
2
- Helper class for some OCI operations methods which needs to be shared/called
3
- by multiple places.
1
+ """OCI query helper class
4
2
 
5
3
  History:
6
- - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
7
-
4
+ - Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Code here mainly
5
+ migrated from the old provisioning API.
6
+ - Hysun He (hysun.he@oracle.com) @ Oct.18, 2024: Enhancement.
7
+ find_compartment: allow search subtree when find a compartment.
8
8
  """
9
-
10
9
  from datetime import datetime
11
- import logging
10
+ import functools
11
+ from logging import Logger
12
12
  import re
13
13
  import time
14
14
  import traceback
15
15
  import typing
16
16
  from typing import Optional
17
17
 
18
+ from sky import sky_logging
18
19
  from sky.adaptors import common as adaptors_common
19
20
  from sky.adaptors import oci as oci_adaptor
20
21
  from sky.clouds.utils import oci_utils
21
- from sky.skylet.providers.oci import utils
22
22
 
23
23
  if typing.TYPE_CHECKING:
24
24
  import pandas as pd
25
25
  else:
26
26
  pd = adaptors_common.LazyImport('pandas')
27
27
 
28
- logger = logging.getLogger(__name__)
28
+ logger = sky_logging.init_logger(__name__)
29
+
30
+
31
+ def debug_enabled(log: Logger):
32
+
33
+ def decorate(f):
34
+
35
+ @functools.wraps(f)
36
+ def wrapper(*args, **kwargs):
37
+ dt_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
38
+ log.debug(f'{dt_str} Enter {f}, {args}, {kwargs}')
39
+ try:
40
+ return f(*args, **kwargs)
41
+ finally:
42
+ log.debug(f'{dt_str} Exit {f}')
43
+
44
+ return wrapper
29
45
 
46
+ return decorate
30
47
 
31
- class oci_query_helper:
32
48
 
49
+ class QueryHelper:
50
+ """Helper class for some OCI operations
51
+ """
33
52
  # Call Cloud API to try getting the satisfied nodes.
34
53
  @classmethod
35
- @utils.debug_enabled(logger=logger)
54
+ @debug_enabled(logger)
36
55
  def query_instances_by_tags(cls, tag_filters, region):
37
56
 
38
- where_clause_tags = ""
57
+ where_clause_tags = ''
39
58
  for tag_key in tag_filters:
40
- if where_clause_tags != "":
41
- where_clause_tags += " && "
59
+ if where_clause_tags != '':
60
+ where_clause_tags += ' && '
42
61
 
43
62
  tag_value = tag_filters[tag_key]
44
- where_clause_tags += (f"(freeformTags.key = '{tag_key}'"
45
- f" && freeformTags.value = '{tag_value}')")
63
+ where_clause_tags += (f'(freeformTags.key = \'{tag_key}\''
64
+ f' && freeformTags.value = \'{tag_value}\')')
46
65
 
47
- qv_str = (f"query instance resources where {where_clause_tags}"
48
- f" && (lifecycleState != 'TERMINATED'"
49
- f" && lifecycleState != 'TERMINATING')")
66
+ qv_str = (f'query instance resources where {where_clause_tags}'
67
+ f' && (lifecycleState != \'TERMINATED\''
68
+ f' && lifecycleState != \'TERMINATING\')')
50
69
 
51
70
  qv = oci_adaptor.oci.resource_search.models.StructuredSearchDetails(
52
71
  query=qv_str,
53
- type="Structured",
72
+ type='Structured',
54
73
  matching_context_type=oci_adaptor.oci.resource_search.models.
55
74
  SearchDetails.MATCHING_CONTEXT_TYPE_NONE,
56
75
  )
@@ -63,44 +82,98 @@ class oci_query_helper:
63
82
 
64
83
  @classmethod
65
84
  def terminate_instances_by_tags(cls, tag_filters, region) -> int:
66
- logger.debug(f"Terminate instance by tags: {tag_filters}")
85
+ logger.debug(f'Terminate instance by tags: {tag_filters}')
67
86
  insts = cls.query_instances_by_tags(tag_filters, region)
68
87
  fail_count = 0
69
88
  for inst in insts:
70
89
  inst_id = inst.identifier
71
- logger.debug(f"Got instance(to be terminated): {inst_id}")
90
+ logger.debug(f'Got instance(to be terminated): {inst_id}')
72
91
 
73
92
  try:
74
93
  oci_adaptor.get_core_client(
75
94
  region,
76
95
  oci_utils.oci_config.get_profile()).terminate_instance(
77
96
  inst_id)
78
- except Exception as e:
97
+ except oci_adaptor.oci.exceptions.ServiceError as e:
79
98
  fail_count += 1
80
- logger.error(f"Terminate instance failed: {str(e)}\n: {inst}")
99
+ logger.error(f'Terminate instance failed: {str(e)}\n: {inst}')
81
100
  traceback.print_exc()
82
101
 
83
102
  if fail_count == 0:
84
- logger.debug(f"Instance teardown result: OK")
103
+ logger.debug('Instance teardown result: OK')
85
104
  else:
86
- logger.warn(f"Instance teardown result: {fail_count} failed!")
105
+ logger.warning(f'Instance teardown result: {fail_count} failed!')
87
106
 
88
107
  return fail_count
89
108
 
90
109
  @classmethod
91
- @utils.debug_enabled(logger=logger)
110
+ @debug_enabled(logger)
111
+ def launch_instance(cls, region, launch_config):
112
+ """ To create a new instance """
113
+ return oci_adaptor.get_core_client(
114
+ region, oci_utils.oci_config.get_profile()).launch_instance(
115
+ launch_instance_details=launch_config)
116
+
117
+ @classmethod
118
+ @debug_enabled(logger)
119
+ def start_instance(cls, region, instance_id):
120
+ """ To start an existing instance """
121
+ return oci_adaptor.get_core_client(
122
+ region, oci_utils.oci_config.get_profile()).instance_action(
123
+ instance_id=instance_id, action='START')
124
+
125
+ @classmethod
126
+ @debug_enabled(logger)
127
+ def stop_instance(cls, region, instance_id):
128
+ """ To stop an instance """
129
+ return oci_adaptor.get_core_client(
130
+ region, oci_utils.oci_config.get_profile()).instance_action(
131
+ instance_id=instance_id, action='STOP')
132
+
133
+ @classmethod
134
+ @debug_enabled(logger)
135
+ def wait_instance_until_status(cls, region, node_id, status):
136
+ """ To wait a instance becoming the specified state """
137
+ compute_client = oci_adaptor.get_core_client(
138
+ region, oci_utils.oci_config.get_profile())
139
+
140
+ resp = compute_client.get_instance(instance_id=node_id)
141
+
142
+ oci_adaptor.oci.wait_until(
143
+ compute_client,
144
+ resp,
145
+ 'lifecycle_state',
146
+ status,
147
+ )
148
+
149
+ @classmethod
150
+ def get_instance_primary_vnic(cls, region, inst_info):
151
+ """ Get the primary vnic infomation of the instance """
152
+ list_vnic_attachments_response = oci_adaptor.get_core_client(
153
+ region, oci_utils.oci_config.get_profile()).list_vnic_attachments(
154
+ availability_domain=inst_info['ad'],
155
+ compartment_id=inst_info['compartment'],
156
+ instance_id=inst_info['inst_id'],
157
+ )
158
+ vnic = list_vnic_attachments_response.data[0]
159
+ return oci_adaptor.get_net_client(
160
+ region, oci_utils.oci_config.get_profile()).get_vnic(
161
+ vnic_id=vnic.vnic_id).data
162
+
163
+ @classmethod
164
+ @debug_enabled(logger)
92
165
  def subscribe_image(cls, compartment_id, listing_id, resource_version,
93
166
  region):
94
- if (pd.isna(listing_id) or listing_id.strip() == "None" or
95
- listing_id.strip() == "nan"):
167
+ if (pd.isna(listing_id) or listing_id.strip() == 'None' or
168
+ listing_id.strip() == 'nan'):
96
169
  return
97
170
 
98
171
  core_client = oci_adaptor.get_core_client(
99
172
  region, oci_utils.oci_config.get_profile())
100
173
  try:
101
- agreements_response = core_client.get_app_catalog_listing_agreements(
174
+ agreements_resp = core_client.get_app_catalog_listing_agreements(
102
175
  listing_id=listing_id, resource_version=resource_version)
103
- agreements = agreements_response.data
176
+ agreements = agreements_resp.data
104
177
 
105
178
  core_client.create_app_catalog_subscription(
106
179
  create_app_catalog_subscription_details=oci_adaptor.oci.core.
@@ -113,24 +186,24 @@ class oci_query_helper:
113
186
  oracle_terms_of_use_link,
114
187
  time_retrieved=datetime.strptime(
115
188
  re.sub(
116
- "\d{3}\+\d{2}\:\d{2}",
117
- "Z",
189
+ r'\d{3}\+\d{2}\:\d{2}',
190
+ 'Z',
118
191
  str(agreements.time_retrieved),
119
192
  0,
120
193
  ),
121
- "%Y-%m-%d %H:%M:%S.%fZ",
194
+ '%Y-%m-%d %H:%M:%S.%fZ',
122
195
  ),
123
196
  signature=agreements.signature,
124
197
  eula_link=agreements.eula_link,
125
198
  ))
126
- except Exception as e:
199
+ except oci_adaptor.oci.exceptions.ServiceError as e:
127
200
  logger.critical(
128
- f"subscribe_image: {listing_id} - {resource_version} ... [Failed]"
129
- f"Error message: {str(e)}")
130
- raise RuntimeError("ERR: Image subscription error!")
201
+ f'[Failed] subscribe_image: {listing_id} - {resource_version}'
202
+ f'Error message: {str(e)}')
203
+ raise RuntimeError('ERR: Image subscription error!') from e
131
204
 
132
205
  @classmethod
133
- @utils.debug_enabled(logger=logger)
206
+ @debug_enabled(logger)
134
207
  def find_compartment(cls, region) -> str:
135
208
  """ If compartment is not configured, we use root compartment """
136
209
  # Try to use the configured one first
@@ -143,12 +216,18 @@ class oci_query_helper:
143
216
  # config file is supported (2023/06/09).
144
217
  root = oci_adaptor.get_oci_config(
145
218
  region, oci_utils.oci_config.get_profile())['tenancy']
219
+
146
220
  list_compartments_response = oci_adaptor.get_identity_client(
147
221
  region, oci_utils.oci_config.get_profile()).list_compartments(
148
222
  compartment_id=root,
149
223
  name=oci_utils.oci_config.COMPARTMENT,
224
+ compartment_id_in_subtree=True,
225
+ access_level='ACCESSIBLE',
150
226
  lifecycle_state='ACTIVE',
227
+ sort_by='TIMECREATED',
228
+ sort_order='DESC',
151
229
  limit=1)
230
+
152
231
  compartments = list_compartments_response.data
153
232
  if len(compartments) > 0:
154
233
  skypilot_compartment = compartments[0].id
@@ -159,7 +238,7 @@ class oci_query_helper:
159
238
  return skypilot_compartment
160
239
 
161
240
  @classmethod
162
- @utils.debug_enabled(logger=logger)
241
+ @debug_enabled(logger)
163
242
  def find_create_vcn_subnet(cls, region) -> Optional[str]:
164
243
  """ If sub is not configured, we find/create VCN skypilot_vcn """
165
244
  subnet = oci_utils.oci_config.get_vcn_subnet(region)
@@ -174,7 +253,7 @@ class oci_query_helper:
174
253
  list_vcns_response = net_client.list_vcns(
175
254
  compartment_id=skypilot_compartment,
176
255
  display_name=oci_utils.oci_config.VCN_NAME,
177
- lifecycle_state="AVAILABLE")
256
+ lifecycle_state='AVAILABLE')
178
257
  vcns = list_vcns_response.data
179
258
  if len(vcns) > 0:
180
259
  # Found the VCN.
@@ -184,7 +263,7 @@ class oci_query_helper:
184
263
  limit=1,
185
264
  vcn_id=skypilot_vcn,
186
265
  display_name=oci_utils.oci_config.VCN_SUBNET_NAME,
187
- lifecycle_state="AVAILABLE")
266
+ lifecycle_state='AVAILABLE')
188
267
  logger.debug(f'Got VCN subnet \n{list_subnets_response.data}')
189
268
  if len(list_subnets_response.data) < 1:
190
269
  logger.error(
@@ -201,10 +280,17 @@ class oci_query_helper:
201
280
  return cls.create_vcn_subnet(net_client, skypilot_compartment)
202
281
 
203
282
  @classmethod
204
- @utils.debug_enabled(logger=logger)
283
+ @debug_enabled(logger)
205
284
  def create_vcn_subnet(cls, net_client,
206
285
  skypilot_compartment) -> Optional[str]:
286
+
287
+ skypilot_vcn = None # VCN for the resources
288
+ subnet = None # Subnet for the VMs
289
+ ig = None # Internet gateway
290
+ sg = None # Service gateway
291
+
207
292
  try:
293
+ # pylint: disable=line-too-long
208
294
  create_vcn_response = net_client.create_vcn(
209
295
  create_vcn_details=oci_adaptor.oci.core.models.CreateVcnDetails(
210
296
  compartment_id=skypilot_compartment,
@@ -274,38 +360,38 @@ class oci_query_helper:
274
360
  update_security_list_details=oci_adaptor.oci.core.models.
275
361
  UpdateSecurityListDetails(ingress_security_rules=[
276
362
  oci_adaptor.oci.core.models.IngressSecurityRule(
277
- protocol="6",
363
+ protocol='6',
278
364
  source=oci_utils.oci_config.VCN_CIDR_INTERNET,
279
365
  is_stateless=False,
280
- source_type="CIDR_BLOCK",
366
+ source_type='CIDR_BLOCK',
281
367
  tcp_options=oci_adaptor.oci.core.models.TcpOptions(
282
368
  destination_port_range=oci_adaptor.oci.core.models.
283
369
  PortRange(max=22, min=22),
284
370
  source_port_range=oci_adaptor.oci.core.models.
285
371
  PortRange(max=65535, min=1)),
286
- description="Allow SSH port."),
372
+ description='Allow SSH port.'),
287
373
  oci_adaptor.oci.core.models.IngressSecurityRule(
288
- protocol="all",
374
+ protocol='all',
289
375
  source=oci_utils.oci_config.VCN_SUBNET_CIDR,
290
376
  is_stateless=False,
291
- source_type="CIDR_BLOCK",
292
- description="Allow all traffic from/to same subnet."),
377
+ source_type='CIDR_BLOCK',
378
+ description='Allow all traffic from/to same subnet.'),
293
379
  oci_adaptor.oci.core.models.IngressSecurityRule(
294
- protocol="1",
380
+ protocol='1',
295
381
  source=oci_utils.oci_config.VCN_CIDR_INTERNET,
296
382
  is_stateless=False,
297
- source_type="CIDR_BLOCK",
383
+ source_type='CIDR_BLOCK',
298
384
  icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
299
385
  type=3, code=4),
300
- description="ICMP traffic."),
386
+ description='ICMP traffic.'),
301
387
  oci_adaptor.oci.core.models.IngressSecurityRule(
302
- protocol="1",
388
+ protocol='1',
303
389
  source=oci_utils.oci_config.VCN_CIDR,
304
390
  is_stateless=False,
305
- source_type="CIDR_BLOCK",
391
+ source_type='CIDR_BLOCK',
306
392
  icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
307
393
  type=3),
308
- description="ICMP traffic (VCN)."),
394
+ description='ICMP traffic (VCN).'),
309
395
  ]))
310
396
  logger.debug(
311
397
  f'Updated security_list: \n{update_security_list_response.data}'
@@ -325,7 +411,7 @@ class oci_query_helper:
325
411
  ]))
326
412
  logger.debug(f'Route table: \n{update_route_table_response.data}')
327
413
 
328
- except oci_adaptor.service_exception() as e:
414
+ except oci_adaptor.oci.exceptions.ServiceError as e:
329
415
  logger.error(f'Create VCN Error: Create new VCN '
330
416
  f'{oci_utils.oci_config.VCN_NAME} failed: {str(e)}')
331
417
  # In case of partial success while creating vcn
@@ -335,7 +421,7 @@ class oci_query_helper:
335
421
  return subnet
336
422
 
337
423
  @classmethod
338
- @utils.debug_enabled(logger=logger)
424
+ @debug_enabled(logger)
339
425
  def delete_vcn(cls, net_client, skypilot_vcn, skypilot_subnet,
340
426
  internet_gateway, service_gateway):
341
427
  if skypilot_vcn is None:
@@ -369,7 +455,7 @@ class oci_query_helper:
369
455
  f'Deleted vcn {skypilot_vcn}-{delete_vcn_response.data}'
370
456
  )
371
457
  break
372
- except oci_adaptor.service_exception() as e:
458
+ except oci_adaptor.oci.exceptions.ServiceError as e:
373
459
  logger.info(f'Waiting del SG/IG/Subnet finish: {str(e)}')
374
460
  retry_count = retry_count + 1
375
461
  if retry_count == oci_utils.oci_config.MAX_RETRY_COUNT:
@@ -378,6 +464,9 @@ class oci_query_helper:
378
464
  time.sleep(
379
465
  oci_utils.oci_config.RETRY_INTERVAL_BASE_SECONDS)
380
466
 
381
- except oci_adaptor.service_exception() as e:
467
+ except oci_adaptor.oci.exceptions.ServiceError as e:
382
468
  logger.error(
383
469
  f'Delete VCN {oci_utils.oci_config.VCN_NAME} Error: {str(e)}')
470
+
471
+
472
+ query_helper = QueryHelper()
@@ -6,7 +6,6 @@ include sky/setup_files/*
6
6
  include sky/skylet/*.sh
7
7
  include sky/skylet/LICENSE
8
8
  include sky/skylet/providers/ibm/*
9
- include sky/skylet/providers/oci/*
10
9
  include sky/skylet/providers/scp/*
11
10
  include sky/skylet/providers/*.py
12
11
  include sky/skylet/ray_patches/*.patch
sky/skylet/job_lib.py CHANGED
@@ -11,8 +11,7 @@ import shlex
11
11
  import sqlite3
12
12
  import subprocess
13
13
  import time
14
- import typing
15
- from typing import Any, Dict, List, Optional, Tuple
14
+ from typing import Any, Dict, List, Optional
16
15
 
17
16
  import colorama
18
17
  import filelock
@@ -24,9 +23,6 @@ from sky.utils import common_utils
24
23
  from sky.utils import db_utils
25
24
  from sky.utils import log_utils
26
25
 
27
- if typing.TYPE_CHECKING:
28
- from ray.dashboard.modules.job import pydantic_models as ray_pydantic
29
-
30
26
  logger = sky_logging.init_logger(__name__)
31
27
 
32
28
  _LINUX_NEW_LINE = '\n'
@@ -184,12 +180,20 @@ class JobScheduler:
184
180
  def schedule_step(self, force_update_jobs: bool = False) -> None:
185
181
  if force_update_jobs:
186
182
  update_status()
187
- pending_jobs = self._get_pending_jobs()
183
+ pending_job_ids = self._get_pending_job_ids()
188
184
  # TODO(zhwu, mraheja): One optimization can be allowing more than one
189
185
  # job staying in the pending state after ray job submit, so that to be
190
186
  # faster to schedule a large amount of jobs.
191
- for job_id, run_cmd, submit, created_time in pending_jobs:
187
+ for job_id in pending_job_ids:
192
188
  with filelock.FileLock(_get_lock_path(job_id)):
189
+ pending_job = _get_pending_job(job_id)
190
+ if pending_job is None:
191
+ # Pending job can be removed by another thread, due to the
192
+ # job being scheduled already.
193
+ continue
194
+ run_cmd = pending_job['run_cmd']
195
+ submit = pending_job['submit']
196
+ created_time = pending_job['created_time']
193
197
  # We don't have to refresh the job status before checking, as
194
198
  # the job status will only be stale in rare cases where ray job
195
199
  # crashes; or the job stays in INIT state for a long time.
@@ -208,8 +212,8 @@ class JobScheduler:
208
212
  self._run_job(job_id, run_cmd)
209
213
  return
210
214
 
211
- def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
212
- """Returns the metadata for jobs in the pending jobs table
215
+ def _get_pending_job_ids(self) -> List[int]:
216
+ """Returns the job ids in the pending jobs table
213
217
 
214
218
  The information contains job_id, run command, submit time,
215
219
  creation time.
@@ -220,9 +224,10 @@ class JobScheduler:
220
224
  class FIFOScheduler(JobScheduler):
221
225
  """First in first out job scheduler"""
222
226
 
223
- def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
224
- return list(
225
- _CURSOR.execute('SELECT * FROM pending_jobs ORDER BY job_id'))
227
+ def _get_pending_job_ids(self) -> List[int]:
228
+ rows = _CURSOR.execute(
229
+ 'SELECT job_id FROM pending_jobs ORDER BY job_id').fetchall()
230
+ return [row[0] for row in rows]
226
231
 
227
232
 
228
233
  scheduler = FIFOScheduler()
@@ -519,11 +524,16 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
519
524
 
520
525
 
521
526
  def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
522
- rows = _CURSOR.execute('SELECT created_time, submit FROM pending_jobs '
523
- f'WHERE job_id={job_id!r}')
527
+ rows = _CURSOR.execute(
528
+ 'SELECT created_time, submit, run_cmd FROM pending_jobs '
529
+ f'WHERE job_id={job_id!r}')
524
530
  for row in rows:
525
- created_time, submit = row
526
- return {'created_time': created_time, 'submit': submit}
531
+ created_time, submit, run_cmd = row
532
+ return {
533
+ 'created_time': created_time,
534
+ 'submit': submit,
535
+ 'run_cmd': run_cmd
536
+ }
527
537
  return None
528
538
 
529
539
 
@@ -794,7 +804,9 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
794
804
  logger.warning(str(e))
795
805
  continue
796
806
 
797
- if job['status'] in [
807
+ # Get the job status again to avoid race condition.
808
+ job_status = get_status_no_lock(job['job_id'])
809
+ if job_status in [
798
810
  JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING
799
811
  ]:
800
812
  _set_status_no_lock(job['job_id'], JobStatus.CANCELLED)
@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
7
7
 
8
8
  provider:
9
9
  type: external
10
- module: sky.skylet.providers.oci.OCINodeProvider
10
+ module: sky.provision.oci
11
11
  region: {{region}}
12
12
  cache_stopped_nodes: True
13
13
  # Disable launch config check for worker nodes as it can cause resource leakage.
@@ -39,25 +39,6 @@ available_node_types:
39
39
  Preemptible: {{use_spot}}
40
40
  AuthorizedKey: |
41
41
  skypilot:ssh_public_key_content
42
- {% if num_nodes > 1 %}
43
- ray_worker_default:
44
- min_workers: {{num_nodes - 1}}
45
- max_workers: {{num_nodes - 1}}
46
- resources: {}
47
- node_config:
48
- InstanceType: {{instance_type}}
49
- VCPUs: {{cpus}}
50
- MemoryInGbs: {{memory}}
51
- BootVolumeSize: {{disk_size}}
52
- BootVolumePerf: {{vpu}}
53
- AvailabilityDomain: {{zone}}
54
- ImageId: {{image}}
55
- AppCatalogListingId: {{app_catalog_listing_id}}
56
- ResourceVersion: {{resource_version}}
57
- Preemptible: {{use_spot}}
58
- AuthorizedKey: |
59
- skypilot:ssh_public_key_content
60
- {%- endif %}
61
42
 
62
43
  head_node_type: ray_head_default
63
44
 
@@ -70,9 +51,6 @@ file_mounts: {
70
51
  {%- endfor %}
71
52
  }
72
53
 
73
- rsync_exclude: []
74
-
75
- initialization_commands: []
76
54
 
77
55
  # List of shell commands to run to set up nodes.
78
56
  # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
@@ -113,34 +91,6 @@ setup_commands:
113
91
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
114
92
  sudo iptables -I INPUT -i ens3 -m state --state ESTABLISHED,RELATED,NEW -j ACCEPT;
115
93
 
116
- # Command to start ray on the head node. You don't need to change this.
117
- # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
118
- # connection, which is expensive. Try your best to co-locate commands into fewer
119
- # items! The same comment applies for worker_start_ray_commands.
120
- #
121
- # Increment the following for catching performance bugs easier:
122
- # current num items (num SSH connections): 2
123
- head_start_ray_commands:
124
- # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
125
- # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
126
- # all the sessions to be reloaded. This is a workaround.
127
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
128
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
129
- {{dump_port_command}}; {{ray_head_wait_initialized_command}}
130
-
131
- {%- if num_nodes > 1 %}
132
- worker_start_ray_commands:
133
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
134
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
135
- {%- else %}
136
- worker_start_ray_commands: []
137
- {%- endif %}
138
-
139
- head_node: {}
140
- worker_nodes: {}
94
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
95
+ # We do not need to list it here anymore.
141
96
 
142
- # These fields are required for external cloud providers.
143
- head_setup_commands: []
144
- worker_setup_commands: []
145
- cluster_synced_files: []
146
- file_mounts_sync_continuously: False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241109
3
+ Version: 1.0.0.dev20241110
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0