skypilot-nightly 1.0.0.dev20241109__py3-none-any.whl → 1.0.0.dev20241110__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +0 -19
- sky/clouds/oci.py +11 -21
- sky/clouds/service_catalog/oci_catalog.py +1 -1
- sky/clouds/utils/oci_utils.py +16 -2
- sky/dag.py +19 -15
- sky/provision/__init__.py +1 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +430 -0
- sky/{skylet/providers/oci/query_helper.py → provision/oci/query_utils.py} +148 -59
- sky/setup_files/MANIFEST.in +0 -1
- sky/skylet/job_lib.py +29 -17
- sky/templates/oci-ray.yml.j2 +3 -53
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/RECORD +20 -20
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/utils.py +0 -21
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/top_level.txt +0 -0
@@ -1,56 +1,75 @@
|
|
1
|
-
"""
|
2
|
-
Helper class for some OCI operations methods which needs to be shared/called
|
3
|
-
by multiple places.
|
1
|
+
"""OCI query helper class
|
4
2
|
|
5
3
|
History:
|
6
|
-
- Hysun He (hysun.he@oracle.com) @
|
7
|
-
|
4
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Code here mainly
|
5
|
+
migrated from the old provisioning API.
|
6
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.18, 2024: Enhancement.
|
7
|
+
find_compartment: allow search subtree when find a compartment.
|
8
8
|
"""
|
9
|
-
|
10
9
|
from datetime import datetime
|
11
|
-
import
|
10
|
+
import functools
|
11
|
+
from logging import Logger
|
12
12
|
import re
|
13
13
|
import time
|
14
14
|
import traceback
|
15
15
|
import typing
|
16
16
|
from typing import Optional
|
17
17
|
|
18
|
+
from sky import sky_logging
|
18
19
|
from sky.adaptors import common as adaptors_common
|
19
20
|
from sky.adaptors import oci as oci_adaptor
|
20
21
|
from sky.clouds.utils import oci_utils
|
21
|
-
from sky.skylet.providers.oci import utils
|
22
22
|
|
23
23
|
if typing.TYPE_CHECKING:
|
24
24
|
import pandas as pd
|
25
25
|
else:
|
26
26
|
pd = adaptors_common.LazyImport('pandas')
|
27
27
|
|
28
|
-
logger =
|
28
|
+
logger = sky_logging.init_logger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
def debug_enabled(log: Logger):
|
32
|
+
|
33
|
+
def decorate(f):
|
34
|
+
|
35
|
+
@functools.wraps(f)
|
36
|
+
def wrapper(*args, **kwargs):
|
37
|
+
dt_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
38
|
+
log.debug(f'{dt_str} Enter {f}, {args}, {kwargs}')
|
39
|
+
try:
|
40
|
+
return f(*args, **kwargs)
|
41
|
+
finally:
|
42
|
+
log.debug(f'{dt_str} Exit {f}')
|
43
|
+
|
44
|
+
return wrapper
|
29
45
|
|
46
|
+
return decorate
|
30
47
|
|
31
|
-
class oci_query_helper:
|
32
48
|
|
49
|
+
class QueryHelper:
|
50
|
+
"""Helper class for some OCI operations
|
51
|
+
"""
|
33
52
|
# Call Cloud API to try getting the satisfied nodes.
|
34
53
|
@classmethod
|
35
|
-
@
|
54
|
+
@debug_enabled(logger)
|
36
55
|
def query_instances_by_tags(cls, tag_filters, region):
|
37
56
|
|
38
|
-
where_clause_tags =
|
57
|
+
where_clause_tags = ''
|
39
58
|
for tag_key in tag_filters:
|
40
|
-
if where_clause_tags !=
|
41
|
-
where_clause_tags +=
|
59
|
+
if where_clause_tags != '':
|
60
|
+
where_clause_tags += ' && '
|
42
61
|
|
43
62
|
tag_value = tag_filters[tag_key]
|
44
|
-
where_clause_tags += (f
|
45
|
-
f
|
63
|
+
where_clause_tags += (f'(freeformTags.key = \'{tag_key}\''
|
64
|
+
f' && freeformTags.value = \'{tag_value}\')')
|
46
65
|
|
47
|
-
qv_str = (f
|
48
|
-
f
|
49
|
-
f
|
66
|
+
qv_str = (f'query instance resources where {where_clause_tags}'
|
67
|
+
f' && (lifecycleState != \'TERMINATED\''
|
68
|
+
f' && lifecycleState != \'TERMINATING\')')
|
50
69
|
|
51
70
|
qv = oci_adaptor.oci.resource_search.models.StructuredSearchDetails(
|
52
71
|
query=qv_str,
|
53
|
-
type=
|
72
|
+
type='Structured',
|
54
73
|
matching_context_type=oci_adaptor.oci.resource_search.models.
|
55
74
|
SearchDetails.MATCHING_CONTEXT_TYPE_NONE,
|
56
75
|
)
|
@@ -63,44 +82,98 @@ class oci_query_helper:
|
|
63
82
|
|
64
83
|
@classmethod
|
65
84
|
def terminate_instances_by_tags(cls, tag_filters, region) -> int:
|
66
|
-
logger.debug(f
|
85
|
+
logger.debug(f'Terminate instance by tags: {tag_filters}')
|
67
86
|
insts = cls.query_instances_by_tags(tag_filters, region)
|
68
87
|
fail_count = 0
|
69
88
|
for inst in insts:
|
70
89
|
inst_id = inst.identifier
|
71
|
-
logger.debug(f
|
90
|
+
logger.debug(f'Got instance(to be terminated): {inst_id}')
|
72
91
|
|
73
92
|
try:
|
74
93
|
oci_adaptor.get_core_client(
|
75
94
|
region,
|
76
95
|
oci_utils.oci_config.get_profile()).terminate_instance(
|
77
96
|
inst_id)
|
78
|
-
except
|
97
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
79
98
|
fail_count += 1
|
80
|
-
logger.error(f
|
99
|
+
logger.error(f'Terminate instance failed: {str(e)}\n: {inst}')
|
81
100
|
traceback.print_exc()
|
82
101
|
|
83
102
|
if fail_count == 0:
|
84
|
-
logger.debug(
|
103
|
+
logger.debug('Instance teardown result: OK')
|
85
104
|
else:
|
86
|
-
logger.
|
105
|
+
logger.warning(f'Instance teardown result: {fail_count} failed!')
|
87
106
|
|
88
107
|
return fail_count
|
89
108
|
|
90
109
|
@classmethod
|
91
|
-
@
|
110
|
+
@debug_enabled(logger)
|
111
|
+
def launch_instance(cls, region, launch_config):
|
112
|
+
""" To create a new instance """
|
113
|
+
return oci_adaptor.get_core_client(
|
114
|
+
region, oci_utils.oci_config.get_profile()).launch_instance(
|
115
|
+
launch_instance_details=launch_config)
|
116
|
+
|
117
|
+
@classmethod
|
118
|
+
@debug_enabled(logger)
|
119
|
+
def start_instance(cls, region, instance_id):
|
120
|
+
""" To start an existing instance """
|
121
|
+
return oci_adaptor.get_core_client(
|
122
|
+
region, oci_utils.oci_config.get_profile()).instance_action(
|
123
|
+
instance_id=instance_id, action='START')
|
124
|
+
|
125
|
+
@classmethod
|
126
|
+
@debug_enabled(logger)
|
127
|
+
def stop_instance(cls, region, instance_id):
|
128
|
+
""" To stop an instance """
|
129
|
+
return oci_adaptor.get_core_client(
|
130
|
+
region, oci_utils.oci_config.get_profile()).instance_action(
|
131
|
+
instance_id=instance_id, action='STOP')
|
132
|
+
|
133
|
+
@classmethod
|
134
|
+
@debug_enabled(logger)
|
135
|
+
def wait_instance_until_status(cls, region, node_id, status):
|
136
|
+
""" To wait a instance becoming the specified state """
|
137
|
+
compute_client = oci_adaptor.get_core_client(
|
138
|
+
region, oci_utils.oci_config.get_profile())
|
139
|
+
|
140
|
+
resp = compute_client.get_instance(instance_id=node_id)
|
141
|
+
|
142
|
+
oci_adaptor.oci.wait_until(
|
143
|
+
compute_client,
|
144
|
+
resp,
|
145
|
+
'lifecycle_state',
|
146
|
+
status,
|
147
|
+
)
|
148
|
+
|
149
|
+
@classmethod
|
150
|
+
def get_instance_primary_vnic(cls, region, inst_info):
|
151
|
+
""" Get the primary vnic infomation of the instance """
|
152
|
+
list_vnic_attachments_response = oci_adaptor.get_core_client(
|
153
|
+
region, oci_utils.oci_config.get_profile()).list_vnic_attachments(
|
154
|
+
availability_domain=inst_info['ad'],
|
155
|
+
compartment_id=inst_info['compartment'],
|
156
|
+
instance_id=inst_info['inst_id'],
|
157
|
+
)
|
158
|
+
vnic = list_vnic_attachments_response.data[0]
|
159
|
+
return oci_adaptor.get_net_client(
|
160
|
+
region, oci_utils.oci_config.get_profile()).get_vnic(
|
161
|
+
vnic_id=vnic.vnic_id).data
|
162
|
+
|
163
|
+
@classmethod
|
164
|
+
@debug_enabled(logger)
|
92
165
|
def subscribe_image(cls, compartment_id, listing_id, resource_version,
|
93
166
|
region):
|
94
|
-
if (pd.isna(listing_id) or listing_id.strip() ==
|
95
|
-
listing_id.strip() ==
|
167
|
+
if (pd.isna(listing_id) or listing_id.strip() == 'None' or
|
168
|
+
listing_id.strip() == 'nan'):
|
96
169
|
return
|
97
170
|
|
98
171
|
core_client = oci_adaptor.get_core_client(
|
99
172
|
region, oci_utils.oci_config.get_profile())
|
100
173
|
try:
|
101
|
-
|
174
|
+
agreements_resp = core_client.get_app_catalog_listing_agreements(
|
102
175
|
listing_id=listing_id, resource_version=resource_version)
|
103
|
-
agreements =
|
176
|
+
agreements = agreements_resp.data
|
104
177
|
|
105
178
|
core_client.create_app_catalog_subscription(
|
106
179
|
create_app_catalog_subscription_details=oci_adaptor.oci.core.
|
@@ -113,24 +186,24 @@ class oci_query_helper:
|
|
113
186
|
oracle_terms_of_use_link,
|
114
187
|
time_retrieved=datetime.strptime(
|
115
188
|
re.sub(
|
116
|
-
|
117
|
-
|
189
|
+
r'\d{3}\+\d{2}\:\d{2}',
|
190
|
+
'Z',
|
118
191
|
str(agreements.time_retrieved),
|
119
192
|
0,
|
120
193
|
),
|
121
|
-
|
194
|
+
'%Y-%m-%d %H:%M:%S.%fZ',
|
122
195
|
),
|
123
196
|
signature=agreements.signature,
|
124
197
|
eula_link=agreements.eula_link,
|
125
198
|
))
|
126
|
-
except
|
199
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
127
200
|
logger.critical(
|
128
|
-
f
|
129
|
-
f
|
130
|
-
raise RuntimeError(
|
201
|
+
f'[Failed] subscribe_image: {listing_id} - {resource_version}'
|
202
|
+
f'Error message: {str(e)}')
|
203
|
+
raise RuntimeError('ERR: Image subscription error!') from e
|
131
204
|
|
132
205
|
@classmethod
|
133
|
-
@
|
206
|
+
@debug_enabled(logger)
|
134
207
|
def find_compartment(cls, region) -> str:
|
135
208
|
""" If compartment is not configured, we use root compartment """
|
136
209
|
# Try to use the configured one first
|
@@ -143,12 +216,18 @@ class oci_query_helper:
|
|
143
216
|
# config file is supported (2023/06/09).
|
144
217
|
root = oci_adaptor.get_oci_config(
|
145
218
|
region, oci_utils.oci_config.get_profile())['tenancy']
|
219
|
+
|
146
220
|
list_compartments_response = oci_adaptor.get_identity_client(
|
147
221
|
region, oci_utils.oci_config.get_profile()).list_compartments(
|
148
222
|
compartment_id=root,
|
149
223
|
name=oci_utils.oci_config.COMPARTMENT,
|
224
|
+
compartment_id_in_subtree=True,
|
225
|
+
access_level='ACCESSIBLE',
|
150
226
|
lifecycle_state='ACTIVE',
|
227
|
+
sort_by='TIMECREATED',
|
228
|
+
sort_order='DESC',
|
151
229
|
limit=1)
|
230
|
+
|
152
231
|
compartments = list_compartments_response.data
|
153
232
|
if len(compartments) > 0:
|
154
233
|
skypilot_compartment = compartments[0].id
|
@@ -159,7 +238,7 @@ class oci_query_helper:
|
|
159
238
|
return skypilot_compartment
|
160
239
|
|
161
240
|
@classmethod
|
162
|
-
@
|
241
|
+
@debug_enabled(logger)
|
163
242
|
def find_create_vcn_subnet(cls, region) -> Optional[str]:
|
164
243
|
""" If sub is not configured, we find/create VCN skypilot_vcn """
|
165
244
|
subnet = oci_utils.oci_config.get_vcn_subnet(region)
|
@@ -174,7 +253,7 @@ class oci_query_helper:
|
|
174
253
|
list_vcns_response = net_client.list_vcns(
|
175
254
|
compartment_id=skypilot_compartment,
|
176
255
|
display_name=oci_utils.oci_config.VCN_NAME,
|
177
|
-
lifecycle_state=
|
256
|
+
lifecycle_state='AVAILABLE')
|
178
257
|
vcns = list_vcns_response.data
|
179
258
|
if len(vcns) > 0:
|
180
259
|
# Found the VCN.
|
@@ -184,7 +263,7 @@ class oci_query_helper:
|
|
184
263
|
limit=1,
|
185
264
|
vcn_id=skypilot_vcn,
|
186
265
|
display_name=oci_utils.oci_config.VCN_SUBNET_NAME,
|
187
|
-
lifecycle_state=
|
266
|
+
lifecycle_state='AVAILABLE')
|
188
267
|
logger.debug(f'Got VCN subnet \n{list_subnets_response.data}')
|
189
268
|
if len(list_subnets_response.data) < 1:
|
190
269
|
logger.error(
|
@@ -201,10 +280,17 @@ class oci_query_helper:
|
|
201
280
|
return cls.create_vcn_subnet(net_client, skypilot_compartment)
|
202
281
|
|
203
282
|
@classmethod
|
204
|
-
@
|
283
|
+
@debug_enabled(logger)
|
205
284
|
def create_vcn_subnet(cls, net_client,
|
206
285
|
skypilot_compartment) -> Optional[str]:
|
286
|
+
|
287
|
+
skypilot_vcn = None # VCN for the resources
|
288
|
+
subnet = None # Subnet for the VMs
|
289
|
+
ig = None # Internet gateway
|
290
|
+
sg = None # Service gateway
|
291
|
+
|
207
292
|
try:
|
293
|
+
# pylint: disable=line-too-long
|
208
294
|
create_vcn_response = net_client.create_vcn(
|
209
295
|
create_vcn_details=oci_adaptor.oci.core.models.CreateVcnDetails(
|
210
296
|
compartment_id=skypilot_compartment,
|
@@ -274,38 +360,38 @@ class oci_query_helper:
|
|
274
360
|
update_security_list_details=oci_adaptor.oci.core.models.
|
275
361
|
UpdateSecurityListDetails(ingress_security_rules=[
|
276
362
|
oci_adaptor.oci.core.models.IngressSecurityRule(
|
277
|
-
protocol=
|
363
|
+
protocol='6',
|
278
364
|
source=oci_utils.oci_config.VCN_CIDR_INTERNET,
|
279
365
|
is_stateless=False,
|
280
|
-
source_type=
|
366
|
+
source_type='CIDR_BLOCK',
|
281
367
|
tcp_options=oci_adaptor.oci.core.models.TcpOptions(
|
282
368
|
destination_port_range=oci_adaptor.oci.core.models.
|
283
369
|
PortRange(max=22, min=22),
|
284
370
|
source_port_range=oci_adaptor.oci.core.models.
|
285
371
|
PortRange(max=65535, min=1)),
|
286
|
-
description=
|
372
|
+
description='Allow SSH port.'),
|
287
373
|
oci_adaptor.oci.core.models.IngressSecurityRule(
|
288
|
-
protocol=
|
374
|
+
protocol='all',
|
289
375
|
source=oci_utils.oci_config.VCN_SUBNET_CIDR,
|
290
376
|
is_stateless=False,
|
291
|
-
source_type=
|
292
|
-
description=
|
377
|
+
source_type='CIDR_BLOCK',
|
378
|
+
description='Allow all traffic from/to same subnet.'),
|
293
379
|
oci_adaptor.oci.core.models.IngressSecurityRule(
|
294
|
-
protocol=
|
380
|
+
protocol='1',
|
295
381
|
source=oci_utils.oci_config.VCN_CIDR_INTERNET,
|
296
382
|
is_stateless=False,
|
297
|
-
source_type=
|
383
|
+
source_type='CIDR_BLOCK',
|
298
384
|
icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
|
299
385
|
type=3, code=4),
|
300
|
-
description=
|
386
|
+
description='ICMP traffic.'),
|
301
387
|
oci_adaptor.oci.core.models.IngressSecurityRule(
|
302
|
-
protocol=
|
388
|
+
protocol='1',
|
303
389
|
source=oci_utils.oci_config.VCN_CIDR,
|
304
390
|
is_stateless=False,
|
305
|
-
source_type=
|
391
|
+
source_type='CIDR_BLOCK',
|
306
392
|
icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
|
307
393
|
type=3),
|
308
|
-
description=
|
394
|
+
description='ICMP traffic (VCN).'),
|
309
395
|
]))
|
310
396
|
logger.debug(
|
311
397
|
f'Updated security_list: \n{update_security_list_response.data}'
|
@@ -325,7 +411,7 @@ class oci_query_helper:
|
|
325
411
|
]))
|
326
412
|
logger.debug(f'Route table: \n{update_route_table_response.data}')
|
327
413
|
|
328
|
-
except oci_adaptor.
|
414
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
329
415
|
logger.error(f'Create VCN Error: Create new VCN '
|
330
416
|
f'{oci_utils.oci_config.VCN_NAME} failed: {str(e)}')
|
331
417
|
# In case of partial success while creating vcn
|
@@ -335,7 +421,7 @@ class oci_query_helper:
|
|
335
421
|
return subnet
|
336
422
|
|
337
423
|
@classmethod
|
338
|
-
@
|
424
|
+
@debug_enabled(logger)
|
339
425
|
def delete_vcn(cls, net_client, skypilot_vcn, skypilot_subnet,
|
340
426
|
internet_gateway, service_gateway):
|
341
427
|
if skypilot_vcn is None:
|
@@ -369,7 +455,7 @@ class oci_query_helper:
|
|
369
455
|
f'Deleted vcn {skypilot_vcn}-{delete_vcn_response.data}'
|
370
456
|
)
|
371
457
|
break
|
372
|
-
except oci_adaptor.
|
458
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
373
459
|
logger.info(f'Waiting del SG/IG/Subnet finish: {str(e)}')
|
374
460
|
retry_count = retry_count + 1
|
375
461
|
if retry_count == oci_utils.oci_config.MAX_RETRY_COUNT:
|
@@ -378,6 +464,9 @@ class oci_query_helper:
|
|
378
464
|
time.sleep(
|
379
465
|
oci_utils.oci_config.RETRY_INTERVAL_BASE_SECONDS)
|
380
466
|
|
381
|
-
except oci_adaptor.
|
467
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
382
468
|
logger.error(
|
383
469
|
f'Delete VCN {oci_utils.oci_config.VCN_NAME} Error: {str(e)}')
|
470
|
+
|
471
|
+
|
472
|
+
query_helper = QueryHelper()
|
sky/setup_files/MANIFEST.in
CHANGED
@@ -6,7 +6,6 @@ include sky/setup_files/*
|
|
6
6
|
include sky/skylet/*.sh
|
7
7
|
include sky/skylet/LICENSE
|
8
8
|
include sky/skylet/providers/ibm/*
|
9
|
-
include sky/skylet/providers/oci/*
|
10
9
|
include sky/skylet/providers/scp/*
|
11
10
|
include sky/skylet/providers/*.py
|
12
11
|
include sky/skylet/ray_patches/*.patch
|
sky/skylet/job_lib.py
CHANGED
@@ -11,8 +11,7 @@ import shlex
|
|
11
11
|
import sqlite3
|
12
12
|
import subprocess
|
13
13
|
import time
|
14
|
-
import
|
15
|
-
from typing import Any, Dict, List, Optional, Tuple
|
14
|
+
from typing import Any, Dict, List, Optional
|
16
15
|
|
17
16
|
import colorama
|
18
17
|
import filelock
|
@@ -24,9 +23,6 @@ from sky.utils import common_utils
|
|
24
23
|
from sky.utils import db_utils
|
25
24
|
from sky.utils import log_utils
|
26
25
|
|
27
|
-
if typing.TYPE_CHECKING:
|
28
|
-
from ray.dashboard.modules.job import pydantic_models as ray_pydantic
|
29
|
-
|
30
26
|
logger = sky_logging.init_logger(__name__)
|
31
27
|
|
32
28
|
_LINUX_NEW_LINE = '\n'
|
@@ -184,12 +180,20 @@ class JobScheduler:
|
|
184
180
|
def schedule_step(self, force_update_jobs: bool = False) -> None:
|
185
181
|
if force_update_jobs:
|
186
182
|
update_status()
|
187
|
-
|
183
|
+
pending_job_ids = self._get_pending_job_ids()
|
188
184
|
# TODO(zhwu, mraheja): One optimization can be allowing more than one
|
189
185
|
# job staying in the pending state after ray job submit, so that to be
|
190
186
|
# faster to schedule a large amount of jobs.
|
191
|
-
for job_id
|
187
|
+
for job_id in pending_job_ids:
|
192
188
|
with filelock.FileLock(_get_lock_path(job_id)):
|
189
|
+
pending_job = _get_pending_job(job_id)
|
190
|
+
if pending_job is None:
|
191
|
+
# Pending job can be removed by another thread, due to the
|
192
|
+
# job being scheduled already.
|
193
|
+
continue
|
194
|
+
run_cmd = pending_job['run_cmd']
|
195
|
+
submit = pending_job['submit']
|
196
|
+
created_time = pending_job['created_time']
|
193
197
|
# We don't have to refresh the job status before checking, as
|
194
198
|
# the job status will only be stale in rare cases where ray job
|
195
199
|
# crashes; or the job stays in INIT state for a long time.
|
@@ -208,8 +212,8 @@ class JobScheduler:
|
|
208
212
|
self._run_job(job_id, run_cmd)
|
209
213
|
return
|
210
214
|
|
211
|
-
def
|
212
|
-
"""Returns the
|
215
|
+
def _get_pending_job_ids(self) -> List[int]:
|
216
|
+
"""Returns the job ids in the pending jobs table
|
213
217
|
|
214
218
|
The information contains job_id, run command, submit time,
|
215
219
|
creation time.
|
@@ -220,9 +224,10 @@ class JobScheduler:
|
|
220
224
|
class FIFOScheduler(JobScheduler):
|
221
225
|
"""First in first out job scheduler"""
|
222
226
|
|
223
|
-
def
|
224
|
-
|
225
|
-
|
227
|
+
def _get_pending_job_ids(self) -> List[int]:
|
228
|
+
rows = _CURSOR.execute(
|
229
|
+
'SELECT job_id FROM pending_jobs ORDER BY job_id').fetchall()
|
230
|
+
return [row[0] for row in rows]
|
226
231
|
|
227
232
|
|
228
233
|
scheduler = FIFOScheduler()
|
@@ -519,11 +524,16 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
|
|
519
524
|
|
520
525
|
|
521
526
|
def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
|
522
|
-
rows = _CURSOR.execute(
|
523
|
-
|
527
|
+
rows = _CURSOR.execute(
|
528
|
+
'SELECT created_time, submit, run_cmd FROM pending_jobs '
|
529
|
+
f'WHERE job_id={job_id!r}')
|
524
530
|
for row in rows:
|
525
|
-
created_time, submit = row
|
526
|
-
return {
|
531
|
+
created_time, submit, run_cmd = row
|
532
|
+
return {
|
533
|
+
'created_time': created_time,
|
534
|
+
'submit': submit,
|
535
|
+
'run_cmd': run_cmd
|
536
|
+
}
|
527
537
|
return None
|
528
538
|
|
529
539
|
|
@@ -794,7 +804,9 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
794
804
|
logger.warning(str(e))
|
795
805
|
continue
|
796
806
|
|
797
|
-
|
807
|
+
# Get the job status again to avoid race condition.
|
808
|
+
job_status = get_status_no_lock(job['job_id'])
|
809
|
+
if job_status in [
|
798
810
|
JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING
|
799
811
|
]:
|
800
812
|
_set_status_no_lock(job['job_id'], JobStatus.CANCELLED)
|
sky/templates/oci-ray.yml.j2
CHANGED
@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
|
|
7
7
|
|
8
8
|
provider:
|
9
9
|
type: external
|
10
|
-
module: sky.
|
10
|
+
module: sky.provision.oci
|
11
11
|
region: {{region}}
|
12
12
|
cache_stopped_nodes: True
|
13
13
|
# Disable launch config check for worker nodes as it can cause resource leakage.
|
@@ -39,25 +39,6 @@ available_node_types:
|
|
39
39
|
Preemptible: {{use_spot}}
|
40
40
|
AuthorizedKey: |
|
41
41
|
skypilot:ssh_public_key_content
|
42
|
-
{% if num_nodes > 1 %}
|
43
|
-
ray_worker_default:
|
44
|
-
min_workers: {{num_nodes - 1}}
|
45
|
-
max_workers: {{num_nodes - 1}}
|
46
|
-
resources: {}
|
47
|
-
node_config:
|
48
|
-
InstanceType: {{instance_type}}
|
49
|
-
VCPUs: {{cpus}}
|
50
|
-
MemoryInGbs: {{memory}}
|
51
|
-
BootVolumeSize: {{disk_size}}
|
52
|
-
BootVolumePerf: {{vpu}}
|
53
|
-
AvailabilityDomain: {{zone}}
|
54
|
-
ImageId: {{image}}
|
55
|
-
AppCatalogListingId: {{app_catalog_listing_id}}
|
56
|
-
ResourceVersion: {{resource_version}}
|
57
|
-
Preemptible: {{use_spot}}
|
58
|
-
AuthorizedKey: |
|
59
|
-
skypilot:ssh_public_key_content
|
60
|
-
{%- endif %}
|
61
42
|
|
62
43
|
head_node_type: ray_head_default
|
63
44
|
|
@@ -70,9 +51,6 @@ file_mounts: {
|
|
70
51
|
{%- endfor %}
|
71
52
|
}
|
72
53
|
|
73
|
-
rsync_exclude: []
|
74
|
-
|
75
|
-
initialization_commands: []
|
76
54
|
|
77
55
|
# List of shell commands to run to set up nodes.
|
78
56
|
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
@@ -113,34 +91,6 @@ setup_commands:
|
|
113
91
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
114
92
|
sudo iptables -I INPUT -i ens3 -m state --state ESTABLISHED,RELATED,NEW -j ACCEPT;
|
115
93
|
|
116
|
-
# Command to start ray
|
117
|
-
#
|
118
|
-
# connection, which is expensive. Try your best to co-locate commands into fewer
|
119
|
-
# items! The same comment applies for worker_start_ray_commands.
|
120
|
-
#
|
121
|
-
# Increment the following for catching performance bugs easier:
|
122
|
-
# current num items (num SSH connections): 2
|
123
|
-
head_start_ray_commands:
|
124
|
-
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
|
125
|
-
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
|
126
|
-
# all the sessions to be reloaded. This is a workaround.
|
127
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
128
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
129
|
-
{{dump_port_command}}; {{ray_head_wait_initialized_command}}
|
130
|
-
|
131
|
-
{%- if num_nodes > 1 %}
|
132
|
-
worker_start_ray_commands:
|
133
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
134
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
135
|
-
{%- else %}
|
136
|
-
worker_start_ray_commands: []
|
137
|
-
{%- endif %}
|
138
|
-
|
139
|
-
head_node: {}
|
140
|
-
worker_nodes: {}
|
94
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
95
|
+
# We do not need to list it here anymore.
|
141
96
|
|
142
|
-
# These fields are required for external cloud providers.
|
143
|
-
head_setup_commands: []
|
144
|
-
worker_setup_commands: []
|
145
|
-
cluster_synced_files: []
|
146
|
-
file_mounts_sync_continuously: False
|