skypilot-nightly 1.0.0.dev20241108__py3-none-any.whl → 1.0.0.dev20241110__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +6 -21
- sky/backends/wheel_utils.py +5 -1
- sky/cli.py +25 -1
- sky/clouds/oci.py +11 -21
- sky/clouds/service_catalog/oci_catalog.py +1 -1
- sky/clouds/utils/oci_utils.py +16 -2
- sky/core.py +3 -2
- sky/dag.py +20 -15
- sky/data/mounting_utils.py +4 -16
- sky/exceptions.py +4 -1
- sky/execution.py +10 -8
- sky/jobs/core.py +3 -1
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +25 -5
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +430 -0
- sky/{skylet/providers/oci/query_helper.py → provision/oci/query_utils.py} +148 -59
- sky/serve/core.py +11 -1
- sky/setup_files/MANIFEST.in +0 -1
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +39 -20
- sky/skylet/log_lib.py +77 -8
- sky/templates/kubernetes-ray.yml.j2 +3 -1
- sky/templates/oci-ray.yml.j2 +3 -53
- sky/utils/admin_policy_utils.py +1 -0
- sky/utils/command_runner.py +14 -2
- sky/utils/control_master_utils.py +49 -0
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/RECORD +35 -34
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/utils.py +0 -21
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241108.dist-info → skypilot_nightly-1.0.0.dev20241110.dist-info}/top_level.txt +0 -0
@@ -1,56 +1,75 @@
|
|
1
|
-
"""
|
2
|
-
Helper class for some OCI operations methods which needs to be shared/called
|
3
|
-
by multiple places.
|
1
|
+
"""OCI query helper class
|
4
2
|
|
5
3
|
History:
|
6
|
-
- Hysun He (hysun.he@oracle.com) @
|
7
|
-
|
4
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Code here mainly
|
5
|
+
migrated from the old provisioning API.
|
6
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.18, 2024: Enhancement.
|
7
|
+
find_compartment: allow search subtree when find a compartment.
|
8
8
|
"""
|
9
|
-
|
10
9
|
from datetime import datetime
|
11
|
-
import
|
10
|
+
import functools
|
11
|
+
from logging import Logger
|
12
12
|
import re
|
13
13
|
import time
|
14
14
|
import traceback
|
15
15
|
import typing
|
16
16
|
from typing import Optional
|
17
17
|
|
18
|
+
from sky import sky_logging
|
18
19
|
from sky.adaptors import common as adaptors_common
|
19
20
|
from sky.adaptors import oci as oci_adaptor
|
20
21
|
from sky.clouds.utils import oci_utils
|
21
|
-
from sky.skylet.providers.oci import utils
|
22
22
|
|
23
23
|
if typing.TYPE_CHECKING:
|
24
24
|
import pandas as pd
|
25
25
|
else:
|
26
26
|
pd = adaptors_common.LazyImport('pandas')
|
27
27
|
|
28
|
-
logger =
|
28
|
+
logger = sky_logging.init_logger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
def debug_enabled(log: Logger):
|
32
|
+
|
33
|
+
def decorate(f):
|
34
|
+
|
35
|
+
@functools.wraps(f)
|
36
|
+
def wrapper(*args, **kwargs):
|
37
|
+
dt_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
38
|
+
log.debug(f'{dt_str} Enter {f}, {args}, {kwargs}')
|
39
|
+
try:
|
40
|
+
return f(*args, **kwargs)
|
41
|
+
finally:
|
42
|
+
log.debug(f'{dt_str} Exit {f}')
|
43
|
+
|
44
|
+
return wrapper
|
29
45
|
|
46
|
+
return decorate
|
30
47
|
|
31
|
-
class oci_query_helper:
|
32
48
|
|
49
|
+
class QueryHelper:
|
50
|
+
"""Helper class for some OCI operations
|
51
|
+
"""
|
33
52
|
# Call Cloud API to try getting the satisfied nodes.
|
34
53
|
@classmethod
|
35
|
-
@
|
54
|
+
@debug_enabled(logger)
|
36
55
|
def query_instances_by_tags(cls, tag_filters, region):
|
37
56
|
|
38
|
-
where_clause_tags =
|
57
|
+
where_clause_tags = ''
|
39
58
|
for tag_key in tag_filters:
|
40
|
-
if where_clause_tags !=
|
41
|
-
where_clause_tags +=
|
59
|
+
if where_clause_tags != '':
|
60
|
+
where_clause_tags += ' && '
|
42
61
|
|
43
62
|
tag_value = tag_filters[tag_key]
|
44
|
-
where_clause_tags += (f
|
45
|
-
f
|
63
|
+
where_clause_tags += (f'(freeformTags.key = \'{tag_key}\''
|
64
|
+
f' && freeformTags.value = \'{tag_value}\')')
|
46
65
|
|
47
|
-
qv_str = (f
|
48
|
-
f
|
49
|
-
f
|
66
|
+
qv_str = (f'query instance resources where {where_clause_tags}'
|
67
|
+
f' && (lifecycleState != \'TERMINATED\''
|
68
|
+
f' && lifecycleState != \'TERMINATING\')')
|
50
69
|
|
51
70
|
qv = oci_adaptor.oci.resource_search.models.StructuredSearchDetails(
|
52
71
|
query=qv_str,
|
53
|
-
type=
|
72
|
+
type='Structured',
|
54
73
|
matching_context_type=oci_adaptor.oci.resource_search.models.
|
55
74
|
SearchDetails.MATCHING_CONTEXT_TYPE_NONE,
|
56
75
|
)
|
@@ -63,44 +82,98 @@ class oci_query_helper:
|
|
63
82
|
|
64
83
|
@classmethod
|
65
84
|
def terminate_instances_by_tags(cls, tag_filters, region) -> int:
|
66
|
-
logger.debug(f
|
85
|
+
logger.debug(f'Terminate instance by tags: {tag_filters}')
|
67
86
|
insts = cls.query_instances_by_tags(tag_filters, region)
|
68
87
|
fail_count = 0
|
69
88
|
for inst in insts:
|
70
89
|
inst_id = inst.identifier
|
71
|
-
logger.debug(f
|
90
|
+
logger.debug(f'Got instance(to be terminated): {inst_id}')
|
72
91
|
|
73
92
|
try:
|
74
93
|
oci_adaptor.get_core_client(
|
75
94
|
region,
|
76
95
|
oci_utils.oci_config.get_profile()).terminate_instance(
|
77
96
|
inst_id)
|
78
|
-
except
|
97
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
79
98
|
fail_count += 1
|
80
|
-
logger.error(f
|
99
|
+
logger.error(f'Terminate instance failed: {str(e)}\n: {inst}')
|
81
100
|
traceback.print_exc()
|
82
101
|
|
83
102
|
if fail_count == 0:
|
84
|
-
logger.debug(
|
103
|
+
logger.debug('Instance teardown result: OK')
|
85
104
|
else:
|
86
|
-
logger.
|
105
|
+
logger.warning(f'Instance teardown result: {fail_count} failed!')
|
87
106
|
|
88
107
|
return fail_count
|
89
108
|
|
90
109
|
@classmethod
|
91
|
-
@
|
110
|
+
@debug_enabled(logger)
|
111
|
+
def launch_instance(cls, region, launch_config):
|
112
|
+
""" To create a new instance """
|
113
|
+
return oci_adaptor.get_core_client(
|
114
|
+
region, oci_utils.oci_config.get_profile()).launch_instance(
|
115
|
+
launch_instance_details=launch_config)
|
116
|
+
|
117
|
+
@classmethod
|
118
|
+
@debug_enabled(logger)
|
119
|
+
def start_instance(cls, region, instance_id):
|
120
|
+
""" To start an existing instance """
|
121
|
+
return oci_adaptor.get_core_client(
|
122
|
+
region, oci_utils.oci_config.get_profile()).instance_action(
|
123
|
+
instance_id=instance_id, action='START')
|
124
|
+
|
125
|
+
@classmethod
|
126
|
+
@debug_enabled(logger)
|
127
|
+
def stop_instance(cls, region, instance_id):
|
128
|
+
""" To stop an instance """
|
129
|
+
return oci_adaptor.get_core_client(
|
130
|
+
region, oci_utils.oci_config.get_profile()).instance_action(
|
131
|
+
instance_id=instance_id, action='STOP')
|
132
|
+
|
133
|
+
@classmethod
|
134
|
+
@debug_enabled(logger)
|
135
|
+
def wait_instance_until_status(cls, region, node_id, status):
|
136
|
+
""" To wait a instance becoming the specified state """
|
137
|
+
compute_client = oci_adaptor.get_core_client(
|
138
|
+
region, oci_utils.oci_config.get_profile())
|
139
|
+
|
140
|
+
resp = compute_client.get_instance(instance_id=node_id)
|
141
|
+
|
142
|
+
oci_adaptor.oci.wait_until(
|
143
|
+
compute_client,
|
144
|
+
resp,
|
145
|
+
'lifecycle_state',
|
146
|
+
status,
|
147
|
+
)
|
148
|
+
|
149
|
+
@classmethod
|
150
|
+
def get_instance_primary_vnic(cls, region, inst_info):
|
151
|
+
""" Get the primary vnic infomation of the instance """
|
152
|
+
list_vnic_attachments_response = oci_adaptor.get_core_client(
|
153
|
+
region, oci_utils.oci_config.get_profile()).list_vnic_attachments(
|
154
|
+
availability_domain=inst_info['ad'],
|
155
|
+
compartment_id=inst_info['compartment'],
|
156
|
+
instance_id=inst_info['inst_id'],
|
157
|
+
)
|
158
|
+
vnic = list_vnic_attachments_response.data[0]
|
159
|
+
return oci_adaptor.get_net_client(
|
160
|
+
region, oci_utils.oci_config.get_profile()).get_vnic(
|
161
|
+
vnic_id=vnic.vnic_id).data
|
162
|
+
|
163
|
+
@classmethod
|
164
|
+
@debug_enabled(logger)
|
92
165
|
def subscribe_image(cls, compartment_id, listing_id, resource_version,
|
93
166
|
region):
|
94
|
-
if (pd.isna(listing_id) or listing_id.strip() ==
|
95
|
-
listing_id.strip() ==
|
167
|
+
if (pd.isna(listing_id) or listing_id.strip() == 'None' or
|
168
|
+
listing_id.strip() == 'nan'):
|
96
169
|
return
|
97
170
|
|
98
171
|
core_client = oci_adaptor.get_core_client(
|
99
172
|
region, oci_utils.oci_config.get_profile())
|
100
173
|
try:
|
101
|
-
|
174
|
+
agreements_resp = core_client.get_app_catalog_listing_agreements(
|
102
175
|
listing_id=listing_id, resource_version=resource_version)
|
103
|
-
agreements =
|
176
|
+
agreements = agreements_resp.data
|
104
177
|
|
105
178
|
core_client.create_app_catalog_subscription(
|
106
179
|
create_app_catalog_subscription_details=oci_adaptor.oci.core.
|
@@ -113,24 +186,24 @@ class oci_query_helper:
|
|
113
186
|
oracle_terms_of_use_link,
|
114
187
|
time_retrieved=datetime.strptime(
|
115
188
|
re.sub(
|
116
|
-
|
117
|
-
|
189
|
+
r'\d{3}\+\d{2}\:\d{2}',
|
190
|
+
'Z',
|
118
191
|
str(agreements.time_retrieved),
|
119
192
|
0,
|
120
193
|
),
|
121
|
-
|
194
|
+
'%Y-%m-%d %H:%M:%S.%fZ',
|
122
195
|
),
|
123
196
|
signature=agreements.signature,
|
124
197
|
eula_link=agreements.eula_link,
|
125
198
|
))
|
126
|
-
except
|
199
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
127
200
|
logger.critical(
|
128
|
-
f
|
129
|
-
f
|
130
|
-
raise RuntimeError(
|
201
|
+
f'[Failed] subscribe_image: {listing_id} - {resource_version}'
|
202
|
+
f'Error message: {str(e)}')
|
203
|
+
raise RuntimeError('ERR: Image subscription error!') from e
|
131
204
|
|
132
205
|
@classmethod
|
133
|
-
@
|
206
|
+
@debug_enabled(logger)
|
134
207
|
def find_compartment(cls, region) -> str:
|
135
208
|
""" If compartment is not configured, we use root compartment """
|
136
209
|
# Try to use the configured one first
|
@@ -143,12 +216,18 @@ class oci_query_helper:
|
|
143
216
|
# config file is supported (2023/06/09).
|
144
217
|
root = oci_adaptor.get_oci_config(
|
145
218
|
region, oci_utils.oci_config.get_profile())['tenancy']
|
219
|
+
|
146
220
|
list_compartments_response = oci_adaptor.get_identity_client(
|
147
221
|
region, oci_utils.oci_config.get_profile()).list_compartments(
|
148
222
|
compartment_id=root,
|
149
223
|
name=oci_utils.oci_config.COMPARTMENT,
|
224
|
+
compartment_id_in_subtree=True,
|
225
|
+
access_level='ACCESSIBLE',
|
150
226
|
lifecycle_state='ACTIVE',
|
227
|
+
sort_by='TIMECREATED',
|
228
|
+
sort_order='DESC',
|
151
229
|
limit=1)
|
230
|
+
|
152
231
|
compartments = list_compartments_response.data
|
153
232
|
if len(compartments) > 0:
|
154
233
|
skypilot_compartment = compartments[0].id
|
@@ -159,7 +238,7 @@ class oci_query_helper:
|
|
159
238
|
return skypilot_compartment
|
160
239
|
|
161
240
|
@classmethod
|
162
|
-
@
|
241
|
+
@debug_enabled(logger)
|
163
242
|
def find_create_vcn_subnet(cls, region) -> Optional[str]:
|
164
243
|
""" If sub is not configured, we find/create VCN skypilot_vcn """
|
165
244
|
subnet = oci_utils.oci_config.get_vcn_subnet(region)
|
@@ -174,7 +253,7 @@ class oci_query_helper:
|
|
174
253
|
list_vcns_response = net_client.list_vcns(
|
175
254
|
compartment_id=skypilot_compartment,
|
176
255
|
display_name=oci_utils.oci_config.VCN_NAME,
|
177
|
-
lifecycle_state=
|
256
|
+
lifecycle_state='AVAILABLE')
|
178
257
|
vcns = list_vcns_response.data
|
179
258
|
if len(vcns) > 0:
|
180
259
|
# Found the VCN.
|
@@ -184,7 +263,7 @@ class oci_query_helper:
|
|
184
263
|
limit=1,
|
185
264
|
vcn_id=skypilot_vcn,
|
186
265
|
display_name=oci_utils.oci_config.VCN_SUBNET_NAME,
|
187
|
-
lifecycle_state=
|
266
|
+
lifecycle_state='AVAILABLE')
|
188
267
|
logger.debug(f'Got VCN subnet \n{list_subnets_response.data}')
|
189
268
|
if len(list_subnets_response.data) < 1:
|
190
269
|
logger.error(
|
@@ -201,10 +280,17 @@ class oci_query_helper:
|
|
201
280
|
return cls.create_vcn_subnet(net_client, skypilot_compartment)
|
202
281
|
|
203
282
|
@classmethod
|
204
|
-
@
|
283
|
+
@debug_enabled(logger)
|
205
284
|
def create_vcn_subnet(cls, net_client,
|
206
285
|
skypilot_compartment) -> Optional[str]:
|
286
|
+
|
287
|
+
skypilot_vcn = None # VCN for the resources
|
288
|
+
subnet = None # Subnet for the VMs
|
289
|
+
ig = None # Internet gateway
|
290
|
+
sg = None # Service gateway
|
291
|
+
|
207
292
|
try:
|
293
|
+
# pylint: disable=line-too-long
|
208
294
|
create_vcn_response = net_client.create_vcn(
|
209
295
|
create_vcn_details=oci_adaptor.oci.core.models.CreateVcnDetails(
|
210
296
|
compartment_id=skypilot_compartment,
|
@@ -274,38 +360,38 @@ class oci_query_helper:
|
|
274
360
|
update_security_list_details=oci_adaptor.oci.core.models.
|
275
361
|
UpdateSecurityListDetails(ingress_security_rules=[
|
276
362
|
oci_adaptor.oci.core.models.IngressSecurityRule(
|
277
|
-
protocol=
|
363
|
+
protocol='6',
|
278
364
|
source=oci_utils.oci_config.VCN_CIDR_INTERNET,
|
279
365
|
is_stateless=False,
|
280
|
-
source_type=
|
366
|
+
source_type='CIDR_BLOCK',
|
281
367
|
tcp_options=oci_adaptor.oci.core.models.TcpOptions(
|
282
368
|
destination_port_range=oci_adaptor.oci.core.models.
|
283
369
|
PortRange(max=22, min=22),
|
284
370
|
source_port_range=oci_adaptor.oci.core.models.
|
285
371
|
PortRange(max=65535, min=1)),
|
286
|
-
description=
|
372
|
+
description='Allow SSH port.'),
|
287
373
|
oci_adaptor.oci.core.models.IngressSecurityRule(
|
288
|
-
protocol=
|
374
|
+
protocol='all',
|
289
375
|
source=oci_utils.oci_config.VCN_SUBNET_CIDR,
|
290
376
|
is_stateless=False,
|
291
|
-
source_type=
|
292
|
-
description=
|
377
|
+
source_type='CIDR_BLOCK',
|
378
|
+
description='Allow all traffic from/to same subnet.'),
|
293
379
|
oci_adaptor.oci.core.models.IngressSecurityRule(
|
294
|
-
protocol=
|
380
|
+
protocol='1',
|
295
381
|
source=oci_utils.oci_config.VCN_CIDR_INTERNET,
|
296
382
|
is_stateless=False,
|
297
|
-
source_type=
|
383
|
+
source_type='CIDR_BLOCK',
|
298
384
|
icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
|
299
385
|
type=3, code=4),
|
300
|
-
description=
|
386
|
+
description='ICMP traffic.'),
|
301
387
|
oci_adaptor.oci.core.models.IngressSecurityRule(
|
302
|
-
protocol=
|
388
|
+
protocol='1',
|
303
389
|
source=oci_utils.oci_config.VCN_CIDR,
|
304
390
|
is_stateless=False,
|
305
|
-
source_type=
|
391
|
+
source_type='CIDR_BLOCK',
|
306
392
|
icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
|
307
393
|
type=3),
|
308
|
-
description=
|
394
|
+
description='ICMP traffic (VCN).'),
|
309
395
|
]))
|
310
396
|
logger.debug(
|
311
397
|
f'Updated security_list: \n{update_security_list_response.data}'
|
@@ -325,7 +411,7 @@ class oci_query_helper:
|
|
325
411
|
]))
|
326
412
|
logger.debug(f'Route table: \n{update_route_table_response.data}')
|
327
413
|
|
328
|
-
except oci_adaptor.
|
414
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
329
415
|
logger.error(f'Create VCN Error: Create new VCN '
|
330
416
|
f'{oci_utils.oci_config.VCN_NAME} failed: {str(e)}')
|
331
417
|
# In case of partial success while creating vcn
|
@@ -335,7 +421,7 @@ class oci_query_helper:
|
|
335
421
|
return subnet
|
336
422
|
|
337
423
|
@classmethod
|
338
|
-
@
|
424
|
+
@debug_enabled(logger)
|
339
425
|
def delete_vcn(cls, net_client, skypilot_vcn, skypilot_subnet,
|
340
426
|
internet_gateway, service_gateway):
|
341
427
|
if skypilot_vcn is None:
|
@@ -369,7 +455,7 @@ class oci_query_helper:
|
|
369
455
|
f'Deleted vcn {skypilot_vcn}-{delete_vcn_response.data}'
|
370
456
|
)
|
371
457
|
break
|
372
|
-
except oci_adaptor.
|
458
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
373
459
|
logger.info(f'Waiting del SG/IG/Subnet finish: {str(e)}')
|
374
460
|
retry_count = retry_count + 1
|
375
461
|
if retry_count == oci_utils.oci_config.MAX_RETRY_COUNT:
|
@@ -378,6 +464,9 @@ class oci_query_helper:
|
|
378
464
|
time.sleep(
|
379
465
|
oci_utils.oci_config.RETRY_INTERVAL_BASE_SECONDS)
|
380
466
|
|
381
|
-
except oci_adaptor.
|
467
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
382
468
|
logger.error(
|
383
469
|
f'Delete VCN {oci_utils.oci_config.VCN_NAME} Error: {str(e)}')
|
470
|
+
|
471
|
+
|
472
|
+
query_helper = QueryHelper()
|
sky/serve/core.py
CHANGED
@@ -124,7 +124,9 @@ def up(
|
|
124
124
|
f'{constants.CLUSTER_NAME_VALID_REGEX}')
|
125
125
|
|
126
126
|
_validate_service_task(task)
|
127
|
-
|
127
|
+
# Always apply the policy again here, even though it might have been applied
|
128
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
129
|
+
# and get the mutated config.
|
128
130
|
dag, mutated_user_config = admin_policy_utils.apply(
|
129
131
|
task, use_mutated_config_in_current_request=False)
|
130
132
|
task = dag.tasks[0]
|
@@ -319,6 +321,14 @@ def update(
|
|
319
321
|
service_name: Name of the service.
|
320
322
|
"""
|
321
323
|
_validate_service_task(task)
|
324
|
+
# Always apply the policy again here, even though it might have been applied
|
325
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
326
|
+
# and get the mutated config.
|
327
|
+
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
328
|
+
# will not apply the config.
|
329
|
+
dag, _ = admin_policy_utils.apply(
|
330
|
+
task, use_mutated_config_in_current_request=False)
|
331
|
+
task = dag.tasks[0]
|
322
332
|
handle = backend_utils.is_controller_accessible(
|
323
333
|
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
324
334
|
stopped_message=
|
sky/setup_files/MANIFEST.in
CHANGED
@@ -6,7 +6,6 @@ include sky/setup_files/*
|
|
6
6
|
include sky/skylet/*.sh
|
7
7
|
include sky/skylet/LICENSE
|
8
8
|
include sky/skylet/providers/ibm/*
|
9
|
-
include sky/skylet/providers/oci/*
|
10
9
|
include sky/skylet/providers/scp/*
|
11
10
|
include sky/skylet/providers/*.py
|
12
11
|
include sky/skylet/ray_patches/*.patch
|
sky/skylet/constants.py
CHANGED
@@ -79,7 +79,7 @@ SKYLET_VERSION = '8'
|
|
79
79
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
80
80
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
81
81
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
82
|
-
SKYLET_LIB_VERSION =
|
82
|
+
SKYLET_LIB_VERSION = 2
|
83
83
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
84
84
|
|
85
85
|
# `sky jobs dashboard`-related
|
sky/skylet/job_lib.py
CHANGED
@@ -11,8 +11,7 @@ import shlex
|
|
11
11
|
import sqlite3
|
12
12
|
import subprocess
|
13
13
|
import time
|
14
|
-
import
|
15
|
-
from typing import Any, Dict, List, Optional, Tuple
|
14
|
+
from typing import Any, Dict, List, Optional
|
16
15
|
|
17
16
|
import colorama
|
18
17
|
import filelock
|
@@ -24,11 +23,9 @@ from sky.utils import common_utils
|
|
24
23
|
from sky.utils import db_utils
|
25
24
|
from sky.utils import log_utils
|
26
25
|
|
27
|
-
if typing.TYPE_CHECKING:
|
28
|
-
from ray.dashboard.modules.job import pydantic_models as ray_pydantic
|
29
|
-
|
30
26
|
logger = sky_logging.init_logger(__name__)
|
31
27
|
|
28
|
+
_LINUX_NEW_LINE = '\n'
|
32
29
|
_JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock'
|
33
30
|
|
34
31
|
|
@@ -183,12 +180,20 @@ class JobScheduler:
|
|
183
180
|
def schedule_step(self, force_update_jobs: bool = False) -> None:
|
184
181
|
if force_update_jobs:
|
185
182
|
update_status()
|
186
|
-
|
183
|
+
pending_job_ids = self._get_pending_job_ids()
|
187
184
|
# TODO(zhwu, mraheja): One optimization can be allowing more than one
|
188
185
|
# job staying in the pending state after ray job submit, so that to be
|
189
186
|
# faster to schedule a large amount of jobs.
|
190
|
-
for job_id
|
187
|
+
for job_id in pending_job_ids:
|
191
188
|
with filelock.FileLock(_get_lock_path(job_id)):
|
189
|
+
pending_job = _get_pending_job(job_id)
|
190
|
+
if pending_job is None:
|
191
|
+
# Pending job can be removed by another thread, due to the
|
192
|
+
# job being scheduled already.
|
193
|
+
continue
|
194
|
+
run_cmd = pending_job['run_cmd']
|
195
|
+
submit = pending_job['submit']
|
196
|
+
created_time = pending_job['created_time']
|
192
197
|
# We don't have to refresh the job status before checking, as
|
193
198
|
# the job status will only be stale in rare cases where ray job
|
194
199
|
# crashes; or the job stays in INIT state for a long time.
|
@@ -207,8 +212,8 @@ class JobScheduler:
|
|
207
212
|
self._run_job(job_id, run_cmd)
|
208
213
|
return
|
209
214
|
|
210
|
-
def
|
211
|
-
"""Returns the
|
215
|
+
def _get_pending_job_ids(self) -> List[int]:
|
216
|
+
"""Returns the job ids in the pending jobs table
|
212
217
|
|
213
218
|
The information contains job_id, run command, submit time,
|
214
219
|
creation time.
|
@@ -219,9 +224,10 @@ class JobScheduler:
|
|
219
224
|
class FIFOScheduler(JobScheduler):
|
220
225
|
"""First in first out job scheduler"""
|
221
226
|
|
222
|
-
def
|
223
|
-
|
224
|
-
|
227
|
+
def _get_pending_job_ids(self) -> List[int]:
|
228
|
+
rows = _CURSOR.execute(
|
229
|
+
'SELECT job_id FROM pending_jobs ORDER BY job_id').fetchall()
|
230
|
+
return [row[0] for row in rows]
|
225
231
|
|
226
232
|
|
227
233
|
scheduler = FIFOScheduler()
|
@@ -518,11 +524,16 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
|
|
518
524
|
|
519
525
|
|
520
526
|
def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
|
521
|
-
rows = _CURSOR.execute(
|
522
|
-
|
527
|
+
rows = _CURSOR.execute(
|
528
|
+
'SELECT created_time, submit, run_cmd FROM pending_jobs '
|
529
|
+
f'WHERE job_id={job_id!r}')
|
523
530
|
for row in rows:
|
524
|
-
created_time, submit = row
|
525
|
-
return {
|
531
|
+
created_time, submit, run_cmd = row
|
532
|
+
return {
|
533
|
+
'created_time': created_time,
|
534
|
+
'submit': submit,
|
535
|
+
'run_cmd': run_cmd
|
536
|
+
}
|
526
537
|
return None
|
527
538
|
|
528
539
|
|
@@ -602,6 +613,7 @@ def update_job_status(job_ids: List[int],
|
|
602
613
|
# the pending table until appearing in ray jobs. For jobs
|
603
614
|
# submitted outside of the grace period, we will consider the
|
604
615
|
# ray job status.
|
616
|
+
|
605
617
|
if not (pending_job['submit'] > 0 and pending_job['submit'] <
|
606
618
|
ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
|
607
619
|
# Reset the job status to PENDING even though it may not
|
@@ -792,7 +804,9 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
792
804
|
logger.warning(str(e))
|
793
805
|
continue
|
794
806
|
|
795
|
-
|
807
|
+
# Get the job status again to avoid race condition.
|
808
|
+
job_status = get_status_no_lock(job['job_id'])
|
809
|
+
if job_status in [
|
796
810
|
JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING
|
797
811
|
]:
|
798
812
|
_set_status_no_lock(job['job_id'], JobStatus.CANCELLED)
|
@@ -903,14 +917,19 @@ class JobLibCodeGen:
|
|
903
917
|
def tail_logs(cls,
|
904
918
|
job_id: Optional[int],
|
905
919
|
managed_job_id: Optional[int],
|
906
|
-
follow: bool = True
|
920
|
+
follow: bool = True,
|
921
|
+
tail: int = 0) -> str:
|
907
922
|
# pylint: disable=line-too-long
|
923
|
+
|
908
924
|
code = [
|
925
|
+
# We use != instead of is not because 1 is not None will print a warning:
|
926
|
+
# <stdin>:1: SyntaxWarning: "is not" with a literal. Did you mean "!="?
|
909
927
|
f'job_id = {job_id} if {job_id} != None else job_lib.get_latest_job_id()',
|
910
928
|
'run_timestamp = job_lib.get_run_timestamp(job_id)',
|
911
929
|
f'log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)',
|
912
|
-
f'
|
913
|
-
f'
|
930
|
+
f'tail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
|
931
|
+
f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
|
932
|
+
f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
|
914
933
|
]
|
915
934
|
return cls._build(code)
|
916
935
|
|