skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/backends/backend_utils.py +9 -6
- sky/backends/cloud_vm_ray_backend.py +2 -3
- sky/check.py +25 -13
- sky/client/cli/command.py +52 -24
- sky/cloud_stores.py +73 -0
- sky/clouds/aws.py +59 -11
- sky/core.py +7 -5
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +71 -2
- sky/data/storage.py +166 -9
- sky/global_user_state.py +14 -18
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +62 -67
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/scheduler.py +15 -2
- sky/jobs/server/core.py +85 -13
- sky/jobs/server/server.py +14 -13
- sky/jobs/server/utils.py +28 -10
- sky/jobs/state.py +216 -40
- sky/jobs/utils.py +65 -28
- sky/metrics/utils.py +18 -0
- sky/optimizer.py +1 -1
- sky/provision/kubernetes/instance.py +88 -19
- sky/provision/kubernetes/volume.py +2 -2
- sky/schemas/api/responses.py +3 -5
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/replica_managers.py +2 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/server.py +8 -7
- sky/server/common.py +21 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +23 -17
- sky/server/requests/executor.py +7 -3
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/request_names.py +80 -0
- sky/server/requests/requests.py +137 -102
- sky/server/requests/serializers/decoders.py +0 -6
- sky/server/requests/serializers/encoders.py +33 -6
- sky/server/server.py +105 -36
- sky/server/stream_utils.py +56 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +6 -1
- sky/skylet/events.py +7 -0
- sky/skylet/services.py +18 -7
- sky/ssh_node_pools/server.py +5 -4
- sky/task.py +14 -42
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +140 -12
- sky/users/permission.py +4 -1
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/context_utils.py +13 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/volumes/server/server.py +4 -3
- sky/workspaces/server.py +7 -6
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
- sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
- sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
- /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
|
@@ -93,6 +93,7 @@ spot_table = sqlalchemy.Table(
|
|
|
93
93
|
sqlalchemy.Column('specs', sqlalchemy.Text),
|
|
94
94
|
sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
|
|
95
95
|
sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
|
|
96
|
+
sqlalchemy.Column('logs_cleaned_at', sqlalchemy.Float, server_default=None),
|
|
96
97
|
)
|
|
97
98
|
|
|
98
99
|
job_info_table = sqlalchemy.Table(
|
|
@@ -108,6 +109,8 @@ job_info_table = sqlalchemy.Table(
|
|
|
108
109
|
server_default=None),
|
|
109
110
|
sqlalchemy.Column('dag_yaml_path', sqlalchemy.Text),
|
|
110
111
|
sqlalchemy.Column('env_file_path', sqlalchemy.Text),
|
|
112
|
+
sqlalchemy.Column('dag_yaml_content', sqlalchemy.Text, server_default=None),
|
|
113
|
+
sqlalchemy.Column('env_file_content', sqlalchemy.Text, server_default=None),
|
|
111
114
|
sqlalchemy.Column('user_hash', sqlalchemy.Text),
|
|
112
115
|
sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
|
|
113
116
|
sqlalchemy.Column('priority',
|
|
@@ -117,6 +120,9 @@ job_info_table = sqlalchemy.Table(
|
|
|
117
120
|
sqlalchemy.Column('original_user_yaml_path',
|
|
118
121
|
sqlalchemy.Text,
|
|
119
122
|
server_default=None),
|
|
123
|
+
sqlalchemy.Column('original_user_yaml_content',
|
|
124
|
+
sqlalchemy.Text,
|
|
125
|
+
server_default=None),
|
|
120
126
|
sqlalchemy.Column('pool', sqlalchemy.Text, server_default=None),
|
|
121
127
|
sqlalchemy.Column('current_cluster_name',
|
|
122
128
|
sqlalchemy.Text,
|
|
@@ -125,6 +131,9 @@ job_info_table = sqlalchemy.Table(
|
|
|
125
131
|
sqlalchemy.Integer,
|
|
126
132
|
server_default=None),
|
|
127
133
|
sqlalchemy.Column('pool_hash', sqlalchemy.Text, server_default=None),
|
|
134
|
+
sqlalchemy.Column('controller_logs_cleaned_at',
|
|
135
|
+
sqlalchemy.Float,
|
|
136
|
+
server_default=None),
|
|
128
137
|
)
|
|
129
138
|
|
|
130
139
|
ha_recovery_script_table = sqlalchemy.Table(
|
|
@@ -313,6 +322,8 @@ async def _describe_task_transition_failure(session: sql_async.AsyncSession,
|
|
|
313
322
|
# column names in the DB and it corresponds to the combined view
|
|
314
323
|
# by joining the spot and job_info tables.
|
|
315
324
|
def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
325
|
+
# WARNING: If you update these you may also need to update GetJobTable in
|
|
326
|
+
# the skylet ManagedJobsServiceImpl.
|
|
316
327
|
return {
|
|
317
328
|
'_job_id': r.get('job_id'), # from spot table
|
|
318
329
|
'_task_name': r.get('job_name'), # deprecated, from spot table
|
|
@@ -339,13 +350,18 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
|
339
350
|
'job_name': r.get('name'), # from job_info table
|
|
340
351
|
'schedule_state': r.get('schedule_state'),
|
|
341
352
|
'controller_pid': r.get('controller_pid'),
|
|
353
|
+
# the _path columns are for backwards compatibility, use the _content
|
|
354
|
+
# columns instead
|
|
342
355
|
'dag_yaml_path': r.get('dag_yaml_path'),
|
|
343
356
|
'env_file_path': r.get('env_file_path'),
|
|
357
|
+
'dag_yaml_content': r.get('dag_yaml_content'),
|
|
358
|
+
'env_file_content': r.get('env_file_content'),
|
|
344
359
|
'user_hash': r.get('user_hash'),
|
|
345
360
|
'workspace': r.get('workspace'),
|
|
346
361
|
'priority': r.get('priority'),
|
|
347
362
|
'entrypoint': r.get('entrypoint'),
|
|
348
363
|
'original_user_yaml_path': r.get('original_user_yaml_path'),
|
|
364
|
+
'original_user_yaml_content': r.get('original_user_yaml_content'),
|
|
349
365
|
'pool': r.get('pool'),
|
|
350
366
|
'current_cluster_name': r.get('current_cluster_name'),
|
|
351
367
|
'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
|
|
@@ -1076,7 +1092,8 @@ def _get_all_task_ids_statuses(
|
|
|
1076
1092
|
|
|
1077
1093
|
@_init_db
|
|
1078
1094
|
def get_all_task_ids_names_statuses_logs(
|
|
1079
|
-
|
|
1095
|
+
job_id: int
|
|
1096
|
+
) -> List[Tuple[int, str, ManagedJobStatus, str, Optional[float]]]:
|
|
1080
1097
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1081
1098
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1082
1099
|
id_names = session.execute(
|
|
@@ -1085,9 +1102,10 @@ def get_all_task_ids_names_statuses_logs(
|
|
|
1085
1102
|
spot_table.c.task_name,
|
|
1086
1103
|
spot_table.c.status,
|
|
1087
1104
|
spot_table.c.local_log_file,
|
|
1105
|
+
spot_table.c.logs_cleaned_at,
|
|
1088
1106
|
).where(spot_table.c.spot_job_id == job_id).order_by(
|
|
1089
1107
|
spot_table.c.task_id.asc())).fetchall()
|
|
1090
|
-
return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
|
|
1108
|
+
return [(row[0], row[1], ManagedJobStatus(row[2]), row[3], row[4])
|
|
1091
1109
|
for row in id_names]
|
|
1092
1110
|
|
|
1093
1111
|
|
|
@@ -1152,8 +1170,8 @@ def get_failure_reason(job_id: int) -> Optional[str]:
|
|
|
1152
1170
|
|
|
1153
1171
|
|
|
1154
1172
|
@_init_db
|
|
1155
|
-
def
|
|
1156
|
-
"""Get managed
|
|
1173
|
+
def get_managed_job_tasks(job_id: int) -> List[Dict[str, Any]]:
|
|
1174
|
+
"""Get managed job tasks for a specific managed job id from the database."""
|
|
1157
1175
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1158
1176
|
|
|
1159
1177
|
# Join spot and job_info tables to get the job name for each task.
|
|
@@ -1168,10 +1186,8 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
1168
1186
|
spot_table.outerjoin(
|
|
1169
1187
|
job_info_table,
|
|
1170
1188
|
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
query = query.order_by(spot_table.c.spot_job_id.desc(),
|
|
1174
|
-
spot_table.c.task_id.asc())
|
|
1189
|
+
query = query.where(spot_table.c.spot_job_id == job_id)
|
|
1190
|
+
query = query.order_by(spot_table.c.task_id.asc())
|
|
1175
1191
|
rows = None
|
|
1176
1192
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1177
1193
|
rows = session.execute(query).fetchall()
|
|
@@ -1186,15 +1202,17 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
1186
1202
|
job_dict['metadata'] = json.loads(job_dict['metadata'])
|
|
1187
1203
|
|
|
1188
1204
|
# Add user YAML content for managed jobs.
|
|
1189
|
-
|
|
1190
|
-
if
|
|
1191
|
-
try
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1205
|
+
job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
|
|
1206
|
+
if job_dict['user_yaml'] is None:
|
|
1207
|
+
# Backwards compatibility - try to read from file path
|
|
1208
|
+
yaml_path = job_dict.get('original_user_yaml_path')
|
|
1209
|
+
if yaml_path:
|
|
1210
|
+
try:
|
|
1211
|
+
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
1212
|
+
job_dict['user_yaml'] = f.read()
|
|
1213
|
+
except (FileNotFoundError, IOError, OSError) as e:
|
|
1214
|
+
logger.debug('Failed to read original user YAML for job '
|
|
1215
|
+
f'{job_id} from {yaml_path}: {e}')
|
|
1198
1216
|
|
|
1199
1217
|
jobs.append(job_dict)
|
|
1200
1218
|
return jobs
|
|
@@ -1408,7 +1426,13 @@ def get_managed_jobs_with_filters(
|
|
|
1408
1426
|
page: Optional[int] = None,
|
|
1409
1427
|
limit: Optional[int] = None,
|
|
1410
1428
|
) -> Tuple[List[Dict[str, Any]], int]:
|
|
1411
|
-
"""Get managed jobs from the database with filters.
|
|
1429
|
+
"""Get managed jobs from the database with filters.
|
|
1430
|
+
|
|
1431
|
+
Returns:
|
|
1432
|
+
A tuple containing
|
|
1433
|
+
- the list of managed jobs
|
|
1434
|
+
- the total number of managed jobs
|
|
1435
|
+
"""
|
|
1412
1436
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1413
1437
|
|
|
1414
1438
|
count_query = build_managed_jobs_with_filters_query(
|
|
@@ -1447,7 +1471,8 @@ def get_managed_jobs_with_filters(
|
|
|
1447
1471
|
jobs = []
|
|
1448
1472
|
for row in rows:
|
|
1449
1473
|
job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
|
|
1450
|
-
job_dict
|
|
1474
|
+
if job_dict.get('status') is not None:
|
|
1475
|
+
job_dict['status'] = ManagedJobStatus(job_dict['status'])
|
|
1451
1476
|
if job_dict.get('schedule_state') is not None:
|
|
1452
1477
|
job_dict['schedule_state'] = ManagedJobScheduleState(
|
|
1453
1478
|
job_dict['schedule_state'])
|
|
@@ -1457,15 +1482,22 @@ def get_managed_jobs_with_filters(
|
|
|
1457
1482
|
job_dict['metadata'] = json.loads(job_dict['metadata'])
|
|
1458
1483
|
|
|
1459
1484
|
# Add user YAML content for managed jobs.
|
|
1460
|
-
|
|
1461
|
-
if
|
|
1462
|
-
try
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1485
|
+
job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
|
|
1486
|
+
if job_dict['user_yaml'] is None:
|
|
1487
|
+
# Backwards compatibility - try to read from file path
|
|
1488
|
+
yaml_path = job_dict.get('original_user_yaml_path')
|
|
1489
|
+
if yaml_path:
|
|
1490
|
+
try:
|
|
1491
|
+
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
1492
|
+
job_dict['user_yaml'] = f.read()
|
|
1493
|
+
except (FileNotFoundError, IOError, OSError) as e:
|
|
1494
|
+
job_id = job_dict.get('job_id')
|
|
1495
|
+
if job_id is not None:
|
|
1496
|
+
logger.debug('Failed to read original user YAML for '
|
|
1497
|
+
f'job {job_id} from {yaml_path}: {e}')
|
|
1498
|
+
else:
|
|
1499
|
+
logger.debug('Failed to read original user YAML from '
|
|
1500
|
+
f'{yaml_path}: {e}')
|
|
1469
1501
|
|
|
1470
1502
|
jobs.append(job_dict)
|
|
1471
1503
|
return jobs, total
|
|
@@ -1511,9 +1543,9 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
|
|
|
1511
1543
|
|
|
1512
1544
|
|
|
1513
1545
|
@_init_db
|
|
1514
|
-
def scheduler_set_waiting(job_id: int,
|
|
1515
|
-
|
|
1516
|
-
priority: int):
|
|
1546
|
+
def scheduler_set_waiting(job_id: int, dag_yaml_content: str,
|
|
1547
|
+
original_user_yaml_content: str,
|
|
1548
|
+
env_file_content: str, priority: int):
|
|
1517
1549
|
"""Do not call without holding the scheduler lock.
|
|
1518
1550
|
|
|
1519
1551
|
Returns: Whether this is a recovery run or not.
|
|
@@ -1525,19 +1557,48 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
|
|
1525
1557
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1526
1558
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1527
1559
|
updated_count = session.query(job_info_table).filter(
|
|
1528
|
-
sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1560
|
+
sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)).update({
|
|
1561
|
+
job_info_table.c.schedule_state:
|
|
1562
|
+
ManagedJobScheduleState.WAITING.value,
|
|
1563
|
+
job_info_table.c.dag_yaml_content: dag_yaml_content,
|
|
1564
|
+
job_info_table.c.original_user_yaml_content:
|
|
1565
|
+
(original_user_yaml_content),
|
|
1566
|
+
job_info_table.c.env_file_content: env_file_content,
|
|
1567
|
+
job_info_table.c.priority: priority,
|
|
1568
|
+
})
|
|
1537
1569
|
session.commit()
|
|
1538
1570
|
assert updated_count <= 1, (job_id, updated_count)
|
|
1539
1571
|
|
|
1540
1572
|
|
|
1573
|
+
@_init_db
|
|
1574
|
+
def get_job_file_contents(job_id: int) -> Dict[str, Optional[str]]:
|
|
1575
|
+
"""Return file information and stored contents for a managed job."""
|
|
1576
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1577
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1578
|
+
row = session.execute(
|
|
1579
|
+
sqlalchemy.select(
|
|
1580
|
+
job_info_table.c.dag_yaml_path,
|
|
1581
|
+
job_info_table.c.env_file_path,
|
|
1582
|
+
job_info_table.c.dag_yaml_content,
|
|
1583
|
+
job_info_table.c.env_file_content,
|
|
1584
|
+
).where(job_info_table.c.spot_job_id == job_id)).fetchone()
|
|
1585
|
+
|
|
1586
|
+
if row is None:
|
|
1587
|
+
return {
|
|
1588
|
+
'dag_yaml_path': None,
|
|
1589
|
+
'env_file_path': None,
|
|
1590
|
+
'dag_yaml_content': None,
|
|
1591
|
+
'env_file_content': None,
|
|
1592
|
+
}
|
|
1593
|
+
|
|
1594
|
+
return {
|
|
1595
|
+
'dag_yaml_path': row[0],
|
|
1596
|
+
'env_file_path': row[1],
|
|
1597
|
+
'dag_yaml_content': row[2],
|
|
1598
|
+
'env_file_content': row[3],
|
|
1599
|
+
}
|
|
1600
|
+
|
|
1601
|
+
|
|
1541
1602
|
@_init_db
|
|
1542
1603
|
def get_pool_from_job_id(job_id: int) -> Optional[str]:
|
|
1543
1604
|
"""Get the pool from the job id."""
|
|
@@ -2331,3 +2392,118 @@ def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
|
|
|
2331
2392
|
rows = session.execute(query).fetchall()
|
|
2332
2393
|
job_ids = [row[0] for row in rows if row[0] is not None]
|
|
2333
2394
|
return job_ids
|
|
2395
|
+
|
|
2396
|
+
|
|
2397
|
+
@_init_db_async
|
|
2398
|
+
async def get_task_logs_to_clean_async(retention_seconds: int,
|
|
2399
|
+
batch_size) -> List[Dict[str, Any]]:
|
|
2400
|
+
"""Get the logs of job tasks to clean.
|
|
2401
|
+
|
|
2402
|
+
The logs of a task will only cleaned when:
|
|
2403
|
+
- the job schedule state is DONE
|
|
2404
|
+
- AND the end time of the task is older than the retention period
|
|
2405
|
+
"""
|
|
2406
|
+
|
|
2407
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2408
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2409
|
+
now = time.time()
|
|
2410
|
+
result = await session.execute(
|
|
2411
|
+
sqlalchemy.select(
|
|
2412
|
+
spot_table.c.spot_job_id,
|
|
2413
|
+
spot_table.c.task_id,
|
|
2414
|
+
spot_table.c.local_log_file,
|
|
2415
|
+
).select_from(
|
|
2416
|
+
spot_table.join(
|
|
2417
|
+
job_info_table,
|
|
2418
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id,
|
|
2419
|
+
)).
|
|
2420
|
+
where(
|
|
2421
|
+
sqlalchemy.and_(
|
|
2422
|
+
job_info_table.c.schedule_state.is_(
|
|
2423
|
+
ManagedJobScheduleState.DONE.value),
|
|
2424
|
+
spot_table.c.end_at.isnot(None),
|
|
2425
|
+
spot_table.c.end_at < (now - retention_seconds),
|
|
2426
|
+
spot_table.c.logs_cleaned_at.is_(None),
|
|
2427
|
+
# The local log file is set AFTER the task is finished,
|
|
2428
|
+
# add this condition to ensure the entire log file has
|
|
2429
|
+
# been written.
|
|
2430
|
+
spot_table.c.local_log_file.isnot(None),
|
|
2431
|
+
)).limit(batch_size))
|
|
2432
|
+
rows = result.fetchall()
|
|
2433
|
+
return [{
|
|
2434
|
+
'job_id': row[0],
|
|
2435
|
+
'task_id': row[1],
|
|
2436
|
+
'local_log_file': row[2]
|
|
2437
|
+
} for row in rows]
|
|
2438
|
+
|
|
2439
|
+
|
|
2440
|
+
@_init_db_async
|
|
2441
|
+
async def get_controller_logs_to_clean_async(
|
|
2442
|
+
retention_seconds: int, batch_size: int) -> List[Dict[str, Any]]:
|
|
2443
|
+
"""Get the controller logs to clean.
|
|
2444
|
+
|
|
2445
|
+
The controller logs will only cleaned when:
|
|
2446
|
+
- the job schedule state is DONE
|
|
2447
|
+
- AND the end time of the latest task is older than the retention period
|
|
2448
|
+
"""
|
|
2449
|
+
|
|
2450
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2451
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2452
|
+
now = time.time()
|
|
2453
|
+
|
|
2454
|
+
result = await session.execute(
|
|
2455
|
+
sqlalchemy.select(job_info_table.c.spot_job_id,).select_from(
|
|
2456
|
+
job_info_table.join(
|
|
2457
|
+
spot_table,
|
|
2458
|
+
job_info_table.c.spot_job_id == spot_table.c.spot_job_id,
|
|
2459
|
+
)).where(
|
|
2460
|
+
sqlalchemy.and_(
|
|
2461
|
+
job_info_table.c.schedule_state.is_(
|
|
2462
|
+
ManagedJobScheduleState.DONE.value),
|
|
2463
|
+
spot_table.c.local_log_file.isnot(None),
|
|
2464
|
+
job_info_table.c.controller_logs_cleaned_at.is_(None),
|
|
2465
|
+
)).group_by(
|
|
2466
|
+
job_info_table.c.spot_job_id,
|
|
2467
|
+
job_info_table.c.current_cluster_name,
|
|
2468
|
+
).having(
|
|
2469
|
+
sqlalchemy.func.max(
|
|
2470
|
+
spot_table.c.end_at).isnot(None),).having(
|
|
2471
|
+
sqlalchemy.func.max(spot_table.c.end_at) < (
|
|
2472
|
+
now - retention_seconds)).limit(batch_size))
|
|
2473
|
+
rows = result.fetchall()
|
|
2474
|
+
return [{'job_id': row[0]} for row in rows]
|
|
2475
|
+
|
|
2476
|
+
|
|
2477
|
+
@_init_db_async
|
|
2478
|
+
async def set_task_logs_cleaned_async(tasks: List[Tuple[int, int]],
|
|
2479
|
+
logs_cleaned_at: float):
|
|
2480
|
+
"""Set the task logs cleaned at."""
|
|
2481
|
+
if not tasks:
|
|
2482
|
+
return
|
|
2483
|
+
# Deduplicate
|
|
2484
|
+
task_keys = list(dict.fromkeys(tasks))
|
|
2485
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2486
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2487
|
+
await session.execute(
|
|
2488
|
+
sqlalchemy.update(spot_table).where(
|
|
2489
|
+
sqlalchemy.tuple_(spot_table.c.spot_job_id,
|
|
2490
|
+
spot_table.c.task_id).in_(task_keys)).values(
|
|
2491
|
+
logs_cleaned_at=logs_cleaned_at))
|
|
2492
|
+
await session.commit()
|
|
2493
|
+
|
|
2494
|
+
|
|
2495
|
+
@_init_db_async
|
|
2496
|
+
async def set_controller_logs_cleaned_async(job_ids: List[int],
|
|
2497
|
+
logs_cleaned_at: float):
|
|
2498
|
+
"""Set the controller logs cleaned at."""
|
|
2499
|
+
if not job_ids:
|
|
2500
|
+
return
|
|
2501
|
+
# Deduplicate
|
|
2502
|
+
job_ids = list(dict.fromkeys(job_ids))
|
|
2503
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2504
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2505
|
+
await session.execute(
|
|
2506
|
+
sqlalchemy.update(job_info_table).where(
|
|
2507
|
+
job_info_table.c.spot_job_id.in_(job_ids)).values(
|
|
2508
|
+
controller_logs_cleaned_at=logs_cleaned_at))
|
|
2509
|
+
await session.commit()
|
sky/jobs/utils.py
CHANGED
|
@@ -6,7 +6,7 @@ ManagedJobCodeGen.
|
|
|
6
6
|
"""
|
|
7
7
|
import asyncio
|
|
8
8
|
import collections
|
|
9
|
-
import datetime
|
|
9
|
+
from datetime import datetime
|
|
10
10
|
import enum
|
|
11
11
|
import os
|
|
12
12
|
import pathlib
|
|
@@ -195,8 +195,8 @@ def _validate_consolidation_mode_config(
|
|
|
195
195
|
'terminate the controller cluster first.'
|
|
196
196
|
f'{colorama.Style.RESET_ALL}')
|
|
197
197
|
else:
|
|
198
|
-
|
|
199
|
-
if
|
|
198
|
+
total_jobs = managed_job_state.get_managed_jobs_total()
|
|
199
|
+
if total_jobs > 0:
|
|
200
200
|
nonterminal_jobs = (
|
|
201
201
|
managed_job_state.get_nonterminal_job_ids_by_name(
|
|
202
202
|
None, None, all_users=True))
|
|
@@ -211,7 +211,7 @@ def _validate_consolidation_mode_config(
|
|
|
211
211
|
else:
|
|
212
212
|
logger.warning(
|
|
213
213
|
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
|
|
214
|
-
f'but there are {
|
|
214
|
+
f'but there are {total_jobs} jobs from previous '
|
|
215
215
|
'consolidation mode. Reset the `jobs.controller.'
|
|
216
216
|
'consolidation_mode` to `true` and run `sky jobs queue` '
|
|
217
217
|
'to see those jobs. Switching to normal mode will '
|
|
@@ -266,6 +266,12 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
|
|
|
266
266
|
|
|
267
267
|
def ha_recovery_for_consolidation_mode():
|
|
268
268
|
"""Recovery logic for HA mode."""
|
|
269
|
+
# Touch the signal file here to avoid conflict with
|
|
270
|
+
# update_managed_jobs_statuses. Although we run this first and then start
|
|
271
|
+
# the deamon, this function is also called in cancel_jobs_by_id.
|
|
272
|
+
signal_file = pathlib.Path(
|
|
273
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser()
|
|
274
|
+
signal_file.touch()
|
|
269
275
|
# No setup recovery is needed in consolidation mode, as the API server
|
|
270
276
|
# already has all runtime installed. Directly start jobs recovery here.
|
|
271
277
|
# Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
|
|
@@ -276,7 +282,9 @@ def ha_recovery_for_consolidation_mode():
|
|
|
276
282
|
encoding='utf-8') as f:
|
|
277
283
|
start = time.time()
|
|
278
284
|
f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
|
|
279
|
-
|
|
285
|
+
jobs, _ = managed_job_state.get_managed_jobs_with_filters(
|
|
286
|
+
fields=['job_id', 'controller_pid', 'schedule_state', 'status'])
|
|
287
|
+
for job in jobs:
|
|
280
288
|
job_id = job['job_id']
|
|
281
289
|
controller_pid = job['controller_pid']
|
|
282
290
|
|
|
@@ -312,6 +320,7 @@ def ha_recovery_for_consolidation_mode():
|
|
|
312
320
|
f'{datetime.datetime.now()}\n')
|
|
313
321
|
f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
|
|
314
322
|
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
|
323
|
+
signal_file.unlink()
|
|
315
324
|
|
|
316
325
|
|
|
317
326
|
async def get_job_status(
|
|
@@ -456,7 +465,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
456
465
|
"""
|
|
457
466
|
managed_job_state.remove_ha_recovery_script(job_id)
|
|
458
467
|
error_msg = None
|
|
459
|
-
tasks = managed_job_state.
|
|
468
|
+
tasks = managed_job_state.get_managed_job_tasks(job_id)
|
|
460
469
|
for task in tasks:
|
|
461
470
|
pool = task.get('pool', None)
|
|
462
471
|
if pool is None:
|
|
@@ -525,7 +534,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
525
534
|
|
|
526
535
|
for job_id in job_ids:
|
|
527
536
|
assert job_id is not None
|
|
528
|
-
tasks = managed_job_state.
|
|
537
|
+
tasks = managed_job_state.get_managed_job_tasks(job_id)
|
|
529
538
|
# Note: controller_pid and schedule_state are in the job_info table
|
|
530
539
|
# which is joined to the spot table, so all tasks with the same job_id
|
|
531
540
|
# will have the same value for these columns. This is what lets us just
|
|
@@ -545,9 +554,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
545
554
|
if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
|
|
546
555
|
# There are two cases where we could get a job that is DONE.
|
|
547
556
|
# 1. At query time (get_jobs_to_check_status), the job was not yet
|
|
548
|
-
# DONE, but since then (before
|
|
549
|
-
# hit a terminal status, marked itself done, and exited.
|
|
550
|
-
# fine.
|
|
557
|
+
# DONE, but since then (before get_managed_job_tasks is called)
|
|
558
|
+
# it has hit a terminal status, marked itself done, and exited.
|
|
559
|
+
# This is fine.
|
|
551
560
|
# 2. The job is DONE, but in a non-terminal status. This is
|
|
552
561
|
# unexpected. For instance, the task status is RUNNING, but the
|
|
553
562
|
# job schedule_state is DONE.
|
|
@@ -901,6 +910,14 @@ def cancel_jobs_by_pool(pool_name: str,
|
|
|
901
910
|
return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
|
|
902
911
|
|
|
903
912
|
|
|
913
|
+
def controller_log_file_for_job(job_id: int,
|
|
914
|
+
create_if_not_exists: bool = False) -> str:
|
|
915
|
+
log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
916
|
+
if create_if_not_exists:
|
|
917
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
918
|
+
return os.path.join(log_dir, f'{job_id}.log')
|
|
919
|
+
|
|
920
|
+
|
|
904
921
|
def stream_logs_by_id(job_id: int,
|
|
905
922
|
follow: bool = True,
|
|
906
923
|
tail: Optional[int] = None) -> Tuple[str, int]:
|
|
@@ -933,13 +950,20 @@ def stream_logs_by_id(job_id: int,
|
|
|
933
950
|
if managed_job_status.is_failed():
|
|
934
951
|
job_msg = ('\nFailure reason: '
|
|
935
952
|
f'{managed_job_state.get_failure_reason(job_id)}')
|
|
936
|
-
|
|
953
|
+
log_file_ever_existed = False
|
|
937
954
|
task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
|
|
938
955
|
job_id)
|
|
939
956
|
num_tasks = len(task_info)
|
|
940
|
-
for task_id, task_name, task_status, log_file
|
|
957
|
+
for (task_id, task_name, task_status, log_file,
|
|
958
|
+
logs_cleaned_at) in task_info:
|
|
941
959
|
if log_file:
|
|
942
|
-
|
|
960
|
+
log_file_ever_existed = True
|
|
961
|
+
if logs_cleaned_at is not None:
|
|
962
|
+
ts_str = datetime.fromtimestamp(
|
|
963
|
+
logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
|
|
964
|
+
print(f'Task {task_name}({task_id}) log has been '
|
|
965
|
+
f'cleaned at {ts_str}.')
|
|
966
|
+
continue
|
|
943
967
|
task_str = (f'Task {task_name}({task_id})'
|
|
944
968
|
if task_name else f'Task {task_id}')
|
|
945
969
|
if num_tasks > 1:
|
|
@@ -974,7 +998,7 @@ def stream_logs_by_id(job_id: int,
|
|
|
974
998
|
f'{task_str} finished '
|
|
975
999
|
f'(status: {task_status.value}).'),
|
|
976
1000
|
flush=True)
|
|
977
|
-
if
|
|
1001
|
+
if log_file_ever_existed:
|
|
978
1002
|
# Add the "Job finished" message for terminal states
|
|
979
1003
|
if managed_job_status.is_terminal():
|
|
980
1004
|
print(ux_utils.finishing_message(
|
|
@@ -1202,7 +1226,8 @@ def stream_logs(job_id: Optional[int],
|
|
|
1202
1226
|
if controller:
|
|
1203
1227
|
if job_id is None:
|
|
1204
1228
|
assert job_name is not None
|
|
1205
|
-
managed_jobs = managed_job_state.
|
|
1229
|
+
managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
|
|
1230
|
+
name_match=job_name, fields=['job_id', 'job_name', 'status'])
|
|
1206
1231
|
# We manually filter the jobs by name, instead of using
|
|
1207
1232
|
# get_nonterminal_job_ids_by_name, as with `controller=True`, we
|
|
1208
1233
|
# should be able to show the logs for jobs in terminal states.
|
|
@@ -1225,9 +1250,7 @@ def stream_logs(job_id: Optional[int],
|
|
|
1225
1250
|
job_id = managed_job_ids.pop()
|
|
1226
1251
|
assert job_id is not None, (job_id, job_name)
|
|
1227
1252
|
|
|
1228
|
-
controller_log_path =
|
|
1229
|
-
os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
|
|
1230
|
-
f'{job_id}.log')
|
|
1253
|
+
controller_log_path = controller_log_file_for_job(job_id)
|
|
1231
1254
|
job_status = None
|
|
1232
1255
|
|
|
1233
1256
|
# Wait for the log file to be written
|
|
@@ -1378,9 +1401,11 @@ def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
|
|
|
1378
1401
|
new_fields.append('priority')
|
|
1379
1402
|
if 'failure_reason' not in new_fields:
|
|
1380
1403
|
new_fields.append('failure_reason')
|
|
1381
|
-
if
|
|
1382
|
-
|
|
1383
|
-
|
|
1404
|
+
if 'user_yaml' in new_fields:
|
|
1405
|
+
if 'original_user_yaml_path' not in new_fields:
|
|
1406
|
+
new_fields.append('original_user_yaml_path')
|
|
1407
|
+
if 'original_user_yaml_content' not in new_fields:
|
|
1408
|
+
new_fields.append('original_user_yaml_content')
|
|
1384
1409
|
if cluster_handle_required:
|
|
1385
1410
|
if 'task_name' not in new_fields:
|
|
1386
1411
|
new_fields.append('task_name')
|
|
@@ -1522,12 +1547,11 @@ def get_managed_job_queue(
|
|
|
1522
1547
|
handle = cluster_name_to_handle.get(
|
|
1523
1548
|
cluster_name, None) if cluster_name is not None else None
|
|
1524
1549
|
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
job['cluster_resources'] = resources_str
|
|
1550
|
+
resources_str_simple, resources_str_full = (
|
|
1551
|
+
resources_utils.get_readable_resources_repr(
|
|
1552
|
+
handle, simplified_only=False))
|
|
1553
|
+
assert resources_str_full is not None
|
|
1554
|
+
job['cluster_resources'] = resources_str_simple
|
|
1531
1555
|
job['cluster_resources_full'] = resources_str_full
|
|
1532
1556
|
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1533
1557
|
job['region'] = handle.launched_resources.region
|
|
@@ -2110,7 +2134,8 @@ def _job_proto_to_dict(
|
|
|
2110
2134
|
# and Protobuf encodes int64 as decimal strings in JSON,
|
|
2111
2135
|
# so we need to convert them back to ints.
|
|
2112
2136
|
# https://protobuf.dev/programming-guides/json/#field-representation
|
|
2113
|
-
if field.type == descriptor.FieldDescriptor.TYPE_INT64
|
|
2137
|
+
if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
|
|
2138
|
+
job_dict.get(field.name) is not None):
|
|
2114
2139
|
job_dict[field.name] = int(job_dict[field.name])
|
|
2115
2140
|
job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
|
|
2116
2141
|
job_dict['status'])
|
|
@@ -2265,6 +2290,18 @@ class ManagedJobCodeGen:
|
|
|
2265
2290
|
""")
|
|
2266
2291
|
return cls._build(code)
|
|
2267
2292
|
|
|
2293
|
+
@classmethod
|
|
2294
|
+
def get_version(cls) -> str:
|
|
2295
|
+
"""Generate code to get controller version."""
|
|
2296
|
+
code = textwrap.dedent("""\
|
|
2297
|
+
from sky.skylet import constants as controller_constants
|
|
2298
|
+
|
|
2299
|
+
# Get controller version
|
|
2300
|
+
controller_version = controller_constants.SKYLET_VERSION
|
|
2301
|
+
print(f"controller_version:{controller_version}", flush=True)
|
|
2302
|
+
""")
|
|
2303
|
+
return cls._build(code)
|
|
2304
|
+
|
|
2268
2305
|
@classmethod
|
|
2269
2306
|
def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
|
|
2270
2307
|
code = textwrap.dedent(f"""\
|
sky/metrics/utils.py
CHANGED
|
@@ -143,6 +143,24 @@ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
|
143
143
|
'RSS increment after requests', ['name'],
|
|
144
144
|
buckets=_MEM_BUCKETS)
|
|
145
145
|
|
|
146
|
+
SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
|
|
147
|
+
'sky_apiserver_websocket_ssh_latency_seconds',
|
|
148
|
+
('Time taken for ssh message to go from client to API server and back'
|
|
149
|
+
'to the client. This does not include: latency to reach the pod, '
|
|
150
|
+
'overhead from sending through the k8s port-forward tunnel, or '
|
|
151
|
+
'ssh server lag on the destination pod.'),
|
|
152
|
+
['pid'],
|
|
153
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
154
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
155
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
156
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
157
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
158
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
159
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
160
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
161
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
162
|
+
)
|
|
163
|
+
|
|
146
164
|
|
|
147
165
|
@contextlib.contextmanager
|
|
148
166
|
def time_it(name: str, group: str = 'default'):
|
sky/optimizer.py
CHANGED
|
@@ -1019,7 +1019,7 @@ class Optimizer:
|
|
|
1019
1019
|
if res.instance_type is not None
|
|
1020
1020
|
])
|
|
1021
1021
|
candidate_str = resources_utils.format_resource(
|
|
1022
|
-
best_resources,
|
|
1022
|
+
best_resources, simplified_only=True)[0]
|
|
1023
1023
|
|
|
1024
1024
|
logger.info(
|
|
1025
1025
|
f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
|