skypilot-nightly 1.0.0.dev20250708__py3-none-any.whl → 1.0.0.dev20250710__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +6 -4
- sky/clouds/kubernetes.py +2 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{O3wBEOmvYEVEqZxAP7Czn → P2Di1JdUlHuKN2lBws4Mr}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/8969-13bb52ce3cffa4e3.js +1 -0
- sky/dashboard/out/_next/static/chunks/{webpack-9a81ea998672c303.js → webpack-fd62f17bd9ce1fcc.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +7 -4
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +7 -0
- sky/jobs/server/core.py +2 -1
- sky/jobs/server/utils.py +81 -0
- sky/jobs/state.py +55 -33
- sky/jobs/utils.py +34 -3
- sky/provision/kubernetes/instance.py +17 -0
- sky/provision/kubernetes/utils.py +5 -0
- sky/provision/provisioner.py +20 -0
- sky/server/metrics.py +2 -3
- sky/server/requests/executor.py +2 -5
- sky/server/requests/payloads.py +1 -0
- sky/server/requests/requests.py +94 -4
- sky/server/server.py +19 -5
- sky/server/uvicorn.py +4 -1
- sky/skylet/constants.py +1 -6
- sky/skylet/job_lib.py +30 -8
- sky/skypilot_config.py +4 -2
- sky/task.py +17 -0
- sky/users/permission.py +3 -0
- sky/utils/common_utils.py +13 -0
- sky/utils/db_utils.py +16 -0
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +2 -4
- {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/RECORD +53 -52
- sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +0 -1
- /sky/dashboard/out/_next/static/{O3wBEOmvYEVEqZxAP7Czn → P2Di1JdUlHuKN2lBws4Mr}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
@@ -77,6 +77,7 @@ spot_table = sqlalchemy.Table(
|
|
77
77
|
sqlalchemy.Column('task_name', sqlalchemy.Text),
|
78
78
|
sqlalchemy.Column('specs', sqlalchemy.Text),
|
79
79
|
sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
|
80
|
+
sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
|
80
81
|
)
|
81
82
|
|
82
83
|
job_info_table = sqlalchemy.Table(
|
@@ -131,7 +132,7 @@ def create_table():
|
|
131
132
|
# is not critical and is likely to be enabled by other processes.
|
132
133
|
|
133
134
|
# Create tables if they don't exist
|
134
|
-
Base.metadata
|
135
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata, _SQLALCHEMY_ENGINE)
|
135
136
|
|
136
137
|
# Backward compatibility: add columns that not exist in older databases
|
137
138
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
@@ -170,6 +171,14 @@ def create_table():
|
|
170
171
|
sqlalchemy.Text(),
|
171
172
|
default_statement='DEFAULT NULL')
|
172
173
|
|
174
|
+
db_utils.add_column_to_table_sqlalchemy(
|
175
|
+
session,
|
176
|
+
'spot',
|
177
|
+
'metadata',
|
178
|
+
sqlalchemy.Text(),
|
179
|
+
default_statement='DEFAULT \'{}\'',
|
180
|
+
value_to_replace_existing_entries='{}')
|
181
|
+
|
173
182
|
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
174
183
|
'schedule_state',
|
175
184
|
sqlalchemy.Text())
|
@@ -219,7 +228,8 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
219
228
|
conn_string = skypilot_config.get_nested(('db',), None)
|
220
229
|
if conn_string:
|
221
230
|
logger.debug(f'using db URI from {conn_string}')
|
222
|
-
_SQLALCHEMY_ENGINE = sqlalchemy.create_engine(
|
231
|
+
_SQLALCHEMY_ENGINE = sqlalchemy.create_engine(
|
232
|
+
conn_string, poolclass=sqlalchemy.NullPool)
|
223
233
|
else:
|
224
234
|
db_path = os.path.expanduser('~/.sky/spot_jobs.db')
|
225
235
|
pathlib.Path(db_path).parents[0].mkdir(parents=True,
|
@@ -272,6 +282,7 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
272
282
|
'task_name': r['task_name'],
|
273
283
|
'specs': r['specs'],
|
274
284
|
'local_log_file': r['local_log_file'],
|
285
|
+
'metadata': r['metadata'],
|
275
286
|
# columns from job_info table (some may be None for legacy jobs)
|
276
287
|
'_job_info_job_id': r[job_info_table.c.spot_job_id
|
277
288
|
], # ambiguous, use table.column
|
@@ -544,20 +555,28 @@ def set_job_info_without_job_id(name: str, workspace: str,
|
|
544
555
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
545
556
|
db_utils.SQLAlchemyDialect.SQLITE.value):
|
546
557
|
result = session.execute(insert_stmt)
|
558
|
+
ret = result.lastrowid
|
547
559
|
session.commit()
|
548
|
-
return
|
560
|
+
return ret
|
549
561
|
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
550
562
|
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
551
563
|
result = session.execute(
|
552
564
|
insert_stmt.returning(job_info_table.c.spot_job_id))
|
565
|
+
ret = result.scalar()
|
553
566
|
session.commit()
|
554
|
-
return
|
567
|
+
return ret
|
555
568
|
else:
|
556
569
|
raise ValueError('Unsupported database dialect')
|
557
570
|
|
558
571
|
|
559
572
|
@_init_db
|
560
|
-
def set_pending(
|
573
|
+
def set_pending(
|
574
|
+
job_id: int,
|
575
|
+
task_id: int,
|
576
|
+
task_name: str,
|
577
|
+
resources_str: str,
|
578
|
+
metadata: str,
|
579
|
+
):
|
561
580
|
"""Set the task to pending state."""
|
562
581
|
assert _SQLALCHEMY_ENGINE is not None
|
563
582
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
@@ -567,6 +586,7 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
|
|
567
586
|
task_id=task_id,
|
568
587
|
task_name=task_name,
|
569
588
|
resources=resources_str,
|
589
|
+
metadata=metadata,
|
570
590
|
status=ManagedJobStatus.PENDING.value,
|
571
591
|
))
|
572
592
|
session.commit()
|
@@ -1192,38 +1212,40 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
1192
1212
|
# Note: we will get the user_hash here, but don't try to call
|
1193
1213
|
# global_user_state.get_user() on it. This runs on the controller, which may
|
1194
1214
|
# not have the user info. Prefer to do it on the API server side.
|
1215
|
+
query = sqlalchemy.select(spot_table, job_info_table).select_from(
|
1216
|
+
spot_table.outerjoin(
|
1217
|
+
job_info_table,
|
1218
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
1219
|
+
if job_id is not None:
|
1220
|
+
query = query.where(spot_table.c.spot_job_id == job_id)
|
1221
|
+
query = query.order_by(spot_table.c.spot_job_id.desc(),
|
1222
|
+
spot_table.c.task_id.asc())
|
1223
|
+
rows = None
|
1195
1224
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1196
|
-
query = sqlalchemy.select(spot_table, job_info_table).select_from(
|
1197
|
-
spot_table.outerjoin(
|
1198
|
-
job_info_table,
|
1199
|
-
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
1200
|
-
if job_id is not None:
|
1201
|
-
query = query.where(spot_table.c.spot_job_id == job_id)
|
1202
|
-
query = query.order_by(spot_table.c.spot_job_id.desc(),
|
1203
|
-
spot_table.c.task_id.asc())
|
1204
1225
|
rows = session.execute(query).fetchall()
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
|
1217
|
-
|
1218
|
-
|
1219
|
-
|
1220
|
-
|
1221
|
-
|
1222
|
-
else:
|
1226
|
+
jobs = []
|
1227
|
+
for row in rows:
|
1228
|
+
job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
|
1229
|
+
job_dict['status'] = ManagedJobStatus(job_dict['status'])
|
1230
|
+
job_dict['schedule_state'] = ManagedJobScheduleState(
|
1231
|
+
job_dict['schedule_state'])
|
1232
|
+
if job_dict['job_name'] is None:
|
1233
|
+
job_dict['job_name'] = job_dict['task_name']
|
1234
|
+
job_dict['metadata'] = json.loads(job_dict['metadata'])
|
1235
|
+
|
1236
|
+
# Add user YAML content for managed jobs.
|
1237
|
+
yaml_path = job_dict.get('original_user_yaml_path')
|
1238
|
+
if yaml_path:
|
1239
|
+
try:
|
1240
|
+
with open(yaml_path, 'r', encoding='utf-8') as f:
|
1241
|
+
job_dict['user_yaml'] = f.read()
|
1242
|
+
except (FileNotFoundError, IOError, OSError):
|
1223
1243
|
job_dict['user_yaml'] = None
|
1244
|
+
else:
|
1245
|
+
job_dict['user_yaml'] = None
|
1224
1246
|
|
1225
|
-
|
1226
|
-
|
1247
|
+
jobs.append(job_dict)
|
1248
|
+
return jobs
|
1227
1249
|
|
1228
1250
|
|
1229
1251
|
@_init_db
|
sky/jobs/utils.py
CHANGED
@@ -1249,7 +1249,14 @@ def format_job_table(
|
|
1249
1249
|
]
|
1250
1250
|
if show_all:
|
1251
1251
|
# TODO: move SCHED. STATE to a separate flag (e.g. --debug)
|
1252
|
-
columns += [
|
1252
|
+
columns += [
|
1253
|
+
'STARTED',
|
1254
|
+
'INFRA',
|
1255
|
+
'RESOURCES',
|
1256
|
+
'SCHED. STATE',
|
1257
|
+
'DETAILS',
|
1258
|
+
'GIT_COMMIT',
|
1259
|
+
]
|
1253
1260
|
if tasks_have_k8s_user:
|
1254
1261
|
columns.insert(0, 'USER')
|
1255
1262
|
job_table = log_utils.create_table(columns)
|
@@ -1362,6 +1369,7 @@ def format_job_table(
|
|
1362
1369
|
'-',
|
1363
1370
|
job_tasks[0]['schedule_state'],
|
1364
1371
|
generate_details(details, failure_reason),
|
1372
|
+
job_tasks[0].get('metadata', {}).get('git_commit', '-'),
|
1365
1373
|
])
|
1366
1374
|
if tasks_have_k8s_user:
|
1367
1375
|
job_values.insert(0, job_tasks[0].get('user', '-'))
|
@@ -1427,6 +1435,8 @@ def format_job_table(
|
|
1427
1435
|
generate_details(task.get('details'),
|
1428
1436
|
task['failure_reason']),
|
1429
1437
|
])
|
1438
|
+
|
1439
|
+
values.append(task.get('metadata', {}).get('git_commit', '-'))
|
1430
1440
|
if tasks_have_k8s_user:
|
1431
1441
|
values.insert(0, task.get('user', '-'))
|
1432
1442
|
job_table.add_row(values)
|
@@ -1511,6 +1521,22 @@ class ManagedJobCodeGen:
|
|
1511
1521
|
""")
|
1512
1522
|
return cls._build(code)
|
1513
1523
|
|
1524
|
+
@classmethod
|
1525
|
+
def get_version_and_job_table(cls) -> str:
|
1526
|
+
"""Generate code to get controller version and raw job table."""
|
1527
|
+
code = textwrap.dedent("""\
|
1528
|
+
from sky.skylet import constants as controller_constants
|
1529
|
+
|
1530
|
+
# Get controller version
|
1531
|
+
controller_version = controller_constants.SKYLET_VERSION
|
1532
|
+
print(f"controller_version:{controller_version}", flush=True)
|
1533
|
+
|
1534
|
+
# Get and print raw job table (load_managed_job_queue can parse this directly)
|
1535
|
+
job_table = utils.dump_managed_job_queue()
|
1536
|
+
print(job_table, flush=True)
|
1537
|
+
""")
|
1538
|
+
return cls._build(code)
|
1539
|
+
|
1514
1540
|
@classmethod
|
1515
1541
|
def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
|
1516
1542
|
code = textwrap.dedent(f"""\
|
@@ -1565,8 +1591,13 @@ class ManagedJobCodeGen:
|
|
1565
1591
|
resources_str = backend_utils.get_task_resources_str(
|
1566
1592
|
task, is_managed_job=True)
|
1567
1593
|
code += textwrap.dedent(f"""\
|
1568
|
-
|
1569
|
-
|
1594
|
+
if managed_job_version < 7:
|
1595
|
+
managed_job_state.set_pending({job_id}, {task_id},
|
1596
|
+
{task.name!r}, {resources_str!r})
|
1597
|
+
else:
|
1598
|
+
managed_job_state.set_pending({job_id}, {task_id},
|
1599
|
+
{task.name!r}, {resources_str!r},
|
1600
|
+
{task.metadata_json!r})
|
1570
1601
|
""")
|
1571
1602
|
return cls._build(code)
|
1572
1603
|
|
@@ -825,6 +825,23 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
825
825
|
return
|
826
826
|
pod_spec_copy['metadata']['name'] = pod_name
|
827
827
|
pod_spec_copy['metadata']['labels']['component'] = pod_name
|
828
|
+
|
829
|
+
# We need to keep the following fields in the pod spec to be same for
|
830
|
+
# head and worker pods.
|
831
|
+
# So that Kueue can merge them into a single PodSet when creating
|
832
|
+
# ProvisioningRequest to trigger scale up of the cluster autoscaler,
|
833
|
+
# this is especially required for DWS queued provisioning mode in GKE.
|
834
|
+
# spec.containers[*].resources.requests
|
835
|
+
# spec.initContainers[*].resources.requests
|
836
|
+
# spec.resources
|
837
|
+
# spec.nodeSelector
|
838
|
+
# spec.tolerations
|
839
|
+
# spec.affinity
|
840
|
+
# resourceClaims
|
841
|
+
# Refer to the following links for more details:
|
842
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#define_a_provisioningrequest_object # pylint: disable=line-too-long
|
843
|
+
# https://kueue.sigs.k8s.io/docs/admission-check-controllers/provisioning/#podset-merge-policy # pylint: disable=line-too-long
|
844
|
+
if config.count > 1:
|
828
845
|
# For multi-node support, we put a soft-constraint to schedule
|
829
846
|
# worker pods on different nodes than the head pod.
|
830
847
|
# This is not set as a hard constraint because if different nodes
|
@@ -313,6 +313,9 @@ def get_gke_accelerator_name(accelerator: str) -> str:
|
|
313
313
|
# A100-80GB, L4, H100-80GB and H100-MEGA-80GB
|
314
314
|
# have a different name pattern.
|
315
315
|
return 'nvidia-{}'.format(accelerator.lower())
|
316
|
+
elif accelerator == 'H200':
|
317
|
+
# H200s on GCP use this label format
|
318
|
+
return 'nvidia-h200-141gb'
|
316
319
|
elif accelerator.startswith('tpu-'):
|
317
320
|
return accelerator
|
318
321
|
else:
|
@@ -482,6 +485,8 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
482
485
|
# we map H100 ---> H100-80GB and keep H100-MEGA-80GB
|
483
486
|
# to distinguish between a3-high and a3-mega instances
|
484
487
|
return 'H100'
|
488
|
+
elif acc == 'H200-141GB':
|
489
|
+
return 'H200'
|
485
490
|
return acc
|
486
491
|
elif is_tpu_on_gke(value):
|
487
492
|
return value
|
sky/provision/provisioner.py
CHANGED
@@ -22,11 +22,13 @@ from sky import sky_logging
|
|
22
22
|
from sky import skypilot_config
|
23
23
|
from sky.adaptors import aws
|
24
24
|
from sky.backends import backend_utils
|
25
|
+
from sky.jobs.server import utils as server_jobs_utils
|
25
26
|
from sky.provision import common as provision_common
|
26
27
|
from sky.provision import instance_setup
|
27
28
|
from sky.provision import logging as provision_logging
|
28
29
|
from sky.provision import metadata_utils
|
29
30
|
from sky.skylet import constants
|
31
|
+
from sky.utils import common
|
30
32
|
from sky.utils import common_utils
|
31
33
|
from sky.utils import message_utils
|
32
34
|
from sky.utils import resources_utils
|
@@ -502,6 +504,24 @@ def _post_provision_setup(
|
|
502
504
|
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
|
503
505
|
f'Docker container is up.{colorama.Style.RESET_ALL}')
|
504
506
|
|
507
|
+
# Check version compatibility for jobs controller clusters
|
508
|
+
if cluster_name.display_name.startswith(common.JOB_CONTROLLER_PREFIX):
|
509
|
+
# TODO(zeping): remove this in v0.12.0
|
510
|
+
# This only happens in upgrade from <0.9.3 to > 0.10.0
|
511
|
+
# After 0.10.0 no incompatibility issue
|
512
|
+
# See https://github.com/skypilot-org/skypilot/pull/6096
|
513
|
+
# For more details
|
514
|
+
status.update(
|
515
|
+
ux_utils.spinner_message(
|
516
|
+
'Checking controller version compatibility'))
|
517
|
+
try:
|
518
|
+
server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
|
519
|
+
except exceptions.ClusterNotUpError:
|
520
|
+
# Controller is not up yet during initial provisioning, that
|
521
|
+
# also means no non-terminal jobs, so no incompatibility in
|
522
|
+
# this case.
|
523
|
+
pass
|
524
|
+
|
505
525
|
# We mount the metadata with sky wheel for speedup.
|
506
526
|
# NOTE: currently we mount all credentials for all nodes, because
|
507
527
|
# (1) jobs controllers need permission to launch/down nodes of
|
sky/server/metrics.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
"""Instrumentation for the API server."""
|
2
2
|
|
3
|
-
import asyncio
|
4
3
|
import os
|
5
4
|
import time
|
6
5
|
|
@@ -50,7 +49,7 @@ async def metrics() -> fastapi.Response:
|
|
50
49
|
headers={'Cache-Control': 'no-cache'})
|
51
50
|
|
52
51
|
|
53
|
-
def
|
52
|
+
def build_metrics_server(host: str, port: int) -> uvicorn.Server:
|
54
53
|
metrics_config = uvicorn.Config(
|
55
54
|
'sky.server.metrics:metrics_app',
|
56
55
|
host=host,
|
@@ -58,7 +57,7 @@ def run_metrics_server(host: str, port: int):
|
|
58
57
|
workers=1,
|
59
58
|
)
|
60
59
|
metrics_server_instance = uvicorn.Server(metrics_config)
|
61
|
-
|
60
|
+
return metrics_server_instance
|
62
61
|
|
63
62
|
|
64
63
|
def _get_status_code_group(status_code: int) -> str:
|
sky/server/requests/executor.py
CHANGED
@@ -399,11 +399,8 @@ def _request_execution_wrapper(request_id: str,
|
|
399
399
|
f'{common_utils.format_exception(e)}')
|
400
400
|
return
|
401
401
|
else:
|
402
|
-
|
403
|
-
|
404
|
-
request_task.status = api_requests.RequestStatus.SUCCEEDED
|
405
|
-
if not ignore_return_value:
|
406
|
-
request_task.set_return_value(return_value)
|
402
|
+
api_requests.set_request_succeeded(
|
403
|
+
request_id, return_value if not ignore_return_value else None)
|
407
404
|
_restore_output(original_stdout, original_stderr)
|
408
405
|
logger.info(f'Request {request_id} finished')
|
409
406
|
|
sky/server/requests/payloads.py
CHANGED
sky/server/requests/requests.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Utilities for REST API."""
|
2
|
+
import asyncio
|
2
3
|
import contextlib
|
3
4
|
import dataclasses
|
4
5
|
import enum
|
@@ -20,6 +21,7 @@ import filelock
|
|
20
21
|
from sky import exceptions
|
21
22
|
from sky import global_user_state
|
22
23
|
from sky import sky_logging
|
24
|
+
from sky import skypilot_config
|
23
25
|
from sky.server import common as server_common
|
24
26
|
from sky.server import constants as server_constants
|
25
27
|
from sky.server.requests import payloads
|
@@ -29,6 +31,7 @@ from sky.utils import common
|
|
29
31
|
from sky.utils import common_utils
|
30
32
|
from sky.utils import db_utils
|
31
33
|
from sky.utils import env_options
|
34
|
+
from sky.utils import subprocess_utils
|
32
35
|
from sky.utils import ux_utils
|
33
36
|
|
34
37
|
logger = sky_logging.init_logger(__name__)
|
@@ -39,8 +42,11 @@ COL_CLUSTER_NAME = 'cluster_name'
|
|
39
42
|
COL_USER_ID = 'user_id'
|
40
43
|
COL_STATUS_MSG = 'status_msg'
|
41
44
|
COL_SHOULD_RETRY = 'should_retry'
|
45
|
+
COL_FINISHED_AT = 'finished_at'
|
42
46
|
REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
|
43
47
|
|
48
|
+
DEFAULT_REQUESTS_RETENTION_HOURS = 24 # 1 day
|
49
|
+
|
44
50
|
# TODO(zhwu): For scalability, there are several TODOs:
|
45
51
|
# [x] Have a way to queue requests.
|
46
52
|
# [ ] Move logs to persistent place.
|
@@ -64,6 +70,10 @@ class RequestStatus(enum.Enum):
|
|
64
70
|
color = _STATUS_TO_COLOR[self]
|
65
71
|
return f'{color}{self.value}{colorama.Style.RESET_ALL}'
|
66
72
|
|
73
|
+
@classmethod
|
74
|
+
def finished_status(cls) -> List['RequestStatus']:
|
75
|
+
return [cls.SUCCEEDED, cls.FAILED, cls.CANCELLED]
|
76
|
+
|
67
77
|
|
68
78
|
_STATUS_TO_COLOR = {
|
69
79
|
RequestStatus.PENDING: colorama.Fore.BLUE,
|
@@ -88,6 +98,7 @@ REQUEST_COLUMNS = [
|
|
88
98
|
COL_USER_ID,
|
89
99
|
COL_STATUS_MSG,
|
90
100
|
COL_SHOULD_RETRY,
|
101
|
+
COL_FINISHED_AT,
|
91
102
|
]
|
92
103
|
|
93
104
|
|
@@ -120,6 +131,8 @@ class Request:
|
|
120
131
|
status_msg: Optional[str] = None
|
121
132
|
# Whether the request should be retried.
|
122
133
|
should_retry: bool = False
|
134
|
+
# When the request finished.
|
135
|
+
finished_at: Optional[float] = None
|
123
136
|
|
124
137
|
@property
|
125
138
|
def log_path(self) -> pathlib.Path:
|
@@ -206,6 +219,7 @@ class Request:
|
|
206
219
|
cluster_name=self.cluster_name,
|
207
220
|
status_msg=self.status_msg,
|
208
221
|
should_retry=self.should_retry,
|
222
|
+
finished_at=self.finished_at,
|
209
223
|
)
|
210
224
|
|
211
225
|
def encode(self) -> payloads.RequestPayload:
|
@@ -228,6 +242,7 @@ class Request:
|
|
228
242
|
cluster_name=self.cluster_name,
|
229
243
|
status_msg=self.status_msg,
|
230
244
|
should_retry=self.should_retry,
|
245
|
+
finished_at=self.finished_at,
|
231
246
|
)
|
232
247
|
except (TypeError, ValueError) as e:
|
233
248
|
# The error is unexpected, so we don't suppress the stack trace.
|
@@ -260,6 +275,7 @@ class Request:
|
|
260
275
|
cluster_name=payload.cluster_name,
|
261
276
|
status_msg=payload.status_msg,
|
262
277
|
should_retry=payload.should_retry,
|
278
|
+
finished_at=payload.finished_at,
|
263
279
|
)
|
264
280
|
except (TypeError, ValueError) as e:
|
265
281
|
logger.error(
|
@@ -439,6 +455,7 @@ def kill_requests(request_ids: Optional[List[str]] = None,
|
|
439
455
|
# process for each request.
|
440
456
|
os.kill(request_record.pid, signal.SIGTERM)
|
441
457
|
request_record.status = RequestStatus.CANCELLED
|
458
|
+
request_record.finished_at = time.time()
|
442
459
|
cancelled_request_ids.append(request_id)
|
443
460
|
return cancelled_request_ids
|
444
461
|
|
@@ -474,13 +491,16 @@ def create_table(cursor, conn):
|
|
474
491
|
schedule_type TEXT,
|
475
492
|
{COL_USER_ID} TEXT,
|
476
493
|
{COL_STATUS_MSG} TEXT,
|
477
|
-
{COL_SHOULD_RETRY} INTEGER
|
494
|
+
{COL_SHOULD_RETRY} INTEGER,
|
495
|
+
{COL_FINISHED_AT} REAL
|
478
496
|
)""")
|
479
497
|
|
480
498
|
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
|
481
499
|
'TEXT')
|
482
500
|
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
|
483
501
|
'INTEGER')
|
502
|
+
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
|
503
|
+
'REAL')
|
484
504
|
|
485
505
|
|
486
506
|
_DB = None
|
@@ -583,6 +603,7 @@ def get_request_tasks(
|
|
583
603
|
user_id: Optional[str] = None,
|
584
604
|
exclude_request_names: Optional[List[str]] = None,
|
585
605
|
include_request_names: Optional[List[str]] = None,
|
606
|
+
finished_before: Optional[float] = None,
|
586
607
|
) -> List[Request]:
|
587
608
|
"""Get a list of requests that match the given filters.
|
588
609
|
|
@@ -595,6 +616,8 @@ def get_request_tasks(
|
|
595
616
|
If None, all users are included.
|
596
617
|
include_request_names: a list of request names to filter on.
|
597
618
|
Mutually exclusive with exclude_request_names.
|
619
|
+
finished_before: if provided, only include requests finished before this
|
620
|
+
timestamp.
|
598
621
|
|
599
622
|
Raises:
|
600
623
|
ValueError: If both exclude_request_names and include_request_names are
|
@@ -606,7 +629,7 @@ def get_request_tasks(
|
|
606
629
|
'provided, not both.')
|
607
630
|
|
608
631
|
filters = []
|
609
|
-
filter_params = []
|
632
|
+
filter_params: List[Any] = []
|
610
633
|
if status is not None:
|
611
634
|
status_list_str = ','.join(repr(status.value) for status in status)
|
612
635
|
filters.append(f'status IN ({status_list_str})')
|
@@ -624,6 +647,9 @@ def get_request_tasks(
|
|
624
647
|
request_names_str = ','.join(
|
625
648
|
repr(name) for name in include_request_names)
|
626
649
|
filters.append(f'name IN ({request_names_str})')
|
650
|
+
if finished_before is not None:
|
651
|
+
filters.append('finished_at < ?')
|
652
|
+
filter_params.append(finished_before)
|
627
653
|
assert _DB is not None
|
628
654
|
with _DB.conn:
|
629
655
|
cursor = _DB.conn.cursor()
|
@@ -665,19 +691,83 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
|
|
665
691
|
with update_request(request_id) as request_task:
|
666
692
|
assert request_task is not None, request_id
|
667
693
|
request_task.status = RequestStatus.FAILED
|
694
|
+
request_task.finished_at = time.time()
|
668
695
|
request_task.set_error(e)
|
669
696
|
|
670
697
|
|
671
|
-
def set_request_succeeded(request_id: str, result: Any) -> None:
|
698
|
+
def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
|
672
699
|
"""Set a request to succeeded and populate the result."""
|
673
700
|
with update_request(request_id) as request_task:
|
674
701
|
assert request_task is not None, request_id
|
675
702
|
request_task.status = RequestStatus.SUCCEEDED
|
676
|
-
request_task.
|
703
|
+
request_task.finished_at = time.time()
|
704
|
+
if result is not None:
|
705
|
+
request_task.set_return_value(result)
|
677
706
|
|
678
707
|
|
679
708
|
def set_request_cancelled(request_id: str) -> None:
|
680
709
|
"""Set a request to cancelled."""
|
681
710
|
with update_request(request_id) as request_task:
|
682
711
|
assert request_task is not None, request_id
|
712
|
+
request_task.finished_at = time.time()
|
683
713
|
request_task.status = RequestStatus.CANCELLED
|
714
|
+
|
715
|
+
|
716
|
+
@init_db
|
717
|
+
def _delete_requests(requests: List[Request]):
|
718
|
+
"""Clean up requests by their IDs."""
|
719
|
+
id_list_str = ','.join(repr(req.request_id) for req in requests)
|
720
|
+
assert _DB is not None
|
721
|
+
with _DB.conn:
|
722
|
+
cursor = _DB.conn.cursor()
|
723
|
+
cursor.execute(
|
724
|
+
f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
|
725
|
+
|
726
|
+
|
727
|
+
def clean_finished_requests_with_retention(retention_seconds: int):
|
728
|
+
"""Clean up finished requests older than the retention period.
|
729
|
+
|
730
|
+
This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
|
731
|
+
from the database and cleans up their associated log files.
|
732
|
+
|
733
|
+
Args:
|
734
|
+
retention_seconds: Requests older than this many seconds will be
|
735
|
+
deleted.
|
736
|
+
"""
|
737
|
+
reqs = get_request_tasks(status=RequestStatus.finished_status(),
|
738
|
+
finished_before=time.time() - retention_seconds)
|
739
|
+
|
740
|
+
subprocess_utils.run_in_parallel(
|
741
|
+
func=lambda req: req.log_path.unlink(missing_ok=True),
|
742
|
+
args=reqs,
|
743
|
+
num_threads=len(reqs))
|
744
|
+
|
745
|
+
_delete_requests(reqs)
|
746
|
+
|
747
|
+
# To avoid leakage of the log file, logs must be deleted before the
|
748
|
+
# request task in the database.
|
749
|
+
logger.info(f'Cleaned up {len(reqs)} finished requests '
|
750
|
+
f'older than {retention_seconds} seconds')
|
751
|
+
|
752
|
+
|
753
|
+
async def requests_gc_daemon():
|
754
|
+
"""Garbage collect finished requests periodically."""
|
755
|
+
while True:
|
756
|
+
logger.info('Running requests GC daemon...')
|
757
|
+
# Use the latest config.
|
758
|
+
skypilot_config.reload_config()
|
759
|
+
retention_seconds = skypilot_config.get_nested(
|
760
|
+
('api_server', 'requests_retention_hours'),
|
761
|
+
DEFAULT_REQUESTS_RETENTION_HOURS) * 3600
|
762
|
+
try:
|
763
|
+
# Negative value disables the requests GC
|
764
|
+
if retention_seconds >= 0:
|
765
|
+
clean_finished_requests_with_retention(retention_seconds)
|
766
|
+
except asyncio.CancelledError:
|
767
|
+
logger.info('Requests GC daemon cancelled')
|
768
|
+
break
|
769
|
+
except Exception as e: # pylint: disable=broad-except
|
770
|
+
logger.error(f'Error running requests GC daemon: {e}')
|
771
|
+
# Run the daemon at most once every hour to avoid too frequent
|
772
|
+
# cleanup.
|
773
|
+
await asyncio.sleep(max(retention_seconds, 3600))
|
sky/server/server.py
CHANGED
@@ -26,6 +26,7 @@ import fastapi
|
|
26
26
|
from fastapi.middleware import cors
|
27
27
|
from passlib.hash import apr_md5_crypt
|
28
28
|
import starlette.middleware.base
|
29
|
+
import uvloop
|
29
30
|
|
30
31
|
import sky
|
31
32
|
from sky import catalog
|
@@ -1461,6 +1462,12 @@ async def stream(
|
|
1461
1462
|
raise fastapi.HTTPException(
|
1462
1463
|
status_code=404, detail=f'Request {request_id!r} not found')
|
1463
1464
|
log_path_to_stream = request_task.log_path
|
1465
|
+
if not log_path_to_stream.exists():
|
1466
|
+
# The log file might be deleted by the request GC daemon but the
|
1467
|
+
# request task is still in the database.
|
1468
|
+
raise fastapi.HTTPException(
|
1469
|
+
status_code=404,
|
1470
|
+
detail=f'Log of request {request_id!r} has been deleted')
|
1464
1471
|
else:
|
1465
1472
|
assert log_path is not None, (request_id, log_path)
|
1466
1473
|
if log_path == constants.API_SERVER_LOGS:
|
@@ -1775,13 +1782,18 @@ if __name__ == '__main__':
|
|
1775
1782
|
|
1776
1783
|
queue_server: Optional[multiprocessing.Process] = None
|
1777
1784
|
workers: List[executor.RequestWorker] = []
|
1785
|
+
# Global background tasks that will be scheduled in a separate event loop.
|
1786
|
+
global_tasks: List[asyncio.Task] = []
|
1778
1787
|
try:
|
1788
|
+
background = uvloop.new_event_loop()
|
1779
1789
|
if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
|
1780
|
-
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1790
|
+
metrics_server = metrics.build_metrics_server(
|
1791
|
+
cmd_args.host, cmd_args.metrics_port)
|
1792
|
+
global_tasks.append(background.create_task(metrics_server.serve()))
|
1793
|
+
global_tasks.append(
|
1794
|
+
background.create_task(requests_lib.requests_gc_daemon()))
|
1795
|
+
threading.Thread(target=background.run_forever, daemon=True).start()
|
1796
|
+
|
1785
1797
|
queue_server, workers = executor.start(config)
|
1786
1798
|
|
1787
1799
|
logger.info(f'Starting SkyPilot API server, workers={num_workers}')
|
@@ -1799,6 +1811,8 @@ if __name__ == '__main__':
|
|
1799
1811
|
finally:
|
1800
1812
|
logger.info('Shutting down SkyPilot API server...')
|
1801
1813
|
|
1814
|
+
for gt in global_tasks:
|
1815
|
+
gt.cancel()
|
1802
1816
|
subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
|
1803
1817
|
workers,
|
1804
1818
|
num_threads=len(workers))
|
sky/server/uvicorn.py
CHANGED
@@ -150,7 +150,10 @@ class Server(uvicorn.Server):
|
|
150
150
|
if req is None:
|
151
151
|
return
|
152
152
|
if req.pid is not None:
|
153
|
-
|
153
|
+
try:
|
154
|
+
os.kill(req.pid, signal.SIGTERM)
|
155
|
+
except ProcessLookupError:
|
156
|
+
logger.debug(f'Process {req.pid} already finished.')
|
154
157
|
req.status = requests_lib.RequestStatus.CANCELLED
|
155
158
|
req.should_retry = True
|
156
159
|
logger.info(
|
sky/skylet/constants.py
CHANGED
@@ -89,18 +89,13 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
89
89
|
# cluster yaml is updated.
|
90
90
|
#
|
91
91
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
92
|
-
SKYLET_VERSION = '
|
92
|
+
SKYLET_VERSION = '15'
|
93
93
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
94
94
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
95
95
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
96
96
|
SKYLET_LIB_VERSION = 3
|
97
97
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
98
98
|
|
99
|
-
# `sky jobs dashboard`-related
|
100
|
-
#
|
101
|
-
# Port on the remote jobs controller that the dashboard is running on.
|
102
|
-
SPOT_DASHBOARD_REMOTE_PORT = 5000
|
103
|
-
|
104
99
|
# Docker default options
|
105
100
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
106
101
|
DEFAULT_DOCKER_PORT = 10022
|