skypilot-nightly 1.0.0.dev20250709__py3-none-any.whl → 1.0.0.dev20250710__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +6 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → P2Di1JdUlHuKN2lBws4Mr}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/8969-13bb52ce3cffa4e3.js +1 -0
- sky/dashboard/out/_next/static/chunks/{webpack-9a81ea998672c303.js → webpack-fd62f17bd9ce1fcc.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +1 -1
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +7 -0
- sky/jobs/server/core.py +2 -1
- sky/jobs/server/utils.py +81 -0
- sky/jobs/state.py +49 -30
- sky/jobs/utils.py +34 -3
- sky/provision/kubernetes/instance.py +17 -0
- sky/provision/kubernetes/utils.py +2 -0
- sky/provision/provisioner.py +20 -0
- sky/skylet/constants.py +1 -6
- sky/skylet/job_lib.py +30 -8
- sky/skypilot_config.py +4 -2
- sky/task.py +17 -0
- sky/users/permission.py +3 -0
- sky/utils/common_utils.py +13 -0
- sky/utils/db_utils.py +16 -0
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +2 -4
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/RECORD +46 -45
- sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +0 -1
- /sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → P2Di1JdUlHuKN2lBws4Mr}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
@@ -77,6 +77,7 @@ spot_table = sqlalchemy.Table(
|
|
77
77
|
sqlalchemy.Column('task_name', sqlalchemy.Text),
|
78
78
|
sqlalchemy.Column('specs', sqlalchemy.Text),
|
79
79
|
sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
|
80
|
+
sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
|
80
81
|
)
|
81
82
|
|
82
83
|
job_info_table = sqlalchemy.Table(
|
@@ -131,7 +132,7 @@ def create_table():
|
|
131
132
|
# is not critical and is likely to be enabled by other processes.
|
132
133
|
|
133
134
|
# Create tables if they don't exist
|
134
|
-
Base.metadata
|
135
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata, _SQLALCHEMY_ENGINE)
|
135
136
|
|
136
137
|
# Backward compatibility: add columns that not exist in older databases
|
137
138
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
@@ -170,6 +171,14 @@ def create_table():
|
|
170
171
|
sqlalchemy.Text(),
|
171
172
|
default_statement='DEFAULT NULL')
|
172
173
|
|
174
|
+
db_utils.add_column_to_table_sqlalchemy(
|
175
|
+
session,
|
176
|
+
'spot',
|
177
|
+
'metadata',
|
178
|
+
sqlalchemy.Text(),
|
179
|
+
default_statement='DEFAULT \'{}\'',
|
180
|
+
value_to_replace_existing_entries='{}')
|
181
|
+
|
173
182
|
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
174
183
|
'schedule_state',
|
175
184
|
sqlalchemy.Text())
|
@@ -273,6 +282,7 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
273
282
|
'task_name': r['task_name'],
|
274
283
|
'specs': r['specs'],
|
275
284
|
'local_log_file': r['local_log_file'],
|
285
|
+
'metadata': r['metadata'],
|
276
286
|
# columns from job_info table (some may be None for legacy jobs)
|
277
287
|
'_job_info_job_id': r[job_info_table.c.spot_job_id
|
278
288
|
], # ambiguous, use table.column
|
@@ -560,7 +570,13 @@ def set_job_info_without_job_id(name: str, workspace: str,
|
|
560
570
|
|
561
571
|
|
562
572
|
@_init_db
|
563
|
-
def set_pending(
|
573
|
+
def set_pending(
|
574
|
+
job_id: int,
|
575
|
+
task_id: int,
|
576
|
+
task_name: str,
|
577
|
+
resources_str: str,
|
578
|
+
metadata: str,
|
579
|
+
):
|
564
580
|
"""Set the task to pending state."""
|
565
581
|
assert _SQLALCHEMY_ENGINE is not None
|
566
582
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
@@ -570,6 +586,7 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
|
|
570
586
|
task_id=task_id,
|
571
587
|
task_name=task_name,
|
572
588
|
resources=resources_str,
|
589
|
+
metadata=metadata,
|
573
590
|
status=ManagedJobStatus.PENDING.value,
|
574
591
|
))
|
575
592
|
session.commit()
|
@@ -1195,38 +1212,40 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
1195
1212
|
# Note: we will get the user_hash here, but don't try to call
|
1196
1213
|
# global_user_state.get_user() on it. This runs on the controller, which may
|
1197
1214
|
# not have the user info. Prefer to do it on the API server side.
|
1215
|
+
query = sqlalchemy.select(spot_table, job_info_table).select_from(
|
1216
|
+
spot_table.outerjoin(
|
1217
|
+
job_info_table,
|
1218
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
1219
|
+
if job_id is not None:
|
1220
|
+
query = query.where(spot_table.c.spot_job_id == job_id)
|
1221
|
+
query = query.order_by(spot_table.c.spot_job_id.desc(),
|
1222
|
+
spot_table.c.task_id.asc())
|
1223
|
+
rows = None
|
1198
1224
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1199
|
-
query = sqlalchemy.select(spot_table, job_info_table).select_from(
|
1200
|
-
spot_table.outerjoin(
|
1201
|
-
job_info_table,
|
1202
|
-
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
1203
|
-
if job_id is not None:
|
1204
|
-
query = query.where(spot_table.c.spot_job_id == job_id)
|
1205
|
-
query = query.order_by(spot_table.c.spot_job_id.desc(),
|
1206
|
-
spot_table.c.task_id.asc())
|
1207
1225
|
rows = session.execute(query).fetchall()
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
|
1217
|
-
|
1218
|
-
|
1219
|
-
|
1220
|
-
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1225
|
-
else:
|
1226
|
+
jobs = []
|
1227
|
+
for row in rows:
|
1228
|
+
job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
|
1229
|
+
job_dict['status'] = ManagedJobStatus(job_dict['status'])
|
1230
|
+
job_dict['schedule_state'] = ManagedJobScheduleState(
|
1231
|
+
job_dict['schedule_state'])
|
1232
|
+
if job_dict['job_name'] is None:
|
1233
|
+
job_dict['job_name'] = job_dict['task_name']
|
1234
|
+
job_dict['metadata'] = json.loads(job_dict['metadata'])
|
1235
|
+
|
1236
|
+
# Add user YAML content for managed jobs.
|
1237
|
+
yaml_path = job_dict.get('original_user_yaml_path')
|
1238
|
+
if yaml_path:
|
1239
|
+
try:
|
1240
|
+
with open(yaml_path, 'r', encoding='utf-8') as f:
|
1241
|
+
job_dict['user_yaml'] = f.read()
|
1242
|
+
except (FileNotFoundError, IOError, OSError):
|
1226
1243
|
job_dict['user_yaml'] = None
|
1244
|
+
else:
|
1245
|
+
job_dict['user_yaml'] = None
|
1227
1246
|
|
1228
|
-
|
1229
|
-
|
1247
|
+
jobs.append(job_dict)
|
1248
|
+
return jobs
|
1230
1249
|
|
1231
1250
|
|
1232
1251
|
@_init_db
|
sky/jobs/utils.py
CHANGED
@@ -1249,7 +1249,14 @@ def format_job_table(
|
|
1249
1249
|
]
|
1250
1250
|
if show_all:
|
1251
1251
|
# TODO: move SCHED. STATE to a separate flag (e.g. --debug)
|
1252
|
-
columns += [
|
1252
|
+
columns += [
|
1253
|
+
'STARTED',
|
1254
|
+
'INFRA',
|
1255
|
+
'RESOURCES',
|
1256
|
+
'SCHED. STATE',
|
1257
|
+
'DETAILS',
|
1258
|
+
'GIT_COMMIT',
|
1259
|
+
]
|
1253
1260
|
if tasks_have_k8s_user:
|
1254
1261
|
columns.insert(0, 'USER')
|
1255
1262
|
job_table = log_utils.create_table(columns)
|
@@ -1362,6 +1369,7 @@ def format_job_table(
|
|
1362
1369
|
'-',
|
1363
1370
|
job_tasks[0]['schedule_state'],
|
1364
1371
|
generate_details(details, failure_reason),
|
1372
|
+
job_tasks[0].get('metadata', {}).get('git_commit', '-'),
|
1365
1373
|
])
|
1366
1374
|
if tasks_have_k8s_user:
|
1367
1375
|
job_values.insert(0, job_tasks[0].get('user', '-'))
|
@@ -1427,6 +1435,8 @@ def format_job_table(
|
|
1427
1435
|
generate_details(task.get('details'),
|
1428
1436
|
task['failure_reason']),
|
1429
1437
|
])
|
1438
|
+
|
1439
|
+
values.append(task.get('metadata', {}).get('git_commit', '-'))
|
1430
1440
|
if tasks_have_k8s_user:
|
1431
1441
|
values.insert(0, task.get('user', '-'))
|
1432
1442
|
job_table.add_row(values)
|
@@ -1511,6 +1521,22 @@ class ManagedJobCodeGen:
|
|
1511
1521
|
""")
|
1512
1522
|
return cls._build(code)
|
1513
1523
|
|
1524
|
+
@classmethod
|
1525
|
+
def get_version_and_job_table(cls) -> str:
|
1526
|
+
"""Generate code to get controller version and raw job table."""
|
1527
|
+
code = textwrap.dedent("""\
|
1528
|
+
from sky.skylet import constants as controller_constants
|
1529
|
+
|
1530
|
+
# Get controller version
|
1531
|
+
controller_version = controller_constants.SKYLET_VERSION
|
1532
|
+
print(f"controller_version:{controller_version}", flush=True)
|
1533
|
+
|
1534
|
+
# Get and print raw job table (load_managed_job_queue can parse this directly)
|
1535
|
+
job_table = utils.dump_managed_job_queue()
|
1536
|
+
print(job_table, flush=True)
|
1537
|
+
""")
|
1538
|
+
return cls._build(code)
|
1539
|
+
|
1514
1540
|
@classmethod
|
1515
1541
|
def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
|
1516
1542
|
code = textwrap.dedent(f"""\
|
@@ -1565,8 +1591,13 @@ class ManagedJobCodeGen:
|
|
1565
1591
|
resources_str = backend_utils.get_task_resources_str(
|
1566
1592
|
task, is_managed_job=True)
|
1567
1593
|
code += textwrap.dedent(f"""\
|
1568
|
-
|
1569
|
-
|
1594
|
+
if managed_job_version < 7:
|
1595
|
+
managed_job_state.set_pending({job_id}, {task_id},
|
1596
|
+
{task.name!r}, {resources_str!r})
|
1597
|
+
else:
|
1598
|
+
managed_job_state.set_pending({job_id}, {task_id},
|
1599
|
+
{task.name!r}, {resources_str!r},
|
1600
|
+
{task.metadata_json!r})
|
1570
1601
|
""")
|
1571
1602
|
return cls._build(code)
|
1572
1603
|
|
@@ -825,6 +825,23 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
825
825
|
return
|
826
826
|
pod_spec_copy['metadata']['name'] = pod_name
|
827
827
|
pod_spec_copy['metadata']['labels']['component'] = pod_name
|
828
|
+
|
829
|
+
# We need to keep the following fields in the pod spec to be same for
|
830
|
+
# head and worker pods.
|
831
|
+
# So that Kueue can merge them into a single PodSet when creating
|
832
|
+
# ProvisioningRequest to trigger scale up of the cluster autoscaler,
|
833
|
+
# this is especially required for DWS queued provisioning mode in GKE.
|
834
|
+
# spec.containers[*].resources.requests
|
835
|
+
# spec.initContainers[*].resources.requests
|
836
|
+
# spec.resources
|
837
|
+
# spec.nodeSelector
|
838
|
+
# spec.tolerations
|
839
|
+
# spec.affinity
|
840
|
+
# resourceClaims
|
841
|
+
# Refer to the following links for more details:
|
842
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#define_a_provisioningrequest_object # pylint: disable=line-too-long
|
843
|
+
# https://kueue.sigs.k8s.io/docs/admission-check-controllers/provisioning/#podset-merge-policy # pylint: disable=line-too-long
|
844
|
+
if config.count > 1:
|
828
845
|
# For multi-node support, we put a soft-constraint to schedule
|
829
846
|
# worker pods on different nodes than the head pod.
|
830
847
|
# This is not set as a hard constraint because if different nodes
|
@@ -485,6 +485,8 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
485
485
|
# we map H100 ---> H100-80GB and keep H100-MEGA-80GB
|
486
486
|
# to distinguish between a3-high and a3-mega instances
|
487
487
|
return 'H100'
|
488
|
+
elif acc == 'H200-141GB':
|
489
|
+
return 'H200'
|
488
490
|
return acc
|
489
491
|
elif is_tpu_on_gke(value):
|
490
492
|
return value
|
sky/provision/provisioner.py
CHANGED
@@ -22,11 +22,13 @@ from sky import sky_logging
|
|
22
22
|
from sky import skypilot_config
|
23
23
|
from sky.adaptors import aws
|
24
24
|
from sky.backends import backend_utils
|
25
|
+
from sky.jobs.server import utils as server_jobs_utils
|
25
26
|
from sky.provision import common as provision_common
|
26
27
|
from sky.provision import instance_setup
|
27
28
|
from sky.provision import logging as provision_logging
|
28
29
|
from sky.provision import metadata_utils
|
29
30
|
from sky.skylet import constants
|
31
|
+
from sky.utils import common
|
30
32
|
from sky.utils import common_utils
|
31
33
|
from sky.utils import message_utils
|
32
34
|
from sky.utils import resources_utils
|
@@ -502,6 +504,24 @@ def _post_provision_setup(
|
|
502
504
|
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
|
503
505
|
f'Docker container is up.{colorama.Style.RESET_ALL}')
|
504
506
|
|
507
|
+
# Check version compatibility for jobs controller clusters
|
508
|
+
if cluster_name.display_name.startswith(common.JOB_CONTROLLER_PREFIX):
|
509
|
+
# TODO(zeping): remove this in v0.12.0
|
510
|
+
# This only happens in upgrade from <0.9.3 to > 0.10.0
|
511
|
+
# After 0.10.0 no incompatibility issue
|
512
|
+
# See https://github.com/skypilot-org/skypilot/pull/6096
|
513
|
+
# For more details
|
514
|
+
status.update(
|
515
|
+
ux_utils.spinner_message(
|
516
|
+
'Checking controller version compatibility'))
|
517
|
+
try:
|
518
|
+
server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
|
519
|
+
except exceptions.ClusterNotUpError:
|
520
|
+
# Controller is not up yet during initial provisioning, that
|
521
|
+
# also means no non-terminal jobs, so no incompatibility in
|
522
|
+
# this case.
|
523
|
+
pass
|
524
|
+
|
505
525
|
# We mount the metadata with sky wheel for speedup.
|
506
526
|
# NOTE: currently we mount all credentials for all nodes, because
|
507
527
|
# (1) jobs controllers need permission to launch/down nodes of
|
sky/skylet/constants.py
CHANGED
@@ -89,18 +89,13 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
89
89
|
# cluster yaml is updated.
|
90
90
|
#
|
91
91
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
92
|
-
SKYLET_VERSION = '
|
92
|
+
SKYLET_VERSION = '15'
|
93
93
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
94
94
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
95
95
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
96
96
|
SKYLET_LIB_VERSION = 3
|
97
97
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
98
98
|
|
99
|
-
# `sky jobs dashboard`-related
|
100
|
-
#
|
101
|
-
# Port on the remote jobs controller that the dashboard is running on.
|
102
|
-
SPOT_DASHBOARD_REMOTE_PORT = 5000
|
103
|
-
|
104
99
|
# Docker default options
|
105
100
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
106
101
|
DEFAULT_DOCKER_PORT = 10022
|
sky/skylet/job_lib.py
CHANGED
@@ -63,6 +63,7 @@ class JobInfoLoc(enum.IntEnum):
|
|
63
63
|
RESOURCES = 8
|
64
64
|
PID = 9
|
65
65
|
LOG_PATH = 10
|
66
|
+
METADATA = 11
|
66
67
|
|
67
68
|
|
68
69
|
def create_table(cursor, conn):
|
@@ -103,7 +104,8 @@ def create_table(cursor, conn):
|
|
103
104
|
end_at FLOAT DEFAULT NULL,
|
104
105
|
resources TEXT DEFAULT NULL,
|
105
106
|
pid INTEGER DEFAULT -1,
|
106
|
-
log_dir TEXT DEFAULT NULL
|
107
|
+
log_dir TEXT DEFAULT NULL,
|
108
|
+
metadata TEXT DEFAULT '{}')""")
|
107
109
|
|
108
110
|
cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
|
109
111
|
job_id INTEGER,
|
@@ -118,6 +120,12 @@ def create_table(cursor, conn):
|
|
118
120
|
'INTEGER DEFAULT -1')
|
119
121
|
db_utils.add_column_to_table(cursor, conn, 'jobs', 'log_dir',
|
120
122
|
'TEXT DEFAULT NULL')
|
123
|
+
db_utils.add_column_to_table(cursor,
|
124
|
+
conn,
|
125
|
+
'jobs',
|
126
|
+
'metadata',
|
127
|
+
'TEXT DEFAULT \'{}\'',
|
128
|
+
value_to_replace_existing_entries='{}')
|
121
129
|
conn.commit()
|
122
130
|
|
123
131
|
|
@@ -338,16 +346,19 @@ def make_job_command_with_user_switching(username: str,
|
|
338
346
|
|
339
347
|
|
340
348
|
@init_db
|
341
|
-
def add_job(job_name: str,
|
342
|
-
|
349
|
+
def add_job(job_name: str,
|
350
|
+
username: str,
|
351
|
+
run_timestamp: str,
|
352
|
+
resources_str: str,
|
353
|
+
metadata: str = '{}') -> Tuple[int, str]:
|
343
354
|
"""Atomically reserve the next available job id for the user."""
|
344
355
|
assert _DB is not None
|
345
356
|
job_submitted_at = time.time()
|
346
357
|
# job_id will autoincrement with the null value
|
347
358
|
_DB.cursor.execute(
|
348
|
-
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null)',
|
359
|
+
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)',
|
349
360
|
(job_name, username, job_submitted_at, JobStatus.INIT.value,
|
350
|
-
run_timestamp, None, resources_str))
|
361
|
+
run_timestamp, None, resources_str, metadata))
|
351
362
|
_DB.conn.commit()
|
352
363
|
rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
|
353
364
|
(run_timestamp,))
|
@@ -569,6 +580,7 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
|
|
569
580
|
'end_at': row[JobInfoLoc.END_AT.value],
|
570
581
|
'resources': row[JobInfoLoc.RESOURCES.value],
|
571
582
|
'pid': row[JobInfoLoc.PID.value],
|
583
|
+
'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
|
572
584
|
})
|
573
585
|
return records
|
574
586
|
|
@@ -839,7 +851,7 @@ def format_job_queue(jobs: List[Dict[str, Any]]):
|
|
839
851
|
"""
|
840
852
|
job_table = log_utils.create_table([
|
841
853
|
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
842
|
-
'STATUS', 'LOG'
|
854
|
+
'STATUS', 'LOG', 'GIT COMMIT'
|
843
855
|
])
|
844
856
|
for job in jobs:
|
845
857
|
job_table.add_row([
|
@@ -854,6 +866,7 @@ def format_job_queue(jobs: List[Dict[str, Any]]):
|
|
854
866
|
job['resources'],
|
855
867
|
job['status'].colored_str(),
|
856
868
|
job['log_path'],
|
869
|
+
job.get('metadata', {}).get('git_commit', '-'),
|
857
870
|
])
|
858
871
|
return job_table
|
859
872
|
|
@@ -1055,7 +1068,7 @@ class JobLibCodeGen:
|
|
1055
1068
|
|
1056
1069
|
@classmethod
|
1057
1070
|
def add_job(cls, job_name: Optional[str], username: str, run_timestamp: str,
|
1058
|
-
resources_str: str) -> str:
|
1071
|
+
resources_str: str, metadata: str) -> str:
|
1059
1072
|
if job_name is None:
|
1060
1073
|
job_name = '-'
|
1061
1074
|
code = [
|
@@ -1066,11 +1079,20 @@ class JobLibCodeGen:
|
|
1066
1079
|
'\nif int(constants.SKYLET_VERSION) < 9: '
|
1067
1080
|
'raise RuntimeError("SkyPilot runtime is too old, which does not '
|
1068
1081
|
'support submitting jobs.")',
|
1069
|
-
'\nresult =
|
1082
|
+
'\nresult = None',
|
1083
|
+
'\nif int(constants.SKYLET_VERSION) < 15: '
|
1084
|
+
'\n result = job_lib.add_job('
|
1070
1085
|
f'{job_name!r},'
|
1071
1086
|
f'{username!r},'
|
1072
1087
|
f'{run_timestamp!r},'
|
1073
1088
|
f'{resources_str!r})',
|
1089
|
+
'\nelse: '
|
1090
|
+
'\n result = job_lib.add_job('
|
1091
|
+
f'{job_name!r},'
|
1092
|
+
f'{username!r},'
|
1093
|
+
f'{run_timestamp!r},'
|
1094
|
+
f'{resources_str!r},'
|
1095
|
+
f'metadata={metadata!r})',
|
1074
1096
|
('\nif isinstance(result, tuple):'
|
1075
1097
|
'\n print("Job ID: " + str(result[0]), flush=True)'
|
1076
1098
|
'\n print("Log Dir: " + str(result[1]), flush=True)'
|
sky/skypilot_config.py
CHANGED
@@ -573,7 +573,8 @@ def _reload_config_as_server() -> None:
|
|
573
573
|
with _DB_USE_LOCK:
|
574
574
|
sqlalchemy_engine = sqlalchemy.create_engine(db_url,
|
575
575
|
poolclass=NullPool)
|
576
|
-
Base.metadata
|
576
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
|
577
|
+
sqlalchemy_engine)
|
577
578
|
|
578
579
|
def _get_config_yaml_from_db(
|
579
580
|
key: str) -> Optional[config_utils.Config]:
|
@@ -859,7 +860,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
859
860
|
with _DB_USE_LOCK:
|
860
861
|
sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
|
861
862
|
poolclass=NullPool)
|
862
|
-
Base.metadata
|
863
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
|
864
|
+
sqlalchemy_engine)
|
863
865
|
|
864
866
|
def _set_config_yaml_to_db(key: str,
|
865
867
|
config: config_utils.Config):
|
sky/task.py
CHANGED
@@ -255,6 +255,7 @@ class Task:
|
|
255
255
|
# Internal use only.
|
256
256
|
file_mounts_mapping: Optional[Dict[str, str]] = None,
|
257
257
|
volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
|
258
|
+
metadata: Optional[Dict[str, Any]] = None,
|
258
259
|
):
|
259
260
|
"""Initializes a Task.
|
260
261
|
|
@@ -313,6 +314,7 @@ class Task:
|
|
313
314
|
is used.) The base docker image that this Task will be built on.
|
314
315
|
Defaults to 'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04'.
|
315
316
|
blocked_resources: A set of resources that this task cannot run on.
|
317
|
+
metadata: A dictionary of metadata to be added to the task.
|
316
318
|
"""
|
317
319
|
self.name = name
|
318
320
|
self.run = run
|
@@ -369,6 +371,8 @@ class Task:
|
|
369
371
|
self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
|
370
372
|
volume_mounts)
|
371
373
|
|
374
|
+
self._metadata = metadata if metadata is not None else {}
|
375
|
+
|
372
376
|
dag = sky.dag.get_current_dag()
|
373
377
|
if dag is not None:
|
374
378
|
dag.add(self)
|
@@ -503,6 +507,8 @@ class Task:
|
|
503
507
|
'Workdir must be a valid directory (or '
|
504
508
|
f'a symlink to a directory). {user_workdir} not found.')
|
505
509
|
|
510
|
+
self._metadata['git_commit'] = common_utils.get_git_commit(self.workdir)
|
511
|
+
|
506
512
|
@staticmethod
|
507
513
|
def from_yaml_config(
|
508
514
|
config: Dict[str, Any],
|
@@ -599,6 +605,7 @@ class Task:
|
|
599
605
|
event_callback=config.pop('event_callback', None),
|
600
606
|
file_mounts_mapping=config.pop('file_mounts_mapping', None),
|
601
607
|
volumes=config.pop('volumes', None),
|
608
|
+
metadata=config.pop('_metadata', None),
|
602
609
|
)
|
603
610
|
|
604
611
|
# Create lists to store storage objects inlined in file_mounts.
|
@@ -872,6 +879,14 @@ class Task:
|
|
872
879
|
f'num_nodes should be a positive int. Got: {num_nodes}')
|
873
880
|
self._num_nodes = num_nodes
|
874
881
|
|
882
|
+
@property
|
883
|
+
def metadata(self) -> Dict[str, Any]:
|
884
|
+
return self._metadata
|
885
|
+
|
886
|
+
@property
|
887
|
+
def metadata_json(self) -> str:
|
888
|
+
return json.dumps(self._metadata)
|
889
|
+
|
875
890
|
@property
|
876
891
|
def envs(self) -> Dict[str, str]:
|
877
892
|
return self._envs
|
@@ -1588,6 +1603,8 @@ class Task:
|
|
1588
1603
|
volume_mount.to_yaml_config()
|
1589
1604
|
for volume_mount in self.volume_mounts
|
1590
1605
|
]
|
1606
|
+
# we manually check if its empty to not clog up the generated yaml
|
1607
|
+
add_if_not_none('_metadata', self._metadata if self._metadata else None)
|
1591
1608
|
return config
|
1592
1609
|
|
1593
1610
|
def get_required_cloud_features(
|
sky/users/permission.py
CHANGED
@@ -15,6 +15,7 @@ from sky import sky_logging
|
|
15
15
|
from sky.skylet import constants
|
16
16
|
from sky.users import rbac
|
17
17
|
from sky.utils import common_utils
|
18
|
+
from sky.utils import db_utils
|
18
19
|
|
19
20
|
logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
|
20
21
|
logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)
|
@@ -38,6 +39,8 @@ class PermissionService:
|
|
38
39
|
if _enforcer_instance is None:
|
39
40
|
_enforcer_instance = self
|
40
41
|
engine = global_user_state.initialize_and_get_db()
|
42
|
+
db_utils.add_tables_to_db_sqlalchemy(
|
43
|
+
sqlalchemy_adapter.Base.metadata, engine)
|
41
44
|
adapter = sqlalchemy_adapter.Adapter(engine)
|
42
45
|
model_path = os.path.join(os.path.dirname(__file__),
|
43
46
|
'model.conf')
|
sky/utils/common_utils.py
CHANGED
@@ -11,6 +11,7 @@ import platform
|
|
11
11
|
import random
|
12
12
|
import re
|
13
13
|
import socket
|
14
|
+
import subprocess
|
14
15
|
import sys
|
15
16
|
import time
|
16
17
|
import typing
|
@@ -87,6 +88,18 @@ def generate_user_hash() -> str:
|
|
87
88
|
return user_hash
|
88
89
|
|
89
90
|
|
91
|
+
def get_git_commit(path: Optional[str] = None) -> Optional[str]:
|
92
|
+
try:
|
93
|
+
result = subprocess.run(['git', 'rev-parse', 'HEAD'],
|
94
|
+
capture_output=True,
|
95
|
+
text=True,
|
96
|
+
cwd=path,
|
97
|
+
check=True)
|
98
|
+
return result.stdout.strip()
|
99
|
+
except subprocess.CalledProcessError:
|
100
|
+
return None
|
101
|
+
|
102
|
+
|
90
103
|
def get_user_hash() -> str:
|
91
104
|
"""Returns a unique user-machine specific hash as a user id.
|
92
105
|
|
sky/utils/db_utils.py
CHANGED
@@ -84,6 +84,22 @@ def add_column_to_table(
|
|
84
84
|
conn.commit()
|
85
85
|
|
86
86
|
|
87
|
+
def add_tables_to_db_sqlalchemy(
|
88
|
+
metadata: sqlalchemy.MetaData,
|
89
|
+
engine: sqlalchemy.Engine,
|
90
|
+
):
|
91
|
+
"""Add tables to the database."""
|
92
|
+
for table in metadata.tables.values():
|
93
|
+
try:
|
94
|
+
table.create(bind=engine, checkfirst=True)
|
95
|
+
except (sqlalchemy_exc.OperationalError,
|
96
|
+
sqlalchemy_exc.ProgrammingError) as e:
|
97
|
+
if 'already exists' in str(e):
|
98
|
+
pass
|
99
|
+
else:
|
100
|
+
raise
|
101
|
+
|
102
|
+
|
87
103
|
def add_column_to_table_sqlalchemy(
|
88
104
|
session: 'Session',
|
89
105
|
table_name: str,
|
sky/utils/schemas.py
CHANGED
sky/utils/ux_utils.py
CHANGED
@@ -253,9 +253,7 @@ def command_hint_messages(hint_type: CommandHintType,
|
|
253
253
|
f'{BOLD}sky jobs logs {job_id}{RESET_BOLD}'
|
254
254
|
f'\n{INDENT_SYMBOL}To stream controller logs:\t\t'
|
255
255
|
f'{BOLD}sky jobs logs --controller {job_id}{RESET_BOLD}'
|
256
|
-
f'\n{
|
257
|
-
f'{BOLD}sky jobs queue{RESET_BOLD}'
|
258
|
-
f'\n{INDENT_LAST_SYMBOL}To view managed job dashboard:\t\t'
|
259
|
-
f'{BOLD}sky jobs dashboard{RESET_BOLD}')
|
256
|
+
f'\n{INDENT_LAST_SYMBOL}To view all managed jobs:\t\t'
|
257
|
+
f'{BOLD}sky jobs queue{RESET_BOLD}')
|
260
258
|
else:
|
261
259
|
raise ValueError(f'Invalid hint type: {hint_type}')
|