skypilot-nightly 1.0.0.dev20250709__py3-none-any.whl → 1.0.0.dev20250710__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +6 -4
  3. sky/dashboard/out/404.html +1 -1
  4. sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → P2Di1JdUlHuKN2lBws4Mr}/_buildManifest.js +1 -1
  5. sky/dashboard/out/_next/static/chunks/8969-13bb52ce3cffa4e3.js +1 -0
  6. sky/dashboard/out/_next/static/chunks/{webpack-9a81ea998672c303.js → webpack-fd62f17bd9ce1fcc.js} +1 -1
  7. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  8. sky/dashboard/out/clusters/[cluster].html +1 -1
  9. sky/dashboard/out/clusters.html +1 -1
  10. sky/dashboard/out/config.html +1 -1
  11. sky/dashboard/out/index.html +1 -1
  12. sky/dashboard/out/infra/[context].html +1 -1
  13. sky/dashboard/out/infra.html +1 -1
  14. sky/dashboard/out/jobs/[job].html +1 -1
  15. sky/dashboard/out/jobs.html +1 -1
  16. sky/dashboard/out/users.html +1 -1
  17. sky/dashboard/out/volumes.html +1 -1
  18. sky/dashboard/out/workspace/new.html +1 -1
  19. sky/dashboard/out/workspaces/[name].html +1 -1
  20. sky/dashboard/out/workspaces.html +1 -1
  21. sky/global_user_state.py +1 -1
  22. sky/jobs/constants.py +1 -1
  23. sky/jobs/controller.py +7 -0
  24. sky/jobs/server/core.py +2 -1
  25. sky/jobs/server/utils.py +81 -0
  26. sky/jobs/state.py +49 -30
  27. sky/jobs/utils.py +34 -3
  28. sky/provision/kubernetes/instance.py +17 -0
  29. sky/provision/kubernetes/utils.py +2 -0
  30. sky/provision/provisioner.py +20 -0
  31. sky/skylet/constants.py +1 -6
  32. sky/skylet/job_lib.py +30 -8
  33. sky/skypilot_config.py +4 -2
  34. sky/task.py +17 -0
  35. sky/users/permission.py +3 -0
  36. sky/utils/common_utils.py +13 -0
  37. sky/utils/db_utils.py +16 -0
  38. sky/utils/schemas.py +3 -0
  39. sky/utils/ux_utils.py +2 -4
  40. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/METADATA +1 -1
  41. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/RECORD +46 -45
  42. sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +0 -1
  43. /sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → P2Di1JdUlHuKN2lBws4Mr}/_ssgManifest.js +0 -0
  44. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/WHEEL +0 -0
  45. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/entry_points.txt +0 -0
  46. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/licenses/LICENSE +0 -0
  47. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -77,6 +77,7 @@ spot_table = sqlalchemy.Table(
77
77
  sqlalchemy.Column('task_name', sqlalchemy.Text),
78
78
  sqlalchemy.Column('specs', sqlalchemy.Text),
79
79
  sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
80
+ sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
80
81
  )
81
82
 
82
83
  job_info_table = sqlalchemy.Table(
@@ -131,7 +132,7 @@ def create_table():
131
132
  # is not critical and is likely to be enabled by other processes.
132
133
 
133
134
  # Create tables if they don't exist
134
- Base.metadata.create_all(bind=_SQLALCHEMY_ENGINE)
135
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata, _SQLALCHEMY_ENGINE)
135
136
 
136
137
  # Backward compatibility: add columns that not exist in older databases
137
138
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
@@ -170,6 +171,14 @@ def create_table():
170
171
  sqlalchemy.Text(),
171
172
  default_statement='DEFAULT NULL')
172
173
 
174
+ db_utils.add_column_to_table_sqlalchemy(
175
+ session,
176
+ 'spot',
177
+ 'metadata',
178
+ sqlalchemy.Text(),
179
+ default_statement='DEFAULT \'{}\'',
180
+ value_to_replace_existing_entries='{}')
181
+
173
182
  db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
174
183
  'schedule_state',
175
184
  sqlalchemy.Text())
@@ -273,6 +282,7 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
273
282
  'task_name': r['task_name'],
274
283
  'specs': r['specs'],
275
284
  'local_log_file': r['local_log_file'],
285
+ 'metadata': r['metadata'],
276
286
  # columns from job_info table (some may be None for legacy jobs)
277
287
  '_job_info_job_id': r[job_info_table.c.spot_job_id
278
288
  ], # ambiguous, use table.column
@@ -560,7 +570,13 @@ def set_job_info_without_job_id(name: str, workspace: str,
560
570
 
561
571
 
562
572
  @_init_db
563
- def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
573
+ def set_pending(
574
+ job_id: int,
575
+ task_id: int,
576
+ task_name: str,
577
+ resources_str: str,
578
+ metadata: str,
579
+ ):
564
580
  """Set the task to pending state."""
565
581
  assert _SQLALCHEMY_ENGINE is not None
566
582
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
@@ -570,6 +586,7 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
570
586
  task_id=task_id,
571
587
  task_name=task_name,
572
588
  resources=resources_str,
589
+ metadata=metadata,
573
590
  status=ManagedJobStatus.PENDING.value,
574
591
  ))
575
592
  session.commit()
@@ -1195,38 +1212,40 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
1195
1212
  # Note: we will get the user_hash here, but don't try to call
1196
1213
  # global_user_state.get_user() on it. This runs on the controller, which may
1197
1214
  # not have the user info. Prefer to do it on the API server side.
1215
+ query = sqlalchemy.select(spot_table, job_info_table).select_from(
1216
+ spot_table.outerjoin(
1217
+ job_info_table,
1218
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1219
+ if job_id is not None:
1220
+ query = query.where(spot_table.c.spot_job_id == job_id)
1221
+ query = query.order_by(spot_table.c.spot_job_id.desc(),
1222
+ spot_table.c.task_id.asc())
1223
+ rows = None
1198
1224
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1199
- query = sqlalchemy.select(spot_table, job_info_table).select_from(
1200
- spot_table.outerjoin(
1201
- job_info_table,
1202
- spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1203
- if job_id is not None:
1204
- query = query.where(spot_table.c.spot_job_id == job_id)
1205
- query = query.order_by(spot_table.c.spot_job_id.desc(),
1206
- spot_table.c.task_id.asc())
1207
1225
  rows = session.execute(query).fetchall()
1208
- jobs = []
1209
- for row in rows:
1210
- job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
1211
- job_dict['status'] = ManagedJobStatus(job_dict['status'])
1212
- job_dict['schedule_state'] = ManagedJobScheduleState(
1213
- job_dict['schedule_state'])
1214
- if job_dict['job_name'] is None:
1215
- job_dict['job_name'] = job_dict['task_name']
1216
-
1217
- # Add user YAML content for managed jobs.
1218
- yaml_path = job_dict.get('original_user_yaml_path')
1219
- if yaml_path:
1220
- try:
1221
- with open(yaml_path, 'r', encoding='utf-8') as f:
1222
- job_dict['user_yaml'] = f.read()
1223
- except (FileNotFoundError, IOError, OSError):
1224
- job_dict['user_yaml'] = None
1225
- else:
1226
+ jobs = []
1227
+ for row in rows:
1228
+ job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
1229
+ job_dict['status'] = ManagedJobStatus(job_dict['status'])
1230
+ job_dict['schedule_state'] = ManagedJobScheduleState(
1231
+ job_dict['schedule_state'])
1232
+ if job_dict['job_name'] is None:
1233
+ job_dict['job_name'] = job_dict['task_name']
1234
+ job_dict['metadata'] = json.loads(job_dict['metadata'])
1235
+
1236
+ # Add user YAML content for managed jobs.
1237
+ yaml_path = job_dict.get('original_user_yaml_path')
1238
+ if yaml_path:
1239
+ try:
1240
+ with open(yaml_path, 'r', encoding='utf-8') as f:
1241
+ job_dict['user_yaml'] = f.read()
1242
+ except (FileNotFoundError, IOError, OSError):
1226
1243
  job_dict['user_yaml'] = None
1244
+ else:
1245
+ job_dict['user_yaml'] = None
1227
1246
 
1228
- jobs.append(job_dict)
1229
- return jobs
1247
+ jobs.append(job_dict)
1248
+ return jobs
1230
1249
 
1231
1250
 
1232
1251
  @_init_db
sky/jobs/utils.py CHANGED
@@ -1249,7 +1249,14 @@ def format_job_table(
1249
1249
  ]
1250
1250
  if show_all:
1251
1251
  # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
1252
- columns += ['STARTED', 'INFRA', 'RESOURCES', 'SCHED. STATE', 'DETAILS']
1252
+ columns += [
1253
+ 'STARTED',
1254
+ 'INFRA',
1255
+ 'RESOURCES',
1256
+ 'SCHED. STATE',
1257
+ 'DETAILS',
1258
+ 'GIT_COMMIT',
1259
+ ]
1253
1260
  if tasks_have_k8s_user:
1254
1261
  columns.insert(0, 'USER')
1255
1262
  job_table = log_utils.create_table(columns)
@@ -1362,6 +1369,7 @@ def format_job_table(
1362
1369
  '-',
1363
1370
  job_tasks[0]['schedule_state'],
1364
1371
  generate_details(details, failure_reason),
1372
+ job_tasks[0].get('metadata', {}).get('git_commit', '-'),
1365
1373
  ])
1366
1374
  if tasks_have_k8s_user:
1367
1375
  job_values.insert(0, job_tasks[0].get('user', '-'))
@@ -1427,6 +1435,8 @@ def format_job_table(
1427
1435
  generate_details(task.get('details'),
1428
1436
  task['failure_reason']),
1429
1437
  ])
1438
+
1439
+ values.append(task.get('metadata', {}).get('git_commit', '-'))
1430
1440
  if tasks_have_k8s_user:
1431
1441
  values.insert(0, task.get('user', '-'))
1432
1442
  job_table.add_row(values)
@@ -1511,6 +1521,22 @@ class ManagedJobCodeGen:
1511
1521
  """)
1512
1522
  return cls._build(code)
1513
1523
 
1524
+ @classmethod
1525
+ def get_version_and_job_table(cls) -> str:
1526
+ """Generate code to get controller version and raw job table."""
1527
+ code = textwrap.dedent("""\
1528
+ from sky.skylet import constants as controller_constants
1529
+
1530
+ # Get controller version
1531
+ controller_version = controller_constants.SKYLET_VERSION
1532
+ print(f"controller_version:{controller_version}", flush=True)
1533
+
1534
+ # Get and print raw job table (load_managed_job_queue can parse this directly)
1535
+ job_table = utils.dump_managed_job_queue()
1536
+ print(job_table, flush=True)
1537
+ """)
1538
+ return cls._build(code)
1539
+
1514
1540
  @classmethod
1515
1541
  def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
1516
1542
  code = textwrap.dedent(f"""\
@@ -1565,8 +1591,13 @@ class ManagedJobCodeGen:
1565
1591
  resources_str = backend_utils.get_task_resources_str(
1566
1592
  task, is_managed_job=True)
1567
1593
  code += textwrap.dedent(f"""\
1568
- managed_job_state.set_pending({job_id}, {task_id},
1569
- {task.name!r}, {resources_str!r})
1594
+ if managed_job_version < 7:
1595
+ managed_job_state.set_pending({job_id}, {task_id},
1596
+ {task.name!r}, {resources_str!r})
1597
+ else:
1598
+ managed_job_state.set_pending({job_id}, {task_id},
1599
+ {task.name!r}, {resources_str!r},
1600
+ {task.metadata_json!r})
1570
1601
  """)
1571
1602
  return cls._build(code)
1572
1603
 
@@ -825,6 +825,23 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
825
825
  return
826
826
  pod_spec_copy['metadata']['name'] = pod_name
827
827
  pod_spec_copy['metadata']['labels']['component'] = pod_name
828
+
829
+ # We need to keep the following fields in the pod spec to be same for
830
+ # head and worker pods.
831
+ # So that Kueue can merge them into a single PodSet when creating
832
+ # ProvisioningRequest to trigger scale up of the cluster autoscaler,
833
+ # this is especially required for DWS queued provisioning mode in GKE.
834
+ # spec.containers[*].resources.requests
835
+ # spec.initContainers[*].resources.requests
836
+ # spec.resources
837
+ # spec.nodeSelector
838
+ # spec.tolerations
839
+ # spec.affinity
840
+ # resourceClaims
841
+ # Refer to the following links for more details:
842
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#define_a_provisioningrequest_object # pylint: disable=line-too-long
843
+ # https://kueue.sigs.k8s.io/docs/admission-check-controllers/provisioning/#podset-merge-policy # pylint: disable=line-too-long
844
+ if config.count > 1:
828
845
  # For multi-node support, we put a soft-constraint to schedule
829
846
  # worker pods on different nodes than the head pod.
830
847
  # This is not set as a hard constraint because if different nodes
@@ -485,6 +485,8 @@ class GKELabelFormatter(GPULabelFormatter):
485
485
  # we map H100 ---> H100-80GB and keep H100-MEGA-80GB
486
486
  # to distinguish between a3-high and a3-mega instances
487
487
  return 'H100'
488
+ elif acc == 'H200-141GB':
489
+ return 'H200'
488
490
  return acc
489
491
  elif is_tpu_on_gke(value):
490
492
  return value
@@ -22,11 +22,13 @@ from sky import sky_logging
22
22
  from sky import skypilot_config
23
23
  from sky.adaptors import aws
24
24
  from sky.backends import backend_utils
25
+ from sky.jobs.server import utils as server_jobs_utils
25
26
  from sky.provision import common as provision_common
26
27
  from sky.provision import instance_setup
27
28
  from sky.provision import logging as provision_logging
28
29
  from sky.provision import metadata_utils
29
30
  from sky.skylet import constants
31
+ from sky.utils import common
30
32
  from sky.utils import common_utils
31
33
  from sky.utils import message_utils
32
34
  from sky.utils import resources_utils
@@ -502,6 +504,24 @@ def _post_provision_setup(
502
504
  logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
503
505
  f'Docker container is up.{colorama.Style.RESET_ALL}')
504
506
 
507
+ # Check version compatibility for jobs controller clusters
508
+ if cluster_name.display_name.startswith(common.JOB_CONTROLLER_PREFIX):
509
+ # TODO(zeping): remove this in v0.12.0
510
+ # This only happens in upgrade from <0.9.3 to > 0.10.0
511
+ # After 0.10.0 no incompatibility issue
512
+ # See https://github.com/skypilot-org/skypilot/pull/6096
513
+ # For more details
514
+ status.update(
515
+ ux_utils.spinner_message(
516
+ 'Checking controller version compatibility'))
517
+ try:
518
+ server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
519
+ except exceptions.ClusterNotUpError:
520
+ # Controller is not up yet during initial provisioning, that
521
+ # also means no non-terminal jobs, so no incompatibility in
522
+ # this case.
523
+ pass
524
+
505
525
  # We mount the metadata with sky wheel for speedup.
506
526
  # NOTE: currently we mount all credentials for all nodes, because
507
527
  # (1) jobs controllers need permission to launch/down nodes of
sky/skylet/constants.py CHANGED
@@ -89,18 +89,13 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
89
89
  # cluster yaml is updated.
90
90
  #
91
91
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
92
- SKYLET_VERSION = '14'
92
+ SKYLET_VERSION = '15'
93
93
  # The version of the lib files that skylet/jobs use. Whenever there is an API
94
94
  # change for the job_lib or log_lib, we need to bump this version, so that the
95
95
  # user can be notified to update their SkyPilot version on the remote cluster.
96
96
  SKYLET_LIB_VERSION = 3
97
97
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
98
98
 
99
- # `sky jobs dashboard`-related
100
- #
101
- # Port on the remote jobs controller that the dashboard is running on.
102
- SPOT_DASHBOARD_REMOTE_PORT = 5000
103
-
104
99
  # Docker default options
105
100
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
106
101
  DEFAULT_DOCKER_PORT = 10022
sky/skylet/job_lib.py CHANGED
@@ -63,6 +63,7 @@ class JobInfoLoc(enum.IntEnum):
63
63
  RESOURCES = 8
64
64
  PID = 9
65
65
  LOG_PATH = 10
66
+ METADATA = 11
66
67
 
67
68
 
68
69
  def create_table(cursor, conn):
@@ -103,7 +104,8 @@ def create_table(cursor, conn):
103
104
  end_at FLOAT DEFAULT NULL,
104
105
  resources TEXT DEFAULT NULL,
105
106
  pid INTEGER DEFAULT -1,
106
- log_dir TEXT DEFAULT NULL)""")
107
+ log_dir TEXT DEFAULT NULL,
108
+ metadata TEXT DEFAULT '{}')""")
107
109
 
108
110
  cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
109
111
  job_id INTEGER,
@@ -118,6 +120,12 @@ def create_table(cursor, conn):
118
120
  'INTEGER DEFAULT -1')
119
121
  db_utils.add_column_to_table(cursor, conn, 'jobs', 'log_dir',
120
122
  'TEXT DEFAULT NULL')
123
+ db_utils.add_column_to_table(cursor,
124
+ conn,
125
+ 'jobs',
126
+ 'metadata',
127
+ 'TEXT DEFAULT \'{}\'',
128
+ value_to_replace_existing_entries='{}')
121
129
  conn.commit()
122
130
 
123
131
 
@@ -338,16 +346,19 @@ def make_job_command_with_user_switching(username: str,
338
346
 
339
347
 
340
348
  @init_db
341
- def add_job(job_name: str, username: str, run_timestamp: str,
342
- resources_str: str) -> Tuple[int, str]:
349
+ def add_job(job_name: str,
350
+ username: str,
351
+ run_timestamp: str,
352
+ resources_str: str,
353
+ metadata: str = '{}') -> Tuple[int, str]:
343
354
  """Atomically reserve the next available job id for the user."""
344
355
  assert _DB is not None
345
356
  job_submitted_at = time.time()
346
357
  # job_id will autoincrement with the null value
347
358
  _DB.cursor.execute(
348
- 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null)',
359
+ 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)',
349
360
  (job_name, username, job_submitted_at, JobStatus.INIT.value,
350
- run_timestamp, None, resources_str))
361
+ run_timestamp, None, resources_str, metadata))
351
362
  _DB.conn.commit()
352
363
  rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
353
364
  (run_timestamp,))
@@ -569,6 +580,7 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
569
580
  'end_at': row[JobInfoLoc.END_AT.value],
570
581
  'resources': row[JobInfoLoc.RESOURCES.value],
571
582
  'pid': row[JobInfoLoc.PID.value],
583
+ 'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
572
584
  })
573
585
  return records
574
586
 
@@ -839,7 +851,7 @@ def format_job_queue(jobs: List[Dict[str, Any]]):
839
851
  """
840
852
  job_table = log_utils.create_table([
841
853
  'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
842
- 'STATUS', 'LOG'
854
+ 'STATUS', 'LOG', 'GIT COMMIT'
843
855
  ])
844
856
  for job in jobs:
845
857
  job_table.add_row([
@@ -854,6 +866,7 @@ def format_job_queue(jobs: List[Dict[str, Any]]):
854
866
  job['resources'],
855
867
  job['status'].colored_str(),
856
868
  job['log_path'],
869
+ job.get('metadata', {}).get('git_commit', '-'),
857
870
  ])
858
871
  return job_table
859
872
 
@@ -1055,7 +1068,7 @@ class JobLibCodeGen:
1055
1068
 
1056
1069
  @classmethod
1057
1070
  def add_job(cls, job_name: Optional[str], username: str, run_timestamp: str,
1058
- resources_str: str) -> str:
1071
+ resources_str: str, metadata: str) -> str:
1059
1072
  if job_name is None:
1060
1073
  job_name = '-'
1061
1074
  code = [
@@ -1066,11 +1079,20 @@ class JobLibCodeGen:
1066
1079
  '\nif int(constants.SKYLET_VERSION) < 9: '
1067
1080
  'raise RuntimeError("SkyPilot runtime is too old, which does not '
1068
1081
  'support submitting jobs.")',
1069
- '\nresult = job_lib.add_job('
1082
+ '\nresult = None',
1083
+ '\nif int(constants.SKYLET_VERSION) < 15: '
1084
+ '\n result = job_lib.add_job('
1070
1085
  f'{job_name!r},'
1071
1086
  f'{username!r},'
1072
1087
  f'{run_timestamp!r},'
1073
1088
  f'{resources_str!r})',
1089
+ '\nelse: '
1090
+ '\n result = job_lib.add_job('
1091
+ f'{job_name!r},'
1092
+ f'{username!r},'
1093
+ f'{run_timestamp!r},'
1094
+ f'{resources_str!r},'
1095
+ f'metadata={metadata!r})',
1074
1096
  ('\nif isinstance(result, tuple):'
1075
1097
  '\n print("Job ID: " + str(result[0]), flush=True)'
1076
1098
  '\n print("Log Dir: " + str(result[1]), flush=True)'
sky/skypilot_config.py CHANGED
@@ -573,7 +573,8 @@ def _reload_config_as_server() -> None:
573
573
  with _DB_USE_LOCK:
574
574
  sqlalchemy_engine = sqlalchemy.create_engine(db_url,
575
575
  poolclass=NullPool)
576
- Base.metadata.create_all(bind=sqlalchemy_engine)
576
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
577
+ sqlalchemy_engine)
577
578
 
578
579
  def _get_config_yaml_from_db(
579
580
  key: str) -> Optional[config_utils.Config]:
@@ -859,7 +860,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
859
860
  with _DB_USE_LOCK:
860
861
  sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
861
862
  poolclass=NullPool)
862
- Base.metadata.create_all(bind=sqlalchemy_engine)
863
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
864
+ sqlalchemy_engine)
863
865
 
864
866
  def _set_config_yaml_to_db(key: str,
865
867
  config: config_utils.Config):
sky/task.py CHANGED
@@ -255,6 +255,7 @@ class Task:
255
255
  # Internal use only.
256
256
  file_mounts_mapping: Optional[Dict[str, str]] = None,
257
257
  volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
258
+ metadata: Optional[Dict[str, Any]] = None,
258
259
  ):
259
260
  """Initializes a Task.
260
261
 
@@ -313,6 +314,7 @@ class Task:
313
314
  is used.) The base docker image that this Task will be built on.
314
315
  Defaults to 'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04'.
315
316
  blocked_resources: A set of resources that this task cannot run on.
317
+ metadata: A dictionary of metadata to be added to the task.
316
318
  """
317
319
  self.name = name
318
320
  self.run = run
@@ -369,6 +371,8 @@ class Task:
369
371
  self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
370
372
  volume_mounts)
371
373
 
374
+ self._metadata = metadata if metadata is not None else {}
375
+
372
376
  dag = sky.dag.get_current_dag()
373
377
  if dag is not None:
374
378
  dag.add(self)
@@ -503,6 +507,8 @@ class Task:
503
507
  'Workdir must be a valid directory (or '
504
508
  f'a symlink to a directory). {user_workdir} not found.')
505
509
 
510
+ self._metadata['git_commit'] = common_utils.get_git_commit(self.workdir)
511
+
506
512
  @staticmethod
507
513
  def from_yaml_config(
508
514
  config: Dict[str, Any],
@@ -599,6 +605,7 @@ class Task:
599
605
  event_callback=config.pop('event_callback', None),
600
606
  file_mounts_mapping=config.pop('file_mounts_mapping', None),
601
607
  volumes=config.pop('volumes', None),
608
+ metadata=config.pop('_metadata', None),
602
609
  )
603
610
 
604
611
  # Create lists to store storage objects inlined in file_mounts.
@@ -872,6 +879,14 @@ class Task:
872
879
  f'num_nodes should be a positive int. Got: {num_nodes}')
873
880
  self._num_nodes = num_nodes
874
881
 
882
+ @property
883
+ def metadata(self) -> Dict[str, Any]:
884
+ return self._metadata
885
+
886
+ @property
887
+ def metadata_json(self) -> str:
888
+ return json.dumps(self._metadata)
889
+
875
890
  @property
876
891
  def envs(self) -> Dict[str, str]:
877
892
  return self._envs
@@ -1588,6 +1603,8 @@ class Task:
1588
1603
  volume_mount.to_yaml_config()
1589
1604
  for volume_mount in self.volume_mounts
1590
1605
  ]
1606
+ # we manually check if its empty to not clog up the generated yaml
1607
+ add_if_not_none('_metadata', self._metadata if self._metadata else None)
1591
1608
  return config
1592
1609
 
1593
1610
  def get_required_cloud_features(
sky/users/permission.py CHANGED
@@ -15,6 +15,7 @@ from sky import sky_logging
15
15
  from sky.skylet import constants
16
16
  from sky.users import rbac
17
17
  from sky.utils import common_utils
18
+ from sky.utils import db_utils
18
19
 
19
20
  logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
20
21
  logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)
@@ -38,6 +39,8 @@ class PermissionService:
38
39
  if _enforcer_instance is None:
39
40
  _enforcer_instance = self
40
41
  engine = global_user_state.initialize_and_get_db()
42
+ db_utils.add_tables_to_db_sqlalchemy(
43
+ sqlalchemy_adapter.Base.metadata, engine)
41
44
  adapter = sqlalchemy_adapter.Adapter(engine)
42
45
  model_path = os.path.join(os.path.dirname(__file__),
43
46
  'model.conf')
sky/utils/common_utils.py CHANGED
@@ -11,6 +11,7 @@ import platform
11
11
  import random
12
12
  import re
13
13
  import socket
14
+ import subprocess
14
15
  import sys
15
16
  import time
16
17
  import typing
@@ -87,6 +88,18 @@ def generate_user_hash() -> str:
87
88
  return user_hash
88
89
 
89
90
 
91
+ def get_git_commit(path: Optional[str] = None) -> Optional[str]:
92
+ try:
93
+ result = subprocess.run(['git', 'rev-parse', 'HEAD'],
94
+ capture_output=True,
95
+ text=True,
96
+ cwd=path,
97
+ check=True)
98
+ return result.stdout.strip()
99
+ except subprocess.CalledProcessError:
100
+ return None
101
+
102
+
90
103
  def get_user_hash() -> str:
91
104
  """Returns a unique user-machine specific hash as a user id.
92
105
 
sky/utils/db_utils.py CHANGED
@@ -84,6 +84,22 @@ def add_column_to_table(
84
84
  conn.commit()
85
85
 
86
86
 
87
+ def add_tables_to_db_sqlalchemy(
88
+ metadata: sqlalchemy.MetaData,
89
+ engine: sqlalchemy.Engine,
90
+ ):
91
+ """Add tables to the database."""
92
+ for table in metadata.tables.values():
93
+ try:
94
+ table.create(bind=engine, checkfirst=True)
95
+ except (sqlalchemy_exc.OperationalError,
96
+ sqlalchemy_exc.ProgrammingError) as e:
97
+ if 'already exists' in str(e):
98
+ pass
99
+ else:
100
+ raise
101
+
102
+
87
103
  def add_column_to_table_sqlalchemy(
88
104
  session: 'Session',
89
105
  table_name: str,
sky/utils/schemas.py CHANGED
@@ -870,6 +870,9 @@ def get_task_schema():
870
870
  'type': 'array',
871
871
  'items': get_volume_mount_schema(),
872
872
  },
873
+ '_metadata': {
874
+ 'type': 'object',
875
+ },
873
876
  **_experimental_task_schema(),
874
877
  }
875
878
  }
sky/utils/ux_utils.py CHANGED
@@ -253,9 +253,7 @@ def command_hint_messages(hint_type: CommandHintType,
253
253
  f'{BOLD}sky jobs logs {job_id}{RESET_BOLD}'
254
254
  f'\n{INDENT_SYMBOL}To stream controller logs:\t\t'
255
255
  f'{BOLD}sky jobs logs --controller {job_id}{RESET_BOLD}'
256
- f'\n{INDENT_SYMBOL}To view all managed jobs:\t\t'
257
- f'{BOLD}sky jobs queue{RESET_BOLD}'
258
- f'\n{INDENT_LAST_SYMBOL}To view managed job dashboard:\t\t'
259
- f'{BOLD}sky jobs dashboard{RESET_BOLD}')
256
+ f'\n{INDENT_LAST_SYMBOL}To view all managed jobs:\t\t'
257
+ f'{BOLD}sky jobs queue{RESET_BOLD}')
260
258
  else:
261
259
  raise ValueError(f'Invalid hint type: {hint_type}')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250709
3
+ Version: 1.0.0.dev20250710
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0