skypilot-nightly 1.0.0.dev20250709__py3-none-any.whl → 1.0.0.dev20250711__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +6 -4
  3. sky/clouds/kubernetes.py +137 -23
  4. sky/core.py +3 -1
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/chunks/1871-3a0f047988be65cd.js +6 -0
  7. sky/dashboard/out/_next/static/chunks/8969-13bb52ce3cffa4e3.js +1 -0
  8. sky/dashboard/out/_next/static/chunks/{webpack-9a81ea998672c303.js → webpack-60070a62f55486a6.js} +1 -1
  9. sky/dashboard/out/_next/static/css/6cbd41a88d2e9e1c.css +3 -0
  10. sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → ldZFQWCiYX_vZnIfB_o8S}/_buildManifest.js +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/config.html +1 -1
  15. sky/dashboard/out/index.html +1 -1
  16. sky/dashboard/out/infra/[context].html +1 -1
  17. sky/dashboard/out/infra.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/global_user_state.py +10 -11
  26. sky/jobs/constants.py +1 -1
  27. sky/jobs/controller.py +7 -0
  28. sky/jobs/server/core.py +2 -1
  29. sky/jobs/server/utils.py +81 -0
  30. sky/jobs/state.py +58 -40
  31. sky/jobs/utils.py +45 -6
  32. sky/provision/kubernetes/instance.py +17 -0
  33. sky/provision/kubernetes/utils.py +134 -0
  34. sky/provision/provisioner.py +20 -0
  35. sky/skylet/constants.py +1 -6
  36. sky/skylet/job_lib.py +30 -8
  37. sky/skypilot_config.py +8 -3
  38. sky/task.py +17 -0
  39. sky/templates/kubernetes-ray.yml.j2 +298 -10
  40. sky/users/permission.py +18 -1
  41. sky/users/token_service.py +25 -3
  42. sky/utils/common_utils.py +13 -0
  43. sky/utils/db_utils.py +16 -0
  44. sky/utils/schemas.py +6 -0
  45. sky/utils/ux_utils.py +2 -4
  46. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/METADATA +1 -1
  47. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/RECORD +55 -54
  48. sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +0 -6
  49. sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +0 -1
  50. sky/dashboard/out/_next/static/css/0da6afe66176678a.css +0 -3
  51. /sky/dashboard/out/_next/static/chunks/pages/{_app-a37b06ddb64521fd.js → _app-e6e82dc8abb50c4f.js} +0 -0
  52. /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-1159f362b960e2b8.js → [cluster]-0fbfb1dd0b08c90c.js} +0 -0
  53. /sky/dashboard/out/_next/static/chunks/pages/{clusters-9744c271a1642f76.js → clusters-102d169e87913ba1.js} +0 -0
  54. /sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → ldZFQWCiYX_vZnIfB_o8S}/_ssgManifest.js +0 -0
  55. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/WHEEL +0 -0
  56. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/entry_points.txt +0 -0
  57. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/licenses/LICENSE +0 -0
  58. {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -30,6 +30,7 @@ from sky.backends import backend_utils
30
30
  from sky.jobs import constants as managed_job_constants
31
31
  from sky.jobs import scheduler
32
32
  from sky.jobs import state as managed_job_state
33
+ from sky.server import common as server_common
33
34
  from sky.skylet import constants
34
35
  from sky.skylet import job_lib
35
36
  from sky.skylet import log_lib
@@ -38,6 +39,7 @@ from sky.utils import annotations
38
39
  from sky.utils import command_runner
39
40
  from sky.utils import common_utils
40
41
  from sky.utils import controller_utils
42
+ from sky.utils import env_options
41
43
  from sky.utils import infra_utils
42
44
  from sky.utils import log_utils
43
45
  from sky.utils import message_utils
@@ -128,9 +130,15 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
128
130
  time.sleep(backoff.current_backoff())
129
131
 
130
132
 
131
- def _check_consolidation_mode_consistency(
133
+ def _validate_consolidation_mode_config(
132
134
  current_is_consolidation_mode: bool) -> None:
133
- """Check the consistency of the consolidation mode."""
135
+ """Validate the consolidation mode config."""
136
+ if (current_is_consolidation_mode and
137
+ not env_options.Options.IS_DEVELOPER.get() and
138
+ server_common.is_api_server_local()):
139
+ with ux_utils.print_exception_no_traceback():
140
+ raise exceptions.NotSupportedError(
141
+ 'Consolidation mode is not supported when running locally.')
134
142
  # Check whether the consolidation mode config is changed.
135
143
  if current_is_consolidation_mode:
136
144
  controller_cn = (
@@ -176,7 +184,7 @@ def _check_consolidation_mode_consistency(
176
184
  def is_consolidation_mode() -> bool:
177
185
  consolidation_mode = skypilot_config.get_nested(
178
186
  ('jobs', 'controller', 'consolidation_mode'), default_value=False)
179
- _check_consolidation_mode_consistency(consolidation_mode)
187
+ _validate_consolidation_mode_config(consolidation_mode)
180
188
  return consolidation_mode
181
189
 
182
190
 
@@ -1249,7 +1257,14 @@ def format_job_table(
1249
1257
  ]
1250
1258
  if show_all:
1251
1259
  # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
1252
- columns += ['STARTED', 'INFRA', 'RESOURCES', 'SCHED. STATE', 'DETAILS']
1260
+ columns += [
1261
+ 'STARTED',
1262
+ 'INFRA',
1263
+ 'RESOURCES',
1264
+ 'SCHED. STATE',
1265
+ 'DETAILS',
1266
+ 'GIT_COMMIT',
1267
+ ]
1253
1268
  if tasks_have_k8s_user:
1254
1269
  columns.insert(0, 'USER')
1255
1270
  job_table = log_utils.create_table(columns)
@@ -1362,6 +1377,7 @@ def format_job_table(
1362
1377
  '-',
1363
1378
  job_tasks[0]['schedule_state'],
1364
1379
  generate_details(details, failure_reason),
1380
+ job_tasks[0].get('metadata', {}).get('git_commit', '-'),
1365
1381
  ])
1366
1382
  if tasks_have_k8s_user:
1367
1383
  job_values.insert(0, job_tasks[0].get('user', '-'))
@@ -1427,6 +1443,8 @@ def format_job_table(
1427
1443
  generate_details(task.get('details'),
1428
1444
  task['failure_reason']),
1429
1445
  ])
1446
+
1447
+ values.append(task.get('metadata', {}).get('git_commit', '-'))
1430
1448
  if tasks_have_k8s_user:
1431
1449
  values.insert(0, task.get('user', '-'))
1432
1450
  job_table.add_row(values)
@@ -1511,6 +1529,22 @@ class ManagedJobCodeGen:
1511
1529
  """)
1512
1530
  return cls._build(code)
1513
1531
 
1532
+ @classmethod
1533
+ def get_version_and_job_table(cls) -> str:
1534
+ """Generate code to get controller version and raw job table."""
1535
+ code = textwrap.dedent("""\
1536
+ from sky.skylet import constants as controller_constants
1537
+
1538
+ # Get controller version
1539
+ controller_version = controller_constants.SKYLET_VERSION
1540
+ print(f"controller_version:{controller_version}", flush=True)
1541
+
1542
+ # Get and print raw job table (load_managed_job_queue can parse this directly)
1543
+ job_table = utils.dump_managed_job_queue()
1544
+ print(job_table, flush=True)
1545
+ """)
1546
+ return cls._build(code)
1547
+
1514
1548
  @classmethod
1515
1549
  def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
1516
1550
  code = textwrap.dedent(f"""\
@@ -1565,8 +1599,13 @@ class ManagedJobCodeGen:
1565
1599
  resources_str = backend_utils.get_task_resources_str(
1566
1600
  task, is_managed_job=True)
1567
1601
  code += textwrap.dedent(f"""\
1568
- managed_job_state.set_pending({job_id}, {task_id},
1569
- {task.name!r}, {resources_str!r})
1602
+ if managed_job_version < 7:
1603
+ managed_job_state.set_pending({job_id}, {task_id},
1604
+ {task.name!r}, {resources_str!r})
1605
+ else:
1606
+ managed_job_state.set_pending({job_id}, {task_id},
1607
+ {task.name!r}, {resources_str!r},
1608
+ {task.metadata_json!r})
1570
1609
  """)
1571
1610
  return cls._build(code)
1572
1611
 
@@ -825,6 +825,23 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
825
825
  return
826
826
  pod_spec_copy['metadata']['name'] = pod_name
827
827
  pod_spec_copy['metadata']['labels']['component'] = pod_name
828
+
829
+ # We need to keep the following fields in the pod spec to be same for
830
+ # head and worker pods.
831
+ # So that Kueue can merge them into a single PodSet when creating
832
+ # ProvisioningRequest to trigger scale up of the cluster autoscaler,
833
+ # this is especially required for DWS queued provisioning mode in GKE.
834
+ # spec.containers[*].resources.requests
835
+ # spec.initContainers[*].resources.requests
836
+ # spec.resources
837
+ # spec.nodeSelector
838
+ # spec.tolerations
839
+ # spec.affinity
840
+ # resourceClaims
841
+ # Refer to the following links for more details:
842
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#define_a_provisioningrequest_object # pylint: disable=line-too-long
843
+ # https://kueue.sigs.k8s.io/docs/admission-check-controllers/provisioning/#podset-merge-policy # pylint: disable=line-too-long
844
+ if config.count > 1:
828
845
  # For multi-node support, we put a soft-constraint to schedule
829
846
  # worker pods on different nodes than the head pod.
830
847
  # This is not set as a hard constraint because if different nodes
@@ -1,5 +1,6 @@
1
1
  """Kubernetes utilities for SkyPilot."""
2
2
  import dataclasses
3
+ import enum
3
4
  import functools
4
5
  import hashlib
5
6
  import json
@@ -57,6 +58,69 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
57
58
  # and store all data that needs to be persisted in future.
58
59
  HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
59
60
 
61
+
62
+ class KubernetesHighPerformanceNetworkType(enum.Enum):
63
+ """Enum for different Kubernetes cluster types with high performance
64
+ network configurations.
65
+
66
+ This enum defines cluster types that support optimized networking for
67
+ distributed ML workloads:
68
+ - GCP_TCPX: GKE clusters with GPUDirect-TCPX support
69
+ (A3 High instances: a3-highgpu-8g)
70
+ - GCP_TCPXO: GKE clusters with GPUDirect-TCPXO support
71
+ (A3 Mega instances: a3-megagpu-8g)
72
+ - GCP_GPUDIRECT_RDMA: GKE clusters with GPUDirect-RDMA support
73
+ (A4/A3 Ultra instances)
74
+ - NEBIUS: Nebius clusters with InfiniBand support for high-throughput,
75
+ low-latency networking
76
+ - NONE: Standard clusters without specialized networking optimizations
77
+
78
+ The network configurations align with corresponding VM-based
79
+ implementations:
80
+ - GCP settings match
81
+ sky.provision.gcp.constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
82
+ - Nebius settings match the InfiniBand configuration used in Nebius VMs
83
+ """
84
+
85
+ GCP_TCPX = 'gcp_tcpx'
86
+ GCP_TCPXO = 'gcp_tcpxo'
87
+ GCP_GPUDIRECT_RDMA = 'gcp_gpudirect_rdma'
88
+ NEBIUS = 'nebius'
89
+ NONE = 'none'
90
+
91
+ def get_network_env_vars(self) -> Dict[str, str]:
92
+ """Get network environment variables for this cluster type."""
93
+ if self == KubernetesHighPerformanceNetworkType.NEBIUS:
94
+ # Nebius cluster with InfiniBand - use InfiniBand optimizations
95
+ return {
96
+ 'NCCL_IB_HCA': 'mlx5',
97
+ 'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
98
+ 'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
99
+ }
100
+ else:
101
+ # GCP clusters and generic clusters - environment variables are
102
+ # handled directly in the template
103
+ return {}
104
+
105
+ def supports_high_performance_networking(self) -> bool:
106
+ """Check if this cluster type supports high performance networking."""
107
+ return self is not KubernetesHighPerformanceNetworkType.NONE
108
+
109
+ def supports_gpu_direct(self) -> bool:
110
+ """Check if this cluster type supports GPUDirect networking."""
111
+ return self in (KubernetesHighPerformanceNetworkType.GCP_TCPX,
112
+ KubernetesHighPerformanceNetworkType.GCP_TCPXO,
113
+ KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
114
+
115
+ def requires_ipc_lock_capability(self) -> bool:
116
+ """Check if this cluster type requires IPC_LOCK capability."""
117
+ return self.supports_high_performance_networking()
118
+
119
+ def requires_tcpxo_daemon(self) -> bool:
120
+ """Check if this cluster type requires TCPXO daemon."""
121
+ return self == KubernetesHighPerformanceNetworkType.GCP_TCPXO
122
+
123
+
60
124
  # TODO(romilb): Move constants to constants.py
61
125
  DEFAULT_NAMESPACE = 'default'
62
126
 
@@ -485,6 +549,8 @@ class GKELabelFormatter(GPULabelFormatter):
485
549
  # we map H100 ---> H100-80GB and keep H100-MEGA-80GB
486
550
  # to distinguish between a3-high and a3-mega instances
487
551
  return 'H100'
552
+ elif acc == 'H200-141GB':
553
+ return 'H200'
488
554
  return acc
489
555
  elif is_tpu_on_gke(value):
490
556
  return value
@@ -756,6 +822,74 @@ class GKEAutoscaler(Autoscaler):
756
822
  return True
757
823
  return False
758
824
 
825
+ @classmethod
826
+ @annotations.lru_cache(scope='request', maxsize=10)
827
+ def get_available_machine_types(cls, context: str) -> List[str]:
828
+ """Returns the list of machine types that are available in the cluster.
829
+ """
830
+ # Assume context naming convention of
831
+ # gke_PROJECT-ID_LOCATION_CLUSTER-NAME
832
+ valid, project_id, location, cluster_name = cls._validate_context_name(
833
+ context)
834
+ if not valid:
835
+ # Context name is not in the format of
836
+ # gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
837
+ # Cannot determine if the context can autoscale.
838
+ # Return empty list.
839
+ logger.debug(f'Context {context} is not in the format of '
840
+ f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
841
+ 'Returning empty machine type list.')
842
+ return []
843
+ try:
844
+ logger.debug(
845
+ f'Attempting to get information about cluster {cluster_name}')
846
+ container_service = gcp.build('container',
847
+ 'v1',
848
+ credentials=None,
849
+ cache_discovery=False)
850
+ cluster = container_service.projects().locations().clusters().get(
851
+ name=f'projects/{project_id}'
852
+ f'/locations/{location}'
853
+ f'/clusters/{cluster_name}').execute()
854
+ except ImportError:
855
+ # If the gcp module is not installed, return empty list.
856
+ # Remind the user once per day to install the gcp module for better
857
+ # pod scheduling with GKE autoscaler.
858
+ if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
859
+ logger.info(
860
+ 'Could not fetch autoscaler information from GKE. '
861
+ 'Run pip install "skypilot[gcp]" for more intelligent pod '
862
+ 'scheduling with GKE autoscaler.')
863
+ cls._pip_install_gcp_hint_last_sent = time.time()
864
+ return []
865
+ except gcp.http_error_exception() as e:
866
+ # Cluster information is not available.
867
+ # Return empty list.
868
+ logger.debug(f'{e.message}', exc_info=True)
869
+ return []
870
+
871
+ machine_types = []
872
+ # Get the list of machine types that are available in the cluster.
873
+ node_pools = cluster.get('nodePools', [])
874
+ for node_pool in node_pools:
875
+ name = node_pool.get('name', '')
876
+ logger.debug(f'Checking if node pool {name} '
877
+ 'has autoscaling enabled.')
878
+ autoscaling_enabled = (node_pool.get('autoscaling',
879
+ {}).get('enabled', False))
880
+ if autoscaling_enabled:
881
+ logger.debug(f'Node pool {name} has autoscaling enabled.')
882
+ try:
883
+ machine_type = node_pool.get('config',
884
+ {}).get('machineType', '')
885
+ if machine_type:
886
+ machine_types.append(machine_type)
887
+ except KeyError:
888
+ logger.debug(f'Encountered KeyError while checking machine '
889
+ f'type of node pool {name}.')
890
+ continue
891
+ return machine_types
892
+
759
893
  @classmethod
760
894
  def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
761
895
  """Validates the context name is in the format of
@@ -22,11 +22,13 @@ from sky import sky_logging
22
22
  from sky import skypilot_config
23
23
  from sky.adaptors import aws
24
24
  from sky.backends import backend_utils
25
+ from sky.jobs.server import utils as server_jobs_utils
25
26
  from sky.provision import common as provision_common
26
27
  from sky.provision import instance_setup
27
28
  from sky.provision import logging as provision_logging
28
29
  from sky.provision import metadata_utils
29
30
  from sky.skylet import constants
31
+ from sky.utils import common
30
32
  from sky.utils import common_utils
31
33
  from sky.utils import message_utils
32
34
  from sky.utils import resources_utils
@@ -502,6 +504,24 @@ def _post_provision_setup(
502
504
  logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
503
505
  f'Docker container is up.{colorama.Style.RESET_ALL}')
504
506
 
507
+ # Check version compatibility for jobs controller clusters
508
+ if cluster_name.display_name.startswith(common.JOB_CONTROLLER_PREFIX):
509
+ # TODO(zeping): remove this in v0.12.0
510
+ # This only happens in upgrade from <0.9.3 to > 0.10.0
511
+ # After 0.10.0 no incompatibility issue
512
+ # See https://github.com/skypilot-org/skypilot/pull/6096
513
+ # For more details
514
+ status.update(
515
+ ux_utils.spinner_message(
516
+ 'Checking controller version compatibility'))
517
+ try:
518
+ server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
519
+ except exceptions.ClusterNotUpError:
520
+ # Controller is not up yet during initial provisioning, that
521
+ # also means no non-terminal jobs, so no incompatibility in
522
+ # this case.
523
+ pass
524
+
505
525
  # We mount the metadata with sky wheel for speedup.
506
526
  # NOTE: currently we mount all credentials for all nodes, because
507
527
  # (1) jobs controllers need permission to launch/down nodes of
sky/skylet/constants.py CHANGED
@@ -89,18 +89,13 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
89
89
  # cluster yaml is updated.
90
90
  #
91
91
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
92
- SKYLET_VERSION = '14'
92
+ SKYLET_VERSION = '15'
93
93
  # The version of the lib files that skylet/jobs use. Whenever there is an API
94
94
  # change for the job_lib or log_lib, we need to bump this version, so that the
95
95
  # user can be notified to update their SkyPilot version on the remote cluster.
96
96
  SKYLET_LIB_VERSION = 3
97
97
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
98
98
 
99
- # `sky jobs dashboard`-related
100
- #
101
- # Port on the remote jobs controller that the dashboard is running on.
102
- SPOT_DASHBOARD_REMOTE_PORT = 5000
103
-
104
99
  # Docker default options
105
100
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
106
101
  DEFAULT_DOCKER_PORT = 10022
sky/skylet/job_lib.py CHANGED
@@ -63,6 +63,7 @@ class JobInfoLoc(enum.IntEnum):
63
63
  RESOURCES = 8
64
64
  PID = 9
65
65
  LOG_PATH = 10
66
+ METADATA = 11
66
67
 
67
68
 
68
69
  def create_table(cursor, conn):
@@ -103,7 +104,8 @@ def create_table(cursor, conn):
103
104
  end_at FLOAT DEFAULT NULL,
104
105
  resources TEXT DEFAULT NULL,
105
106
  pid INTEGER DEFAULT -1,
106
- log_dir TEXT DEFAULT NULL)""")
107
+ log_dir TEXT DEFAULT NULL,
108
+ metadata TEXT DEFAULT '{}')""")
107
109
 
108
110
  cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
109
111
  job_id INTEGER,
@@ -118,6 +120,12 @@ def create_table(cursor, conn):
118
120
  'INTEGER DEFAULT -1')
119
121
  db_utils.add_column_to_table(cursor, conn, 'jobs', 'log_dir',
120
122
  'TEXT DEFAULT NULL')
123
+ db_utils.add_column_to_table(cursor,
124
+ conn,
125
+ 'jobs',
126
+ 'metadata',
127
+ 'TEXT DEFAULT \'{}\'',
128
+ value_to_replace_existing_entries='{}')
121
129
  conn.commit()
122
130
 
123
131
 
@@ -338,16 +346,19 @@ def make_job_command_with_user_switching(username: str,
338
346
 
339
347
 
340
348
  @init_db
341
- def add_job(job_name: str, username: str, run_timestamp: str,
342
- resources_str: str) -> Tuple[int, str]:
349
+ def add_job(job_name: str,
350
+ username: str,
351
+ run_timestamp: str,
352
+ resources_str: str,
353
+ metadata: str = '{}') -> Tuple[int, str]:
343
354
  """Atomically reserve the next available job id for the user."""
344
355
  assert _DB is not None
345
356
  job_submitted_at = time.time()
346
357
  # job_id will autoincrement with the null value
347
358
  _DB.cursor.execute(
348
- 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null)',
359
+ 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)',
349
360
  (job_name, username, job_submitted_at, JobStatus.INIT.value,
350
- run_timestamp, None, resources_str))
361
+ run_timestamp, None, resources_str, metadata))
351
362
  _DB.conn.commit()
352
363
  rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
353
364
  (run_timestamp,))
@@ -569,6 +580,7 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
569
580
  'end_at': row[JobInfoLoc.END_AT.value],
570
581
  'resources': row[JobInfoLoc.RESOURCES.value],
571
582
  'pid': row[JobInfoLoc.PID.value],
583
+ 'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
572
584
  })
573
585
  return records
574
586
 
@@ -839,7 +851,7 @@ def format_job_queue(jobs: List[Dict[str, Any]]):
839
851
  """
840
852
  job_table = log_utils.create_table([
841
853
  'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
842
- 'STATUS', 'LOG'
854
+ 'STATUS', 'LOG', 'GIT COMMIT'
843
855
  ])
844
856
  for job in jobs:
845
857
  job_table.add_row([
@@ -854,6 +866,7 @@ def format_job_queue(jobs: List[Dict[str, Any]]):
854
866
  job['resources'],
855
867
  job['status'].colored_str(),
856
868
  job['log_path'],
869
+ job.get('metadata', {}).get('git_commit', '-'),
857
870
  ])
858
871
  return job_table
859
872
 
@@ -1055,7 +1068,7 @@ class JobLibCodeGen:
1055
1068
 
1056
1069
  @classmethod
1057
1070
  def add_job(cls, job_name: Optional[str], username: str, run_timestamp: str,
1058
- resources_str: str) -> str:
1071
+ resources_str: str, metadata: str) -> str:
1059
1072
  if job_name is None:
1060
1073
  job_name = '-'
1061
1074
  code = [
@@ -1066,11 +1079,20 @@ class JobLibCodeGen:
1066
1079
  '\nif int(constants.SKYLET_VERSION) < 9: '
1067
1080
  'raise RuntimeError("SkyPilot runtime is too old, which does not '
1068
1081
  'support submitting jobs.")',
1069
- '\nresult = job_lib.add_job('
1082
+ '\nresult = None',
1083
+ '\nif int(constants.SKYLET_VERSION) < 15: '
1084
+ '\n result = job_lib.add_job('
1070
1085
  f'{job_name!r},'
1071
1086
  f'{username!r},'
1072
1087
  f'{run_timestamp!r},'
1073
1088
  f'{resources_str!r})',
1089
+ '\nelse: '
1090
+ '\n result = job_lib.add_job('
1091
+ f'{job_name!r},'
1092
+ f'{username!r},'
1093
+ f'{run_timestamp!r},'
1094
+ f'{resources_str!r},'
1095
+ f'metadata={metadata!r})',
1074
1096
  ('\nif isinstance(result, tuple):'
1075
1097
  '\n print("Job ID: " + str(result[0]), flush=True)'
1076
1098
  '\n print("Log Dir: " + str(result[1]), flush=True)'
sky/skypilot_config.py CHANGED
@@ -52,6 +52,7 @@ import contextlib
52
52
  import copy
53
53
  import json
54
54
  import os
55
+ import pathlib
55
56
  import tempfile
56
57
  import threading
57
58
  import typing
@@ -573,7 +574,8 @@ def _reload_config_as_server() -> None:
573
574
  with _DB_USE_LOCK:
574
575
  sqlalchemy_engine = sqlalchemy.create_engine(db_url,
575
576
  poolclass=NullPool)
576
- Base.metadata.create_all(bind=sqlalchemy_engine)
577
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
578
+ sqlalchemy_engine)
577
579
 
578
580
  def _get_config_yaml_from_db(
579
581
  key: str) -> Optional[config_utils.Config]:
@@ -847,7 +849,9 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
847
849
 
848
850
  global_config_path = _resolve_server_config_path()
849
851
  if global_config_path is None:
850
- global_config_path = get_user_config_path()
852
+ # Fallback to ~/.sky/config.yaml, and make sure it exists.
853
+ global_config_path = os.path.expanduser(get_user_config_path())
854
+ pathlib.Path(global_config_path).touch(exist_ok=True)
851
855
 
852
856
  db_updated = False
853
857
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
@@ -859,7 +863,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
859
863
  with _DB_USE_LOCK:
860
864
  sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
861
865
  poolclass=NullPool)
862
- Base.metadata.create_all(bind=sqlalchemy_engine)
866
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
867
+ sqlalchemy_engine)
863
868
 
864
869
  def _set_config_yaml_to_db(key: str,
865
870
  config: config_utils.Config):
sky/task.py CHANGED
@@ -255,6 +255,7 @@ class Task:
255
255
  # Internal use only.
256
256
  file_mounts_mapping: Optional[Dict[str, str]] = None,
257
257
  volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
258
+ metadata: Optional[Dict[str, Any]] = None,
258
259
  ):
259
260
  """Initializes a Task.
260
261
 
@@ -313,6 +314,7 @@ class Task:
313
314
  is used.) The base docker image that this Task will be built on.
314
315
  Defaults to 'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04'.
315
316
  blocked_resources: A set of resources that this task cannot run on.
317
+ metadata: A dictionary of metadata to be added to the task.
316
318
  """
317
319
  self.name = name
318
320
  self.run = run
@@ -369,6 +371,8 @@ class Task:
369
371
  self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
370
372
  volume_mounts)
371
373
 
374
+ self._metadata = metadata if metadata is not None else {}
375
+
372
376
  dag = sky.dag.get_current_dag()
373
377
  if dag is not None:
374
378
  dag.add(self)
@@ -503,6 +507,8 @@ class Task:
503
507
  'Workdir must be a valid directory (or '
504
508
  f'a symlink to a directory). {user_workdir} not found.')
505
509
 
510
+ self._metadata['git_commit'] = common_utils.get_git_commit(self.workdir)
511
+
506
512
  @staticmethod
507
513
  def from_yaml_config(
508
514
  config: Dict[str, Any],
@@ -599,6 +605,7 @@ class Task:
599
605
  event_callback=config.pop('event_callback', None),
600
606
  file_mounts_mapping=config.pop('file_mounts_mapping', None),
601
607
  volumes=config.pop('volumes', None),
608
+ metadata=config.pop('_metadata', None),
602
609
  )
603
610
 
604
611
  # Create lists to store storage objects inlined in file_mounts.
@@ -872,6 +879,14 @@ class Task:
872
879
  f'num_nodes should be a positive int. Got: {num_nodes}')
873
880
  self._num_nodes = num_nodes
874
881
 
882
+ @property
883
+ def metadata(self) -> Dict[str, Any]:
884
+ return self._metadata
885
+
886
+ @property
887
+ def metadata_json(self) -> str:
888
+ return json.dumps(self._metadata)
889
+
875
890
  @property
876
891
  def envs(self) -> Dict[str, str]:
877
892
  return self._envs
@@ -1588,6 +1603,8 @@ class Task:
1588
1603
  volume_mount.to_yaml_config()
1589
1604
  for volume_mount in self.volume_mounts
1590
1605
  ]
1606
+ # we manually check if its empty to not clog up the generated yaml
1607
+ add_if_not_none('_metadata', self._metadata if self._metadata else None)
1591
1608
  return config
1592
1609
 
1593
1610
  def get_required_cloud_features(