skypilot-nightly 1.0.0.dev20250709__py3-none-any.whl → 1.0.0.dev20250711__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +6 -4
- sky/clouds/kubernetes.py +137 -23
- sky/core.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1871-3a0f047988be65cd.js +6 -0
- sky/dashboard/out/_next/static/chunks/8969-13bb52ce3cffa4e3.js +1 -0
- sky/dashboard/out/_next/static/chunks/{webpack-9a81ea998672c303.js → webpack-60070a62f55486a6.js} +1 -1
- sky/dashboard/out/_next/static/css/6cbd41a88d2e9e1c.css +3 -0
- sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → ldZFQWCiYX_vZnIfB_o8S}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +10 -11
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +7 -0
- sky/jobs/server/core.py +2 -1
- sky/jobs/server/utils.py +81 -0
- sky/jobs/state.py +58 -40
- sky/jobs/utils.py +45 -6
- sky/provision/kubernetes/instance.py +17 -0
- sky/provision/kubernetes/utils.py +134 -0
- sky/provision/provisioner.py +20 -0
- sky/skylet/constants.py +1 -6
- sky/skylet/job_lib.py +30 -8
- sky/skypilot_config.py +8 -3
- sky/task.py +17 -0
- sky/templates/kubernetes-ray.yml.j2 +298 -10
- sky/users/permission.py +18 -1
- sky/users/token_service.py +25 -3
- sky/utils/common_utils.py +13 -0
- sky/utils/db_utils.py +16 -0
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +2 -4
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/RECORD +55 -54
- sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +0 -6
- sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +0 -1
- sky/dashboard/out/_next/static/css/0da6afe66176678a.css +0 -3
- /sky/dashboard/out/_next/static/chunks/pages/{_app-a37b06ddb64521fd.js → _app-e6e82dc8abb50c4f.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-1159f362b960e2b8.js → [cluster]-0fbfb1dd0b08c90c.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{clusters-9744c271a1642f76.js → clusters-102d169e87913ba1.js} +0 -0
- /sky/dashboard/out/_next/static/{EqELoF4IXcALfWVihInou → ldZFQWCiYX_vZnIfB_o8S}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250709.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
@@ -30,6 +30,7 @@ from sky.backends import backend_utils
|
|
30
30
|
from sky.jobs import constants as managed_job_constants
|
31
31
|
from sky.jobs import scheduler
|
32
32
|
from sky.jobs import state as managed_job_state
|
33
|
+
from sky.server import common as server_common
|
33
34
|
from sky.skylet import constants
|
34
35
|
from sky.skylet import job_lib
|
35
36
|
from sky.skylet import log_lib
|
@@ -38,6 +39,7 @@ from sky.utils import annotations
|
|
38
39
|
from sky.utils import command_runner
|
39
40
|
from sky.utils import common_utils
|
40
41
|
from sky.utils import controller_utils
|
42
|
+
from sky.utils import env_options
|
41
43
|
from sky.utils import infra_utils
|
42
44
|
from sky.utils import log_utils
|
43
45
|
from sky.utils import message_utils
|
@@ -128,9 +130,15 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
|
|
128
130
|
time.sleep(backoff.current_backoff())
|
129
131
|
|
130
132
|
|
131
|
-
def
|
133
|
+
def _validate_consolidation_mode_config(
|
132
134
|
current_is_consolidation_mode: bool) -> None:
|
133
|
-
"""
|
135
|
+
"""Validate the consolidation mode config."""
|
136
|
+
if (current_is_consolidation_mode and
|
137
|
+
not env_options.Options.IS_DEVELOPER.get() and
|
138
|
+
server_common.is_api_server_local()):
|
139
|
+
with ux_utils.print_exception_no_traceback():
|
140
|
+
raise exceptions.NotSupportedError(
|
141
|
+
'Consolidation mode is not supported when running locally.')
|
134
142
|
# Check whether the consolidation mode config is changed.
|
135
143
|
if current_is_consolidation_mode:
|
136
144
|
controller_cn = (
|
@@ -176,7 +184,7 @@ def _check_consolidation_mode_consistency(
|
|
176
184
|
def is_consolidation_mode() -> bool:
|
177
185
|
consolidation_mode = skypilot_config.get_nested(
|
178
186
|
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
179
|
-
|
187
|
+
_validate_consolidation_mode_config(consolidation_mode)
|
180
188
|
return consolidation_mode
|
181
189
|
|
182
190
|
|
@@ -1249,7 +1257,14 @@ def format_job_table(
|
|
1249
1257
|
]
|
1250
1258
|
if show_all:
|
1251
1259
|
# TODO: move SCHED. STATE to a separate flag (e.g. --debug)
|
1252
|
-
columns += [
|
1260
|
+
columns += [
|
1261
|
+
'STARTED',
|
1262
|
+
'INFRA',
|
1263
|
+
'RESOURCES',
|
1264
|
+
'SCHED. STATE',
|
1265
|
+
'DETAILS',
|
1266
|
+
'GIT_COMMIT',
|
1267
|
+
]
|
1253
1268
|
if tasks_have_k8s_user:
|
1254
1269
|
columns.insert(0, 'USER')
|
1255
1270
|
job_table = log_utils.create_table(columns)
|
@@ -1362,6 +1377,7 @@ def format_job_table(
|
|
1362
1377
|
'-',
|
1363
1378
|
job_tasks[0]['schedule_state'],
|
1364
1379
|
generate_details(details, failure_reason),
|
1380
|
+
job_tasks[0].get('metadata', {}).get('git_commit', '-'),
|
1365
1381
|
])
|
1366
1382
|
if tasks_have_k8s_user:
|
1367
1383
|
job_values.insert(0, job_tasks[0].get('user', '-'))
|
@@ -1427,6 +1443,8 @@ def format_job_table(
|
|
1427
1443
|
generate_details(task.get('details'),
|
1428
1444
|
task['failure_reason']),
|
1429
1445
|
])
|
1446
|
+
|
1447
|
+
values.append(task.get('metadata', {}).get('git_commit', '-'))
|
1430
1448
|
if tasks_have_k8s_user:
|
1431
1449
|
values.insert(0, task.get('user', '-'))
|
1432
1450
|
job_table.add_row(values)
|
@@ -1511,6 +1529,22 @@ class ManagedJobCodeGen:
|
|
1511
1529
|
""")
|
1512
1530
|
return cls._build(code)
|
1513
1531
|
|
1532
|
+
@classmethod
|
1533
|
+
def get_version_and_job_table(cls) -> str:
|
1534
|
+
"""Generate code to get controller version and raw job table."""
|
1535
|
+
code = textwrap.dedent("""\
|
1536
|
+
from sky.skylet import constants as controller_constants
|
1537
|
+
|
1538
|
+
# Get controller version
|
1539
|
+
controller_version = controller_constants.SKYLET_VERSION
|
1540
|
+
print(f"controller_version:{controller_version}", flush=True)
|
1541
|
+
|
1542
|
+
# Get and print raw job table (load_managed_job_queue can parse this directly)
|
1543
|
+
job_table = utils.dump_managed_job_queue()
|
1544
|
+
print(job_table, flush=True)
|
1545
|
+
""")
|
1546
|
+
return cls._build(code)
|
1547
|
+
|
1514
1548
|
@classmethod
|
1515
1549
|
def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
|
1516
1550
|
code = textwrap.dedent(f"""\
|
@@ -1565,8 +1599,13 @@ class ManagedJobCodeGen:
|
|
1565
1599
|
resources_str = backend_utils.get_task_resources_str(
|
1566
1600
|
task, is_managed_job=True)
|
1567
1601
|
code += textwrap.dedent(f"""\
|
1568
|
-
|
1569
|
-
|
1602
|
+
if managed_job_version < 7:
|
1603
|
+
managed_job_state.set_pending({job_id}, {task_id},
|
1604
|
+
{task.name!r}, {resources_str!r})
|
1605
|
+
else:
|
1606
|
+
managed_job_state.set_pending({job_id}, {task_id},
|
1607
|
+
{task.name!r}, {resources_str!r},
|
1608
|
+
{task.metadata_json!r})
|
1570
1609
|
""")
|
1571
1610
|
return cls._build(code)
|
1572
1611
|
|
@@ -825,6 +825,23 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
825
825
|
return
|
826
826
|
pod_spec_copy['metadata']['name'] = pod_name
|
827
827
|
pod_spec_copy['metadata']['labels']['component'] = pod_name
|
828
|
+
|
829
|
+
# We need to keep the following fields in the pod spec to be same for
|
830
|
+
# head and worker pods.
|
831
|
+
# So that Kueue can merge them into a single PodSet when creating
|
832
|
+
# ProvisioningRequest to trigger scale up of the cluster autoscaler,
|
833
|
+
# this is especially required for DWS queued provisioning mode in GKE.
|
834
|
+
# spec.containers[*].resources.requests
|
835
|
+
# spec.initContainers[*].resources.requests
|
836
|
+
# spec.resources
|
837
|
+
# spec.nodeSelector
|
838
|
+
# spec.tolerations
|
839
|
+
# spec.affinity
|
840
|
+
# resourceClaims
|
841
|
+
# Refer to the following links for more details:
|
842
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#define_a_provisioningrequest_object # pylint: disable=line-too-long
|
843
|
+
# https://kueue.sigs.k8s.io/docs/admission-check-controllers/provisioning/#podset-merge-policy # pylint: disable=line-too-long
|
844
|
+
if config.count > 1:
|
828
845
|
# For multi-node support, we put a soft-constraint to schedule
|
829
846
|
# worker pods on different nodes than the head pod.
|
830
847
|
# This is not set as a hard constraint because if different nodes
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
2
2
|
import dataclasses
|
3
|
+
import enum
|
3
4
|
import functools
|
4
5
|
import hashlib
|
5
6
|
import json
|
@@ -57,6 +58,69 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
|
|
57
58
|
# and store all data that needs to be persisted in future.
|
58
59
|
HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
|
59
60
|
|
61
|
+
|
62
|
+
class KubernetesHighPerformanceNetworkType(enum.Enum):
|
63
|
+
"""Enum for different Kubernetes cluster types with high performance
|
64
|
+
network configurations.
|
65
|
+
|
66
|
+
This enum defines cluster types that support optimized networking for
|
67
|
+
distributed ML workloads:
|
68
|
+
- GCP_TCPX: GKE clusters with GPUDirect-TCPX support
|
69
|
+
(A3 High instances: a3-highgpu-8g)
|
70
|
+
- GCP_TCPXO: GKE clusters with GPUDirect-TCPXO support
|
71
|
+
(A3 Mega instances: a3-megagpu-8g)
|
72
|
+
- GCP_GPUDIRECT_RDMA: GKE clusters with GPUDirect-RDMA support
|
73
|
+
(A4/A3 Ultra instances)
|
74
|
+
- NEBIUS: Nebius clusters with InfiniBand support for high-throughput,
|
75
|
+
low-latency networking
|
76
|
+
- NONE: Standard clusters without specialized networking optimizations
|
77
|
+
|
78
|
+
The network configurations align with corresponding VM-based
|
79
|
+
implementations:
|
80
|
+
- GCP settings match
|
81
|
+
sky.provision.gcp.constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
|
82
|
+
- Nebius settings match the InfiniBand configuration used in Nebius VMs
|
83
|
+
"""
|
84
|
+
|
85
|
+
GCP_TCPX = 'gcp_tcpx'
|
86
|
+
GCP_TCPXO = 'gcp_tcpxo'
|
87
|
+
GCP_GPUDIRECT_RDMA = 'gcp_gpudirect_rdma'
|
88
|
+
NEBIUS = 'nebius'
|
89
|
+
NONE = 'none'
|
90
|
+
|
91
|
+
def get_network_env_vars(self) -> Dict[str, str]:
|
92
|
+
"""Get network environment variables for this cluster type."""
|
93
|
+
if self == KubernetesHighPerformanceNetworkType.NEBIUS:
|
94
|
+
# Nebius cluster with InfiniBand - use InfiniBand optimizations
|
95
|
+
return {
|
96
|
+
'NCCL_IB_HCA': 'mlx5',
|
97
|
+
'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
|
98
|
+
'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
|
99
|
+
}
|
100
|
+
else:
|
101
|
+
# GCP clusters and generic clusters - environment variables are
|
102
|
+
# handled directly in the template
|
103
|
+
return {}
|
104
|
+
|
105
|
+
def supports_high_performance_networking(self) -> bool:
|
106
|
+
"""Check if this cluster type supports high performance networking."""
|
107
|
+
return self is not KubernetesHighPerformanceNetworkType.NONE
|
108
|
+
|
109
|
+
def supports_gpu_direct(self) -> bool:
|
110
|
+
"""Check if this cluster type supports GPUDirect networking."""
|
111
|
+
return self in (KubernetesHighPerformanceNetworkType.GCP_TCPX,
|
112
|
+
KubernetesHighPerformanceNetworkType.GCP_TCPXO,
|
113
|
+
KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
|
114
|
+
|
115
|
+
def requires_ipc_lock_capability(self) -> bool:
|
116
|
+
"""Check if this cluster type requires IPC_LOCK capability."""
|
117
|
+
return self.supports_high_performance_networking()
|
118
|
+
|
119
|
+
def requires_tcpxo_daemon(self) -> bool:
|
120
|
+
"""Check if this cluster type requires TCPXO daemon."""
|
121
|
+
return self == KubernetesHighPerformanceNetworkType.GCP_TCPXO
|
122
|
+
|
123
|
+
|
60
124
|
# TODO(romilb): Move constants to constants.py
|
61
125
|
DEFAULT_NAMESPACE = 'default'
|
62
126
|
|
@@ -485,6 +549,8 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
485
549
|
# we map H100 ---> H100-80GB and keep H100-MEGA-80GB
|
486
550
|
# to distinguish between a3-high and a3-mega instances
|
487
551
|
return 'H100'
|
552
|
+
elif acc == 'H200-141GB':
|
553
|
+
return 'H200'
|
488
554
|
return acc
|
489
555
|
elif is_tpu_on_gke(value):
|
490
556
|
return value
|
@@ -756,6 +822,74 @@ class GKEAutoscaler(Autoscaler):
|
|
756
822
|
return True
|
757
823
|
return False
|
758
824
|
|
825
|
+
@classmethod
|
826
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
827
|
+
def get_available_machine_types(cls, context: str) -> List[str]:
|
828
|
+
"""Returns the list of machine types that are available in the cluster.
|
829
|
+
"""
|
830
|
+
# Assume context naming convention of
|
831
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME
|
832
|
+
valid, project_id, location, cluster_name = cls._validate_context_name(
|
833
|
+
context)
|
834
|
+
if not valid:
|
835
|
+
# Context name is not in the format of
|
836
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
|
837
|
+
# Cannot determine if the context can autoscale.
|
838
|
+
# Return empty list.
|
839
|
+
logger.debug(f'Context {context} is not in the format of '
|
840
|
+
f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
|
841
|
+
'Returning empty machine type list.')
|
842
|
+
return []
|
843
|
+
try:
|
844
|
+
logger.debug(
|
845
|
+
f'Attempting to get information about cluster {cluster_name}')
|
846
|
+
container_service = gcp.build('container',
|
847
|
+
'v1',
|
848
|
+
credentials=None,
|
849
|
+
cache_discovery=False)
|
850
|
+
cluster = container_service.projects().locations().clusters().get(
|
851
|
+
name=f'projects/{project_id}'
|
852
|
+
f'/locations/{location}'
|
853
|
+
f'/clusters/{cluster_name}').execute()
|
854
|
+
except ImportError:
|
855
|
+
# If the gcp module is not installed, return empty list.
|
856
|
+
# Remind the user once per day to install the gcp module for better
|
857
|
+
# pod scheduling with GKE autoscaler.
|
858
|
+
if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
|
859
|
+
logger.info(
|
860
|
+
'Could not fetch autoscaler information from GKE. '
|
861
|
+
'Run pip install "skypilot[gcp]" for more intelligent pod '
|
862
|
+
'scheduling with GKE autoscaler.')
|
863
|
+
cls._pip_install_gcp_hint_last_sent = time.time()
|
864
|
+
return []
|
865
|
+
except gcp.http_error_exception() as e:
|
866
|
+
# Cluster information is not available.
|
867
|
+
# Return empty list.
|
868
|
+
logger.debug(f'{e.message}', exc_info=True)
|
869
|
+
return []
|
870
|
+
|
871
|
+
machine_types = []
|
872
|
+
# Get the list of machine types that are available in the cluster.
|
873
|
+
node_pools = cluster.get('nodePools', [])
|
874
|
+
for node_pool in node_pools:
|
875
|
+
name = node_pool.get('name', '')
|
876
|
+
logger.debug(f'Checking if node pool {name} '
|
877
|
+
'has autoscaling enabled.')
|
878
|
+
autoscaling_enabled = (node_pool.get('autoscaling',
|
879
|
+
{}).get('enabled', False))
|
880
|
+
if autoscaling_enabled:
|
881
|
+
logger.debug(f'Node pool {name} has autoscaling enabled.')
|
882
|
+
try:
|
883
|
+
machine_type = node_pool.get('config',
|
884
|
+
{}).get('machineType', '')
|
885
|
+
if machine_type:
|
886
|
+
machine_types.append(machine_type)
|
887
|
+
except KeyError:
|
888
|
+
logger.debug(f'Encountered KeyError while checking machine '
|
889
|
+
f'type of node pool {name}.')
|
890
|
+
continue
|
891
|
+
return machine_types
|
892
|
+
|
759
893
|
@classmethod
|
760
894
|
def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
|
761
895
|
"""Validates the context name is in the format of
|
sky/provision/provisioner.py
CHANGED
@@ -22,11 +22,13 @@ from sky import sky_logging
|
|
22
22
|
from sky import skypilot_config
|
23
23
|
from sky.adaptors import aws
|
24
24
|
from sky.backends import backend_utils
|
25
|
+
from sky.jobs.server import utils as server_jobs_utils
|
25
26
|
from sky.provision import common as provision_common
|
26
27
|
from sky.provision import instance_setup
|
27
28
|
from sky.provision import logging as provision_logging
|
28
29
|
from sky.provision import metadata_utils
|
29
30
|
from sky.skylet import constants
|
31
|
+
from sky.utils import common
|
30
32
|
from sky.utils import common_utils
|
31
33
|
from sky.utils import message_utils
|
32
34
|
from sky.utils import resources_utils
|
@@ -502,6 +504,24 @@ def _post_provision_setup(
|
|
502
504
|
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
|
503
505
|
f'Docker container is up.{colorama.Style.RESET_ALL}')
|
504
506
|
|
507
|
+
# Check version compatibility for jobs controller clusters
|
508
|
+
if cluster_name.display_name.startswith(common.JOB_CONTROLLER_PREFIX):
|
509
|
+
# TODO(zeping): remove this in v0.12.0
|
510
|
+
# This only happens in upgrade from <0.9.3 to > 0.10.0
|
511
|
+
# After 0.10.0 no incompatibility issue
|
512
|
+
# See https://github.com/skypilot-org/skypilot/pull/6096
|
513
|
+
# For more details
|
514
|
+
status.update(
|
515
|
+
ux_utils.spinner_message(
|
516
|
+
'Checking controller version compatibility'))
|
517
|
+
try:
|
518
|
+
server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
|
519
|
+
except exceptions.ClusterNotUpError:
|
520
|
+
# Controller is not up yet during initial provisioning, that
|
521
|
+
# also means no non-terminal jobs, so no incompatibility in
|
522
|
+
# this case.
|
523
|
+
pass
|
524
|
+
|
505
525
|
# We mount the metadata with sky wheel for speedup.
|
506
526
|
# NOTE: currently we mount all credentials for all nodes, because
|
507
527
|
# (1) jobs controllers need permission to launch/down nodes of
|
sky/skylet/constants.py
CHANGED
@@ -89,18 +89,13 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
89
89
|
# cluster yaml is updated.
|
90
90
|
#
|
91
91
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
92
|
-
SKYLET_VERSION = '
|
92
|
+
SKYLET_VERSION = '15'
|
93
93
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
94
94
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
95
95
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
96
96
|
SKYLET_LIB_VERSION = 3
|
97
97
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
98
98
|
|
99
|
-
# `sky jobs dashboard`-related
|
100
|
-
#
|
101
|
-
# Port on the remote jobs controller that the dashboard is running on.
|
102
|
-
SPOT_DASHBOARD_REMOTE_PORT = 5000
|
103
|
-
|
104
99
|
# Docker default options
|
105
100
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
106
101
|
DEFAULT_DOCKER_PORT = 10022
|
sky/skylet/job_lib.py
CHANGED
@@ -63,6 +63,7 @@ class JobInfoLoc(enum.IntEnum):
|
|
63
63
|
RESOURCES = 8
|
64
64
|
PID = 9
|
65
65
|
LOG_PATH = 10
|
66
|
+
METADATA = 11
|
66
67
|
|
67
68
|
|
68
69
|
def create_table(cursor, conn):
|
@@ -103,7 +104,8 @@ def create_table(cursor, conn):
|
|
103
104
|
end_at FLOAT DEFAULT NULL,
|
104
105
|
resources TEXT DEFAULT NULL,
|
105
106
|
pid INTEGER DEFAULT -1,
|
106
|
-
log_dir TEXT DEFAULT NULL
|
107
|
+
log_dir TEXT DEFAULT NULL,
|
108
|
+
metadata TEXT DEFAULT '{}')""")
|
107
109
|
|
108
110
|
cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
|
109
111
|
job_id INTEGER,
|
@@ -118,6 +120,12 @@ def create_table(cursor, conn):
|
|
118
120
|
'INTEGER DEFAULT -1')
|
119
121
|
db_utils.add_column_to_table(cursor, conn, 'jobs', 'log_dir',
|
120
122
|
'TEXT DEFAULT NULL')
|
123
|
+
db_utils.add_column_to_table(cursor,
|
124
|
+
conn,
|
125
|
+
'jobs',
|
126
|
+
'metadata',
|
127
|
+
'TEXT DEFAULT \'{}\'',
|
128
|
+
value_to_replace_existing_entries='{}')
|
121
129
|
conn.commit()
|
122
130
|
|
123
131
|
|
@@ -338,16 +346,19 @@ def make_job_command_with_user_switching(username: str,
|
|
338
346
|
|
339
347
|
|
340
348
|
@init_db
|
341
|
-
def add_job(job_name: str,
|
342
|
-
|
349
|
+
def add_job(job_name: str,
|
350
|
+
username: str,
|
351
|
+
run_timestamp: str,
|
352
|
+
resources_str: str,
|
353
|
+
metadata: str = '{}') -> Tuple[int, str]:
|
343
354
|
"""Atomically reserve the next available job id for the user."""
|
344
355
|
assert _DB is not None
|
345
356
|
job_submitted_at = time.time()
|
346
357
|
# job_id will autoincrement with the null value
|
347
358
|
_DB.cursor.execute(
|
348
|
-
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null)',
|
359
|
+
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)',
|
349
360
|
(job_name, username, job_submitted_at, JobStatus.INIT.value,
|
350
|
-
run_timestamp, None, resources_str))
|
361
|
+
run_timestamp, None, resources_str, metadata))
|
351
362
|
_DB.conn.commit()
|
352
363
|
rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
|
353
364
|
(run_timestamp,))
|
@@ -569,6 +580,7 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
|
|
569
580
|
'end_at': row[JobInfoLoc.END_AT.value],
|
570
581
|
'resources': row[JobInfoLoc.RESOURCES.value],
|
571
582
|
'pid': row[JobInfoLoc.PID.value],
|
583
|
+
'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
|
572
584
|
})
|
573
585
|
return records
|
574
586
|
|
@@ -839,7 +851,7 @@ def format_job_queue(jobs: List[Dict[str, Any]]):
|
|
839
851
|
"""
|
840
852
|
job_table = log_utils.create_table([
|
841
853
|
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
842
|
-
'STATUS', 'LOG'
|
854
|
+
'STATUS', 'LOG', 'GIT COMMIT'
|
843
855
|
])
|
844
856
|
for job in jobs:
|
845
857
|
job_table.add_row([
|
@@ -854,6 +866,7 @@ def format_job_queue(jobs: List[Dict[str, Any]]):
|
|
854
866
|
job['resources'],
|
855
867
|
job['status'].colored_str(),
|
856
868
|
job['log_path'],
|
869
|
+
job.get('metadata', {}).get('git_commit', '-'),
|
857
870
|
])
|
858
871
|
return job_table
|
859
872
|
|
@@ -1055,7 +1068,7 @@ class JobLibCodeGen:
|
|
1055
1068
|
|
1056
1069
|
@classmethod
|
1057
1070
|
def add_job(cls, job_name: Optional[str], username: str, run_timestamp: str,
|
1058
|
-
resources_str: str) -> str:
|
1071
|
+
resources_str: str, metadata: str) -> str:
|
1059
1072
|
if job_name is None:
|
1060
1073
|
job_name = '-'
|
1061
1074
|
code = [
|
@@ -1066,11 +1079,20 @@ class JobLibCodeGen:
|
|
1066
1079
|
'\nif int(constants.SKYLET_VERSION) < 9: '
|
1067
1080
|
'raise RuntimeError("SkyPilot runtime is too old, which does not '
|
1068
1081
|
'support submitting jobs.")',
|
1069
|
-
'\nresult =
|
1082
|
+
'\nresult = None',
|
1083
|
+
'\nif int(constants.SKYLET_VERSION) < 15: '
|
1084
|
+
'\n result = job_lib.add_job('
|
1070
1085
|
f'{job_name!r},'
|
1071
1086
|
f'{username!r},'
|
1072
1087
|
f'{run_timestamp!r},'
|
1073
1088
|
f'{resources_str!r})',
|
1089
|
+
'\nelse: '
|
1090
|
+
'\n result = job_lib.add_job('
|
1091
|
+
f'{job_name!r},'
|
1092
|
+
f'{username!r},'
|
1093
|
+
f'{run_timestamp!r},'
|
1094
|
+
f'{resources_str!r},'
|
1095
|
+
f'metadata={metadata!r})',
|
1074
1096
|
('\nif isinstance(result, tuple):'
|
1075
1097
|
'\n print("Job ID: " + str(result[0]), flush=True)'
|
1076
1098
|
'\n print("Log Dir: " + str(result[1]), flush=True)'
|
sky/skypilot_config.py
CHANGED
@@ -52,6 +52,7 @@ import contextlib
|
|
52
52
|
import copy
|
53
53
|
import json
|
54
54
|
import os
|
55
|
+
import pathlib
|
55
56
|
import tempfile
|
56
57
|
import threading
|
57
58
|
import typing
|
@@ -573,7 +574,8 @@ def _reload_config_as_server() -> None:
|
|
573
574
|
with _DB_USE_LOCK:
|
574
575
|
sqlalchemy_engine = sqlalchemy.create_engine(db_url,
|
575
576
|
poolclass=NullPool)
|
576
|
-
Base.metadata
|
577
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
|
578
|
+
sqlalchemy_engine)
|
577
579
|
|
578
580
|
def _get_config_yaml_from_db(
|
579
581
|
key: str) -> Optional[config_utils.Config]:
|
@@ -847,7 +849,9 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
847
849
|
|
848
850
|
global_config_path = _resolve_server_config_path()
|
849
851
|
if global_config_path is None:
|
850
|
-
|
852
|
+
# Fallback to ~/.sky/config.yaml, and make sure it exists.
|
853
|
+
global_config_path = os.path.expanduser(get_user_config_path())
|
854
|
+
pathlib.Path(global_config_path).touch(exist_ok=True)
|
851
855
|
|
852
856
|
db_updated = False
|
853
857
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
@@ -859,7 +863,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
859
863
|
with _DB_USE_LOCK:
|
860
864
|
sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
|
861
865
|
poolclass=NullPool)
|
862
|
-
Base.metadata
|
866
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
|
867
|
+
sqlalchemy_engine)
|
863
868
|
|
864
869
|
def _set_config_yaml_to_db(key: str,
|
865
870
|
config: config_utils.Config):
|
sky/task.py
CHANGED
@@ -255,6 +255,7 @@ class Task:
|
|
255
255
|
# Internal use only.
|
256
256
|
file_mounts_mapping: Optional[Dict[str, str]] = None,
|
257
257
|
volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
|
258
|
+
metadata: Optional[Dict[str, Any]] = None,
|
258
259
|
):
|
259
260
|
"""Initializes a Task.
|
260
261
|
|
@@ -313,6 +314,7 @@ class Task:
|
|
313
314
|
is used.) The base docker image that this Task will be built on.
|
314
315
|
Defaults to 'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04'.
|
315
316
|
blocked_resources: A set of resources that this task cannot run on.
|
317
|
+
metadata: A dictionary of metadata to be added to the task.
|
316
318
|
"""
|
317
319
|
self.name = name
|
318
320
|
self.run = run
|
@@ -369,6 +371,8 @@ class Task:
|
|
369
371
|
self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
|
370
372
|
volume_mounts)
|
371
373
|
|
374
|
+
self._metadata = metadata if metadata is not None else {}
|
375
|
+
|
372
376
|
dag = sky.dag.get_current_dag()
|
373
377
|
if dag is not None:
|
374
378
|
dag.add(self)
|
@@ -503,6 +507,8 @@ class Task:
|
|
503
507
|
'Workdir must be a valid directory (or '
|
504
508
|
f'a symlink to a directory). {user_workdir} not found.')
|
505
509
|
|
510
|
+
self._metadata['git_commit'] = common_utils.get_git_commit(self.workdir)
|
511
|
+
|
506
512
|
@staticmethod
|
507
513
|
def from_yaml_config(
|
508
514
|
config: Dict[str, Any],
|
@@ -599,6 +605,7 @@ class Task:
|
|
599
605
|
event_callback=config.pop('event_callback', None),
|
600
606
|
file_mounts_mapping=config.pop('file_mounts_mapping', None),
|
601
607
|
volumes=config.pop('volumes', None),
|
608
|
+
metadata=config.pop('_metadata', None),
|
602
609
|
)
|
603
610
|
|
604
611
|
# Create lists to store storage objects inlined in file_mounts.
|
@@ -872,6 +879,14 @@ class Task:
|
|
872
879
|
f'num_nodes should be a positive int. Got: {num_nodes}')
|
873
880
|
self._num_nodes = num_nodes
|
874
881
|
|
882
|
+
@property
|
883
|
+
def metadata(self) -> Dict[str, Any]:
|
884
|
+
return self._metadata
|
885
|
+
|
886
|
+
@property
|
887
|
+
def metadata_json(self) -> str:
|
888
|
+
return json.dumps(self._metadata)
|
889
|
+
|
875
890
|
@property
|
876
891
|
def envs(self) -> Dict[str, str]:
|
877
892
|
return self._envs
|
@@ -1588,6 +1603,8 @@ class Task:
|
|
1588
1603
|
volume_mount.to_yaml_config()
|
1589
1604
|
for volume_mount in self.volume_mounts
|
1590
1605
|
]
|
1606
|
+
# we manually check if its empty to not clog up the generated yaml
|
1607
|
+
add_if_not_none('_metadata', self._metadata if self._metadata else None)
|
1591
1608
|
return config
|
1592
1609
|
|
1593
1610
|
def get_required_cloud_features(
|