skypilot-nightly 1.0.0.dev20251001__py3-none-any.whl → 1.0.0.dev20251003__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (58) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -109
  3. sky/client/cli/command.py +2 -3
  4. sky/client/cli/table_utils.py +222 -1
  5. sky/clouds/cudo.py +1 -1
  6. sky/clouds/kubernetes.py +7 -19
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/{m3YT2i5s6v4SsIdYc8WZa → Haazh5IQz6F8Wyiqxcaj8}/_buildManifest.js +1 -1
  9. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-ad77b12fc736dca3.js → [job]-72794fc3fcdd517a.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/{webpack-4f0c389a4ce5fd9c.js → webpack-3286453d56f3c0a0.js} +1 -1
  13. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  14. sky/dashboard/out/clusters/[cluster].html +1 -1
  15. sky/dashboard/out/clusters.html +1 -1
  16. sky/dashboard/out/config.html +1 -1
  17. sky/dashboard/out/index.html +1 -1
  18. sky/dashboard/out/infra/[context].html +1 -1
  19. sky/dashboard/out/infra.html +1 -1
  20. sky/dashboard/out/jobs/[job].html +1 -1
  21. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  22. sky/dashboard/out/jobs.html +1 -1
  23. sky/dashboard/out/users.html +1 -1
  24. sky/dashboard/out/volumes.html +1 -1
  25. sky/dashboard/out/workspace/new.html +1 -1
  26. sky/dashboard/out/workspaces/[name].html +1 -1
  27. sky/dashboard/out/workspaces.html +1 -1
  28. sky/data/storage_utils.py +9 -0
  29. sky/global_user_state.py +16 -0
  30. sky/jobs/server/core.py +60 -53
  31. sky/jobs/state.py +21 -1
  32. sky/jobs/utils.py +29 -11
  33. sky/provision/kubernetes/config.py +0 -42
  34. sky/provision/kubernetes/instance.py +1 -33
  35. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  36. sky/provision/kubernetes/network_utils.py +0 -21
  37. sky/provision/kubernetes/utils.py +68 -322
  38. sky/schemas/api/responses.py +21 -0
  39. sky/server/requests/serializers/decoders.py +8 -0
  40. sky/server/requests/serializers/encoders.py +6 -0
  41. sky/templates/kubernetes-ray.yml.j2 +4 -13
  42. sky/utils/env_options.py +4 -0
  43. sky/utils/kubernetes_enums.py +2 -15
  44. sky/utils/schemas.py +17 -6
  45. sky/volumes/client/sdk.py +3 -2
  46. sky/volumes/server/core.py +3 -2
  47. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/METADATA +37 -37
  48. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/RECORD +53 -56
  49. sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +0 -1
  50. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +0 -1
  51. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  52. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  53. sky/volumes/utils.py +0 -224
  54. /sky/dashboard/out/_next/static/{m3YT2i5s6v4SsIdYc8WZa → Haazh5IQz6F8Wyiqxcaj8}/_ssgManifest.js +0 -0
  55. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/WHEEL +0 -0
  56. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/entry_points.txt +0 -0
  57. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/licenses/LICENSE +0 -0
  58. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/top_level.txt +0 -0
@@ -1,94 +0,0 @@
1
- pod_spec:
2
- apiVersion: v1
3
- kind: Pod
4
- metadata:
5
- name: {{ name }}
6
- labels:
7
- component: {{ name }}
8
- parent: skypilot
9
- spec:
10
- serviceAccountName: sky-ssh-jump-sa
11
- volumes:
12
- - name: secret-volume
13
- secret:
14
- secretName: {{ secret }}
15
- containers:
16
- - name: {{ name }}
17
- imagePullPolicy: Always
18
- image: {{ image }}
19
- command: ["python3", "-u", "/skypilot/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py"]
20
- ports:
21
- - containerPort: 22
22
- volumeMounts:
23
- - name: secret-volume
24
- readOnly: true
25
- mountPath: /etc/secret-volume
26
- lifecycle:
27
- postStart:
28
- exec:
29
- command: ["/bin/bash", "-c", "mkdir -p ~/.ssh && cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys && sudo service ssh restart"]
30
- env:
31
- - name: MY_POD_NAME
32
- valueFrom:
33
- fieldRef:
34
- fieldPath: metadata.name
35
- - name: MY_POD_NAMESPACE
36
- valueFrom:
37
- fieldRef:
38
- fieldPath: metadata.namespace
39
- - name: ALERT_THRESHOLD
40
- # seconds
41
- value: "600"
42
- - name: RETRY_INTERVAL
43
- # seconds
44
- value: "60"
45
- terminationGracePeriodSeconds: 0
46
- service_spec:
47
- apiVersion: v1
48
- kind: Service
49
- metadata:
50
- name: {{ name }}
51
- labels:
52
- parent: skypilot
53
- spec:
54
- type: {{ service_type }}
55
- selector:
56
- component: {{ name }}
57
- ports:
58
- - protocol: TCP
59
- port: 22
60
- targetPort: 22
61
- # The following ServiceAccount/Role/RoleBinding sets up an RBAC for life cycle
62
- # management of the jump pod/service
63
- service_account:
64
- apiVersion: v1
65
- kind: ServiceAccount
66
- metadata:
67
- name: sky-ssh-jump-sa
68
- labels:
69
- parent: skypilot
70
- role:
71
- kind: Role
72
- apiVersion: rbac.authorization.k8s.io/v1
73
- metadata:
74
- name: sky-ssh-jump-role
75
- labels:
76
- parent: skypilot
77
- rules:
78
- - apiGroups: [""]
79
- resources: ["pods", "pods/status", "pods/exec", "services"]
80
- verbs: ["get", "list", "create", "delete"]
81
- role_binding:
82
- apiVersion: rbac.authorization.k8s.io/v1
83
- kind: RoleBinding
84
- metadata:
85
- name: sky-ssh-jump-rb
86
- labels:
87
- parent: skypilot
88
- subjects:
89
- - kind: ServiceAccount
90
- name: sky-ssh-jump-sa
91
- roleRef:
92
- kind: Role
93
- name: sky-ssh-jump-role
94
- apiGroup: rbac.authorization.k8s.io
@@ -1,191 +0,0 @@
1
- """Manages lifecycle of ssh jump pod.
2
-
3
- This script runs inside ssh jump pod as the main process (PID 1).
4
-
5
- It terminates itself (by removing ssh jump service and pod via a call to
6
- kubeapi) if it does not see ray pods in the duration of 10 minutes. If the
7
- user re-launches a task before the duration is over, then ssh jump pod is being
8
- reused and will terminate itself when it sees that no ray clusters exist in
9
- that duration.
10
-
11
- To allow multiple users to the share the same SSH jump pod,
12
- this script also reloads SSH keys from the mounted secret volume on an
13
- interval and updates `~/.ssh/authorized_keys`.
14
- """
15
- import datetime
16
- import os
17
- import subprocess
18
- import sys
19
- import threading
20
- import time
21
-
22
- from kubernetes import client
23
- from kubernetes import config
24
-
25
- # Load kube config
26
- config.load_incluster_config()
27
-
28
- v1 = client.CoreV1Api()
29
-
30
- current_name = os.getenv('MY_POD_NAME')
31
- current_namespace = os.getenv('MY_POD_NAMESPACE')
32
-
33
- # The amount of time in seconds where no Ray pods exist in which after that time
34
- # ssh jump pod terminates itself
35
- alert_threshold = int(os.getenv('ALERT_THRESHOLD', '600'))
36
- # The amount of time in seconds to wait between Ray pods existence checks
37
- retry_interval = int(os.getenv('RETRY_INTERVAL', '60'))
38
- # The amount of time in seconds to wait between SSH key reloads
39
- reload_interval = int(os.getenv('RELOAD_INTERVAL', '5'))
40
-
41
- # Ray pods are labeled with this value i.e., ssh jump name which is unique per
42
- # user (based on user hash)
43
- label_selector = f'skypilot-ssh-jump={current_name}'
44
-
45
-
46
- def poll(interval, leading=True):
47
- """Decorator factory for polling function. To stop polling, return True.
48
-
49
- Args:
50
- interval (int): The amount of time to wait between function calls.
51
- leading (bool): Whether to wait before (rather than after) calls.
52
- """
53
-
54
- def decorator(func):
55
-
56
- def wrapper(*args, **kwargs):
57
- while True:
58
- if leading:
59
- time.sleep(interval)
60
- done = func(*args, **kwargs)
61
- if done:
62
- return
63
- if not leading:
64
- time.sleep(interval)
65
-
66
- return wrapper
67
-
68
- return decorator
69
-
70
-
71
- # Flag to terminate the reload keys thread when the lifecycle thread
72
- # terminates.
73
- terminated = False
74
-
75
-
76
- @poll(interval=reload_interval, leading=False)
77
- def reload_keys():
78
- """Reloads SSH keys from mounted secret volume."""
79
-
80
- if terminated:
81
- sys.stdout.write('[SSH Key Reloader] Terminated.\n')
82
- return True
83
-
84
- # Reload SSH keys from mounted secret volume if changed.
85
- tmpfile = '/tmp/sky-ssh-keys'
86
- try:
87
- subprocess.check_output(
88
- f'cat /etc/secret-volume/ssh-publickey* > {tmpfile}', shell=True)
89
- try:
90
- subprocess.check_output(f'diff {tmpfile} ~/.ssh/authorized_keys',
91
- shell=True)
92
- sys.stdout.write(
93
- '[SSH Key Reloader] No keys changed, continuing.\n')
94
- except subprocess.CalledProcessError as e:
95
- if e.returncode == 1:
96
- sys.stdout.write(
97
- '[SSH Key Reloader] Changes detected, reloading.\n')
98
- subprocess.check_output(f'mv {tmpfile} ~/.ssh/authorized_keys',
99
- shell=True)
100
- else:
101
- raise
102
- except Exception as e:
103
- sys.stdout.write(
104
- f'[SSH Key Reloader][ERROR] Failed to reload SSH keys: {e}\n')
105
- raise
106
-
107
-
108
- alert_delta = datetime.timedelta(seconds=alert_threshold)
109
- retry_interval_delta = datetime.timedelta(seconds=retry_interval)
110
- # Accumulated time of where no SkyPilot cluster exists. Compared
111
- # against alert_threshold.
112
- nocluster_delta = datetime.timedelta()
113
-
114
-
115
- @poll(interval=retry_interval)
116
- def manage_lifecycle():
117
- """Manages lifecycle of ssh jump pod."""
118
-
119
- global terminated, nocluster_delta
120
-
121
- try:
122
- ret = v1.list_namespaced_pod(current_namespace,
123
- label_selector=label_selector)
124
- except Exception as e:
125
- sys.stdout.write('[Lifecycle] [ERROR] listing pods failed with '
126
- f'error: {e}\n')
127
- raise
128
-
129
- if not ret.items:
130
- sys.stdout.write(
131
- f'[Lifecycle] Did not find pods with label '
132
- f'"{label_selector}" in namespace {current_namespace}\n')
133
- nocluster_delta = nocluster_delta + retry_interval_delta
134
- sys.stdout.write(
135
- f'[Lifecycle] Time since no pods found: {nocluster_delta}, alert '
136
- f'threshold: {alert_delta}\n')
137
- else:
138
- sys.stdout.write(
139
- f'[Lifecycle] Found pods with label "{label_selector}" in '
140
- f'namespace {current_namespace}\n')
141
- # reset ..
142
- nocluster_delta = datetime.timedelta()
143
- sys.stdout.write(
144
- f'[Lifecycle] nocluster_delta is reset: {nocluster_delta}\n')
145
-
146
- if nocluster_delta >= alert_delta:
147
- sys.stdout.write(
148
- f'[Lifecycle] nocluster_delta: {nocluster_delta} crossed alert '
149
- f'threshold: {alert_delta}. Time to terminate myself and my '
150
- 'service.\n')
151
- try:
152
- # ssh jump resources created under same name
153
- v1.delete_namespaced_service(current_name, current_namespace)
154
- v1.delete_namespaced_pod(current_name, current_namespace)
155
- except Exception as e:
156
- sys.stdout.write('[Lifecycle][ERROR] Deletion failed. Exiting '
157
- f'poll() with error: {e}\n')
158
- raise
159
-
160
- terminated = True
161
- return True
162
-
163
-
164
- def main():
165
- sys.stdout.write('SkyPilot SSH Jump Pod Lifecycle Manager\n')
166
- sys.stdout.write(f'current_name: {current_name}\n')
167
- sys.stdout.write(f'current_namespace: {current_namespace}\n')
168
- sys.stdout.write(f'alert_threshold time: {alert_threshold}\n')
169
- sys.stdout.write(f'retry_interval time: {retry_interval}\n')
170
- sys.stdout.write(f'reload_interval time: {reload_interval}\n')
171
- sys.stdout.write(f'label_selector: {label_selector}\n')
172
-
173
- if not current_name or not current_namespace:
174
- # Raise Exception with message to terminate pod
175
- raise Exception('Missing environment variables MY_POD_NAME or '
176
- 'MY_POD_NAMESPACE')
177
-
178
- threads = [
179
- threading.Thread(target=manage_lifecycle),
180
- threading.Thread(target=reload_keys)
181
- ]
182
- sys.stdout.write(f'Polling with {len(threads)} threads.\n')
183
- for t in threads:
184
- t.start()
185
- for t in threads:
186
- t.join()
187
- sys.stdout.write('Done.\n')
188
-
189
-
190
- if __name__ == '__main__':
191
- main()
sky/volumes/utils.py DELETED
@@ -1,224 +0,0 @@
1
- """Volume utils."""
2
- import abc
3
- from datetime import datetime
4
- from typing import Any, Dict, List, Optional
5
-
6
- import prettytable
7
-
8
- from sky import sky_logging
9
- from sky.skylet import constants
10
- from sky.utils import common_utils
11
- from sky.utils import log_utils
12
- from sky.utils import volume
13
-
14
- logger = sky_logging.init_logger(__name__)
15
-
16
- _BASIC_COLUMNS = [
17
- 'NAME',
18
- 'TYPE',
19
- 'INFRA',
20
- 'SIZE',
21
- 'USER',
22
- 'WORKSPACE',
23
- 'AGE',
24
- 'STATUS',
25
- 'LAST_USE',
26
- 'USED_BY',
27
- ]
28
-
29
-
30
- def _get_infra_str(cloud: Optional[str], region: Optional[str],
31
- zone: Optional[str]) -> str:
32
- """Get the infrastructure string for the volume."""
33
- infra = ''
34
- if cloud:
35
- infra += cloud
36
- if region:
37
- infra += f'/{region}'
38
- if zone:
39
- infra += f'/{zone}'
40
- return infra
41
-
42
-
43
- class VolumeTable(abc.ABC):
44
- """The volume table."""
45
-
46
- def __init__(self, volumes: List[Dict[str, Any]], show_all: bool = False):
47
- super().__init__()
48
- self.table = self._create_table(show_all)
49
- self._add_rows(volumes, show_all)
50
-
51
- def _get_row_base_columns(self,
52
- row: Dict[str, Any],
53
- show_all: bool = False) -> List[str]:
54
- """Get the base columns for a row."""
55
- # Convert last_attached_at timestamp to human readable string
56
- last_attached_at = row.get('last_attached_at')
57
- if last_attached_at is not None:
58
- last_attached_at_str = datetime.fromtimestamp(
59
- last_attached_at).strftime('%Y-%m-%d %H:%M:%S')
60
- else:
61
- last_attached_at_str = '-'
62
- size = row.get('size', '')
63
- if size:
64
- size = f'{size}Gi'
65
- usedby_str = '-'
66
- usedby_clusters = row.get('usedby_clusters')
67
- usedby_pods = row.get('usedby_pods')
68
- if usedby_clusters:
69
- usedby_str = f'{", ".join(usedby_clusters)}'
70
- elif usedby_pods:
71
- usedby_str = f'{", ".join(usedby_pods)}'
72
- if show_all:
73
- usedby = usedby_str
74
- else:
75
- usedby = common_utils.truncate_long_string(
76
- usedby_str, constants.USED_BY_TRUNC_LENGTH)
77
- infra = _get_infra_str(row.get('cloud'), row.get('region'),
78
- row.get('zone'))
79
- return [
80
- row.get('name', ''),
81
- row.get('type', ''),
82
- infra,
83
- size,
84
- row.get('user_name', '-'),
85
- row.get('workspace', '-'),
86
- log_utils.human_duration(row.get('launched_at', 0)),
87
- row.get('status', ''),
88
- last_attached_at_str,
89
- usedby,
90
- ]
91
-
92
- def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
93
- """Create the volume table."""
94
- raise NotImplementedError
95
-
96
- def _add_rows(self,
97
- volumes: List[Dict[str, Any]],
98
- show_all: bool = False) -> None:
99
- """Add rows to the volume table."""
100
- raise NotImplementedError
101
-
102
- @abc.abstractmethod
103
- def format(self) -> str:
104
- """Format the volume table for display."""
105
- raise NotImplementedError
106
-
107
-
108
- class PVCVolumeTable(VolumeTable):
109
- """The PVC volume table."""
110
-
111
- def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
112
- """Create the PVC volume table."""
113
- # If show_all is False, show the table with the columns:
114
- # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
115
- # AGE, STATUS, LAST_USE, USED_BY
116
- # If show_all is True, show the table with the columns:
117
- # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
118
- # AGE, STATUS, LAST_USE, USED_BY, NAME_ON_CLOUD
119
- # STORAGE_CLASS, ACCESS_MODE
120
-
121
- if show_all:
122
- columns = _BASIC_COLUMNS + [
123
- 'NAME_ON_CLOUD',
124
- 'STORAGE_CLASS',
125
- 'ACCESS_MODE',
126
- ]
127
- else:
128
- columns = _BASIC_COLUMNS
129
-
130
- table = log_utils.create_table(columns)
131
- return table
132
-
133
- def _add_rows(self,
134
- volumes: List[Dict[str, Any]],
135
- show_all: bool = False) -> None:
136
- """Add rows to the PVC volume table."""
137
- for row in volumes:
138
- table_row = self._get_row_base_columns(row, show_all)
139
- if show_all:
140
- table_row.append(row.get('name_on_cloud', ''))
141
- table_row.append(
142
- row.get('config', {}).get('storage_class_name', '-'))
143
- table_row.append(row.get('config', {}).get('access_mode', ''))
144
-
145
- self.table.add_row(table_row)
146
-
147
- def format(self) -> str:
148
- """Format the PVC volume table for display."""
149
- return 'Kubernetes PVCs:\n' + str(self.table)
150
-
151
-
152
- class RunPodVolumeTable(VolumeTable):
153
- """The RunPod volume table."""
154
-
155
- def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
156
- """Create the RunPod volume table."""
157
- # If show_all is False, show the table with the columns:
158
- # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
159
- # AGE, STATUS, LAST_USE, USED_BY
160
- # If show_all is True, show the table with the columns:
161
- # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
162
- # AGE, STATUS, LAST_USE, USED_BY, NAME_ON_CLOUD
163
-
164
- if show_all:
165
- columns = _BASIC_COLUMNS + ['NAME_ON_CLOUD']
166
- else:
167
- columns = _BASIC_COLUMNS
168
-
169
- table = log_utils.create_table(columns)
170
- return table
171
-
172
- def _add_rows(self,
173
- volumes: List[Dict[str, Any]],
174
- show_all: bool = False) -> None:
175
- """Add rows to the RunPod volume table."""
176
- for row in volumes:
177
- table_row = self._get_row_base_columns(row, show_all)
178
- if show_all:
179
- table_row.append(row.get('name_on_cloud', ''))
180
-
181
- self.table.add_row(table_row)
182
-
183
- def format(self) -> str:
184
- """Format the RunPod volume table for display."""
185
- return 'RunPod Network Volumes:\n' + str(self.table)
186
-
187
-
188
- def format_volume_table(volumes: List[Dict[str, Any]],
189
- show_all: bool = False) -> str:
190
- """Format the volume table for display.
191
-
192
- Args:
193
- volume_table (dict): The volume table.
194
-
195
- Returns:
196
- str: The formatted volume table.
197
- """
198
- volumes_per_type: Dict[str, List[Dict[str, Any]]] = {}
199
- supported_volume_types = [
200
- volume_type.value for volume_type in volume.VolumeType
201
- ]
202
- for row in volumes:
203
- volume_type = row.get('type', '')
204
- if volume_type in supported_volume_types:
205
- if volume_type not in volumes_per_type:
206
- volumes_per_type[volume_type] = []
207
- volumes_per_type[volume_type].append(row)
208
- else:
209
- logger.warning(f'Unknown volume type: {volume_type}')
210
- continue
211
- table_str = ''
212
- for volume_type, volume_list in volumes_per_type.items():
213
- if table_str:
214
- table_str += '\n\n'
215
- if volume_type == volume.VolumeType.PVC.value:
216
- pvc_table = PVCVolumeTable(volume_list, show_all)
217
- table_str += pvc_table.format()
218
- elif volume_type == volume.VolumeType.RUNPOD_NETWORK_VOLUME.value:
219
- runpod_table = RunPodVolumeTable(volume_list, show_all)
220
- table_str += runpod_table.format()
221
- if table_str:
222
- return table_str
223
- else:
224
- return 'No existing volumes.'