konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
kubernetes:
|
|
2
|
+
pod_config:
|
|
3
|
+
metadata:
|
|
4
|
+
labels:
|
|
5
|
+
parent: trainy
|
|
6
|
+
trainy.ai/username: {{ user }}
|
|
7
|
+
{% if accelerator_type %}
|
|
8
|
+
trainy.ai/accelerator: {{ accelerator_type }}
|
|
9
|
+
{% endif %}
|
|
10
|
+
{% if konduktor_debug %}
|
|
11
|
+
konduktor.ai/debug: "true"
|
|
12
|
+
{% else %}
|
|
13
|
+
konduktor.ai/debug: "false"
|
|
14
|
+
{% endif %}
|
|
15
|
+
{% if tailscale_secret %}
|
|
16
|
+
konduktor.ai/tailscale: "true"
|
|
17
|
+
{% else %}
|
|
18
|
+
konduktor.ai/tailscale: "false"
|
|
19
|
+
{% endif %}
|
|
20
|
+
spec:
|
|
21
|
+
restartPolicy: "Never"
|
|
22
|
+
# trigger this on GPU request
|
|
23
|
+
{% if num_gpus > 0 %}
|
|
24
|
+
tolerations:
|
|
25
|
+
- key: "nvidia.com/gpu"
|
|
26
|
+
operator: "Exists"
|
|
27
|
+
{% endif %}
|
|
28
|
+
containers:
|
|
29
|
+
# TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
|
|
30
|
+
- name: konduktor-container
|
|
31
|
+
{% if enable_ssh or serving %}
|
|
32
|
+
ports:
|
|
33
|
+
{% if enable_ssh %}
|
|
34
|
+
- name: ssh
|
|
35
|
+
containerPort: {{ konduktor_ssh_port }}
|
|
36
|
+
{% endif %}
|
|
37
|
+
|
|
38
|
+
{% if serving %}
|
|
39
|
+
- name: serving
|
|
40
|
+
containerPort: {{ ports }}
|
|
41
|
+
{% endif %}
|
|
42
|
+
{% endif %}
|
|
43
|
+
|
|
44
|
+
{% if serving and probe %}
|
|
45
|
+
# TODO (ryan): allow modification of thresholds and timings
|
|
46
|
+
livenessProbe:
|
|
47
|
+
httpGet:
|
|
48
|
+
path: {{ probe }}
|
|
49
|
+
port: {{ ports }}
|
|
50
|
+
scheme: HTTP
|
|
51
|
+
initialDelaySeconds: 60
|
|
52
|
+
failureThreshold: 3
|
|
53
|
+
periodSeconds: 10
|
|
54
|
+
successThreshold: 1
|
|
55
|
+
timeoutSeconds: 1
|
|
56
|
+
readinessProbe:
|
|
57
|
+
httpGet:
|
|
58
|
+
path: {{ probe }}
|
|
59
|
+
port: {{ ports }}
|
|
60
|
+
scheme: HTTP
|
|
61
|
+
initialDelaySeconds: 60
|
|
62
|
+
failureThreshold: 10
|
|
63
|
+
periodSeconds: 5
|
|
64
|
+
successThreshold: 1
|
|
65
|
+
timeoutSeconds: 1
|
|
66
|
+
startupProbe:
|
|
67
|
+
httpGet:
|
|
68
|
+
path: {{ probe }}
|
|
69
|
+
port: {{ ports }}
|
|
70
|
+
scheme: HTTP
|
|
71
|
+
failureThreshold: 60
|
|
72
|
+
periodSeconds: 30
|
|
73
|
+
successThreshold: 1
|
|
74
|
+
timeoutSeconds: 1
|
|
75
|
+
{% endif %}
|
|
76
|
+
image: {{ image_id }}
|
|
77
|
+
# this is set during jobset definition since we need to know the jobset
|
|
78
|
+
# name and number of nodes to set all the environment variables correctly here
|
|
79
|
+
# as well as the additional from the job definition
|
|
80
|
+
env:
|
|
81
|
+
# flush logs immediately to stdout for more reactive log streaming
|
|
82
|
+
- name: PYTHONUNBUFFERED
|
|
83
|
+
value: "0"
|
|
84
|
+
- name: KONDUKTOR_NODENAME
|
|
85
|
+
valueFrom:
|
|
86
|
+
fieldRef:
|
|
87
|
+
fieldPath: spec.nodeName
|
|
88
|
+
- name: KONDUKTOR_JOB_NAME
|
|
89
|
+
value: "{{ job_name }}"
|
|
90
|
+
- name: NODE_HOST_IPS
|
|
91
|
+
value: "{{ node_hostnames }}"
|
|
92
|
+
- name: MASTER_ADDR
|
|
93
|
+
value: "{{ master_addr }}"
|
|
94
|
+
- name: RANK
|
|
95
|
+
valueFrom:
|
|
96
|
+
fieldRef:
|
|
97
|
+
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
|
|
98
|
+
- name: LOCAL_ADDR
|
|
99
|
+
valueFrom:
|
|
100
|
+
fieldRef:
|
|
101
|
+
fieldPath: status.podIP
|
|
102
|
+
- name: NUM_NODES
|
|
103
|
+
value: "{{ num_nodes }}"
|
|
104
|
+
- name: NUM_GPUS_PER_NODE
|
|
105
|
+
value: "{{ num_gpus }}"
|
|
106
|
+
{% if tailscale_secret %}
|
|
107
|
+
- name: TS_USERSPACE
|
|
108
|
+
value: "true"
|
|
109
|
+
- name: TS_AUTHKEY
|
|
110
|
+
valueFrom:
|
|
111
|
+
secretKeyRef:
|
|
112
|
+
name: {{ tailscale_secret }}
|
|
113
|
+
key: TS_AUTHKEY
|
|
114
|
+
optional: true
|
|
115
|
+
- name: POD_NAME
|
|
116
|
+
valueFrom:
|
|
117
|
+
fieldRef:
|
|
118
|
+
fieldPath: metadata.name
|
|
119
|
+
- name: POD_UID
|
|
120
|
+
valueFrom:
|
|
121
|
+
fieldRef:
|
|
122
|
+
fieldPath: metadata.uid
|
|
123
|
+
{% endif %}
|
|
124
|
+
{% if enable_ssh %}
|
|
125
|
+
- name: KONDUKTOR_SSHPUB
|
|
126
|
+
valueFrom:
|
|
127
|
+
secretKeyRef:
|
|
128
|
+
name: {{ secret_name }}
|
|
129
|
+
key: PUBKEY
|
|
130
|
+
- name: KONDUKTOR_SSHPRIV
|
|
131
|
+
valueFrom:
|
|
132
|
+
secretKeyRef:
|
|
133
|
+
name: {{ secret_name }}
|
|
134
|
+
key: PRIVKEY
|
|
135
|
+
- name: KONDUKTOR_SSH_PORT
|
|
136
|
+
value: "{{ konduktor_ssh_port }}"
|
|
137
|
+
{% endif %}
|
|
138
|
+
{% if git_ssh %}
|
|
139
|
+
- name: GIT_SSH_COMMAND
|
|
140
|
+
value: "ssh -i /run/konduktor/git-ssh-secret/gitkey -o StrictHostKeyChecking=no"
|
|
141
|
+
{% endif %}
|
|
142
|
+
{% if default_secrets %}
|
|
143
|
+
- name: KONDUKTOR_DEFAULT_SECRETS
|
|
144
|
+
value: "/konduktor/default-secrets"
|
|
145
|
+
- name: KONDUKTOR_DEFAULT_SECRETS_EXPANDED
|
|
146
|
+
value: "/run/konduktor/expanded-default-secrets"
|
|
147
|
+
{% endif %}
|
|
148
|
+
# these are for compatibility with skypilot
|
|
149
|
+
- name: SKYPILOT_NODE_IPS
|
|
150
|
+
value: "{{ node_hostnames }}"
|
|
151
|
+
- name: SKYPILOT_NODE_RANK
|
|
152
|
+
valueFrom:
|
|
153
|
+
fieldRef:
|
|
154
|
+
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
|
|
155
|
+
- name: SKYPILOT_NUM_NODES
|
|
156
|
+
value: "{{ num_nodes }}"
|
|
157
|
+
- name: SKYPILOT_NUM_GPUS_PER_NODE
|
|
158
|
+
value: "{{ num_gpus }}"
|
|
159
|
+
- name: RESTART_ATTEMPT
|
|
160
|
+
valueFrom:
|
|
161
|
+
fieldRef:
|
|
162
|
+
fieldPath: metadata.labels['jobset.sigs.k8s.io/restart-attempt']
|
|
163
|
+
volumeMounts:
|
|
164
|
+
- name: shared-memory
|
|
165
|
+
mountPath: /dev/shm
|
|
166
|
+
- name: sync
|
|
167
|
+
mountPath: /tmp/konduktor
|
|
168
|
+
{% for secret_type, secret_name in mount_secrets.items() %}
|
|
169
|
+
- name: {{ secret_type }}-secret
|
|
170
|
+
mountPath: /run/konduktor/{{ secret_type }}-secret
|
|
171
|
+
{% endfor %}
|
|
172
|
+
{% for secret in default_secrets %}
|
|
173
|
+
- name: default-secret-{{ secret.mount_name }}
|
|
174
|
+
mountPath: /konduktor/default-secrets/{{ secret.mount_name }}
|
|
175
|
+
{% endfor %}
|
|
176
|
+
{% if default_secrets %}
|
|
177
|
+
- name: default-secrets-expanded
|
|
178
|
+
mountPath: /run/konduktor/expanded-default-secrets
|
|
179
|
+
{% endif %}
|
|
180
|
+
{% if git_ssh %}
|
|
181
|
+
- name: git-ssh-secret
|
|
182
|
+
mountPath: /run/konduktor/git-ssh-secret
|
|
183
|
+
{% endif %}
|
|
184
|
+
{% if tailscale_secret %}
|
|
185
|
+
- name: tailscale-state
|
|
186
|
+
mountPath: /var/lib/tailscale
|
|
187
|
+
{% endif %}
|
|
188
|
+
command: ["bash", "-c"]
|
|
189
|
+
args:
|
|
190
|
+
- |
|
|
191
|
+
# TODO(asaiacai): add debug environment variable for printing the apt-update, apt-install, sync-files output
|
|
192
|
+
# Helper function to conditionally use sudo
|
|
193
|
+
export RDZV_CONF=is_host=$(if [ "$RANK" == "0" ]; then echo "true"; else echo "false"; fi)
|
|
194
|
+
set -eo pipefail
|
|
195
|
+
{% if konduktor_debug %}
|
|
196
|
+
set -x
|
|
197
|
+
{% endif %}
|
|
198
|
+
mkdir -p ~/.konduktor/tmp
|
|
199
|
+
start_epoch=$(date +%s);
|
|
200
|
+
start_setup=$(date +%s);
|
|
201
|
+
echo "===== KONDUKTOR: Running setup and installing packages ====="
|
|
202
|
+
prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
|
|
203
|
+
[ $(id -u) -eq 0 ] && function sudo() { "$@"; } || true;
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
PACKAGES="";
|
|
207
|
+
{% if 'rsync' in run_cmd or 'rsync' in setup_cmd %}
|
|
208
|
+
PACKAGES="$PACKAGES rsync";
|
|
209
|
+
{% endif %}
|
|
210
|
+
{% if 'curl' in run_cmd or 'curl' in setup_cmd or tailscale_secret %}
|
|
211
|
+
PACKAGES="$PACKAGES curl";
|
|
212
|
+
{% endif %}
|
|
213
|
+
{% if 'gs' in mount_secrets or 's3' in mount_secrets or default_secrets %}
|
|
214
|
+
PACKAGES="$PACKAGES unzip wget";
|
|
215
|
+
{% endif %}
|
|
216
|
+
{% if 'git' in run_cmd or 'git' in setup_cmd %}
|
|
217
|
+
PACKAGES="$PACKAGES git";
|
|
218
|
+
{% endif %}
|
|
219
|
+
|
|
220
|
+
if [ ! -z "${PACKAGES}" ]; then
|
|
221
|
+
# Run apt update, install missing packages
|
|
222
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update 2>&1 | tee -a ~/.konduktor/tmp/apt-update.log 2>&1 || \
|
|
223
|
+
$(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
|
|
224
|
+
fi
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# Separate packages into two groups: packages that are installed first
|
|
228
|
+
# so that curl and rsync are available sooner to unblock the following
|
|
229
|
+
# conda installation and rsync.
|
|
230
|
+
INSTALL_FIRST="";
|
|
231
|
+
MISSING_PACKAGES="";
|
|
232
|
+
for pkg in $PACKAGES; do
|
|
233
|
+
if ! dpkg -l | grep -q "^ii $pkg "; then
|
|
234
|
+
if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ]; then
|
|
235
|
+
INSTALL_FIRST="$INSTALL_FIRST $pkg";
|
|
236
|
+
else
|
|
237
|
+
MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
|
|
238
|
+
fi
|
|
239
|
+
fi
|
|
240
|
+
done;
|
|
241
|
+
if [ ! -z "$INSTALL_FIRST" ]; then
|
|
242
|
+
$(prefix_cmd) echo "Installing core packages: $INSTALL_FIRST";
|
|
243
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST 2>&1 | tee -a ~/.konduktor/tmp/apt-install.log;
|
|
244
|
+
fi;
|
|
245
|
+
|
|
246
|
+
if [ ! -z "$MISSING_PACKAGES" ]; then
|
|
247
|
+
$(prefix_cmd) echo "Installing missing packages: $MISSING_PACKAGES";
|
|
248
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES 2>&1 | tee -a ~/.konduktor/tmp/apt-install.log;
|
|
249
|
+
fi;
|
|
250
|
+
end_epoch=$(date +%s);
|
|
251
|
+
|
|
252
|
+
echo "===== KONDUKTOR: Exposing ENV variables ====="
|
|
253
|
+
$(prefix_cmd) env -0 | awk -v RS='\0' '
|
|
254
|
+
{
|
|
255
|
+
gsub(/\\/,"\\\\"); # escape existing backslashes first
|
|
256
|
+
gsub(/"/,"\\\""); # escape any double quotes
|
|
257
|
+
gsub(/\n/,"\\n"); # turn real newlines into the two characters \n
|
|
258
|
+
sub(/=/,"=\""); # open the value-quoting
|
|
259
|
+
print $0 "\""; # close the quote and add a newline record separator
|
|
260
|
+
}
|
|
261
|
+
' > /etc/environment
|
|
262
|
+
$(prefix_cmd) echo "set -a; source /etc/environment; set +a;" >> $HOME/.bashrc
|
|
263
|
+
|
|
264
|
+
{% if enable_ssh %}
|
|
265
|
+
|
|
266
|
+
function InstallSSH {
|
|
267
|
+
export DEBIAN_FRONTEND=noninteractive
|
|
268
|
+
export TZ=Etc/UTC
|
|
269
|
+
set -u
|
|
270
|
+
if service sshd status > /dev/null 2>&1; then
|
|
271
|
+
$(prefix_cmd) echo "OpenSSH server is already started."
|
|
272
|
+
return
|
|
273
|
+
fi
|
|
274
|
+
# Check if OpenSSH server is already installed
|
|
275
|
+
if ! command -v sshd &> /dev/null; then
|
|
276
|
+
$(prefix_cmd) echo "OpenSSH server is not installed. Installing..."
|
|
277
|
+
|
|
278
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt update 2>&1 >> ~/.konduktor/tmp/apt-install.log;
|
|
279
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt install -y openssh-server >> ~/.konduktor/tmp/apt-install.log;
|
|
280
|
+
|
|
281
|
+
$(prefix_cmd) echo "OpenSSH server installation complete."
|
|
282
|
+
else
|
|
283
|
+
$(prefix_cmd) echo "OpenSSH server is already installed."
|
|
284
|
+
fi
|
|
285
|
+
|
|
286
|
+
# Set root password if SSHKEY is provided
|
|
287
|
+
# Enable root login in SSH configuration
|
|
288
|
+
$(prefix_cmd) sed -i '/^#PermitRootLogin/c\PermitRootLogin without-password' /etc/ssh/sshd_config
|
|
289
|
+
$(prefix_cmd) sed -i '/^PermitRootLogin/c\PermitRootLogin without-password' /etc/ssh/sshd_config
|
|
290
|
+
$(prefix_cmd) echo "Root login is enabled."
|
|
291
|
+
|
|
292
|
+
# Create the .ssh directory and authorized_keys file if they don't exist
|
|
293
|
+
if [ ! -d "$HOME/.ssh" ]; then
|
|
294
|
+
$(prefix_cmd) mkdir -p "$HOME/.ssh"
|
|
295
|
+
$(prefix_cmd) chmod 0700 "$HOME/.ssh"
|
|
296
|
+
$(prefix_cmd) echo "Directory $HOME/.ssh created."
|
|
297
|
+
fi
|
|
298
|
+
if [ ! -f "$HOME/.ssh/authorized_keys" ]; then
|
|
299
|
+
$(prefix_cmd) touch "$HOME/.ssh/authorized_keys"
|
|
300
|
+
$(prefix_cmd) chmod 0600 "$HOME/.ssh/authorized_keys"
|
|
301
|
+
$(prefix_cmd) echo "File $HOME/.ssh/authorized_keys created."
|
|
302
|
+
fi
|
|
303
|
+
# Check if the public key is not already present in authorized_keys
|
|
304
|
+
if ! grep -q "${KONDUKTOR_SSHPUB}" "$HOME/.ssh/authorized_keys"; then
|
|
305
|
+
# Append the public key to authorized_keys
|
|
306
|
+
$(prefix_cmd) echo "${KONDUKTOR_SSHPUB}" >> "$HOME/.ssh/authorized_keys"
|
|
307
|
+
$(prefix_cmd) echo "Public key added."
|
|
308
|
+
fi
|
|
309
|
+
if [ ! -f "$HOME/.ssh/konduktor-key" ]; then
|
|
310
|
+
# create the private key to authorized_keys
|
|
311
|
+
$(prefix_cmd) touch "$HOME/.ssh/konduktor-key"
|
|
312
|
+
$(prefix_cmd) chmod 0600 "$HOME/.ssh/konduktor-key"
|
|
313
|
+
$(prefix_cmd) echo "${KONDUKTOR_SSHPRIV}" >> "$HOME/.ssh/konduktor-key"
|
|
314
|
+
$(prefix_cmd) echo "private key added."
|
|
315
|
+
fi
|
|
316
|
+
if [ ! -f "$HOME/.ssh/config" ]; then
|
|
317
|
+
# create the private key to authorized_keys
|
|
318
|
+
$(prefix_cmd) touch "$HOME/.ssh/config"
|
|
319
|
+
$(prefix_cmd) chmod 0600 "$HOME/.ssh/config"
|
|
320
|
+
$(prefix_cmd) printf '\nHost *\n StrictHostKeyChecking no\n' >> "$HOME/.ssh/config"
|
|
321
|
+
$(prefix_cmd) echo "ssh config set"
|
|
322
|
+
fi
|
|
323
|
+
|
|
324
|
+
# turn off PAM to fix sshd login issue
|
|
325
|
+
$(prefix_cmd) sed -i 's/UsePAM yes/UsePAM no/' /etc/ssh/sshd_config
|
|
326
|
+
|
|
327
|
+
# set default port to 22
|
|
328
|
+
$(prefix_cmd) sed -i 's/#Port 22/Port {{ konduktor_ssh_port }}/' /etc/ssh/sshd_config
|
|
329
|
+
|
|
330
|
+
$(prefix_cmd) mkdir /run/sshd
|
|
331
|
+
$(prefix_cmd) chmod 0755 /run/sshd
|
|
332
|
+
|
|
333
|
+
$(prefix_cmd) service ssh start
|
|
334
|
+
$(prefix_cmd) echo "sshd service started"
|
|
335
|
+
set +u
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
InstallSSH
|
|
339
|
+
{% endif %}
|
|
340
|
+
{% if tailscale_secret %}
|
|
341
|
+
export TS_HOSTNAME=$(echo "$POD_NAME" | sed 's/-[^-]*$//')
|
|
342
|
+
$(prefix_cmd) echo "TS_HOSTNAME=${TS_HOSTNAME}" >> /etc/environment
|
|
343
|
+
function InstallTailscale {
|
|
344
|
+
if ! command -v tailscale >/dev/null 2>&1; then
|
|
345
|
+
$(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh > ~/.konduktor/tmp/tailscale-install.log 2>&1
|
|
346
|
+
fi
|
|
347
|
+
if ! tailscale status >/dev/null 2>&1; then
|
|
348
|
+
$(prefix_cmd) mkdir -p /var/run/tailscale /var/cache/tailscale /var/lib/tailscale
|
|
349
|
+
$(prefix_cmd) nohup tailscaled --tun=userspace-networking >~/.konduktor/tmp/tailscaled.log 2>&1 &
|
|
350
|
+
fi
|
|
351
|
+
until tailscale status >/dev/null 2>&1; do
|
|
352
|
+
$(prefix_cmd) tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME} --accept-dns=false || echo "tailscale up failed retrying"
|
|
353
|
+
done
|
|
354
|
+
$(prefix_cmd) echo "Tailscale is up"
|
|
355
|
+
$(prefix_cmd) tailscale status
|
|
356
|
+
$(prefix_cmd) tailscale netcheck
|
|
357
|
+
}
|
|
358
|
+
InstallTailscale | tee ~/.konduktor/tmp/tailscale-out.log
|
|
359
|
+
{% if konduktor_debug %}
|
|
360
|
+
$(prefix_cmd) cat ~/.konduktor/tmp/tailscale*.log
|
|
361
|
+
{% endif %}
|
|
362
|
+
{% endif %}
|
|
363
|
+
end_epoch=$(date +%s);
|
|
364
|
+
|
|
365
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Installing packages took $((end_epoch - start_epoch)) seconds ====="
|
|
366
|
+
|
|
367
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Environment variable summary ====="
|
|
368
|
+
start_epoch=$(date +%s);
|
|
369
|
+
|
|
370
|
+
print_bucket () {
|
|
371
|
+
title="$1"; list="${2:-}"
|
|
372
|
+
echo "--- $title ---"
|
|
373
|
+
if [ -n "$list" ]; then
|
|
374
|
+
echo "$list" | tr ',' '\n' | sed "s/^/[$title] /"
|
|
375
|
+
else
|
|
376
|
+
echo "[none]"
|
|
377
|
+
fi
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
# Secrets: prefer detailed mapping if available
|
|
381
|
+
echo "--- env secret ---"
|
|
382
|
+
if [ -n "${KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION:-}" ]; then
|
|
383
|
+
echo "${KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION}" \
|
|
384
|
+
| tr ',' '\n' \
|
|
385
|
+
| awk -F'=' '{ printf("[secret: %s] %s\n", $2, $1) }'
|
|
386
|
+
elif [ -n "${KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION:-}" ]; then
|
|
387
|
+
echo "${KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION}" \
|
|
388
|
+
| tr ',' '\n' | sed 's/^/[secret] /'
|
|
389
|
+
else
|
|
390
|
+
echo "[none]"
|
|
391
|
+
fi
|
|
392
|
+
|
|
393
|
+
print_bucket "CLI + task.yaml" "${KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION}"
|
|
394
|
+
print_bucket "config.yaml" "${KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION}"
|
|
395
|
+
print_bucket "other" "${KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION}"
|
|
396
|
+
|
|
397
|
+
end_epoch=$(date +%s);
|
|
398
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Environment variable summary took $((end_epoch - start_epoch)) seconds ====="
|
|
399
|
+
|
|
400
|
+
# unpack secrets credentials
|
|
401
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking cloud storage secret credentials ====="
|
|
402
|
+
start_epoch=$(date +%s);
|
|
403
|
+
mkdir -p ~/.konduktor
|
|
404
|
+
mkdir -p {{ remote_workdir }}
|
|
405
|
+
{% for secret_type, secret_name in mount_secrets.items() %}
|
|
406
|
+
{% if secret_type == "gs" %}
|
|
407
|
+
$(prefix_cmd) echo "Unpacking GCP secret"
|
|
408
|
+
$(prefix_cmd) mkdir -p ~/.config
|
|
409
|
+
$(prefix_cmd) unzip /run/konduktor/gs-secret/gcpcredentials -d ~/.config/gcloud
|
|
410
|
+
{% elif secret_type == "s3" %}
|
|
411
|
+
$(prefix_cmd) echo "Unpacking AWS secret"
|
|
412
|
+
$(prefix_cmd) mkdir -p ~/.aws
|
|
413
|
+
$(prefix_cmd) unzip /run/konduktor/s3-secret/awscredentials -d ~/.aws
|
|
414
|
+
{% endif %}
|
|
415
|
+
{% endfor %}
|
|
416
|
+
|
|
417
|
+
{% if default_secrets %}
|
|
418
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking default secrets ====="
|
|
419
|
+
$(prefix_cmd) mkdir -p "${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}"
|
|
420
|
+
|
|
421
|
+
# For each mounted default secret folder:
|
|
422
|
+
# - if payload.zip exists, unzip it into the expanded dir
|
|
423
|
+
# - otherwise, copy the files as-is
|
|
424
|
+
for src in "${KONDUKTOR_DEFAULT_SECRETS}"/*; do
|
|
425
|
+
[ -d "$src" ] || continue
|
|
426
|
+
name="$(basename "$src")"
|
|
427
|
+
dst="${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}/${name}"
|
|
428
|
+
$(prefix_cmd) mkdir -p "$dst"
|
|
429
|
+
|
|
430
|
+
if [ -f "${src}/payload.zip" ]; then
|
|
431
|
+
$(prefix_cmd) unzip -oq "${src}/payload.zip" -d "$dst"
|
|
432
|
+
else
|
|
433
|
+
$(prefix_cmd) cp -a "${src}/." "$dst/"
|
|
434
|
+
fi
|
|
435
|
+
done
|
|
436
|
+
|
|
437
|
+
# Point callers to the expanded (writable) path going forward
|
|
438
|
+
export KONDUKTOR_DEFAULT_SECRETS="${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}"
|
|
439
|
+
$(prefix_cmd) echo "KONDUKTOR_DEFAULT_SECRETS=${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}" >> /etc/environment
|
|
440
|
+
{% endif %}
|
|
441
|
+
|
|
442
|
+
{% if git_ssh %}
|
|
443
|
+
$(prefix_cmd) echo "Unpacking GIT-SSH secret"
|
|
444
|
+
{% endif %}
|
|
445
|
+
end_epoch=$(date +%s);
|
|
446
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking secrets credentials took $((end_epoch - start_epoch)) seconds ====="
|
|
447
|
+
|
|
448
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Default secret summary ====="
|
|
449
|
+
start_epoch=$(date +%s)
|
|
450
|
+
|
|
451
|
+
root="${KONDUKTOR_DEFAULT_SECRETS:-}"
|
|
452
|
+
if [[ -z "$root" || ! -d "$root" ]]; then
|
|
453
|
+
$(prefix_cmd) echo "NO DEFAULT SECRETS FOUND."
|
|
454
|
+
else
|
|
455
|
+
for dir in "$root"/*; do
|
|
456
|
+
[ -d "$dir" ] || continue
|
|
457
|
+
name="$(basename "$dir")"
|
|
458
|
+
|
|
459
|
+
# Pretty header that mirrors the logical mount base:
|
|
460
|
+
$(prefix_cmd) echo "/konduktor/default-secrets/${name}:"
|
|
461
|
+
|
|
462
|
+
# Print relative paths only; skip macOS junk and k8s secret internals
|
|
463
|
+
(
|
|
464
|
+
cd "$dir"
|
|
465
|
+
out="$(find . \
|
|
466
|
+
\( -name '.DS_Store' -o -name '__MACOSX' -o -name '..data' -o -name '..*' \) -prune -o \
|
|
467
|
+
\( -type f -o -type l \) -print \
|
|
468
|
+
| sed 's|^\./||' \
|
|
469
|
+
| sort)"
|
|
470
|
+
if [ -n "$out" ]; then
|
|
471
|
+
printf "%s\n" "$out"
|
|
472
|
+
fi
|
|
473
|
+
)
|
|
474
|
+
done
|
|
475
|
+
fi
|
|
476
|
+
|
|
477
|
+
end_epoch=$(date +%s)
|
|
478
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Default secret summary took $((end_epoch - start_epoch)) seconds ====="
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
# sync file mounts
|
|
482
|
+
{% for mkdir_command in mkdir_commands %}
|
|
483
|
+
$(prefix_cmd) {{ mkdir_command }}
|
|
484
|
+
{% endfor %}
|
|
485
|
+
{% if sync_commands|length > 0 %}
|
|
486
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Syncing files ====="
|
|
487
|
+
start_epoch=$(date +%s);
|
|
488
|
+
{% for sync_command in sync_commands %}
|
|
489
|
+
$(prefix_cmd) {{ sync_command }} >> ~/.konduktor/tmp/sync-files.log
|
|
490
|
+
{% endfor %}
|
|
491
|
+
end_epoch=$(date +%s);
|
|
492
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Syncing files took $((end_epoch - start_epoch)) seconds ====="
|
|
493
|
+
{% endif %}
|
|
494
|
+
end_epoch=$(date +%s);
|
|
495
|
+
end_setup_time=$((end_epoch - start_setup));
|
|
496
|
+
ulimit -Sc 0 && ulimit -Hc 0
|
|
497
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Initialization took $end_setup_time seconds ====="
|
|
498
|
+
set +eo pipefail
|
|
499
|
+
# run task
|
|
500
|
+
$(prefix_cmd) cd {{ remote_workdir }}
|
|
501
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Running task ====="
|
|
502
|
+
start_epoch=$(date +%s);
|
|
503
|
+
{{ run_cmd | indent( width=14 ) }}
|
|
504
|
+
end_epoch=$(date +%s);
|
|
505
|
+
exit_code=$?
|
|
506
|
+
set +ex
|
|
507
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Running task took $((end_epoch - start_epoch)) seconds and finished with exit code: $exit_code ====="
|
|
508
|
+
exit $exit_code
|
|
509
|
+
resources:
|
|
510
|
+
limits:
|
|
511
|
+
cpu: {{ cpu }}
|
|
512
|
+
memory: {{ memory }}Gi
|
|
513
|
+
# TODO(asaiacai): need to decide whether we include fabric configuration here
|
|
514
|
+
{% if num_gpus > 0 %}
|
|
515
|
+
nvidia.com/gpu: {{ num_gpus }}
|
|
516
|
+
{% endif %}
|
|
517
|
+
requests:
|
|
518
|
+
cpu: {{ cpu }}
|
|
519
|
+
memory: {{ memory }}Gi
|
|
520
|
+
{% if num_gpus > 0 %}
|
|
521
|
+
nvidia.com/gpu: {{num_gpus}}
|
|
522
|
+
{% endif %}
|
|
523
|
+
securityContext:
|
|
524
|
+
capabilities:
|
|
525
|
+
add:
|
|
526
|
+
- "IPC_LOCK" # May be needed for memlock
|
|
527
|
+
|
|
528
|
+
volumes:
|
|
529
|
+
- name: shared-memory
|
|
530
|
+
emptyDir:
|
|
531
|
+
medium: "Memory"
|
|
532
|
+
sizeLimit: 4Gi
|
|
533
|
+
{% if tailscale_secret %}
|
|
534
|
+
- name: tailscale-state
|
|
535
|
+
emptyDir: {}
|
|
536
|
+
{% endif %}
|
|
537
|
+
- name: sync
|
|
538
|
+
emptyDir: {}
|
|
539
|
+
{% for secret_type, secret_name in mount_secrets.items() %}
|
|
540
|
+
- name: {{ secret_type }}-secret
|
|
541
|
+
secret:
|
|
542
|
+
secretName: {{ secret_name }}
|
|
543
|
+
{% endfor %}
|
|
544
|
+
{% for secret in default_secrets %}
|
|
545
|
+
- name: default-secret-{{ secret.mount_name }}
|
|
546
|
+
secret:
|
|
547
|
+
secretName: {{ secret.k8s_name }}
|
|
548
|
+
{% endfor %}
|
|
549
|
+
{% if default_secrets %}
|
|
550
|
+
- name: default-secrets-expanded
|
|
551
|
+
emptyDir: {}
|
|
552
|
+
{% endif %}
|
|
553
|
+
{% if git_ssh %}
|
|
554
|
+
- name: git-ssh-secret
|
|
555
|
+
secret:
|
|
556
|
+
secretName: {{ git_ssh }}
|
|
557
|
+
defaultMode: 384
|
|
558
|
+
{% endif %}
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
# TODO(asaiacai): should we add nodeSelectors here or leave to
|
|
562
|
+
# kueue resource flavors. leaning towards defining
|
|
563
|
+
# in kueue and just querying for the kueue resource flavor
|
|
File without changes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Constants for usage collection."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
KONDUKTOR_DISABLE_USAGE_COLLECTION = os.environ.get(
|
|
6
|
+
'KONDUKTOR_DISABLE_USAGE_COLLECTION', False
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
POSTHOG_API_KEY = os.environ.get(
|
|
10
|
+
'POSTHOG_API_KEY', 'phc_4UgX80BfVNmYRZ2o3dJLyRMGkv1CxBozPAcPnD29uP4'
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
POSTHOG_HOST = os.environ.get('POSTHOG_HOST', 'https://us.i.posthog.com')
|
|
14
|
+
|
|
15
|
+
USAGE_POLICY_MESSAGE = (
|
|
16
|
+
'Konduktor collects usage data to improve its services. '
|
|
17
|
+
'`run` commands are not collected to '
|
|
18
|
+
'ensure privacy.\n'
|
|
19
|
+
'Usage logging can be disabled by setting the '
|
|
20
|
+
'environment variable KONDUKTOR_DISABLE_USAGE_COLLECTION=1.'
|
|
21
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Accelerator registry."""
|
|
2
|
+
|
|
3
|
+
_ACCELERATORS = ['A100', 'A100-80GB', 'B200', 'H100', 'H200', 'L40S', 'T4', 'L40']
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def canonicalize_accelerator_name(accelerator: str) -> str:
|
|
7
|
+
"""Returns the canonical accelerator name."""
|
|
8
|
+
|
|
9
|
+
# Common case: do not read the catalog files.
|
|
10
|
+
mapping = {name.lower(): name for name in _ACCELERATORS}
|
|
11
|
+
if accelerator.lower() in mapping:
|
|
12
|
+
return mapping[accelerator.lower()]
|
|
13
|
+
|
|
14
|
+
raise ValueError(
|
|
15
|
+
f'Accelerator name {accelerator!r} is not supported. '
|
|
16
|
+
f'Please choose one of {_ACCELERATORS}.'
|
|
17
|
+
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""Annotations for public APIs."""
|
|
14
|
+
|
|
15
|
+
import functools
|
|
16
|
+
from typing import Callable, Literal
|
|
17
|
+
|
|
18
|
+
# Whether the current process is a SkyPilot API server process.
|
|
19
|
+
is_on_api_server = True
|
|
20
|
+
FUNCTIONS_NEED_RELOAD_CACHE = []
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def client_api(func):
|
|
24
|
+
"""Mark a function as a client-side API.
|
|
25
|
+
|
|
26
|
+
Code invoked by server-side functions will find annotations.is_on_api_server
|
|
27
|
+
to be True, so they can have some server-side handling.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
@functools.wraps(func)
|
|
31
|
+
def wrapper(*args, **kwargs):
|
|
32
|
+
global is_on_api_server
|
|
33
|
+
is_on_api_server = False
|
|
34
|
+
return func(*args, **kwargs)
|
|
35
|
+
|
|
36
|
+
return wrapper
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def lru_cache(
|
|
40
|
+
scope: Literal['global', 'request'], *lru_cache_args, **lru_cache_kwargs
|
|
41
|
+
) -> Callable:
|
|
42
|
+
"""LRU cache decorator for functions.
|
|
43
|
+
|
|
44
|
+
This decorator allows us to track which functions need to be reloaded for a
|
|
45
|
+
new request using the scope argument.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
scope: Whether the cache is global or request-specific, i.e. needs to be
|
|
49
|
+
reloaded for a new request.
|
|
50
|
+
lru_cache_args: Arguments for functools.lru_cache.
|
|
51
|
+
lru_cache_kwargs: Keyword arguments for functools.lru_cache.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def decorator(func: Callable) -> Callable:
|
|
55
|
+
if scope == 'global':
|
|
56
|
+
return functools.lru_cache(*lru_cache_args, **lru_cache_kwargs)(func)
|
|
57
|
+
else:
|
|
58
|
+
cached_func = functools.lru_cache(*lru_cache_args, **lru_cache_kwargs)(func)
|
|
59
|
+
FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
|
|
60
|
+
return cached_func
|
|
61
|
+
|
|
62
|
+
return decorator
|