konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,563 @@
1
+ kubernetes:
2
+ pod_config:
3
+ metadata:
4
+ labels:
5
+ parent: trainy
6
+ trainy.ai/username: {{ user }}
7
+ {% if accelerator_type %}
8
+ trainy.ai/accelerator: {{ accelerator_type }}
9
+ {% endif %}
10
+ {% if konduktor_debug %}
11
+ konduktor.ai/debug: "true"
12
+ {% else %}
13
+ konduktor.ai/debug: "false"
14
+ {% endif %}
15
+ {% if tailscale_secret %}
16
+ konduktor.ai/tailscale: "true"
17
+ {% else %}
18
+ konduktor.ai/tailscale: "false"
19
+ {% endif %}
20
+ spec:
21
+ restartPolicy: "Never"
22
+ # trigger this on GPU request
23
+ {% if num_gpus > 0 %}
24
+ tolerations:
25
+ - key: "nvidia.com/gpu"
26
+ operator: "Exists"
27
+ {% endif %}
28
+ containers:
29
+ # TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
30
+ - name: konduktor-container
31
+ {% if enable_ssh or serving %}
32
+ ports:
33
+ {% if enable_ssh %}
34
+ - name: ssh
35
+ containerPort: {{ konduktor_ssh_port }}
36
+ {% endif %}
37
+
38
+ {% if serving %}
39
+ - name: serving
40
+ containerPort: {{ ports }}
41
+ {% endif %}
42
+ {% endif %}
43
+
44
+ {% if serving and probe %}
45
+ # TODO (ryan): allow modification of thresholds and timings
46
+ livenessProbe:
47
+ httpGet:
48
+ path: {{ probe }}
49
+ port: {{ ports }}
50
+ scheme: HTTP
51
+ initialDelaySeconds: 60
52
+ failureThreshold: 3
53
+ periodSeconds: 10
54
+ successThreshold: 1
55
+ timeoutSeconds: 1
56
+ readinessProbe:
57
+ httpGet:
58
+ path: {{ probe }}
59
+ port: {{ ports }}
60
+ scheme: HTTP
61
+ initialDelaySeconds: 60
62
+ failureThreshold: 10
63
+ periodSeconds: 5
64
+ successThreshold: 1
65
+ timeoutSeconds: 1
66
+ startupProbe:
67
+ httpGet:
68
+ path: {{ probe }}
69
+ port: {{ ports }}
70
+ scheme: HTTP
71
+ failureThreshold: 60
72
+ periodSeconds: 30
73
+ successThreshold: 1
74
+ timeoutSeconds: 1
75
+ {% endif %}
76
+ image: {{ image_id }}
77
+ # this is set during jobset definition since we need to know the jobset
78
+ # name and number of nodes to set all the environment variables correctly here
79
+ # as well as the additional from the job definition
80
+ env:
81
+ # flush logs immediately to stdout for more reactive log streaming
82
+ - name: PYTHONUNBUFFERED
83
+ value: "0"
84
+ - name: KONDUKTOR_NODENAME
85
+ valueFrom:
86
+ fieldRef:
87
+ fieldPath: spec.nodeName
88
+ - name: KONDUKTOR_JOB_NAME
89
+ value: "{{ job_name }}"
90
+ - name: NODE_HOST_IPS
91
+ value: "{{ node_hostnames }}"
92
+ - name: MASTER_ADDR
93
+ value: "{{ master_addr }}"
94
+ - name: RANK
95
+ valueFrom:
96
+ fieldRef:
97
+ fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
98
+ - name: LOCAL_ADDR
99
+ valueFrom:
100
+ fieldRef:
101
+ fieldPath: status.podIP
102
+ - name: NUM_NODES
103
+ value: "{{ num_nodes }}"
104
+ - name: NUM_GPUS_PER_NODE
105
+ value: "{{ num_gpus }}"
106
+ {% if tailscale_secret %}
107
+ - name: TS_USERSPACE
108
+ value: "true"
109
+ - name: TS_AUTHKEY
110
+ valueFrom:
111
+ secretKeyRef:
112
+ name: {{ tailscale_secret }}
113
+ key: TS_AUTHKEY
114
+ optional: true
115
+ - name: POD_NAME
116
+ valueFrom:
117
+ fieldRef:
118
+ fieldPath: metadata.name
119
+ - name: POD_UID
120
+ valueFrom:
121
+ fieldRef:
122
+ fieldPath: metadata.uid
123
+ {% endif %}
124
+ {% if enable_ssh %}
125
+ - name: KONDUKTOR_SSHPUB
126
+ valueFrom:
127
+ secretKeyRef:
128
+ name: {{ secret_name }}
129
+ key: PUBKEY
130
+ - name: KONDUKTOR_SSHPRIV
131
+ valueFrom:
132
+ secretKeyRef:
133
+ name: {{ secret_name }}
134
+ key: PRIVKEY
135
+ - name: KONDUKTOR_SSH_PORT
136
+ value: "{{ konduktor_ssh_port }}"
137
+ {% endif %}
138
+ {% if git_ssh %}
139
+ - name: GIT_SSH_COMMAND
140
+ value: "ssh -i /run/konduktor/git-ssh-secret/gitkey -o StrictHostKeyChecking=no"
141
+ {% endif %}
142
+ {% if default_secrets %}
143
+ - name: KONDUKTOR_DEFAULT_SECRETS
144
+ value: "/konduktor/default-secrets"
145
+ - name: KONDUKTOR_DEFAULT_SECRETS_EXPANDED
146
+ value: "/run/konduktor/expanded-default-secrets"
147
+ {% endif %}
148
+ # these are for compatibility with skypilot
149
+ - name: SKYPILOT_NODE_IPS
150
+ value: "{{ node_hostnames }}"
151
+ - name: SKYPILOT_NODE_RANK
152
+ valueFrom:
153
+ fieldRef:
154
+ fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
155
+ - name: SKYPILOT_NUM_NODES
156
+ value: "{{ num_nodes }}"
157
+ - name: SKYPILOT_NUM_GPUS_PER_NODE
158
+ value: "{{ num_gpus }}"
159
+ - name: RESTART_ATTEMPT
160
+ valueFrom:
161
+ fieldRef:
162
+ fieldPath: metadata.labels['jobset.sigs.k8s.io/restart-attempt']
163
+ volumeMounts:
164
+ - name: shared-memory
165
+ mountPath: /dev/shm
166
+ - name: sync
167
+ mountPath: /tmp/konduktor
168
+ {% for secret_type, secret_name in mount_secrets.items() %}
169
+ - name: {{ secret_type }}-secret
170
+ mountPath: /run/konduktor/{{ secret_type }}-secret
171
+ {% endfor %}
172
+ {% for secret in default_secrets %}
173
+ - name: default-secret-{{ secret.mount_name }}
174
+ mountPath: /konduktor/default-secrets/{{ secret.mount_name }}
175
+ {% endfor %}
176
+ {% if default_secrets %}
177
+ - name: default-secrets-expanded
178
+ mountPath: /run/konduktor/expanded-default-secrets
179
+ {% endif %}
180
+ {% if git_ssh %}
181
+ - name: git-ssh-secret
182
+ mountPath: /run/konduktor/git-ssh-secret
183
+ {% endif %}
184
+ {% if tailscale_secret %}
185
+ - name: tailscale-state
186
+ mountPath: /var/lib/tailscale
187
+ {% endif %}
188
+ command: ["bash", "-c"]
189
+ args:
190
+ - |
191
+ # TODO(asaiacai): add debug environment variable for printing the apt-update, apt-install, sync-files output
192
+ # Helper function to conditionally use sudo
193
+ export RDZV_CONF=is_host=$(if [ "$RANK" == "0" ]; then echo "true"; else echo "false"; fi)
194
+ set -eo pipefail
195
+ {% if konduktor_debug %}
196
+ set -x
197
+ {% endif %}
198
+ mkdir -p ~/.konduktor/tmp
199
+ start_epoch=$(date +%s);
200
+ start_setup=$(date +%s);
201
+ echo "===== KONDUKTOR: Running setup and installing packages ====="
202
+ prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
203
+ [ $(id -u) -eq 0 ] && function sudo() { "$@"; } || true;
204
+
205
+
206
+ PACKAGES="";
207
+ {% if 'rsync' in run_cmd or 'rsync' in setup_cmd %}
208
+ PACKAGES="$PACKAGES rsync";
209
+ {% endif %}
210
+ {% if 'curl' in run_cmd or 'curl' in setup_cmd or tailscale_secret %}
211
+ PACKAGES="$PACKAGES curl";
212
+ {% endif %}
213
+ {% if 'gs' in mount_secrets or 's3' in mount_secrets or default_secrets %}
214
+ PACKAGES="$PACKAGES unzip wget";
215
+ {% endif %}
216
+ {% if 'git' in run_cmd or 'git' in setup_cmd %}
217
+ PACKAGES="$PACKAGES git";
218
+ {% endif %}
219
+
220
+ if [ ! -z "${PACKAGES}" ]; then
221
+ # Run apt update, install missing packages
222
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update 2>&1 | tee -a ~/.konduktor/tmp/apt-update.log 2>&1 || \
223
+ $(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
224
+ fi
225
+
226
+
227
+ # Separate packages into two groups: packages that are installed first
228
+ # so that curl and rsync are available sooner to unblock the following
229
+ # conda installation and rsync.
230
+ INSTALL_FIRST="";
231
+ MISSING_PACKAGES="";
232
+ for pkg in $PACKAGES; do
233
+ if ! dpkg -l | grep -q "^ii $pkg "; then
234
+ if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ]; then
235
+ INSTALL_FIRST="$INSTALL_FIRST $pkg";
236
+ else
237
+ MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
238
+ fi
239
+ fi
240
+ done;
241
+ if [ ! -z "$INSTALL_FIRST" ]; then
242
+ $(prefix_cmd) echo "Installing core packages: $INSTALL_FIRST";
243
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST 2>&1 | tee -a ~/.konduktor/tmp/apt-install.log;
244
+ fi;
245
+
246
+ if [ ! -z "$MISSING_PACKAGES" ]; then
247
+ $(prefix_cmd) echo "Installing missing packages: $MISSING_PACKAGES";
248
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES 2>&1 | tee -a ~/.konduktor/tmp/apt-install.log;
249
+ fi;
250
+ end_epoch=$(date +%s);
251
+
252
+ echo "===== KONDUKTOR: Exposing ENV variables ====="
253
+ $(prefix_cmd) env -0 | awk -v RS='\0' '
254
+ {
255
+ gsub(/\\/,"\\\\"); # escape existing backslashes first
256
+ gsub(/"/,"\\\""); # escape any double quotes
257
+ gsub(/\n/,"\\n"); # turn real newlines into the two characters \n
258
+ sub(/=/,"=\""); # open the value-quoting
259
+ print $0 "\""; # close the quote and add a newline record separator
260
+ }
261
+ ' > /etc/environment
262
+ $(prefix_cmd) echo "set -a; source /etc/environment; set +a;" >> $HOME/.bashrc
263
+
264
+ {% if enable_ssh %}
265
+
266
+ function InstallSSH {
267
+ export DEBIAN_FRONTEND=noninteractive
268
+ export TZ=Etc/UTC
269
+ set -u
270
+ if service sshd status > /dev/null 2>&1; then
271
+ $(prefix_cmd) echo "OpenSSH server is already started."
272
+ return
273
+ fi
274
+ # Check if OpenSSH server is already installed
275
+ if ! command -v sshd &> /dev/null; then
276
+ $(prefix_cmd) echo "OpenSSH server is not installed. Installing..."
277
+
278
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt update 2>&1 >> ~/.konduktor/tmp/apt-install.log;
279
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt install -y openssh-server >> ~/.konduktor/tmp/apt-install.log;
280
+
281
+ $(prefix_cmd) echo "OpenSSH server installation complete."
282
+ else
283
+ $(prefix_cmd) echo "OpenSSH server is already installed."
284
+ fi
285
+
286
+ # Set root password if SSHKEY is provided
287
+ # Enable root login in SSH configuration
288
+ $(prefix_cmd) sed -i '/^#PermitRootLogin/c\PermitRootLogin without-password' /etc/ssh/sshd_config
289
+ $(prefix_cmd) sed -i '/^PermitRootLogin/c\PermitRootLogin without-password' /etc/ssh/sshd_config
290
+ $(prefix_cmd) echo "Root login is enabled."
291
+
292
+ # Create the .ssh directory and authorized_keys file if they don't exist
293
+ if [ ! -d "$HOME/.ssh" ]; then
294
+ $(prefix_cmd) mkdir -p "$HOME/.ssh"
295
+ $(prefix_cmd) chmod 0700 "$HOME/.ssh"
296
+ $(prefix_cmd) echo "Directory $HOME/.ssh created."
297
+ fi
298
+ if [ ! -f "$HOME/.ssh/authorized_keys" ]; then
299
+ $(prefix_cmd) touch "$HOME/.ssh/authorized_keys"
300
+ $(prefix_cmd) chmod 0600 "$HOME/.ssh/authorized_keys"
301
+ $(prefix_cmd) echo "File $HOME/.ssh/authorized_keys created."
302
+ fi
303
+ # Check if the public key is not already present in authorized_keys
304
+ if ! grep -q "${KONDUKTOR_SSHPUB}" "$HOME/.ssh/authorized_keys"; then
305
+ # Append the public key to authorized_keys
306
+ $(prefix_cmd) echo "${KONDUKTOR_SSHPUB}" >> "$HOME/.ssh/authorized_keys"
307
+ $(prefix_cmd) echo "Public key added."
308
+ fi
309
+ if [ ! -f "$HOME/.ssh/konduktor-key" ]; then
310
+ # create the private key to authorized_keys
311
+ $(prefix_cmd) touch "$HOME/.ssh/konduktor-key"
312
+ $(prefix_cmd) chmod 0600 "$HOME/.ssh/konduktor-key"
313
+ $(prefix_cmd) echo "${KONDUKTOR_SSHPRIV}" >> "$HOME/.ssh/konduktor-key"
314
+ $(prefix_cmd) echo "private key added."
315
+ fi
316
+ if [ ! -f "$HOME/.ssh/config" ]; then
317
+ # create the private key to authorized_keys
318
+ $(prefix_cmd) touch "$HOME/.ssh/config"
319
+ $(prefix_cmd) chmod 0600 "$HOME/.ssh/config"
320
+ $(prefix_cmd) printf '\nHost *\n StrictHostKeyChecking no\n' >> "$HOME/.ssh/config"
321
+ $(prefix_cmd) echo "ssh config set"
322
+ fi
323
+
324
+ # turn off PAM to fix sshd login issue
325
+ $(prefix_cmd) sed -i 's/UsePAM yes/UsePAM no/' /etc/ssh/sshd_config
326
+
327
+ # set default port to 22
328
+ $(prefix_cmd) sed -i 's/#Port 22/Port {{ konduktor_ssh_port }}/' /etc/ssh/sshd_config
329
+
330
+ $(prefix_cmd) mkdir /run/sshd
331
+ $(prefix_cmd) chmod 0755 /run/sshd
332
+
333
+ $(prefix_cmd) service ssh start
334
+ $(prefix_cmd) echo "sshd service started"
335
+ set +u
336
+ }
337
+
338
+ InstallSSH
339
+ {% endif %}
340
+ {% if tailscale_secret %}
341
+ export TS_HOSTNAME=$(echo "$POD_NAME" | sed 's/-[^-]*$//')
342
+ $(prefix_cmd) echo "TS_HOSTNAME=${TS_HOSTNAME}" >> /etc/environment
343
+ function InstallTailscale {
344
+ if ! command -v tailscale >/dev/null 2>&1; then
345
+ $(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh > ~/.konduktor/tmp/tailscale-install.log 2>&1
346
+ fi
347
+ if ! tailscale status >/dev/null 2>&1; then
348
+ $(prefix_cmd) mkdir -p /var/run/tailscale /var/cache/tailscale /var/lib/tailscale
349
+ $(prefix_cmd) nohup tailscaled --tun=userspace-networking >~/.konduktor/tmp/tailscaled.log 2>&1 &
350
+ fi
351
+ until tailscale status >/dev/null 2>&1; do
352
+ $(prefix_cmd) tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME} --accept-dns=false || echo "tailscale up failed retrying"
353
+ done
354
+ $(prefix_cmd) echo "Tailscale is up"
355
+ $(prefix_cmd) tailscale status
356
+ $(prefix_cmd) tailscale netcheck
357
+ }
358
+ InstallTailscale | tee ~/.konduktor/tmp/tailscale-out.log
359
+ {% if konduktor_debug %}
360
+ $(prefix_cmd) cat ~/.konduktor/tmp/tailscale*.log
361
+ {% endif %}
362
+ {% endif %}
363
+ end_epoch=$(date +%s);
364
+
365
+ $(prefix_cmd) echo "===== KONDUKTOR: Installing packages took $((end_epoch - start_epoch)) seconds ====="
366
+
367
+ $(prefix_cmd) echo "===== KONDUKTOR: Environment variable summary ====="
368
+ start_epoch=$(date +%s);
369
+
370
+ print_bucket () {
371
+ title="$1"; list="${2:-}"
372
+ echo "--- $title ---"
373
+ if [ -n "$list" ]; then
374
+ echo "$list" | tr ',' '\n' | sed "s/^/[$title] /"
375
+ else
376
+ echo "[none]"
377
+ fi
378
+ }
379
+
380
+ # Secrets: prefer detailed mapping if available
381
+ echo "--- env secret ---"
382
+ if [ -n "${KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION:-}" ]; then
383
+ echo "${KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION}" \
384
+ | tr ',' '\n' \
385
+ | awk -F'=' '{ printf("[secret: %s] %s\n", $2, $1) }'
386
+ elif [ -n "${KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION:-}" ]; then
387
+ echo "${KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION}" \
388
+ | tr ',' '\n' | sed 's/^/[secret] /'
389
+ else
390
+ echo "[none]"
391
+ fi
392
+
393
+ print_bucket "CLI + task.yaml" "${KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION}"
394
+ print_bucket "config.yaml" "${KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION}"
395
+ print_bucket "other" "${KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION}"
396
+
397
+ end_epoch=$(date +%s);
398
+ $(prefix_cmd) echo "===== KONDUKTOR: Environment variable summary took $((end_epoch - start_epoch)) seconds ====="
399
+
400
+ # unpack secrets credentials
401
+ $(prefix_cmd) echo "===== KONDUKTOR: Unpacking cloud storage secret credentials ====="
402
+ start_epoch=$(date +%s);
403
+ mkdir -p ~/.konduktor
404
+ mkdir -p {{ remote_workdir }}
405
+ {% for secret_type, secret_name in mount_secrets.items() %}
406
+ {% if secret_type == "gs" %}
407
+ $(prefix_cmd) echo "Unpacking GCP secret"
408
+ $(prefix_cmd) mkdir -p ~/.config
409
+ $(prefix_cmd) unzip /run/konduktor/gs-secret/gcpcredentials -d ~/.config/gcloud
410
+ {% elif secret_type == "s3" %}
411
+ $(prefix_cmd) echo "Unpacking AWS secret"
412
+ $(prefix_cmd) mkdir -p ~/.aws
413
+ $(prefix_cmd) unzip /run/konduktor/s3-secret/awscredentials -d ~/.aws
414
+ {% endif %}
415
+ {% endfor %}
416
+
417
+ {% if default_secrets %}
418
+ $(prefix_cmd) echo "===== KONDUKTOR: Unpacking default secrets ====="
419
+ $(prefix_cmd) mkdir -p "${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}"
420
+
421
+ # For each mounted default secret folder:
422
+ # - if payload.zip exists, unzip it into the expanded dir
423
+ # - otherwise, copy the files as-is
424
+ for src in "${KONDUKTOR_DEFAULT_SECRETS}"/*; do
425
+ [ -d "$src" ] || continue
426
+ name="$(basename "$src")"
427
+ dst="${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}/${name}"
428
+ $(prefix_cmd) mkdir -p "$dst"
429
+
430
+ if [ -f "${src}/payload.zip" ]; then
431
+ $(prefix_cmd) unzip -oq "${src}/payload.zip" -d "$dst"
432
+ else
433
+ $(prefix_cmd) cp -a "${src}/." "$dst/"
434
+ fi
435
+ done
436
+
437
+ # Point callers to the expanded (writable) path going forward
438
+ export KONDUKTOR_DEFAULT_SECRETS="${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}"
439
+ $(prefix_cmd) echo "KONDUKTOR_DEFAULT_SECRETS=${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}" >> /etc/environment
440
+ {% endif %}
441
+
442
+ {% if git_ssh %}
443
+ $(prefix_cmd) echo "Unpacking GIT-SSH secret"
444
+ {% endif %}
445
+ end_epoch=$(date +%s);
446
+ $(prefix_cmd) echo "===== KONDUKTOR: Unpacking secrets credentials took $((end_epoch - start_epoch)) seconds ====="
447
+
448
+ $(prefix_cmd) echo "===== KONDUKTOR: Default secret summary ====="
449
+ start_epoch=$(date +%s)
450
+
451
+ root="${KONDUKTOR_DEFAULT_SECRETS:-}"
452
+ if [[ -z "$root" || ! -d "$root" ]]; then
453
+ $(prefix_cmd) echo "NO DEFAULT SECRETS FOUND."
454
+ else
455
+ for dir in "$root"/*; do
456
+ [ -d "$dir" ] || continue
457
+ name="$(basename "$dir")"
458
+
459
+ # Pretty header that mirrors the logical mount base:
460
+ $(prefix_cmd) echo "/konduktor/default-secrets/${name}:"
461
+
462
+ # Print relative paths only; skip macOS junk and k8s secret internals
463
+ (
464
+ cd "$dir"
465
+ out="$(find . \
466
+ \( -name '.DS_Store' -o -name '__MACOSX' -o -name '..data' -o -name '..*' \) -prune -o \
467
+ \( -type f -o -type l \) -print \
468
+ | sed 's|^\./||' \
469
+ | sort)"
470
+ if [ -n "$out" ]; then
471
+ printf "%s\n" "$out"
472
+ fi
473
+ )
474
+ done
475
+ fi
476
+
477
+ end_epoch=$(date +%s)
478
+ $(prefix_cmd) echo "===== KONDUKTOR: Default secret summary took $((end_epoch - start_epoch)) seconds ====="
479
+
480
+
481
+ # sync file mounts
482
+ {% for mkdir_command in mkdir_commands %}
483
+ $(prefix_cmd) {{ mkdir_command }}
484
+ {% endfor %}
485
+ {% if sync_commands|length > 0 %}
486
+ $(prefix_cmd) echo "===== KONDUKTOR: Syncing files ====="
487
+ start_epoch=$(date +%s);
488
+ {% for sync_command in sync_commands %}
489
+ $(prefix_cmd) {{ sync_command }} >> ~/.konduktor/tmp/sync-files.log
490
+ {% endfor %}
491
+ end_epoch=$(date +%s);
492
+ $(prefix_cmd) echo "===== KONDUKTOR: Syncing files took $((end_epoch - start_epoch)) seconds ====="
493
+ {% endif %}
494
+ end_epoch=$(date +%s);
495
+ end_setup_time=$((end_epoch - start_setup));
496
+ ulimit -Sc 0 && ulimit -Hc 0
497
+ $(prefix_cmd) echo "===== KONDUKTOR: Initialization took $end_setup_time seconds ====="
498
+ set +eo pipefail
499
+ # run task
500
+ $(prefix_cmd) cd {{ remote_workdir }}
501
+ $(prefix_cmd) echo "===== KONDUKTOR: Running task ====="
502
+ start_epoch=$(date +%s);
503
+ {{ run_cmd | indent( width=14 ) }}
504
+ end_epoch=$(date +%s);
505
+ exit_code=$?
506
+ set +ex
507
+ $(prefix_cmd) echo "===== KONDUKTOR: Running task took $((end_epoch - start_epoch)) seconds and finished with exit code: $exit_code ====="
508
+ exit $exit_code
509
+ resources:
510
+ limits:
511
+ cpu: {{ cpu }}
512
+ memory: {{ memory }}Gi
513
+ # TODO(asaiacai): need to decide whether we include fabric configuration here
514
+ {% if num_gpus > 0 %}
515
+ nvidia.com/gpu: {{ num_gpus }}
516
+ {% endif %}
517
+ requests:
518
+ cpu: {{ cpu }}
519
+ memory: {{ memory }}Gi
520
+ {% if num_gpus > 0 %}
521
+ nvidia.com/gpu: {{num_gpus}}
522
+ {% endif %}
523
+ securityContext:
524
+ capabilities:
525
+ add:
526
+ - "IPC_LOCK" # May be needed for memlock
527
+
528
+ volumes:
529
+ - name: shared-memory
530
+ emptyDir:
531
+ medium: "Memory"
532
+ sizeLimit: 4Gi
533
+ {% if tailscale_secret %}
534
+ - name: tailscale-state
535
+ emptyDir: {}
536
+ {% endif %}
537
+ - name: sync
538
+ emptyDir: {}
539
+ {% for secret_type, secret_name in mount_secrets.items() %}
540
+ - name: {{ secret_type }}-secret
541
+ secret:
542
+ secretName: {{ secret_name }}
543
+ {% endfor %}
544
+ {% for secret in default_secrets %}
545
+ - name: default-secret-{{ secret.mount_name }}
546
+ secret:
547
+ secretName: {{ secret.k8s_name }}
548
+ {% endfor %}
549
+ {% if default_secrets %}
550
+ - name: default-secrets-expanded
551
+ emptyDir: {}
552
+ {% endif %}
553
+ {% if git_ssh %}
554
+ - name: git-ssh-secret
555
+ secret:
556
+ secretName: {{ git_ssh }}
557
+ defaultMode: 384
558
+ {% endif %}
559
+
560
+
561
+ # TODO(asaiacai): should we add nodeSelectors here or leave to
562
+ # kueue resource flavors. leaning towards defining
563
+ # in kueue and just querying for the kueue resource flavor
File without changes
@@ -0,0 +1,21 @@
1
+ """Constants for usage collection."""
2
+
3
+ import os
4
+
5
+ KONDUKTOR_DISABLE_USAGE_COLLECTION = os.environ.get(
6
+ 'KONDUKTOR_DISABLE_USAGE_COLLECTION', False
7
+ )
8
+
9
+ POSTHOG_API_KEY = os.environ.get(
10
+ 'POSTHOG_API_KEY', 'phc_4UgX80BfVNmYRZ2o3dJLyRMGkv1CxBozPAcPnD29uP4'
11
+ )
12
+
13
+ POSTHOG_HOST = os.environ.get('POSTHOG_HOST', 'https://us.i.posthog.com')
14
+
15
+ USAGE_POLICY_MESSAGE = (
16
+ 'Konduktor collects usage data to improve its services. '
17
+ '`run` commands are not collected to '
18
+ 'ensure privacy.\n'
19
+ 'Usage logging can be disabled by setting the '
20
+ 'environment variable KONDUKTOR_DISABLE_USAGE_COLLECTION=1.'
21
+ )
File without changes
@@ -0,0 +1,17 @@
1
+ """Accelerator registry."""
2
+
3
+ _ACCELERATORS = ['A100', 'A100-80GB', 'B200', 'H100', 'H200', 'L40S', 'T4', 'L40']
4
+
5
+
6
+ def canonicalize_accelerator_name(accelerator: str) -> str:
7
+ """Returns the canonical accelerator name."""
8
+
9
+ # Common case: do not read the catalog files.
10
+ mapping = {name.lower(): name for name in _ACCELERATORS}
11
+ if accelerator.lower() in mapping:
12
+ return mapping[accelerator.lower()]
13
+
14
+ raise ValueError(
15
+ f'Accelerator name {accelerator!r} is not supported. '
16
+ f'Please choose one of {_ACCELERATORS}.'
17
+ )
@@ -0,0 +1,62 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Annotations for public APIs."""
14
+
15
+ import functools
16
+ from typing import Callable, Literal
17
+
18
+ # Whether the current process is a SkyPilot API server process.
19
+ is_on_api_server = True
20
+ FUNCTIONS_NEED_RELOAD_CACHE = []
21
+
22
+
23
+ def client_api(func):
24
+ """Mark a function as a client-side API.
25
+
26
+ Code invoked by server-side functions will find annotations.is_on_api_server
27
+ to be True, so they can have some server-side handling.
28
+ """
29
+
30
+ @functools.wraps(func)
31
+ def wrapper(*args, **kwargs):
32
+ global is_on_api_server
33
+ is_on_api_server = False
34
+ return func(*args, **kwargs)
35
+
36
+ return wrapper
37
+
38
+
39
+ def lru_cache(
40
+ scope: Literal['global', 'request'], *lru_cache_args, **lru_cache_kwargs
41
+ ) -> Callable:
42
+ """LRU cache decorator for functions.
43
+
44
+ This decorator allows us to track which functions need to be reloaded for a
45
+ new request using the scope argument.
46
+
47
+ Args:
48
+ scope: Whether the cache is global or request-specific, i.e. needs to be
49
+ reloaded for a new request.
50
+ lru_cache_args: Arguments for functools.lru_cache.
51
+ lru_cache_kwargs: Keyword arguments for functools.lru_cache.
52
+ """
53
+
54
+ def decorator(func: Callable) -> Callable:
55
+ if scope == 'global':
56
+ return functools.lru_cache(*lru_cache_args, **lru_cache_kwargs)(func)
57
+ else:
58
+ cached_func = functools.lru_cache(*lru_cache_args, **lru_cache_kwargs)(func)
59
+ FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
60
+ return cached_func
61
+
62
+ return decorator