konduktor-nightly 0.1.0.dev20250915104603__py3-none-any.whl → 0.1.0.dev20251107104752__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +2 -2
- konduktor/backends/constants.py +1 -0
- konduktor/backends/deployment.py +27 -10
- konduktor/backends/deployment_utils.py +594 -358
- konduktor/backends/jobset_utils.py +6 -6
- konduktor/backends/pod_utils.py +133 -18
- konduktor/cli.py +61 -29
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +42 -9
- konduktor/manifests/apoxy-setup2.yaml +69 -5
- konduktor/resource.py +9 -2
- konduktor/serving.py +10 -6
- konduktor/task.py +8 -5
- konduktor/templates/deployment.yaml.j2 +96 -47
- konduktor/templates/pod.yaml.j2 +123 -9
- konduktor/utils/base64_utils.py +2 -0
- konduktor/utils/schemas.py +1 -1
- konduktor/utils/validator.py +12 -0
- {konduktor_nightly-0.1.0.dev20250915104603.dist-info → konduktor_nightly-0.1.0.dev20251107104752.dist-info}/METADATA +1 -1
- {konduktor_nightly-0.1.0.dev20250915104603.dist-info → konduktor_nightly-0.1.0.dev20251107104752.dist-info}/RECORD +23 -23
- konduktor/templates/apoxy-deployment.yaml.j2 +0 -33
- {konduktor_nightly-0.1.0.dev20250915104603.dist-info → konduktor_nightly-0.1.0.dev20251107104752.dist-info}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250915104603.dist-info → konduktor_nightly-0.1.0.dev20251107104752.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250915104603.dist-info → konduktor_nightly-0.1.0.dev20251107104752.dist-info}/entry_points.txt +0 -0
konduktor/task.py
CHANGED
|
@@ -29,7 +29,7 @@ import konduktor
|
|
|
29
29
|
from konduktor import constants, logging
|
|
30
30
|
from konduktor.data import data_utils
|
|
31
31
|
from konduktor.data import storage as storage_lib
|
|
32
|
-
from konduktor.utils import common_utils, exceptions, schemas, ux_utils
|
|
32
|
+
from konduktor.utils import common_utils, exceptions, schemas, ux_utils
|
|
33
33
|
|
|
34
34
|
logger = logging.get_logger(__name__)
|
|
35
35
|
|
|
@@ -388,10 +388,6 @@ class Task:
|
|
|
388
388
|
)
|
|
389
389
|
resources_config['_cluster_config_overrides'] = cluster_config_override
|
|
390
390
|
|
|
391
|
-
# Validate Docker image if specified in resources
|
|
392
|
-
if 'image_id' in resources_config and resources_config['image_id']:
|
|
393
|
-
validator.validate_and_warn_image(resources_config['image_id'], 'task')
|
|
394
|
-
|
|
395
391
|
task.set_resources(konduktor.Resources.from_yaml_config(resources_config))
|
|
396
392
|
|
|
397
393
|
# Parse serving field.
|
|
@@ -567,6 +563,13 @@ class Task:
|
|
|
567
563
|
f'less than min_replicas ({serving.min_replicas})'
|
|
568
564
|
)
|
|
569
565
|
|
|
566
|
+
if serving.max_replicas == 0 and serving.min_replicas == 0:
|
|
567
|
+
with ux_utils.print_exception_no_traceback():
|
|
568
|
+
raise ValueError(
|
|
569
|
+
f'max_replicas ({serving.max_replicas}) and '
|
|
570
|
+
f'min_replicas ({serving.min_replicas}) cannot both be 0'
|
|
571
|
+
)
|
|
572
|
+
|
|
570
573
|
if isinstance(serving, konduktor.Serving):
|
|
571
574
|
serving = serving
|
|
572
575
|
self.serving = serving
|
|
@@ -12,6 +12,10 @@ metadata:
|
|
|
12
12
|
{{ deployment_num_accelerators_label }}: "{{ num_accelerators }}"
|
|
13
13
|
trainy.ai/has-autoscaler: "{{ autoscaler }}"
|
|
14
14
|
trainy.ai/konduktor-managed: "true"
|
|
15
|
+
{% if autoscaler == 'true' %}
|
|
16
|
+
trainy.ai/original-min-replicas: "{{ min_replicas }}"
|
|
17
|
+
trainy.ai/original-max-replicas: "{{ max_replicas }}"
|
|
18
|
+
{% endif %}
|
|
15
19
|
name: {{ name }}
|
|
16
20
|
namespace: default
|
|
17
21
|
spec:
|
|
@@ -37,11 +41,9 @@ metadata:
|
|
|
37
41
|
{{ deployment_name_label }}: "{{ name }}"
|
|
38
42
|
{{ deployment_user_label }}: "{{ user }}"
|
|
39
43
|
trainy.ai/has-autoscaler: "{{ autoscaler }}"
|
|
40
|
-
{% if not general %}
|
|
41
44
|
annotations:
|
|
42
45
|
prometheus.io/scrape: "true"
|
|
43
|
-
prometheus.io/port: "
|
|
44
|
-
{% endif %}
|
|
46
|
+
prometheus.io/port: "9000"
|
|
45
47
|
name: {{ name }}
|
|
46
48
|
namespace: default
|
|
47
49
|
spec:
|
|
@@ -61,18 +63,15 @@ spec:
|
|
|
61
63
|
{{ model_name_label }}: {{ name }}
|
|
62
64
|
{% endif %}
|
|
63
65
|
{{ deployment_name_label }}: "{{ name }}"
|
|
64
|
-
{% if general %}
|
|
65
|
-
type: LoadBalancer
|
|
66
|
-
{% else %}
|
|
67
66
|
type: ClusterIP
|
|
68
|
-
{% endif %}
|
|
69
67
|
|
|
70
|
-
|
|
68
|
+
# AIBRIX PODAUTOSCALER STUFF (KPA)
|
|
69
|
+
{% if not general and autoscaler == 'true' %}
|
|
71
70
|
---
|
|
72
71
|
apiVersion: autoscaling.aibrix.ai/v1alpha1
|
|
73
72
|
kind: PodAutoscaler
|
|
74
73
|
metadata:
|
|
75
|
-
name: {{ name }}-
|
|
74
|
+
name: {{ name }}-pa
|
|
76
75
|
namespace: default
|
|
77
76
|
labels:
|
|
78
77
|
{{ model_name_label }}: {{ name }}
|
|
@@ -80,63 +79,113 @@ metadata:
|
|
|
80
79
|
app.kubernetes.io/managed-by: kustomize
|
|
81
80
|
{{ deployment_name_label }}: "{{ name }}"
|
|
82
81
|
{{ deployment_user_label }}: "{{ user }}"
|
|
83
|
-
annotations:
|
|
84
|
-
autoscaling.aibrix.ai/up-fluctuation-tolerance: '0.1'
|
|
85
|
-
autoscaling.aibrix.ai/down-fluctuation-tolerance: '0.2'
|
|
86
|
-
apa.autoscaling.aibrix.ai/window: 30s
|
|
87
82
|
spec:
|
|
88
|
-
scalingStrategy:
|
|
83
|
+
scalingStrategy: KPA
|
|
89
84
|
minReplicas: {{ min_replicas }}
|
|
90
85
|
maxReplicas: {{ max_replicas }}
|
|
91
86
|
metricsSources:
|
|
92
|
-
- metricSourceType:
|
|
87
|
+
- metricSourceType: domain
|
|
93
88
|
protocolType: http
|
|
94
|
-
|
|
95
|
-
path: metrics
|
|
96
|
-
targetMetric:
|
|
97
|
-
targetValue:
|
|
89
|
+
endpoint: aibrix-activator.aibrix-activator.svc.cluster.local:8080
|
|
90
|
+
path: /metrics/default/{{ name }}
|
|
91
|
+
targetMetric: vllm:deployment_replicas
|
|
92
|
+
targetValue: "1"
|
|
98
93
|
scaleTargetRef:
|
|
99
94
|
apiVersion: apps/v1
|
|
100
95
|
kind: Deployment
|
|
101
96
|
name: {{ name }}
|
|
102
97
|
{% endif %}
|
|
103
98
|
|
|
99
|
+
# KEDA HTTP ADD-ON STUFF (1 per deployment)
|
|
104
100
|
{% if general %}
|
|
101
|
+
{% if autoscaler == 'true' %}
|
|
102
|
+
# HTTPScaledObject (1 per deployment) - only when autoscaling enabled
|
|
105
103
|
---
|
|
106
|
-
apiVersion:
|
|
107
|
-
kind:
|
|
104
|
+
apiVersion: http.keda.sh/v1alpha1
|
|
105
|
+
kind: HTTPScaledObject
|
|
108
106
|
metadata:
|
|
109
|
-
name: {{ name }}-
|
|
107
|
+
name: {{ name }}-httpscaledobject
|
|
110
108
|
namespace: default
|
|
111
109
|
labels:
|
|
112
110
|
{{ deployment_name_label }}: "{{ name }}"
|
|
113
111
|
{{ deployment_user_label }}: "{{ user }}"
|
|
114
|
-
trainy.ai/has-autoscaler: "{{ autoscaler }}"
|
|
115
112
|
spec:
|
|
113
|
+
hosts:
|
|
114
|
+
- {{ name }}
|
|
115
|
+
pathPrefixes:
|
|
116
|
+
- "/"
|
|
117
|
+
{% if probe_path %}
|
|
118
|
+
- "{{ probe_path }}"
|
|
119
|
+
{% endif %}
|
|
116
120
|
scaleTargetRef:
|
|
117
|
-
|
|
121
|
+
name: "{{ name }}"
|
|
118
122
|
kind: Deployment
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
123
|
+
apiVersion: apps/v1
|
|
124
|
+
service: "{{ name }}"
|
|
125
|
+
port: {{ ports }}
|
|
126
|
+
replicas:
|
|
127
|
+
min: {{ min_replicas }}
|
|
128
|
+
max: {{ max_replicas }}
|
|
129
|
+
scaledownPeriod: 1200 # 20 minutes
|
|
130
|
+
scalingMetric:
|
|
131
|
+
requestRate:
|
|
132
|
+
targetValue: 4
|
|
133
|
+
granularity: "1s"
|
|
134
|
+
window: "30s"
|
|
135
|
+
{% endif %}
|
|
136
|
+
|
|
137
|
+
# INGRESS (1 per deployment)
|
|
138
|
+
---
|
|
139
|
+
apiVersion: networking.k8s.io/v1
|
|
140
|
+
kind: Ingress
|
|
141
|
+
metadata:
|
|
142
|
+
name: {{ name }}-ingress
|
|
143
|
+
labels:
|
|
144
|
+
{{ deployment_name_label }}: "{{ name }}"
|
|
145
|
+
{{ deployment_user_label }}: "{{ user }}"
|
|
146
|
+
trainy.ai/konduktor-managed: "true"
|
|
147
|
+
annotations:
|
|
148
|
+
nginx.ingress.kubernetes.io/use-regex: "true"
|
|
149
|
+
nginx.ingress.kubernetes.io/rewrite-target: /$1
|
|
150
|
+
{% if autoscaler == 'true' %}
|
|
151
|
+
nginx.ingress.kubernetes.io/upstream-vhost: "{{ name }}"
|
|
152
|
+
{% endif %}
|
|
153
|
+
spec:
|
|
154
|
+
ingressClassName: nginx
|
|
155
|
+
rules:
|
|
156
|
+
- host: {{ general_base_host }}
|
|
157
|
+
http:
|
|
158
|
+
paths:
|
|
159
|
+
- path: /{{ name }}(.*)
|
|
160
|
+
pathType: ImplementationSpecific
|
|
161
|
+
backend:
|
|
162
|
+
service:
|
|
163
|
+
{% if autoscaler == 'true' %}
|
|
164
|
+
# Use KEDA interceptor for autoscaling
|
|
165
|
+
name: keda-proxy
|
|
166
|
+
port:
|
|
167
|
+
number: 8080
|
|
168
|
+
{% else %}
|
|
169
|
+
# Direct to app service for fixed replicas
|
|
170
|
+
name: {{ name }}
|
|
171
|
+
port:
|
|
172
|
+
number: {{ ports }}
|
|
173
|
+
{% endif %}
|
|
174
|
+
# Direct access convenience rule (via LB IP + Host: {{ name }})
|
|
175
|
+
- host: {{ name }}
|
|
176
|
+
http:
|
|
177
|
+
paths:
|
|
178
|
+
- path: /(.*)
|
|
179
|
+
pathType: ImplementationSpecific
|
|
180
|
+
backend:
|
|
181
|
+
service:
|
|
182
|
+
{% if autoscaler == 'true' %}
|
|
183
|
+
name: keda-proxy
|
|
184
|
+
port:
|
|
185
|
+
number: 8080
|
|
186
|
+
{% else %}
|
|
187
|
+
name: {{ name }}
|
|
188
|
+
port:
|
|
189
|
+
number: {{ ports }}
|
|
190
|
+
{% endif %}
|
|
142
191
|
{% endif %}
|
konduktor/templates/pod.yaml.j2
CHANGED
|
@@ -28,16 +28,21 @@ kubernetes:
|
|
|
28
28
|
containers:
|
|
29
29
|
# TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
|
|
30
30
|
- name: konduktor-container
|
|
31
|
-
{% if enable_ssh %}
|
|
31
|
+
{% if enable_ssh or serving %}
|
|
32
32
|
ports:
|
|
33
|
+
{% if enable_ssh %}
|
|
33
34
|
- name: ssh
|
|
34
35
|
containerPort: {{ konduktor_ssh_port }}
|
|
36
|
+
{% endif %}
|
|
37
|
+
|
|
38
|
+
{% if serving %}
|
|
39
|
+
- name: serving
|
|
40
|
+
containerPort: {{ ports }}
|
|
41
|
+
{% endif %}
|
|
35
42
|
{% endif %}
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
- containerPort: {{ ports }}
|
|
43
|
+
|
|
44
|
+
{% if serving and probe %}
|
|
39
45
|
# TODO (ryan): allow modification of thresholds and timings
|
|
40
|
-
{% if probe %}
|
|
41
46
|
livenessProbe:
|
|
42
47
|
httpGet:
|
|
43
48
|
path: {{ probe }}
|
|
@@ -68,7 +73,6 @@ kubernetes:
|
|
|
68
73
|
successThreshold: 1
|
|
69
74
|
timeoutSeconds: 1
|
|
70
75
|
{% endif %}
|
|
71
|
-
{% endif %}
|
|
72
76
|
image: {{ image_id }}
|
|
73
77
|
# this is set during jobset definition since we need to know the jobset
|
|
74
78
|
# name and number of nodes to set all the environment variables correctly here
|
|
@@ -77,6 +81,10 @@ kubernetes:
|
|
|
77
81
|
# flush logs immediately to stdout for more reactive log streaming
|
|
78
82
|
- name: PYTHONUNBUFFERED
|
|
79
83
|
value: "0"
|
|
84
|
+
- name: KONDUKTOR_NODENAME
|
|
85
|
+
valueFrom:
|
|
86
|
+
fieldRef:
|
|
87
|
+
fieldPath: spec.nodeName
|
|
80
88
|
- name: KONDUKTOR_JOB_NAME
|
|
81
89
|
value: "{{ job_name }}"
|
|
82
90
|
- name: NODE_HOST_IPS
|
|
@@ -134,6 +142,8 @@ kubernetes:
|
|
|
134
142
|
{% if default_secrets %}
|
|
135
143
|
- name: KONDUKTOR_DEFAULT_SECRETS
|
|
136
144
|
value: "/konduktor/default-secrets"
|
|
145
|
+
- name: KONDUKTOR_DEFAULT_SECRETS_EXPANDED
|
|
146
|
+
value: "/run/konduktor/expanded-default-secrets"
|
|
137
147
|
{% endif %}
|
|
138
148
|
# these are for compatibility with skypilot
|
|
139
149
|
- name: SKYPILOT_NODE_IPS
|
|
@@ -146,6 +156,10 @@ kubernetes:
|
|
|
146
156
|
value: "{{ num_nodes }}"
|
|
147
157
|
- name: SKYPILOT_NUM_GPUS_PER_NODE
|
|
148
158
|
value: "{{ num_gpus }}"
|
|
159
|
+
- name: RESTART_ATTEMPT
|
|
160
|
+
valueFrom:
|
|
161
|
+
fieldRef:
|
|
162
|
+
fieldPath: metadata.labels['jobset.sigs.k8s.io/restart-attempt']
|
|
149
163
|
volumeMounts:
|
|
150
164
|
- name: shared-memory
|
|
151
165
|
mountPath: /dev/shm
|
|
@@ -159,6 +173,10 @@ kubernetes:
|
|
|
159
173
|
- name: default-secret-{{ secret.mount_name }}
|
|
160
174
|
mountPath: /konduktor/default-secrets/{{ secret.mount_name }}
|
|
161
175
|
{% endfor %}
|
|
176
|
+
{% if default_secrets %}
|
|
177
|
+
- name: default-secrets-expanded
|
|
178
|
+
mountPath: /run/konduktor/expanded-default-secrets
|
|
179
|
+
{% endif %}
|
|
162
180
|
{% if git_ssh %}
|
|
163
181
|
- name: git-ssh-secret
|
|
164
182
|
mountPath: /run/konduktor/git-ssh-secret
|
|
@@ -192,7 +210,7 @@ kubernetes:
|
|
|
192
210
|
{% if 'curl' in run_cmd or 'curl' in setup_cmd or tailscale_secret %}
|
|
193
211
|
PACKAGES="$PACKAGES curl";
|
|
194
212
|
{% endif %}
|
|
195
|
-
{% if 'gs' in mount_secrets or 's3' in mount_secrets %}
|
|
213
|
+
{% if 'gs' in mount_secrets or 's3' in mount_secrets or default_secrets %}
|
|
196
214
|
PACKAGES="$PACKAGES unzip wget";
|
|
197
215
|
{% endif %}
|
|
198
216
|
{% if 'git' in run_cmd or 'git' in setup_cmd %}
|
|
@@ -231,7 +249,7 @@ kubernetes:
|
|
|
231
249
|
fi;
|
|
232
250
|
end_epoch=$(date +%s);
|
|
233
251
|
|
|
234
|
-
echo "Exposing ENV variables"
|
|
252
|
+
echo "===== KONDUKTOR: Exposing ENV variables ====="
|
|
235
253
|
$(prefix_cmd) env -0 | awk -v RS='\0' '
|
|
236
254
|
{
|
|
237
255
|
gsub(/\\/,"\\\\"); # escape existing backslashes first
|
|
@@ -346,8 +364,41 @@ kubernetes:
|
|
|
346
364
|
|
|
347
365
|
$(prefix_cmd) echo "===== KONDUKTOR: Installing packages took $((end_epoch - start_epoch)) seconds ====="
|
|
348
366
|
|
|
367
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Environment variable summary ====="
|
|
368
|
+
start_epoch=$(date +%s);
|
|
369
|
+
|
|
370
|
+
print_bucket () {
|
|
371
|
+
title="$1"; list="${2:-}"
|
|
372
|
+
echo "--- $title ---"
|
|
373
|
+
if [ -n "$list" ]; then
|
|
374
|
+
echo "$list" | tr ',' '\n' | sed "s/^/[$title] /"
|
|
375
|
+
else
|
|
376
|
+
echo "[none]"
|
|
377
|
+
fi
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
# Secrets: prefer detailed mapping if available
|
|
381
|
+
echo "--- env secret ---"
|
|
382
|
+
if [ -n "${KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION:-}" ]; then
|
|
383
|
+
echo "${KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION}" \
|
|
384
|
+
| tr ',' '\n' \
|
|
385
|
+
| awk -F'=' '{ printf("[secret: %s] %s\n", $2, $1) }'
|
|
386
|
+
elif [ -n "${KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION:-}" ]; then
|
|
387
|
+
echo "${KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION}" \
|
|
388
|
+
| tr ',' '\n' | sed 's/^/[secret] /'
|
|
389
|
+
else
|
|
390
|
+
echo "[none]"
|
|
391
|
+
fi
|
|
392
|
+
|
|
393
|
+
print_bucket "CLI + task.yaml" "${KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION}"
|
|
394
|
+
print_bucket "config.yaml" "${KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION}"
|
|
395
|
+
print_bucket "other" "${KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION}"
|
|
396
|
+
|
|
397
|
+
end_epoch=$(date +%s);
|
|
398
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Environment variable summary took $((end_epoch - start_epoch)) seconds ====="
|
|
399
|
+
|
|
349
400
|
# unpack secrets credentials
|
|
350
|
-
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking
|
|
401
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking cloud storage secret credentials ====="
|
|
351
402
|
start_epoch=$(date +%s);
|
|
352
403
|
mkdir -p ~/.konduktor
|
|
353
404
|
mkdir -p {{ remote_workdir }}
|
|
@@ -362,12 +413,71 @@ kubernetes:
|
|
|
362
413
|
$(prefix_cmd) unzip /run/konduktor/s3-secret/awscredentials -d ~/.aws
|
|
363
414
|
{% endif %}
|
|
364
415
|
{% endfor %}
|
|
416
|
+
|
|
417
|
+
{% if default_secrets %}
|
|
418
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking default secrets ====="
|
|
419
|
+
$(prefix_cmd) mkdir -p "${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}"
|
|
420
|
+
|
|
421
|
+
# For each mounted default secret folder:
|
|
422
|
+
# - if payload.zip exists, unzip it into the expanded dir
|
|
423
|
+
# - otherwise, copy the files as-is
|
|
424
|
+
for src in "${KONDUKTOR_DEFAULT_SECRETS}"/*; do
|
|
425
|
+
[ -d "$src" ] || continue
|
|
426
|
+
name="$(basename "$src")"
|
|
427
|
+
dst="${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}/${name}"
|
|
428
|
+
$(prefix_cmd) mkdir -p "$dst"
|
|
429
|
+
|
|
430
|
+
if [ -f "${src}/payload.zip" ]; then
|
|
431
|
+
$(prefix_cmd) unzip -oq "${src}/payload.zip" -d "$dst"
|
|
432
|
+
else
|
|
433
|
+
$(prefix_cmd) cp -a "${src}/." "$dst/"
|
|
434
|
+
fi
|
|
435
|
+
done
|
|
436
|
+
|
|
437
|
+
# Point callers to the expanded (writable) path going forward
|
|
438
|
+
export KONDUKTOR_DEFAULT_SECRETS="${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}"
|
|
439
|
+
$(prefix_cmd) echo "KONDUKTOR_DEFAULT_SECRETS=${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}" >> /etc/environment
|
|
440
|
+
{% endif %}
|
|
441
|
+
|
|
365
442
|
{% if git_ssh %}
|
|
366
443
|
$(prefix_cmd) echo "Unpacking GIT-SSH secret"
|
|
367
444
|
{% endif %}
|
|
368
445
|
end_epoch=$(date +%s);
|
|
369
446
|
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking secrets credentials took $((end_epoch - start_epoch)) seconds ====="
|
|
370
447
|
|
|
448
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Default secret summary ====="
|
|
449
|
+
start_epoch=$(date +%s)
|
|
450
|
+
|
|
451
|
+
root="${KONDUKTOR_DEFAULT_SECRETS:-}"
|
|
452
|
+
if [[ -z "$root" || ! -d "$root" ]]; then
|
|
453
|
+
$(prefix_cmd) echo "NO DEFAULT SECRETS FOUND."
|
|
454
|
+
else
|
|
455
|
+
for dir in "$root"/*; do
|
|
456
|
+
[ -d "$dir" ] || continue
|
|
457
|
+
name="$(basename "$dir")"
|
|
458
|
+
|
|
459
|
+
# Pretty header that mirrors the logical mount base:
|
|
460
|
+
$(prefix_cmd) echo "/konduktor/default-secrets/${name}:"
|
|
461
|
+
|
|
462
|
+
# Print relative paths only; skip macOS junk and k8s secret internals
|
|
463
|
+
(
|
|
464
|
+
cd "$dir"
|
|
465
|
+
out="$(find . \
|
|
466
|
+
\( -name '.DS_Store' -o -name '__MACOSX' -o -name '..data' -o -name '..*' \) -prune -o \
|
|
467
|
+
\( -type f -o -type l \) -print \
|
|
468
|
+
| sed 's|^\./||' \
|
|
469
|
+
| sort)"
|
|
470
|
+
if [ -n "$out" ]; then
|
|
471
|
+
printf "%s\n" "$out"
|
|
472
|
+
fi
|
|
473
|
+
)
|
|
474
|
+
done
|
|
475
|
+
fi
|
|
476
|
+
|
|
477
|
+
end_epoch=$(date +%s)
|
|
478
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Default secret summary took $((end_epoch - start_epoch)) seconds ====="
|
|
479
|
+
|
|
480
|
+
|
|
371
481
|
# sync file mounts
|
|
372
482
|
{% for mkdir_command in mkdir_commands %}
|
|
373
483
|
$(prefix_cmd) {{ mkdir_command }}
|
|
@@ -436,6 +546,10 @@ kubernetes:
|
|
|
436
546
|
secret:
|
|
437
547
|
secretName: {{ secret.k8s_name }}
|
|
438
548
|
{% endfor %}
|
|
549
|
+
{% if default_secrets %}
|
|
550
|
+
- name: default-secrets-expanded
|
|
551
|
+
emptyDir: {}
|
|
552
|
+
{% endif %}
|
|
439
553
|
{% if git_ssh %}
|
|
440
554
|
- name: git-ssh-secret
|
|
441
555
|
secret:
|
konduktor/utils/base64_utils.py
CHANGED
|
@@ -44,6 +44,8 @@ def zip_base64encode(files: List[str]) -> str:
|
|
|
44
44
|
else:
|
|
45
45
|
for root, _, files in os.walk(item_path):
|
|
46
46
|
for file in files:
|
|
47
|
+
if file == '.DS_Store':
|
|
48
|
+
continue
|
|
47
49
|
file_path = os.path.join(root, file)
|
|
48
50
|
arcname = os.path.relpath(file_path, temp_dir)
|
|
49
51
|
zipf.write(file_path, arcname)
|
konduktor/utils/schemas.py
CHANGED
konduktor/utils/validator.py
CHANGED
|
@@ -37,6 +37,11 @@ SCHEMA_URLS = {
|
|
|
37
37
|
logger = logging.get_logger(__name__)
|
|
38
38
|
|
|
39
39
|
|
|
40
|
+
def _skip_image_checks() -> bool:
|
|
41
|
+
val = os.getenv('KONDUKTOR_SKIP_IMAGE_CHECK', '')
|
|
42
|
+
return val.lower() in ('1', 'true', 'yes', 'y')
|
|
43
|
+
|
|
44
|
+
|
|
40
45
|
def case_insensitive_enum(validator, enums, instance, schema):
|
|
41
46
|
del validator, schema # Unused.
|
|
42
47
|
if instance.lower() not in [enum.lower() for enum in enums]:
|
|
@@ -419,6 +424,13 @@ def validate_and_warn_image(image_id: str, context: str = 'task') -> None:
|
|
|
419
424
|
if not image_id:
|
|
420
425
|
return
|
|
421
426
|
|
|
427
|
+
if _skip_image_checks():
|
|
428
|
+
logger.info(
|
|
429
|
+
'Skipping Docker image validation for %s',
|
|
430
|
+
image_id,
|
|
431
|
+
)
|
|
432
|
+
return
|
|
433
|
+
|
|
422
434
|
status, message = validate_docker_image(image_id)
|
|
423
435
|
|
|
424
436
|
if status == 'invalid':
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
konduktor/__init__.py,sha256=
|
|
1
|
+
konduktor/__init__.py,sha256=mHmTi0owXeaxTt6NwGboUKlwfKWw6xwzbdcUjq9-1DM,1574
|
|
2
2
|
konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
|
|
4
4
|
konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
|
|
@@ -6,14 +6,14 @@ konduktor/adaptors/gcp.py,sha256=ierTF4z7vwpJ9BsC7LSiwv4uLcjGXscwZOwQrddr2vM,410
|
|
|
6
6
|
konduktor/authentication.py,sha256=_mVy3eqoKohicHostFiGwG1-2ybxP-l7ouofQ0LRlCY,4570
|
|
7
7
|
konduktor/backends/__init__.py,sha256=usWJ8HdZJEyg7MIsN8Zcz9rk9e2Lq5dWJ8dv6hCN3ys,199
|
|
8
8
|
konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
|
|
9
|
-
konduktor/backends/constants.py,sha256=
|
|
10
|
-
konduktor/backends/deployment.py,sha256=
|
|
11
|
-
konduktor/backends/deployment_utils.py,sha256=
|
|
9
|
+
konduktor/backends/constants.py,sha256=uAU-edQ_9DNYnu6x7fwNYXIEM7KMfJMOrnG74rlZ8mY,766
|
|
10
|
+
konduktor/backends/deployment.py,sha256=d0a3F7dxDbnRKIt4ZO_kQ0_vet0pZvg4bWYzVZ8DZIQ,6640
|
|
11
|
+
konduktor/backends/deployment_utils.py,sha256=9CmB9CYC_3wxIfIOmTSCN2hbURZ5MpEMTvPwYMUXBRM,49272
|
|
12
12
|
konduktor/backends/jobset.py,sha256=drt8Gc0iYQx18JWXBU6XfhUvC2xCKd8szSJ2JC4O20Q,8640
|
|
13
|
-
konduktor/backends/jobset_utils.py,sha256=
|
|
14
|
-
konduktor/backends/pod_utils.py,sha256=
|
|
13
|
+
konduktor/backends/jobset_utils.py,sha256=g49NY8RFhL_NNd4c1adRLG_Bq3UTFtRURxcAzxnMEYw,26524
|
|
14
|
+
konduktor/backends/pod_utils.py,sha256=kOi3cLbTI3abZFCNQswWrkrOiBBm3gW_9N4INjxeS-w,19276
|
|
15
15
|
konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
|
|
16
|
-
konduktor/cli.py,sha256=
|
|
16
|
+
konduktor/cli.py,sha256=B3Pp3RCwkGj8r9YgH-TgC85XU4zcc3eema1kpcDTQ3I,58452
|
|
17
17
|
konduktor/config.py,sha256=9upqgCCYvcu6fKw7tovEYC1MWTkAAir0_WHPdayylbI,15536
|
|
18
18
|
konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
|
|
19
19
|
konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -68,25 +68,25 @@ konduktor/data/storage_utils.py,sha256=n4GivkN0KMqmyOTDznF0Z-hzsJvm7KCEh5i5HgFAT
|
|
|
68
68
|
konduktor/execution.py,sha256=d0EP79iSrW2uFsoqn0YV_4kgIupPIqpMOParXx0y3kg,18519
|
|
69
69
|
konduktor/kube_client.py,sha256=HtM3d-_GigHnfGINRANchApR9_OigqczBgeYJ6Dj4j0,8504
|
|
70
70
|
konduktor/logging.py,sha256=xtcCdnecmC3rqMTyunK-klQRINojI7NI4Apag78i9jM,3221
|
|
71
|
-
konduktor/manifests/
|
|
72
|
-
konduktor/manifests/apoxy-
|
|
71
|
+
konduktor/manifests/aibrix-setup.yaml,sha256=Foe3M1C0zVt-CVSJGr5SlQkMaNGs9kU2CvIZoANm3f8,14133
|
|
72
|
+
konduktor/manifests/apoxy-setup.yaml,sha256=EipknCq33aBdxu9BIo6y5novjO0B_d_DCWqY44zYNuU,4262
|
|
73
|
+
konduktor/manifests/apoxy-setup2.yaml,sha256=fc1tDwVopPVFzvUygkqxDGVqHHeo1cF9ERTnzUdgaGs,2517
|
|
73
74
|
konduktor/manifests/controller_deployment.yaml,sha256=6p3oSLkEVONZsvKZGqVop0Dhn4bo3lrigRmhf8NXBHE,1730
|
|
74
75
|
konduktor/manifests/dashboard_deployment.yaml,sha256=xJLd4FbPMAosI0fIv5_8y7dV9bw0Vsf81l-w4MB_aU8,2837
|
|
75
76
|
konduktor/manifests/dmesg_daemonset.yaml,sha256=pSWt7YOeTYjS0l0iki1fvHOs7MhY-sH-RQfVW6JJyno,1391
|
|
76
77
|
konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw1vcq9_3gwHCu75Bi0A,3939
|
|
77
|
-
konduktor/resource.py,sha256=
|
|
78
|
-
konduktor/serving.py,sha256=
|
|
79
|
-
konduktor/task.py,sha256=
|
|
80
|
-
konduktor/templates/
|
|
81
|
-
konduktor/templates/deployment.yaml.j2,sha256=uXFjDQaimbpFdAn2RJGaIvS_PzDY136cw_L3QMjz3ZA,3452
|
|
78
|
+
konduktor/resource.py,sha256=JqEE3LZiBBd5vqAiHDk-nlLve_VUQHhgdo6BIgx2Xfk,21215
|
|
79
|
+
konduktor/serving.py,sha256=4s8cQhsVjf-HByZF65pbMxuqaV319hUSQE9pC8gP4Sg,5405
|
|
80
|
+
konduktor/task.py,sha256=FIWm_rC_63GPBoe-Hi8a_eJ0H8Szw747SwXYPrNtOWE,37820
|
|
81
|
+
konduktor/templates/deployment.yaml.j2,sha256=0Cer53I8YHtYgUeEBQ_NVgC36FdOcjMNejgfP8teJC4,4964
|
|
82
82
|
konduktor/templates/jobset.yaml.j2,sha256=NQcVeRNsTLLmTnJRnkL1vr45mSeth-b11YShXn_RoSg,1323
|
|
83
|
-
konduktor/templates/pod.yaml.j2,sha256=
|
|
83
|
+
konduktor/templates/pod.yaml.j2,sha256=gGYwdXsPxStiua9Mm-OF7byVfuKjcH-TYNjoQmdpX_Q,24107
|
|
84
84
|
konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
85
85
|
konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
|
|
86
86
|
konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
87
87
|
konduktor/utils/accelerator_registry.py,sha256=ythz3ynulP1DSSU7Jj5VUsQeBzSYRkxCVDZ5oOg0xtc,560
|
|
88
88
|
konduktor/utils/annotations.py,sha256=oy2-BLydkFt3KWkXDuaGY84d6b7iISuy4eAT9uXk0Fc,2225
|
|
89
|
-
konduktor/utils/base64_utils.py,sha256=
|
|
89
|
+
konduktor/utils/base64_utils.py,sha256=TzKxe_SPHrurJ1lTOwOi4OmGUNkGxomstFcTWcWLQhw,3223
|
|
90
90
|
konduktor/utils/common_utils.py,sha256=8gBpzYiC1bQ8sbgHIFLkKCGT5nLs1afpejod60kVSos,15076
|
|
91
91
|
konduktor/utils/constants.py,sha256=1DneiTR21lvKUcWdBGwC4I4fD4uPjbjLUilEnJS7rzA,216
|
|
92
92
|
konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4xc,2258
|
|
@@ -96,12 +96,12 @@ konduktor/utils/kubernetes_utils.py,sha256=XleYxzG64hciZb-CjzBDjX8BOMhFATIIHZlXD
|
|
|
96
96
|
konduktor/utils/log_utils.py,sha256=VUyTtN819BJnSwm33-73-h8aaD51Y5Gawt6ek2kU1tk,18181
|
|
97
97
|
konduktor/utils/loki_utils.py,sha256=eOGiD7dZNuwzmyXKiifyqz00EVh2nwcUPFSiPkac9y0,4050
|
|
98
98
|
konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo,3583
|
|
99
|
-
konduktor/utils/schemas.py,sha256=
|
|
99
|
+
konduktor/utils/schemas.py,sha256=cr39nEAgjluhXoUYnvIwCwLBH8rLds37MBsF1uQv1rw,19067
|
|
100
100
|
konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
|
|
101
101
|
konduktor/utils/ux_utils.py,sha256=LSH4b5lckD157qDF4keThxtkGdxNrAfGKmH1ewhZkm4,8646
|
|
102
|
-
konduktor/utils/validator.py,sha256=
|
|
103
|
-
konduktor_nightly-0.1.0.
|
|
104
|
-
konduktor_nightly-0.1.0.
|
|
105
|
-
konduktor_nightly-0.1.0.
|
|
106
|
-
konduktor_nightly-0.1.0.
|
|
107
|
-
konduktor_nightly-0.1.0.
|
|
102
|
+
konduktor/utils/validator.py,sha256=UcLvZCk9Cpbbhw8r_ZJtTpMSTfY1NKqcyciKsPzRPZM,17222
|
|
103
|
+
konduktor_nightly-0.1.0.dev20251107104752.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
|
|
104
|
+
konduktor_nightly-0.1.0.dev20251107104752.dist-info/METADATA,sha256=EEA9KjVBKhzBk4hO1-mWEacCmBul0d5GqMbB_VUKWbQ,4247
|
|
105
|
+
konduktor_nightly-0.1.0.dev20251107104752.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
106
|
+
konduktor_nightly-0.1.0.dev20251107104752.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
|
|
107
|
+
konduktor_nightly-0.1.0.dev20251107104752.dist-info/RECORD,,
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
# Apoxy Backend for general deployment
|
|
3
|
-
apiVersion: core.apoxy.dev/v1alpha
|
|
4
|
-
kind: Backend
|
|
5
|
-
metadata:
|
|
6
|
-
name: {{ unique_cluster_name }}-backend-{{ deployment_number }}
|
|
7
|
-
labels:
|
|
8
|
-
task_name: {{ name }}
|
|
9
|
-
endpoint_name: {{ cluster_name }}-{{ deployment_number }}.trainy.us
|
|
10
|
-
spec:
|
|
11
|
-
endpoints:
|
|
12
|
-
- fqdn: {{ name }}.default.{{ unique_cluster_name }}.tun.apoxy.net
|
|
13
|
-
---
|
|
14
|
-
# Apoxy Route for general deployment
|
|
15
|
-
apiVersion: gateway.apoxy.dev/v1
|
|
16
|
-
kind: HTTPRoute
|
|
17
|
-
metadata:
|
|
18
|
-
name: {{ unique_cluster_name }}-route-{{ deployment_number }}
|
|
19
|
-
labels:
|
|
20
|
-
task_name: {{ name }}
|
|
21
|
-
endpoint_name: {{ cluster_name }}-{{ deployment_number }}.trainy.us
|
|
22
|
-
spec:
|
|
23
|
-
parentRefs:
|
|
24
|
-
- name: default
|
|
25
|
-
kind: Gateway
|
|
26
|
-
port: 443
|
|
27
|
-
hostnames:
|
|
28
|
-
- '{{ cluster_name }}-{{ deployment_number }}.trainy.us'
|
|
29
|
-
rules:
|
|
30
|
-
- backendRefs:
|
|
31
|
-
- kind: Backend
|
|
32
|
-
name: {{ unique_cluster_name }}-backend-{{ deployment_number }}
|
|
33
|
-
port: {{ ports }}
|
|
File without changes
|
|
File without changes
|