konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
apiVersion: apps/v1
|
|
2
|
+
kind: Deployment
|
|
3
|
+
metadata:
|
|
4
|
+
labels:
|
|
5
|
+
{% if not general %}
|
|
6
|
+
{{ model_name_label }}: {{ name }}
|
|
7
|
+
model.aibrix.ai/port: "{{ ports }}"
|
|
8
|
+
{% endif %}
|
|
9
|
+
{{ deployment_name_label }}: "{{ name }}"
|
|
10
|
+
{{ deployment_user_label }}: "{{ user }}"
|
|
11
|
+
{{ deployment_accelerator_label }}: "{{ accelerator_type }}"
|
|
12
|
+
{{ deployment_num_accelerators_label }}: "{{ num_accelerators }}"
|
|
13
|
+
trainy.ai/has-autoscaler: "{{ autoscaler }}"
|
|
14
|
+
trainy.ai/konduktor-managed: "true"
|
|
15
|
+
{% if autoscaler == 'true' %}
|
|
16
|
+
trainy.ai/original-min-replicas: "{{ min_replicas }}"
|
|
17
|
+
trainy.ai/original-max-replicas: "{{ max_replicas }}"
|
|
18
|
+
{% endif %}
|
|
19
|
+
name: {{ name }}
|
|
20
|
+
namespace: default
|
|
21
|
+
spec:
|
|
22
|
+
replicas: {{ min_replicas }}
|
|
23
|
+
selector:
|
|
24
|
+
matchLabels:
|
|
25
|
+
{% if not general %}
|
|
26
|
+
{{ model_name_label }}: {{ name }}
|
|
27
|
+
{% endif %}
|
|
28
|
+
{{ deployment_name_label }}: "{{ name }}"
|
|
29
|
+
template: {}
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
apiVersion: v1
|
|
34
|
+
kind: Service
|
|
35
|
+
metadata:
|
|
36
|
+
labels:
|
|
37
|
+
{% if not general %}
|
|
38
|
+
{{ model_name_label }}: {{ name }}
|
|
39
|
+
{% endif %}
|
|
40
|
+
prometheus-discovery: "true"
|
|
41
|
+
{{ deployment_name_label }}: "{{ name }}"
|
|
42
|
+
{{ deployment_user_label }}: "{{ user }}"
|
|
43
|
+
trainy.ai/has-autoscaler: "{{ autoscaler }}"
|
|
44
|
+
annotations:
|
|
45
|
+
prometheus.io/scrape: "true"
|
|
46
|
+
prometheus.io/port: "9000"
|
|
47
|
+
name: {{ name }}
|
|
48
|
+
namespace: default
|
|
49
|
+
spec:
|
|
50
|
+
ports:
|
|
51
|
+
- name: serve
|
|
52
|
+
port: {{ ports }}
|
|
53
|
+
protocol: TCP
|
|
54
|
+
targetPort: {{ ports }}
|
|
55
|
+
{% if not general %}
|
|
56
|
+
- name: http
|
|
57
|
+
port: 8080
|
|
58
|
+
protocol: TCP
|
|
59
|
+
targetPort: 8080
|
|
60
|
+
{% endif %}
|
|
61
|
+
selector:
|
|
62
|
+
{% if not general %}
|
|
63
|
+
{{ model_name_label }}: {{ name }}
|
|
64
|
+
{% endif %}
|
|
65
|
+
{{ deployment_name_label }}: "{{ name }}"
|
|
66
|
+
type: ClusterIP
|
|
67
|
+
|
|
68
|
+
# AIBRIX PODAUTOSCALER STUFF (KPA)
|
|
69
|
+
{% if not general and autoscaler == 'true' %}
|
|
70
|
+
---
|
|
71
|
+
apiVersion: autoscaling.aibrix.ai/v1alpha1
|
|
72
|
+
kind: PodAutoscaler
|
|
73
|
+
metadata:
|
|
74
|
+
name: {{ name }}-pa
|
|
75
|
+
namespace: default
|
|
76
|
+
labels:
|
|
77
|
+
{{ model_name_label }}: {{ name }}
|
|
78
|
+
app.kubernetes.io/name: aibrix
|
|
79
|
+
app.kubernetes.io/managed-by: kustomize
|
|
80
|
+
{{ deployment_name_label }}: "{{ name }}"
|
|
81
|
+
{{ deployment_user_label }}: "{{ user }}"
|
|
82
|
+
spec:
|
|
83
|
+
scalingStrategy: KPA
|
|
84
|
+
minReplicas: {{ min_replicas }}
|
|
85
|
+
maxReplicas: {{ max_replicas }}
|
|
86
|
+
metricsSources:
|
|
87
|
+
- metricSourceType: domain
|
|
88
|
+
protocolType: http
|
|
89
|
+
endpoint: aibrix-activator.aibrix-activator.svc.cluster.local:8080
|
|
90
|
+
path: /metrics/default/{{ name }}
|
|
91
|
+
targetMetric: vllm:deployment_replicas
|
|
92
|
+
targetValue: "1"
|
|
93
|
+
scaleTargetRef:
|
|
94
|
+
apiVersion: apps/v1
|
|
95
|
+
kind: Deployment
|
|
96
|
+
name: {{ name }}
|
|
97
|
+
{% endif %}
|
|
98
|
+
|
|
99
|
+
# KEDA HTTP ADD-ON STUFF (1 per deployment)
|
|
100
|
+
{% if general %}
|
|
101
|
+
{% if autoscaler == 'true' %}
|
|
102
|
+
# HTTPScaledObject (1 per deployment) - only when autoscaling enabled
|
|
103
|
+
---
|
|
104
|
+
apiVersion: http.keda.sh/v1alpha1
|
|
105
|
+
kind: HTTPScaledObject
|
|
106
|
+
metadata:
|
|
107
|
+
name: {{ name }}-httpscaledobject
|
|
108
|
+
namespace: default
|
|
109
|
+
labels:
|
|
110
|
+
{{ deployment_name_label }}: "{{ name }}"
|
|
111
|
+
{{ deployment_user_label }}: "{{ user }}"
|
|
112
|
+
spec:
|
|
113
|
+
hosts:
|
|
114
|
+
- {{ name }}
|
|
115
|
+
pathPrefixes:
|
|
116
|
+
- "/"
|
|
117
|
+
{% if probe_path %}
|
|
118
|
+
- "{{ probe_path }}"
|
|
119
|
+
{% endif %}
|
|
120
|
+
scaleTargetRef:
|
|
121
|
+
name: "{{ name }}"
|
|
122
|
+
kind: Deployment
|
|
123
|
+
apiVersion: apps/v1
|
|
124
|
+
service: "{{ name }}"
|
|
125
|
+
port: {{ ports }}
|
|
126
|
+
replicas:
|
|
127
|
+
min: {{ min_replicas }}
|
|
128
|
+
max: {{ max_replicas }}
|
|
129
|
+
scaledownPeriod: 1200 # 20 minutes
|
|
130
|
+
scalingMetric:
|
|
131
|
+
requestRate:
|
|
132
|
+
targetValue: 4
|
|
133
|
+
granularity: "1s"
|
|
134
|
+
window: "30s"
|
|
135
|
+
{% endif %}
|
|
136
|
+
|
|
137
|
+
# INGRESS (1 per deployment)
|
|
138
|
+
---
|
|
139
|
+
apiVersion: networking.k8s.io/v1
|
|
140
|
+
kind: Ingress
|
|
141
|
+
metadata:
|
|
142
|
+
name: {{ name }}-ingress
|
|
143
|
+
labels:
|
|
144
|
+
{{ deployment_name_label }}: "{{ name }}"
|
|
145
|
+
{{ deployment_user_label }}: "{{ user }}"
|
|
146
|
+
trainy.ai/konduktor-managed: "true"
|
|
147
|
+
annotations:
|
|
148
|
+
nginx.ingress.kubernetes.io/use-regex: "true"
|
|
149
|
+
nginx.ingress.kubernetes.io/rewrite-target: /$1
|
|
150
|
+
{% if autoscaler == 'true' %}
|
|
151
|
+
nginx.ingress.kubernetes.io/upstream-vhost: "{{ name }}"
|
|
152
|
+
{% endif %}
|
|
153
|
+
spec:
|
|
154
|
+
ingressClassName: nginx
|
|
155
|
+
rules:
|
|
156
|
+
- host: {{ general_base_host }}
|
|
157
|
+
http:
|
|
158
|
+
paths:
|
|
159
|
+
- path: /{{ name }}(.*)
|
|
160
|
+
pathType: ImplementationSpecific
|
|
161
|
+
backend:
|
|
162
|
+
service:
|
|
163
|
+
{% if autoscaler == 'true' %}
|
|
164
|
+
# Use KEDA interceptor for autoscaling
|
|
165
|
+
name: keda-proxy
|
|
166
|
+
port:
|
|
167
|
+
number: 8080
|
|
168
|
+
{% else %}
|
|
169
|
+
# Direct to app service for fixed replicas
|
|
170
|
+
name: {{ name }}
|
|
171
|
+
port:
|
|
172
|
+
number: {{ ports }}
|
|
173
|
+
{% endif %}
|
|
174
|
+
# Direct access convenience rule (via LB IP + Host: {{ name }})
|
|
175
|
+
- host: {{ name }}
|
|
176
|
+
http:
|
|
177
|
+
paths:
|
|
178
|
+
- path: /(.*)
|
|
179
|
+
pathType: ImplementationSpecific
|
|
180
|
+
backend:
|
|
181
|
+
service:
|
|
182
|
+
{% if autoscaler == 'true' %}
|
|
183
|
+
name: keda-proxy
|
|
184
|
+
port:
|
|
185
|
+
number: 8080
|
|
186
|
+
{% else %}
|
|
187
|
+
name: {{ name }}
|
|
188
|
+
port:
|
|
189
|
+
number: {{ ports }}
|
|
190
|
+
{% endif %}
|
|
191
|
+
{% endif %}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
jobset:
|
|
2
|
+
apiVersion: jobset.x-k8s.io/v1alpha2
|
|
3
|
+
kind: JobSet
|
|
4
|
+
metadata:
|
|
5
|
+
name: {{ job_name }}
|
|
6
|
+
labels:
|
|
7
|
+
{{ jobset_name_label }}: "{{ job_name }}"
|
|
8
|
+
{{ jobset_userid_label }}: "{{ user_id }}"
|
|
9
|
+
{{ jobset_user_label }}: "{{ user }}"
|
|
10
|
+
{% if accelerator_type %}
|
|
11
|
+
{{ jobset_accelerator_label }}: "{{ accelerator_type }}"
|
|
12
|
+
{{ jobset_num_accelerators_label }}: "{{ num_accelerators }}"
|
|
13
|
+
{% endif %}
|
|
14
|
+
{% if max_execution_time %}
|
|
15
|
+
{{ jobset_max_execution_time_label }}: "{{ max_execution_time }}"
|
|
16
|
+
{% endif %}
|
|
17
|
+
trainy.ai/konduktor-managed: "true"
|
|
18
|
+
parent: "trainy"
|
|
19
|
+
annotations: {}
|
|
20
|
+
spec:
|
|
21
|
+
ttlSecondsAfterFinished: 31536000 # 1 year (365 days)
|
|
22
|
+
{% if max_restarts is not none %}
|
|
23
|
+
failurePolicy:
|
|
24
|
+
maxRestarts: {{ max_restarts }}
|
|
25
|
+
{% endif %}
|
|
26
|
+
replicatedJobs:
|
|
27
|
+
- name: workers
|
|
28
|
+
template:
|
|
29
|
+
spec:
|
|
30
|
+
ttlSecondsAfterFinished: 600 # 5 minutes
|
|
31
|
+
parallelism: {{ num_nodes }}
|
|
32
|
+
{% if completions %}
|
|
33
|
+
completions: {{ completions }}
|
|
34
|
+
{% else %}
|
|
35
|
+
completions: {{ num_nodes }}
|
|
36
|
+
{% endif %}
|
|
37
|
+
backoffLimit: 0
|
|
38
|
+
template: {}
|
|
39
|
+
podFailurePolicy:
|
|
40
|
+
rules:
|
|
41
|
+
- action: FailJob
|
|
42
|
+
onPodConditions:
|
|
43
|
+
- type: ConfigIssue
|