konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# Apoxy Setup (Part 1/3) - Core Infrastructure
|
|
2
|
+
#
|
|
3
|
+
# This file sets up the core Apoxy infrastructure for external access to deployments:
|
|
4
|
+
# 1. Apoxy system namespace and RBAC
|
|
5
|
+
# 2. Kubeconfig secret for cluster access (populated by CI)
|
|
6
|
+
# 3. Apoxy tunnel controller and proxy services
|
|
7
|
+
# 4. Network policies for cross-namespace access
|
|
8
|
+
#
|
|
9
|
+
# Split into 2 files because:
|
|
10
|
+
# - apoxy-setup.yaml: Core infrastructure (1 per cluster) (needs to be applied first)
|
|
11
|
+
# - apoxy-setup2.yaml: All routing rules for both deployment types
|
|
12
|
+
|
|
13
|
+
apiVersion: v1
|
|
14
|
+
kind: Namespace
|
|
15
|
+
metadata:
|
|
16
|
+
name: apoxy-system
|
|
17
|
+
---
|
|
18
|
+
apiVersion: v1
|
|
19
|
+
kind: Secret
|
|
20
|
+
metadata:
|
|
21
|
+
name: trainy-kubeconfig
|
|
22
|
+
namespace: apoxy-system
|
|
23
|
+
type: Opaque
|
|
24
|
+
data:
|
|
25
|
+
# this gets replaced by buildkite CI secret APOXY_AUTH
|
|
26
|
+
kubeconfig.yaml: |
|
|
27
|
+
APOXY_AUTH
|
|
28
|
+
---
|
|
29
|
+
apiVersion: v1
|
|
30
|
+
kind: ServiceAccount
|
|
31
|
+
metadata:
|
|
32
|
+
name: kube-controller
|
|
33
|
+
namespace: apoxy-system
|
|
34
|
+
---
|
|
35
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
36
|
+
kind: ClusterRole
|
|
37
|
+
metadata:
|
|
38
|
+
name: kube-controller-role
|
|
39
|
+
rules:
|
|
40
|
+
- apiGroups: ["apiregistration.k8s.io"]
|
|
41
|
+
resources: ["apiservices"]
|
|
42
|
+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
|
43
|
+
---
|
|
44
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
45
|
+
kind: ClusterRoleBinding
|
|
46
|
+
metadata:
|
|
47
|
+
name: kube-controller-role-binding
|
|
48
|
+
roleRef:
|
|
49
|
+
apiGroup: rbac.authorization.k8s.io
|
|
50
|
+
kind: ClusterRole
|
|
51
|
+
name: kube-controller-role
|
|
52
|
+
subjects:
|
|
53
|
+
- kind: ServiceAccount
|
|
54
|
+
name: kube-controller
|
|
55
|
+
namespace: apoxy-system
|
|
56
|
+
---
|
|
57
|
+
apiVersion: apps/v1
|
|
58
|
+
kind: Deployment
|
|
59
|
+
metadata:
|
|
60
|
+
name: kube-controller
|
|
61
|
+
namespace: apoxy-system
|
|
62
|
+
labels:
|
|
63
|
+
app: kube-controller
|
|
64
|
+
spec:
|
|
65
|
+
replicas: 1
|
|
66
|
+
selector:
|
|
67
|
+
matchLabels:
|
|
68
|
+
app: kube-controller
|
|
69
|
+
template:
|
|
70
|
+
metadata:
|
|
71
|
+
labels:
|
|
72
|
+
app: kube-controller
|
|
73
|
+
spec:
|
|
74
|
+
containers:
|
|
75
|
+
- name: kube-controller
|
|
76
|
+
image: apoxy/kube-controller:v0.11.6
|
|
77
|
+
args:
|
|
78
|
+
- --dev
|
|
79
|
+
- --project_id=7ce458d7-e20c-443c-aeeb-dbc5663c1240
|
|
80
|
+
- --kubeconfig_path=/data/kubeconfig.yaml
|
|
81
|
+
env:
|
|
82
|
+
- name: POD_NAMESPACE
|
|
83
|
+
valueFrom:
|
|
84
|
+
fieldRef:
|
|
85
|
+
fieldPath: metadata.namespace
|
|
86
|
+
volumeMounts:
|
|
87
|
+
- name: kubeconfig-volume
|
|
88
|
+
mountPath: /data
|
|
89
|
+
readOnly: true
|
|
90
|
+
volumes:
|
|
91
|
+
- name: kubeconfig-volume
|
|
92
|
+
secret:
|
|
93
|
+
secretName: trainy-kubeconfig
|
|
94
|
+
items:
|
|
95
|
+
- key: kubeconfig.yaml
|
|
96
|
+
path: kubeconfig.yaml
|
|
97
|
+
mode: 0600
|
|
98
|
+
serviceAccountName: kube-controller
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
apiVersion: v1
|
|
102
|
+
kind: Service
|
|
103
|
+
metadata:
|
|
104
|
+
name: kube-controller
|
|
105
|
+
namespace: apoxy-system
|
|
106
|
+
labels:
|
|
107
|
+
app: kube-controller
|
|
108
|
+
spec:
|
|
109
|
+
selector:
|
|
110
|
+
app: kube-controller
|
|
111
|
+
ports:
|
|
112
|
+
- name: http
|
|
113
|
+
protocol: TCP
|
|
114
|
+
port: 8443
|
|
115
|
+
targetPort: 8443
|
|
116
|
+
---
|
|
117
|
+
apiVersion: v1
|
|
118
|
+
kind: ConfigMap
|
|
119
|
+
metadata:
|
|
120
|
+
name: apoxy-config
|
|
121
|
+
namespace: apoxy-system
|
|
122
|
+
data:
|
|
123
|
+
config.yaml: |
|
|
124
|
+
apiVersion: config.apoxy.dev/v1alpha1
|
|
125
|
+
kind: Config
|
|
126
|
+
currentProject: 7ce458d7-e20c-443c-aeeb-dbc5663c1240
|
|
127
|
+
projects:
|
|
128
|
+
- id: 7ce458d7-e20c-443c-aeeb-dbc5663c1240
|
|
129
|
+
kubernetesConfig:
|
|
130
|
+
kubeconfigPath: /root/kubeconfig.yaml
|
|
131
|
+
tunnel:
|
|
132
|
+
mode: userspace
|
|
133
|
+
---
|
|
134
|
+
apiVersion: apps/v1
|
|
135
|
+
kind: Deployment
|
|
136
|
+
metadata:
|
|
137
|
+
name: apoxy
|
|
138
|
+
namespace: apoxy-system
|
|
139
|
+
labels:
|
|
140
|
+
app: apoxy
|
|
141
|
+
spec:
|
|
142
|
+
replicas: 1
|
|
143
|
+
selector:
|
|
144
|
+
matchLabels:
|
|
145
|
+
app: apoxy
|
|
146
|
+
template:
|
|
147
|
+
metadata:
|
|
148
|
+
labels:
|
|
149
|
+
app: apoxy
|
|
150
|
+
spec:
|
|
151
|
+
containers:
|
|
152
|
+
- name: apoxy
|
|
153
|
+
image: apoxy/apoxy:v0.11.18
|
|
154
|
+
command: ["apoxy", "tunnel", "run", "UNIQUE-TEMPNAME", "--insecure-skip-verify"]
|
|
155
|
+
volumeMounts:
|
|
156
|
+
- name: kubeconfig-volume
|
|
157
|
+
mountPath: /root/kubeconfig.yaml
|
|
158
|
+
subPath: kubeconfig.yaml
|
|
159
|
+
- name: apoxy-config-volume
|
|
160
|
+
mountPath: /root/.apoxy/config.yaml
|
|
161
|
+
subPath: config.yaml
|
|
162
|
+
volumes:
|
|
163
|
+
- name: kubeconfig-volume
|
|
164
|
+
secret:
|
|
165
|
+
secretName: trainy-kubeconfig
|
|
166
|
+
- name: apoxy-config-volume
|
|
167
|
+
configMap:
|
|
168
|
+
name: apoxy-config
|
|
169
|
+
---
|
|
170
|
+
# NetworkPolicy to allow Apoxy to reach services in other namespaces
|
|
171
|
+
apiVersion: networking.k8s.io/v1
|
|
172
|
+
kind: NetworkPolicy
|
|
173
|
+
metadata:
|
|
174
|
+
name: apoxy-cross-namespace-access
|
|
175
|
+
namespace: apoxy-system
|
|
176
|
+
spec:
|
|
177
|
+
podSelector:
|
|
178
|
+
matchLabels:
|
|
179
|
+
app: apoxy
|
|
180
|
+
policyTypes:
|
|
181
|
+
- Egress
|
|
182
|
+
egress:
|
|
183
|
+
# Allow all egress traffic
|
|
184
|
+
- {}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Apoxy Setup (Part 2/2) - Deployment Routing
|
|
2
|
+
#
|
|
3
|
+
# This file sets up Apoxy routing for both vLLM and general deployments:
|
|
4
|
+
# 1. TunnelNode for secure tunnel connection
|
|
5
|
+
# 2. Backend for vLLM pointing to Envoy Gateway
|
|
6
|
+
# 3. HTTPRoute for company.trainy.us -> vLLM deployments
|
|
7
|
+
# 4. Backend for general deployments pointing to nginx ingress
|
|
8
|
+
# 5. HTTPRoute for company2.trainy.us -> general deployments
|
|
9
|
+
# 6. KEDA proxy service for HTTP autoscaling
|
|
10
|
+
# 7. 60s timeout for all requests
|
|
11
|
+
#
|
|
12
|
+
# Split into 2 files because:
|
|
13
|
+
# - apoxy-setup.yaml: Core infrastructure (1 per cluster) (needs to be applied first)
|
|
14
|
+
# - apoxy-setup2.yaml: All routing rules for both deployment types
|
|
15
|
+
|
|
16
|
+
# NOTE: TunnelNode should technically be in the first apoxy-setup.yaml but it
|
|
17
|
+
# needs to be created after the core infrastructure is created, so we put it here.
|
|
18
|
+
apiVersion: core.apoxy.dev/v1alpha
|
|
19
|
+
kind: TunnelNode
|
|
20
|
+
metadata:
|
|
21
|
+
name: UNIQUE-TEMPNAME
|
|
22
|
+
spec:
|
|
23
|
+
egressGateway:
|
|
24
|
+
enabled: true
|
|
25
|
+
---
|
|
26
|
+
# Backend for vLLM deployments
|
|
27
|
+
apiVersion: core.apoxy.dev/v1alpha
|
|
28
|
+
kind: Backend
|
|
29
|
+
metadata:
|
|
30
|
+
name: UNIQUE-TEMPNAME-backend
|
|
31
|
+
spec:
|
|
32
|
+
endpoints:
|
|
33
|
+
- fqdn: envoy-aibrix-system-aibrix-eg-903790dc.envoy-gateway-system.UNIQUE-TEMPNAME.tun.apoxy.net
|
|
34
|
+
---
|
|
35
|
+
# HTTPRoute for vLLM deployments
|
|
36
|
+
apiVersion: gateway.apoxy.dev/v1
|
|
37
|
+
kind: HTTPRoute
|
|
38
|
+
metadata:
|
|
39
|
+
name: UNIQUE-TEMPNAME-route
|
|
40
|
+
spec:
|
|
41
|
+
parentRefs:
|
|
42
|
+
- name: default
|
|
43
|
+
kind: Gateway
|
|
44
|
+
port: 443
|
|
45
|
+
hostnames:
|
|
46
|
+
- 'TEMPNAME.trainy.us'
|
|
47
|
+
rules:
|
|
48
|
+
- backendRefs:
|
|
49
|
+
- kind: Backend
|
|
50
|
+
name: UNIQUE-TEMPNAME-backend
|
|
51
|
+
port: 80
|
|
52
|
+
timeouts:
|
|
53
|
+
request: "60s"
|
|
54
|
+
---
|
|
55
|
+
# Backend for general deployments
|
|
56
|
+
apiVersion: core.apoxy.dev/v1alpha
|
|
57
|
+
kind: Backend
|
|
58
|
+
metadata:
|
|
59
|
+
name: UNIQUE-TEMPNAME-backend2
|
|
60
|
+
spec:
|
|
61
|
+
endpoints:
|
|
62
|
+
- fqdn: keda-ingress-nginx-controller.keda.UNIQUE-TEMPNAME.tun.apoxy.net
|
|
63
|
+
---
|
|
64
|
+
# HTTPRoute for general deployments
|
|
65
|
+
apiVersion: gateway.apoxy.dev/v1
|
|
66
|
+
kind: HTTPRoute
|
|
67
|
+
metadata:
|
|
68
|
+
name: UNIQUE-TEMPNAME-route2
|
|
69
|
+
spec:
|
|
70
|
+
parentRefs:
|
|
71
|
+
- name: default
|
|
72
|
+
kind: Gateway
|
|
73
|
+
port: 443
|
|
74
|
+
hostnames:
|
|
75
|
+
- 'TEMPNAME2.trainy.us'
|
|
76
|
+
rules:
|
|
77
|
+
- backendRefs:
|
|
78
|
+
- kind: Backend
|
|
79
|
+
name: UNIQUE-TEMPNAME-backend2
|
|
80
|
+
port: 80
|
|
81
|
+
timeouts:
|
|
82
|
+
request: "60s"
|
|
83
|
+
|
|
84
|
+
# KEDA proxy service (1 per cluster) (For general deployments)
|
|
85
|
+
---
|
|
86
|
+
apiVersion: v1
|
|
87
|
+
kind: Service
|
|
88
|
+
metadata:
|
|
89
|
+
name: keda-proxy
|
|
90
|
+
namespace: default
|
|
91
|
+
spec:
|
|
92
|
+
type: ExternalName
|
|
93
|
+
externalName: keda-add-ons-http-interceptor-proxy.keda
|
|
94
|
+
ports:
|
|
95
|
+
- name: http
|
|
96
|
+
port: 8080
|
|
97
|
+
protocol: TCP
|
|
98
|
+
targetPort: 8080
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
apiVersion: v1
|
|
2
|
+
kind: Namespace
|
|
3
|
+
metadata:
|
|
4
|
+
name: konduktor
|
|
5
|
+
---
|
|
6
|
+
apiVersion: v1
|
|
7
|
+
kind: ServiceAccount
|
|
8
|
+
metadata:
|
|
9
|
+
name: konduktor-controller-sa
|
|
10
|
+
namespace: konduktor
|
|
11
|
+
---
|
|
12
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
13
|
+
kind: ClusterRole
|
|
14
|
+
metadata:
|
|
15
|
+
namespace: konduktor
|
|
16
|
+
name: konduktor-controller-role
|
|
17
|
+
rules:
|
|
18
|
+
- apiGroups: [""]
|
|
19
|
+
resources: ["nodes"]
|
|
20
|
+
verbs: ["get", "list", "patch"]
|
|
21
|
+
---
|
|
22
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
23
|
+
kind: ClusterRoleBinding
|
|
24
|
+
metadata:
|
|
25
|
+
namespace: konduktor
|
|
26
|
+
name: konduktor-controller-rolebinding
|
|
27
|
+
subjects:
|
|
28
|
+
- kind: ServiceAccount
|
|
29
|
+
name: konduktor-controller-sa
|
|
30
|
+
namespace: konduktor
|
|
31
|
+
roleRef:
|
|
32
|
+
kind: ClusterRole
|
|
33
|
+
name: konduktor-controller-role
|
|
34
|
+
apiGroup: rbac.authorization.k8s.io
|
|
35
|
+
---
|
|
36
|
+
apiVersion: apps/v1
|
|
37
|
+
kind: Deployment
|
|
38
|
+
metadata:
|
|
39
|
+
name: konduktor-controller-deployment
|
|
40
|
+
namespace: konduktor
|
|
41
|
+
spec:
|
|
42
|
+
replicas: 1
|
|
43
|
+
selector:
|
|
44
|
+
matchLabels:
|
|
45
|
+
app: konduktor-controller
|
|
46
|
+
template:
|
|
47
|
+
metadata:
|
|
48
|
+
labels:
|
|
49
|
+
app: konduktor-controller
|
|
50
|
+
spec:
|
|
51
|
+
serviceAccountName: konduktor-controller-sa
|
|
52
|
+
affinity:
|
|
53
|
+
nodeAffinity:
|
|
54
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
55
|
+
nodeSelectorTerms:
|
|
56
|
+
- matchExpressions:
|
|
57
|
+
- key: nvidia.com/gpu
|
|
58
|
+
operator: DoesNotExist
|
|
59
|
+
containers:
|
|
60
|
+
- name: python
|
|
61
|
+
image: python:3.10
|
|
62
|
+
command: ["/bin/sh"]
|
|
63
|
+
args: ["-c", "pip install konduktor-nightly && python -m konduktor.controller.launch"]
|
|
64
|
+
## define what namespaces to watch for errors, comma separated.
|
|
65
|
+
# env:
|
|
66
|
+
# - name: WATCHED_NAMESPACES
|
|
67
|
+
# value: "default,othernamespace"
|
|
68
|
+
# - name: LOG_ENDPOINT
|
|
69
|
+
# value: "http://loki.loki.svc.cluster.local:3100"
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
---
|
|
2
|
+
apiVersion: v1
|
|
3
|
+
kind: Namespace
|
|
4
|
+
metadata:
|
|
5
|
+
name: konduktor-dashboard
|
|
6
|
+
---
|
|
7
|
+
# ServiceAccount
|
|
8
|
+
apiVersion: v1
|
|
9
|
+
kind: ServiceAccount
|
|
10
|
+
metadata:
|
|
11
|
+
name: konduktor-service-account
|
|
12
|
+
namespace: konduktor-dashboard
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
# ClusterRole
|
|
16
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
17
|
+
kind: ClusterRole
|
|
18
|
+
metadata:
|
|
19
|
+
name: node-access-role
|
|
20
|
+
rules:
|
|
21
|
+
- apiGroups: [""]
|
|
22
|
+
resources: ["services", "pods", "nodes", "pods/portforward", "namespaces"]
|
|
23
|
+
verbs: ["get", "list", "patch", "watch", "create"]
|
|
24
|
+
|
|
25
|
+
# Add permissions to delete Kubernetes jobs
|
|
26
|
+
- apiGroups: ["batch"] # For Kubernetes native Jobs
|
|
27
|
+
resources: ["jobs"]
|
|
28
|
+
verbs: ["get", "list", "delete", "patch"]
|
|
29
|
+
|
|
30
|
+
# Add permissions to delete Kueue workloads
|
|
31
|
+
- apiGroups: ["kueue.x-k8s.io"] # For Kueue workloads
|
|
32
|
+
resources: ["workloads"]
|
|
33
|
+
verbs: ["get", "list", "delete", "patch"]
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
# ClusterRoleBinding
|
|
37
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
38
|
+
kind: ClusterRoleBinding
|
|
39
|
+
metadata:
|
|
40
|
+
name: node-access-binding
|
|
41
|
+
subjects:
|
|
42
|
+
- kind: ServiceAccount
|
|
43
|
+
name: konduktor-service-account # Must match the name of the service account
|
|
44
|
+
namespace: konduktor-dashboard
|
|
45
|
+
roleRef:
|
|
46
|
+
kind: ClusterRole
|
|
47
|
+
name: node-access-role # Must match the ClusterRole name
|
|
48
|
+
apiGroup: rbac.authorization.k8s.io
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
# Backend Deployment + Service
|
|
52
|
+
apiVersion: apps/v1
|
|
53
|
+
kind: Deployment
|
|
54
|
+
metadata:
|
|
55
|
+
name: backend
|
|
56
|
+
namespace: konduktor-dashboard
|
|
57
|
+
spec:
|
|
58
|
+
replicas: 1
|
|
59
|
+
selector:
|
|
60
|
+
matchLabels:
|
|
61
|
+
app: backend
|
|
62
|
+
template:
|
|
63
|
+
metadata:
|
|
64
|
+
labels:
|
|
65
|
+
app: backend
|
|
66
|
+
spec:
|
|
67
|
+
serviceAccountName: konduktor-service-account
|
|
68
|
+
containers:
|
|
69
|
+
- name: backend
|
|
70
|
+
image: ryanattrainy/konduktor-dashboard:backend1.16
|
|
71
|
+
imagePullPolicy: Always
|
|
72
|
+
ports:
|
|
73
|
+
- containerPort: 5001
|
|
74
|
+
command: ["/app/startup.sh"]
|
|
75
|
+
env:
|
|
76
|
+
- name: KONDUKTOR_DEBUG
|
|
77
|
+
value: "0" # Set debug mode: 1 (DEBUG) or 0
|
|
78
|
+
- name: LOGS_URL # Set loki logs URL
|
|
79
|
+
value: "http://loki.loki.svc.cluster.local:3100/loki/api/v1/query_range"
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
apiVersion: v1
|
|
83
|
+
kind: Service
|
|
84
|
+
metadata:
|
|
85
|
+
name: backend
|
|
86
|
+
namespace: konduktor-dashboard
|
|
87
|
+
spec:
|
|
88
|
+
ports:
|
|
89
|
+
- name: backend-port
|
|
90
|
+
port: 5001
|
|
91
|
+
targetPort: 5001
|
|
92
|
+
selector:
|
|
93
|
+
app: backend
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
# Frontend Deployment + Service
|
|
97
|
+
apiVersion: apps/v1
|
|
98
|
+
kind: Deployment
|
|
99
|
+
metadata:
|
|
100
|
+
name: frontend
|
|
101
|
+
namespace: konduktor-dashboard
|
|
102
|
+
spec:
|
|
103
|
+
replicas: 1
|
|
104
|
+
selector:
|
|
105
|
+
matchLabels:
|
|
106
|
+
app: frontend
|
|
107
|
+
template:
|
|
108
|
+
metadata:
|
|
109
|
+
labels:
|
|
110
|
+
app: frontend
|
|
111
|
+
spec:
|
|
112
|
+
containers:
|
|
113
|
+
- name: frontend
|
|
114
|
+
image: ryanattrainy/konduktor-dashboard:frontend1.16
|
|
115
|
+
imagePullPolicy: Always
|
|
116
|
+
ports:
|
|
117
|
+
- containerPort: 5173
|
|
118
|
+
---
|
|
119
|
+
apiVersion: v1
|
|
120
|
+
kind: Service
|
|
121
|
+
metadata:
|
|
122
|
+
name: frontend
|
|
123
|
+
namespace: konduktor-dashboard
|
|
124
|
+
spec:
|
|
125
|
+
ports:
|
|
126
|
+
- name: frontend-port
|
|
127
|
+
port: 5173
|
|
128
|
+
targetPort: 5173
|
|
129
|
+
selector:
|
|
130
|
+
app: frontend
|
|
131
|
+
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
apiVersion: v1
|
|
2
|
+
kind: Namespace
|
|
3
|
+
metadata:
|
|
4
|
+
name: dmesg-logging
|
|
5
|
+
---
|
|
6
|
+
apiVersion: apps/v1
|
|
7
|
+
kind: DaemonSet
|
|
8
|
+
metadata:
|
|
9
|
+
name: dmesg
|
|
10
|
+
namespace: dmesg-logging
|
|
11
|
+
labels:
|
|
12
|
+
k8s-app: dmesg-logging
|
|
13
|
+
spec:
|
|
14
|
+
selector:
|
|
15
|
+
matchLabels:
|
|
16
|
+
name: dmesg
|
|
17
|
+
template:
|
|
18
|
+
metadata:
|
|
19
|
+
labels:
|
|
20
|
+
name: dmesg
|
|
21
|
+
spec:
|
|
22
|
+
tolerations:
|
|
23
|
+
# these tolerations are to have the daemonset runnable on control plane nodes
|
|
24
|
+
# remove them if your control plane nodes should not run pods
|
|
25
|
+
- key: "nvidia.com/gpu"
|
|
26
|
+
operator: "Equal"
|
|
27
|
+
value: "present"
|
|
28
|
+
effect: "NoSchedule"
|
|
29
|
+
- key: "trainy.konduktor.ai/faulty"
|
|
30
|
+
operator: "Equal"
|
|
31
|
+
value: "true"
|
|
32
|
+
effect: "NoSchedule"
|
|
33
|
+
- key: "cloud.google.com/gke-queued"
|
|
34
|
+
operator: "Equal"
|
|
35
|
+
value: "true"
|
|
36
|
+
effect: "NoSchedule"
|
|
37
|
+
containers:
|
|
38
|
+
- name: dmesg
|
|
39
|
+
image: ubuntu:22.04
|
|
40
|
+
# required for running `dmesg`
|
|
41
|
+
securityContext:
|
|
42
|
+
privileged: true
|
|
43
|
+
command:
|
|
44
|
+
- sh
|
|
45
|
+
- -c
|
|
46
|
+
- >
|
|
47
|
+
dmesg -w
|
|
48
|
+
resources:
|
|
49
|
+
limits:
|
|
50
|
+
memory: 200Mi
|
|
51
|
+
requests:
|
|
52
|
+
cpu: 100m
|
|
53
|
+
memory: 200Mi
|
|
54
|
+
# it may be desirable to set a high priority class to ensure that a DaemonSet Pod
|
|
55
|
+
# preempts running Pods
|
|
56
|
+
# priorityClassName: important
|
|
57
|
+
terminationGracePeriodSeconds: 30
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
apiVersion: v1
|
|
2
|
+
kind: ServiceAccount
|
|
3
|
+
metadata:
|
|
4
|
+
name: pod-cleanup-controller
|
|
5
|
+
namespace: default
|
|
6
|
+
---
|
|
7
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
8
|
+
kind: Role
|
|
9
|
+
metadata:
|
|
10
|
+
name: pod-cleanup-controller
|
|
11
|
+
namespace: default
|
|
12
|
+
rules:
|
|
13
|
+
- apiGroups: [""]
|
|
14
|
+
resources: ["pods", "pods/status", "events"]
|
|
15
|
+
verbs: ["get", "list", "watch", "delete", "patch", "update"]
|
|
16
|
+
---
|
|
17
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
18
|
+
kind: RoleBinding
|
|
19
|
+
metadata:
|
|
20
|
+
name: pod-cleanup-controller
|
|
21
|
+
namespace: default
|
|
22
|
+
subjects:
|
|
23
|
+
- kind: ServiceAccount
|
|
24
|
+
name: pod-cleanup-controller
|
|
25
|
+
namespace: default
|
|
26
|
+
roleRef:
|
|
27
|
+
kind: Role
|
|
28
|
+
name: pod-cleanup-controller
|
|
29
|
+
apiGroup: rbac.authorization.k8s.io
|
|
30
|
+
---
|
|
31
|
+
apiVersion: apps/v1
|
|
32
|
+
kind: Deployment
|
|
33
|
+
metadata:
|
|
34
|
+
name: pod-cleanup-controller
|
|
35
|
+
namespace: default
|
|
36
|
+
spec:
|
|
37
|
+
replicas: 1
|
|
38
|
+
selector:
|
|
39
|
+
matchLabels:
|
|
40
|
+
app: pod-cleanup-controller
|
|
41
|
+
template:
|
|
42
|
+
metadata:
|
|
43
|
+
labels:
|
|
44
|
+
app: pod-cleanup-controller
|
|
45
|
+
spec:
|
|
46
|
+
serviceAccountName: pod-cleanup-controller
|
|
47
|
+
containers:
|
|
48
|
+
- name: controller
|
|
49
|
+
image: python:3.10
|
|
50
|
+
command: ["/bin/sh", "-c"]
|
|
51
|
+
args: ["pip install kubernetes && echo 'starting controller' && python /app/controller.py"]
|
|
52
|
+
env:
|
|
53
|
+
- name: PYTHONUNBUFFERED
|
|
54
|
+
value: "0"
|
|
55
|
+
volumeMounts:
|
|
56
|
+
- name: controller-code
|
|
57
|
+
mountPath: /app
|
|
58
|
+
volumes:
|
|
59
|
+
- name: controller-code
|
|
60
|
+
configMap:
|
|
61
|
+
name: pod-cleanup-controller-code
|
|
62
|
+
---
|
|
63
|
+
apiVersion: v1
|
|
64
|
+
kind: ConfigMap
|
|
65
|
+
metadata:
|
|
66
|
+
name: pod-cleanup-controller-code
|
|
67
|
+
namespace: default
|
|
68
|
+
data:
|
|
69
|
+
controller.py: |
|
|
70
|
+
from kubernetes import client, config, watch
|
|
71
|
+
from collections import defaultdict
|
|
72
|
+
from datetime import datetime
|
|
73
|
+
import time
|
|
74
|
+
|
|
75
|
+
FAILURE_MODES = ['ErrImagePull', 'InvalidImageName']
|
|
76
|
+
|
|
77
|
+
def check_failure_mode(message):
|
|
78
|
+
for mode in FAILURE_MODES:
|
|
79
|
+
if mode in message:
|
|
80
|
+
return mode
|
|
81
|
+
return ''
|
|
82
|
+
|
|
83
|
+
def main():
|
|
84
|
+
# Load kube config
|
|
85
|
+
try:
|
|
86
|
+
config.load_incluster_config()
|
|
87
|
+
except:
|
|
88
|
+
config.load_kube_config()
|
|
89
|
+
|
|
90
|
+
v1 = client.CoreV1Api()
|
|
91
|
+
error_counts = defaultdict(int)
|
|
92
|
+
|
|
93
|
+
w = watch.Watch()
|
|
94
|
+
while True:
|
|
95
|
+
for event in w.stream(v1.list_namespaced_event, namespace="default"):
|
|
96
|
+
if event['object'].type == 'Warning' and event['object'].reason == 'Failed' and check_failure_mode(event['object'].message):
|
|
97
|
+
pod_name = event['object'].involved_object.name
|
|
98
|
+
pod_namespace = event['object'].involved_object.namespace
|
|
99
|
+
print(f"Pod {pod_namespace}/{pod_name} has failed with ErrImagePull. Patching and deleting...")
|
|
100
|
+
try:
|
|
101
|
+
# Get current time in UTC
|
|
102
|
+
current_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
103
|
+
|
|
104
|
+
# Create the status patch
|
|
105
|
+
body = {
|
|
106
|
+
"status": {
|
|
107
|
+
"conditions": [
|
|
108
|
+
{
|
|
109
|
+
"type": "ConfigIssue",
|
|
110
|
+
"status": "True",
|
|
111
|
+
"reason": "ErrImagePull",
|
|
112
|
+
"lastTransitionTime": current_time
|
|
113
|
+
}
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
# Patch pod status
|
|
119
|
+
v1.patch_namespaced_pod_status(pod_name, pod_namespace, body)
|
|
120
|
+
|
|
121
|
+
# Delete the pod
|
|
122
|
+
v1.delete_namespaced_pod(pod_name, pod_namespace)
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print(f"Error handling pod: {e}")
|
|
125
|
+
print("Finished event stream... waiting for another stream...")
|
|
126
|
+
time.sleep(5)
|
|
127
|
+
|
|
128
|
+
if __name__ == '__main__':
|
|
129
|
+
main()
|