konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,184 @@
1
+ # Apoxy Setup (Part 1/3) - Core Infrastructure
2
+ #
3
+ # This file sets up the core Apoxy infrastructure for external access to deployments:
4
+ # 1. Apoxy system namespace and RBAC
5
+ # 2. Kubeconfig secret for cluster access (populated by CI)
6
+ # 3. Apoxy tunnel controller and proxy services
7
+ # 4. Network policies for cross-namespace access
8
+ #
9
+ # Split into 2 files because:
10
+ # - apoxy-setup.yaml: Core infrastructure (1 per cluster) (needs to be applied first)
11
+ # - apoxy-setup2.yaml: All routing rules for both deployment types
12
+
13
+ apiVersion: v1
14
+ kind: Namespace
15
+ metadata:
16
+ name: apoxy-system
17
+ ---
18
+ apiVersion: v1
19
+ kind: Secret
20
+ metadata:
21
+ name: trainy-kubeconfig
22
+ namespace: apoxy-system
23
+ type: Opaque
24
+ data:
25
+ # this gets replaced by buildkite CI secret APOXY_AUTH
26
+ kubeconfig.yaml: |
27
+ APOXY_AUTH
28
+ ---
29
+ apiVersion: v1
30
+ kind: ServiceAccount
31
+ metadata:
32
+ name: kube-controller
33
+ namespace: apoxy-system
34
+ ---
35
+ apiVersion: rbac.authorization.k8s.io/v1
36
+ kind: ClusterRole
37
+ metadata:
38
+ name: kube-controller-role
39
+ rules:
40
+ - apiGroups: ["apiregistration.k8s.io"]
41
+ resources: ["apiservices"]
42
+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
43
+ ---
44
+ apiVersion: rbac.authorization.k8s.io/v1
45
+ kind: ClusterRoleBinding
46
+ metadata:
47
+ name: kube-controller-role-binding
48
+ roleRef:
49
+ apiGroup: rbac.authorization.k8s.io
50
+ kind: ClusterRole
51
+ name: kube-controller-role
52
+ subjects:
53
+ - kind: ServiceAccount
54
+ name: kube-controller
55
+ namespace: apoxy-system
56
+ ---
57
+ apiVersion: apps/v1
58
+ kind: Deployment
59
+ metadata:
60
+ name: kube-controller
61
+ namespace: apoxy-system
62
+ labels:
63
+ app: kube-controller
64
+ spec:
65
+ replicas: 1
66
+ selector:
67
+ matchLabels:
68
+ app: kube-controller
69
+ template:
70
+ metadata:
71
+ labels:
72
+ app: kube-controller
73
+ spec:
74
+ containers:
75
+ - name: kube-controller
76
+ image: apoxy/kube-controller:v0.11.6
77
+ args:
78
+ - --dev
79
+ - --project_id=7ce458d7-e20c-443c-aeeb-dbc5663c1240
80
+ - --kubeconfig_path=/data/kubeconfig.yaml
81
+ env:
82
+ - name: POD_NAMESPACE
83
+ valueFrom:
84
+ fieldRef:
85
+ fieldPath: metadata.namespace
86
+ volumeMounts:
87
+ - name: kubeconfig-volume
88
+ mountPath: /data
89
+ readOnly: true
90
+ volumes:
91
+ - name: kubeconfig-volume
92
+ secret:
93
+ secretName: trainy-kubeconfig
94
+ items:
95
+ - key: kubeconfig.yaml
96
+ path: kubeconfig.yaml
97
+ mode: 0600
98
+ serviceAccountName: kube-controller
99
+
100
+ ---
101
+ apiVersion: v1
102
+ kind: Service
103
+ metadata:
104
+ name: kube-controller
105
+ namespace: apoxy-system
106
+ labels:
107
+ app: kube-controller
108
+ spec:
109
+ selector:
110
+ app: kube-controller
111
+ ports:
112
+ - name: http
113
+ protocol: TCP
114
+ port: 8443
115
+ targetPort: 8443
116
+ ---
117
+ apiVersion: v1
118
+ kind: ConfigMap
119
+ metadata:
120
+ name: apoxy-config
121
+ namespace: apoxy-system
122
+ data:
123
+ config.yaml: |
124
+ apiVersion: config.apoxy.dev/v1alpha1
125
+ kind: Config
126
+ currentProject: 7ce458d7-e20c-443c-aeeb-dbc5663c1240
127
+ projects:
128
+ - id: 7ce458d7-e20c-443c-aeeb-dbc5663c1240
129
+ kubernetesConfig:
130
+ kubeconfigPath: /root/kubeconfig.yaml
131
+ tunnel:
132
+ mode: userspace
133
+ ---
134
+ apiVersion: apps/v1
135
+ kind: Deployment
136
+ metadata:
137
+ name: apoxy
138
+ namespace: apoxy-system
139
+ labels:
140
+ app: apoxy
141
+ spec:
142
+ replicas: 1
143
+ selector:
144
+ matchLabels:
145
+ app: apoxy
146
+ template:
147
+ metadata:
148
+ labels:
149
+ app: apoxy
150
+ spec:
151
+ containers:
152
+ - name: apoxy
153
+ image: apoxy/apoxy:v0.11.18
154
+ command: ["apoxy", "tunnel", "run", "UNIQUE-TEMPNAME", "--insecure-skip-verify"]
155
+ volumeMounts:
156
+ - name: kubeconfig-volume
157
+ mountPath: /root/kubeconfig.yaml
158
+ subPath: kubeconfig.yaml
159
+ - name: apoxy-config-volume
160
+ mountPath: /root/.apoxy/config.yaml
161
+ subPath: config.yaml
162
+ volumes:
163
+ - name: kubeconfig-volume
164
+ secret:
165
+ secretName: trainy-kubeconfig
166
+ - name: apoxy-config-volume
167
+ configMap:
168
+ name: apoxy-config
169
+ ---
170
+ # NetworkPolicy to allow Apoxy to reach services in other namespaces
171
+ apiVersion: networking.k8s.io/v1
172
+ kind: NetworkPolicy
173
+ metadata:
174
+ name: apoxy-cross-namespace-access
175
+ namespace: apoxy-system
176
+ spec:
177
+ podSelector:
178
+ matchLabels:
179
+ app: apoxy
180
+ policyTypes:
181
+ - Egress
182
+ egress:
183
+ # Allow all egress traffic
184
+ - {}
@@ -0,0 +1,98 @@
1
+ # Apoxy Setup (Part 2/2) - Deployment Routing
2
+ #
3
+ # This file sets up Apoxy routing for both vLLM and general deployments:
4
+ # 1. TunnelNode for secure tunnel connection
5
+ # 2. Backend for vLLM pointing to Envoy Gateway
6
+ # 3. HTTPRoute for company.trainy.us -> vLLM deployments
7
+ # 4. Backend for general deployments pointing to nginx ingress
8
+ # 5. HTTPRoute for company2.trainy.us -> general deployments
9
+ # 6. KEDA proxy service for HTTP autoscaling
10
+ # 7. 60s timeout for all requests
11
+ #
12
+ # Split into 2 files because:
13
+ # - apoxy-setup.yaml: Core infrastructure (1 per cluster) (needs to be applied first)
14
+ # - apoxy-setup2.yaml: All routing rules for both deployment types
15
+
16
+ # NOTE: TunnelNode should technically be in the first apoxy-setup.yaml but it
17
+ # needs to be created after the core infrastructure is created, so we put it here.
18
+ apiVersion: core.apoxy.dev/v1alpha
19
+ kind: TunnelNode
20
+ metadata:
21
+ name: UNIQUE-TEMPNAME
22
+ spec:
23
+ egressGateway:
24
+ enabled: true
25
+ ---
26
+ # Backend for vLLM deployments
27
+ apiVersion: core.apoxy.dev/v1alpha
28
+ kind: Backend
29
+ metadata:
30
+ name: UNIQUE-TEMPNAME-backend
31
+ spec:
32
+ endpoints:
33
+ - fqdn: envoy-aibrix-system-aibrix-eg-903790dc.envoy-gateway-system.UNIQUE-TEMPNAME.tun.apoxy.net
34
+ ---
35
+ # HTTPRoute for vLLM deployments
36
+ apiVersion: gateway.apoxy.dev/v1
37
+ kind: HTTPRoute
38
+ metadata:
39
+ name: UNIQUE-TEMPNAME-route
40
+ spec:
41
+ parentRefs:
42
+ - name: default
43
+ kind: Gateway
44
+ port: 443
45
+ hostnames:
46
+ - 'TEMPNAME.trainy.us'
47
+ rules:
48
+ - backendRefs:
49
+ - kind: Backend
50
+ name: UNIQUE-TEMPNAME-backend
51
+ port: 80
52
+ timeouts:
53
+ request: "60s"
54
+ ---
55
+ # Backend for general deployments
56
+ apiVersion: core.apoxy.dev/v1alpha
57
+ kind: Backend
58
+ metadata:
59
+ name: UNIQUE-TEMPNAME-backend2
60
+ spec:
61
+ endpoints:
62
+ - fqdn: keda-ingress-nginx-controller.keda.UNIQUE-TEMPNAME.tun.apoxy.net
63
+ ---
64
+ # HTTPRoute for general deployments
65
+ apiVersion: gateway.apoxy.dev/v1
66
+ kind: HTTPRoute
67
+ metadata:
68
+ name: UNIQUE-TEMPNAME-route2
69
+ spec:
70
+ parentRefs:
71
+ - name: default
72
+ kind: Gateway
73
+ port: 443
74
+ hostnames:
75
+ - 'TEMPNAME2.trainy.us'
76
+ rules:
77
+ - backendRefs:
78
+ - kind: Backend
79
+ name: UNIQUE-TEMPNAME-backend2
80
+ port: 80
81
+ timeouts:
82
+ request: "60s"
83
+
84
+ # KEDA proxy service (1 per cluster) (For general deployments)
85
+ ---
86
+ apiVersion: v1
87
+ kind: Service
88
+ metadata:
89
+ name: keda-proxy
90
+ namespace: default
91
+ spec:
92
+ type: ExternalName
93
+ externalName: keda-add-ons-http-interceptor-proxy.keda
94
+ ports:
95
+ - name: http
96
+ port: 8080
97
+ protocol: TCP
98
+ targetPort: 8080
@@ -0,0 +1,69 @@
1
+ apiVersion: v1
2
+ kind: Namespace
3
+ metadata:
4
+ name: konduktor
5
+ ---
6
+ apiVersion: v1
7
+ kind: ServiceAccount
8
+ metadata:
9
+ name: konduktor-controller-sa
10
+ namespace: konduktor
11
+ ---
12
+ apiVersion: rbac.authorization.k8s.io/v1
13
+ kind: ClusterRole
14
+ metadata:
15
+ namespace: konduktor
16
+ name: konduktor-controller-role
17
+ rules:
18
+ - apiGroups: [""]
19
+ resources: ["nodes"]
20
+ verbs: ["get", "list", "patch"]
21
+ ---
22
+ apiVersion: rbac.authorization.k8s.io/v1
23
+ kind: ClusterRoleBinding
24
+ metadata:
25
+ namespace: konduktor
26
+ name: konduktor-controller-rolebinding
27
+ subjects:
28
+ - kind: ServiceAccount
29
+ name: konduktor-controller-sa
30
+ namespace: konduktor
31
+ roleRef:
32
+ kind: ClusterRole
33
+ name: konduktor-controller-role
34
+ apiGroup: rbac.authorization.k8s.io
35
+ ---
36
+ apiVersion: apps/v1
37
+ kind: Deployment
38
+ metadata:
39
+ name: konduktor-controller-deployment
40
+ namespace: konduktor
41
+ spec:
42
+ replicas: 1
43
+ selector:
44
+ matchLabels:
45
+ app: konduktor-controller
46
+ template:
47
+ metadata:
48
+ labels:
49
+ app: konduktor-controller
50
+ spec:
51
+ serviceAccountName: konduktor-controller-sa
52
+ affinity:
53
+ nodeAffinity:
54
+ requiredDuringSchedulingIgnoredDuringExecution:
55
+ nodeSelectorTerms:
56
+ - matchExpressions:
57
+ - key: nvidia.com/gpu
58
+ operator: DoesNotExist
59
+ containers:
60
+ - name: python
61
+ image: python:3.10
62
+ command: ["/bin/sh"]
63
+ args: ["-c", "pip install konduktor-nightly && python -m konduktor.controller.launch"]
64
+ ## define what namespaces to watch for errors, comma separated.
65
+ # env:
66
+ # - name: WATCHED_NAMESPACES
67
+ # value: "default,othernamespace"
68
+ # - name: LOG_ENDPOINT
69
+ # value: "http://loki.loki.svc.cluster.local:3100"
@@ -0,0 +1,131 @@
1
+ ---
2
+ apiVersion: v1
3
+ kind: Namespace
4
+ metadata:
5
+ name: konduktor-dashboard
6
+ ---
7
+ # ServiceAccount
8
+ apiVersion: v1
9
+ kind: ServiceAccount
10
+ metadata:
11
+ name: konduktor-service-account
12
+ namespace: konduktor-dashboard
13
+
14
+ ---
15
+ # ClusterRole
16
+ apiVersion: rbac.authorization.k8s.io/v1
17
+ kind: ClusterRole
18
+ metadata:
19
+ name: node-access-role
20
+ rules:
21
+ - apiGroups: [""]
22
+ resources: ["services", "pods", "nodes", "pods/portforward", "namespaces"]
23
+ verbs: ["get", "list", "patch", "watch", "create"]
24
+
25
+ # Add permissions to delete Kubernetes jobs
26
+ - apiGroups: ["batch"] # For Kubernetes native Jobs
27
+ resources: ["jobs"]
28
+ verbs: ["get", "list", "delete", "patch"]
29
+
30
+ # Add permissions to delete Kueue workloads
31
+ - apiGroups: ["kueue.x-k8s.io"] # For Kueue workloads
32
+ resources: ["workloads"]
33
+ verbs: ["get", "list", "delete", "patch"]
34
+
35
+ ---
36
+ # ClusterRoleBinding
37
+ apiVersion: rbac.authorization.k8s.io/v1
38
+ kind: ClusterRoleBinding
39
+ metadata:
40
+ name: node-access-binding
41
+ subjects:
42
+ - kind: ServiceAccount
43
+ name: konduktor-service-account # Must match the name of the service account
44
+ namespace: konduktor-dashboard
45
+ roleRef:
46
+ kind: ClusterRole
47
+ name: node-access-role # Must match the ClusterRole name
48
+ apiGroup: rbac.authorization.k8s.io
49
+
50
+ ---
51
+ # Backend Deployment + Service
52
+ apiVersion: apps/v1
53
+ kind: Deployment
54
+ metadata:
55
+ name: backend
56
+ namespace: konduktor-dashboard
57
+ spec:
58
+ replicas: 1
59
+ selector:
60
+ matchLabels:
61
+ app: backend
62
+ template:
63
+ metadata:
64
+ labels:
65
+ app: backend
66
+ spec:
67
+ serviceAccountName: konduktor-service-account
68
+ containers:
69
+ - name: backend
70
+ image: ryanattrainy/konduktor-dashboard:backend1.16
71
+ imagePullPolicy: Always
72
+ ports:
73
+ - containerPort: 5001
74
+ command: ["/app/startup.sh"]
75
+ env:
76
+ - name: KONDUKTOR_DEBUG
77
+ value: "0" # Set debug mode: 1 (DEBUG) or 0
78
+ - name: LOGS_URL # Set loki logs URL
79
+ value: "http://loki.loki.svc.cluster.local:3100/loki/api/v1/query_range"
80
+
81
+ ---
82
+ apiVersion: v1
83
+ kind: Service
84
+ metadata:
85
+ name: backend
86
+ namespace: konduktor-dashboard
87
+ spec:
88
+ ports:
89
+ - name: backend-port
90
+ port: 5001
91
+ targetPort: 5001
92
+ selector:
93
+ app: backend
94
+
95
+ ---
96
+ # Frontend Deployment + Service
97
+ apiVersion: apps/v1
98
+ kind: Deployment
99
+ metadata:
100
+ name: frontend
101
+ namespace: konduktor-dashboard
102
+ spec:
103
+ replicas: 1
104
+ selector:
105
+ matchLabels:
106
+ app: frontend
107
+ template:
108
+ metadata:
109
+ labels:
110
+ app: frontend
111
+ spec:
112
+ containers:
113
+ - name: frontend
114
+ image: ryanattrainy/konduktor-dashboard:frontend1.16
115
+ imagePullPolicy: Always
116
+ ports:
117
+ - containerPort: 5173
118
+ ---
119
+ apiVersion: v1
120
+ kind: Service
121
+ metadata:
122
+ name: frontend
123
+ namespace: konduktor-dashboard
124
+ spec:
125
+ ports:
126
+ - name: frontend-port
127
+ port: 5173
128
+ targetPort: 5173
129
+ selector:
130
+ app: frontend
131
+
@@ -0,0 +1,57 @@
1
+ apiVersion: v1
2
+ kind: Namespace
3
+ metadata:
4
+ name: dmesg-logging
5
+ ---
6
+ apiVersion: apps/v1
7
+ kind: DaemonSet
8
+ metadata:
9
+ name: dmesg
10
+ namespace: dmesg-logging
11
+ labels:
12
+ k8s-app: dmesg-logging
13
+ spec:
14
+ selector:
15
+ matchLabels:
16
+ name: dmesg
17
+ template:
18
+ metadata:
19
+ labels:
20
+ name: dmesg
21
+ spec:
22
+ tolerations:
23
+ # these tolerations are to have the daemonset runnable on control plane nodes
24
+ # remove them if your control plane nodes should not run pods
25
+ - key: "nvidia.com/gpu"
26
+ operator: "Equal"
27
+ value: "present"
28
+ effect: "NoSchedule"
29
+ - key: "trainy.konduktor.ai/faulty"
30
+ operator: "Equal"
31
+ value: "true"
32
+ effect: "NoSchedule"
33
+ - key: "cloud.google.com/gke-queued"
34
+ operator: "Equal"
35
+ value: "true"
36
+ effect: "NoSchedule"
37
+ containers:
38
+ - name: dmesg
39
+ image: ubuntu:22.04
40
+ # required for running `dmesg`
41
+ securityContext:
42
+ privileged: true
43
+ command:
44
+ - sh
45
+ - -c
46
+ - >
47
+ dmesg -w
48
+ resources:
49
+ limits:
50
+ memory: 200Mi
51
+ requests:
52
+ cpu: 100m
53
+ memory: 200Mi
54
+ # it may be desirable to set a high priority class to ensure that a DaemonSet Pod
55
+ # preempts running Pods
56
+ # priorityClassName: important
57
+ terminationGracePeriodSeconds: 30
@@ -0,0 +1,129 @@
1
+ apiVersion: v1
2
+ kind: ServiceAccount
3
+ metadata:
4
+ name: pod-cleanup-controller
5
+ namespace: default
6
+ ---
7
+ apiVersion: rbac.authorization.k8s.io/v1
8
+ kind: Role
9
+ metadata:
10
+ name: pod-cleanup-controller
11
+ namespace: default
12
+ rules:
13
+ - apiGroups: [""]
14
+ resources: ["pods", "pods/status", "events"]
15
+ verbs: ["get", "list", "watch", "delete", "patch", "update"]
16
+ ---
17
+ apiVersion: rbac.authorization.k8s.io/v1
18
+ kind: RoleBinding
19
+ metadata:
20
+ name: pod-cleanup-controller
21
+ namespace: default
22
+ subjects:
23
+ - kind: ServiceAccount
24
+ name: pod-cleanup-controller
25
+ namespace: default
26
+ roleRef:
27
+ kind: Role
28
+ name: pod-cleanup-controller
29
+ apiGroup: rbac.authorization.k8s.io
30
+ ---
31
+ apiVersion: apps/v1
32
+ kind: Deployment
33
+ metadata:
34
+ name: pod-cleanup-controller
35
+ namespace: default
36
+ spec:
37
+ replicas: 1
38
+ selector:
39
+ matchLabels:
40
+ app: pod-cleanup-controller
41
+ template:
42
+ metadata:
43
+ labels:
44
+ app: pod-cleanup-controller
45
+ spec:
46
+ serviceAccountName: pod-cleanup-controller
47
+ containers:
48
+ - name: controller
49
+ image: python:3.10
50
+ command: ["/bin/sh", "-c"]
51
+ args: ["pip install kubernetes && echo 'starting controller' && python /app/controller.py"]
52
+ env:
53
+ - name: PYTHONUNBUFFERED
54
+ value: "0"
55
+ volumeMounts:
56
+ - name: controller-code
57
+ mountPath: /app
58
+ volumes:
59
+ - name: controller-code
60
+ configMap:
61
+ name: pod-cleanup-controller-code
62
+ ---
63
+ apiVersion: v1
64
+ kind: ConfigMap
65
+ metadata:
66
+ name: pod-cleanup-controller-code
67
+ namespace: default
68
+ data:
69
+ controller.py: |
70
+ from kubernetes import client, config, watch
71
+ from collections import defaultdict
72
+ from datetime import datetime
73
+ import time
74
+
75
+ FAILURE_MODES = ['ErrImagePull', 'InvalidImageName']
76
+
77
+ def check_failure_mode(message):
78
+ for mode in FAILURE_MODES:
79
+ if mode in message:
80
+ return mode
81
+ return ''
82
+
83
+ def main():
84
+ # Load kube config
85
+ try:
86
+ config.load_incluster_config()
87
+ except:
88
+ config.load_kube_config()
89
+
90
+ v1 = client.CoreV1Api()
91
+ error_counts = defaultdict(int)
92
+
93
+ w = watch.Watch()
94
+ while True:
95
+ for event in w.stream(v1.list_namespaced_event, namespace="default"):
96
+ if event['object'].type == 'Warning' and event['object'].reason == 'Failed' and check_failure_mode(event['object'].message):
97
+ pod_name = event['object'].involved_object.name
98
+ pod_namespace = event['object'].involved_object.namespace
99
+ print(f"Pod {pod_namespace}/{pod_name} has failed with ErrImagePull. Patching and deleting...")
100
+ try:
101
+ # Get current time in UTC
102
+ current_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
103
+
104
+ # Create the status patch
105
+ body = {
106
+ "status": {
107
+ "conditions": [
108
+ {
109
+ "type": "ConfigIssue",
110
+ "status": "True",
111
+ "reason": "ErrImagePull",
112
+ "lastTransitionTime": current_time
113
+ }
114
+ ]
115
+ }
116
+ }
117
+
118
+ # Patch pod status
119
+ v1.patch_namespaced_pod_status(pod_name, pod_namespace, body)
120
+
121
+ # Delete the pod
122
+ v1.delete_namespaced_pod(pod_name, pod_namespace)
123
+ except Exception as e:
124
+ print(f"Error handling pod: {e}")
125
+ print("Finished event stream... waiting for another stream...")
126
+ time.sleep(5)
127
+
128
+ if __name__ == '__main__':
129
+ main()