konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +16 -6
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/common.py +88 -0
- konduktor/adaptors/gcp.py +112 -0
- konduktor/backends/__init__.py +8 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/jobset.py +218 -0
- konduktor/backends/jobset_utils.py +447 -0
- konduktor/check.py +192 -0
- konduktor/cli.py +790 -0
- konduktor/cloud_stores.py +158 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/constants.py +6 -6
- konduktor/controller/launch.py +3 -3
- konduktor/controller/node.py +5 -5
- konduktor/controller/parse.py +23 -23
- konduktor/dashboard/backend/main.py +57 -57
- konduktor/dashboard/backend/sockets.py +19 -19
- konduktor/data/__init__.py +9 -0
- konduktor/data/constants.py +12 -0
- konduktor/data/data_utils.py +223 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +906 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/storage.py +799 -0
- konduktor/data/storage_utils.py +500 -0
- konduktor/execution.py +444 -0
- konduktor/kube_client.py +153 -48
- konduktor/logging.py +49 -5
- konduktor/manifests/dmesg_daemonset.yaml +8 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +478 -0
- konduktor/task.py +867 -0
- konduktor/templates/jobset.yaml.j2 +31 -0
- konduktor/templates/pod.yaml.j2 +185 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +21 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +93 -0
- konduktor/utils/common_utils.py +393 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +226 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +652 -0
- konduktor/utils/log_utils.py +251 -0
- konduktor/utils/loki_utils.py +85 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +581 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +216 -0
- konduktor/utils/validator.py +20 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
- konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
- konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
konduktor/logging.py
CHANGED
@@ -1,12 +1,22 @@
|
|
1
1
|
"""Logging utilities."""
|
2
2
|
|
3
|
+
import contextlib
|
3
4
|
import logging
|
4
5
|
import os
|
6
|
+
import threading
|
7
|
+
from datetime import datetime
|
5
8
|
|
6
9
|
import colorama
|
7
10
|
|
8
|
-
|
9
|
-
|
11
|
+
from konduktor import constants
|
12
|
+
|
13
|
+
CHECK_MARK_EMOJI = '\U00002714' # Heavy check mark unicode
|
14
|
+
PARTY_POPPER_EMOJI = '\U0001f389' # Party popper unicode
|
15
|
+
|
16
|
+
_FORMAT = '[%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
|
17
|
+
_DATE_FORMAT = '%m-%d %H:%M:%S'
|
18
|
+
|
19
|
+
_logging_config = threading.local()
|
10
20
|
|
11
21
|
|
12
22
|
class NewLineFormatter(logging.Formatter):
|
@@ -18,9 +28,9 @@ class NewLineFormatter(logging.Formatter):
|
|
18
28
|
|
19
29
|
def format(self, record):
|
20
30
|
msg = logging.Formatter.format(self, record)
|
21
|
-
if record.message !=
|
31
|
+
if record.message != '':
|
22
32
|
parts = msg.partition(record.message)
|
23
|
-
msg = msg.replace(
|
33
|
+
msg = msg.replace('\n', '\r\n' + parts[0])
|
24
34
|
if self.dim:
|
25
35
|
msg = colorama.Style.DIM + msg + colorama.Style.RESET_ALL
|
26
36
|
return msg
|
@@ -29,10 +39,21 @@ class NewLineFormatter(logging.Formatter):
|
|
29
39
|
FORMATTER = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
|
30
40
|
|
31
41
|
|
42
|
+
@contextlib.contextmanager
|
43
|
+
def set_logging_level(logger: str, level: int):
|
44
|
+
logger = logging.getLogger(logger)
|
45
|
+
original_level = logger.level
|
46
|
+
logger.setLevel(level)
|
47
|
+
try:
|
48
|
+
yield
|
49
|
+
finally:
|
50
|
+
logger.setLevel(original_level)
|
51
|
+
|
52
|
+
|
32
53
|
def get_logger(name: str):
|
33
54
|
# Determine the logging level based on the KONDUKTOR_DEBUG environment variable
|
34
55
|
log_level = logging.INFO
|
35
|
-
if os.environ.get(
|
56
|
+
if os.environ.get('KONDUKTOR_DEBUG', None) == '1':
|
36
57
|
log_level = logging.DEBUG
|
37
58
|
|
38
59
|
# Configure the logger
|
@@ -45,3 +66,26 @@ def get_logger(name: str):
|
|
45
66
|
logger.addHandler(ch)
|
46
67
|
logger.propagate = False
|
47
68
|
return logger
|
69
|
+
|
70
|
+
|
71
|
+
def is_silent():
|
72
|
+
if not hasattr(_logging_config, 'is_silent'):
|
73
|
+
# Should not set it globally, as the global assignment
|
74
|
+
# will be executed only once if the module is imported
|
75
|
+
# in the main thread, and will not be executed in other
|
76
|
+
# threads.
|
77
|
+
_logging_config.is_silent = False
|
78
|
+
return _logging_config.is_silent
|
79
|
+
|
80
|
+
|
81
|
+
def get_run_timestamp() -> str:
|
82
|
+
return 'konduktor-' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
|
83
|
+
|
84
|
+
|
85
|
+
def generate_tmp_logging_file_path(file_name: str) -> str:
|
86
|
+
"""Generate an absolute path of a tmp file for logging."""
|
87
|
+
run_timestamp = get_run_timestamp()
|
88
|
+
log_dir = os.path.join(constants.KONDUKTOR_LOGS_DIRECTORY, run_timestamp)
|
89
|
+
log_path = os.path.expanduser(os.path.join(log_dir, file_name))
|
90
|
+
|
91
|
+
return log_path
|
@@ -26,6 +26,14 @@ spec:
|
|
26
26
|
operator: "Equal"
|
27
27
|
value: "present"
|
28
28
|
effect: "NoSchedule"
|
29
|
+
- key: "trainy.konduktor.ai/faulty"
|
30
|
+
operator: "Equal"
|
31
|
+
value: "true"
|
32
|
+
effect: "NoSchedule"
|
33
|
+
- key: "cloud.google.com/gke-queued"
|
34
|
+
operator: "Equal"
|
35
|
+
value: "true"
|
36
|
+
effect: "NoSchedule"
|
29
37
|
containers:
|
30
38
|
- name: dmesg
|
31
39
|
image: ubuntu:22.04
|
@@ -0,0 +1,129 @@
|
|
1
|
+
apiVersion: v1
|
2
|
+
kind: ServiceAccount
|
3
|
+
metadata:
|
4
|
+
name: pod-cleanup-controller
|
5
|
+
namespace: default
|
6
|
+
---
|
7
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
8
|
+
kind: Role
|
9
|
+
metadata:
|
10
|
+
name: pod-cleanup-controller
|
11
|
+
namespace: default
|
12
|
+
rules:
|
13
|
+
- apiGroups: [""]
|
14
|
+
resources: ["pods", "pods/status", "events"]
|
15
|
+
verbs: ["get", "list", "watch", "delete", "patch", "update"]
|
16
|
+
---
|
17
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
18
|
+
kind: RoleBinding
|
19
|
+
metadata:
|
20
|
+
name: pod-cleanup-controller
|
21
|
+
namespace: default
|
22
|
+
subjects:
|
23
|
+
- kind: ServiceAccount
|
24
|
+
name: pod-cleanup-controller
|
25
|
+
namespace: default
|
26
|
+
roleRef:
|
27
|
+
kind: Role
|
28
|
+
name: pod-cleanup-controller
|
29
|
+
apiGroup: rbac.authorization.k8s.io
|
30
|
+
---
|
31
|
+
apiVersion: apps/v1
|
32
|
+
kind: Deployment
|
33
|
+
metadata:
|
34
|
+
name: pod-cleanup-controller
|
35
|
+
namespace: default
|
36
|
+
spec:
|
37
|
+
replicas: 1
|
38
|
+
selector:
|
39
|
+
matchLabels:
|
40
|
+
app: pod-cleanup-controller
|
41
|
+
template:
|
42
|
+
metadata:
|
43
|
+
labels:
|
44
|
+
app: pod-cleanup-controller
|
45
|
+
spec:
|
46
|
+
serviceAccountName: pod-cleanup-controller
|
47
|
+
containers:
|
48
|
+
- name: controller
|
49
|
+
image: python:3.10
|
50
|
+
command: ["/bin/sh", "-c"]
|
51
|
+
args: ["pip install kubernetes && echo 'starting controller' && python /app/controller.py"]
|
52
|
+
env:
|
53
|
+
- name: PYTHONUNBUFFERED
|
54
|
+
value: "0"
|
55
|
+
volumeMounts:
|
56
|
+
- name: controller-code
|
57
|
+
mountPath: /app
|
58
|
+
volumes:
|
59
|
+
- name: controller-code
|
60
|
+
configMap:
|
61
|
+
name: pod-cleanup-controller-code
|
62
|
+
---
|
63
|
+
apiVersion: v1
|
64
|
+
kind: ConfigMap
|
65
|
+
metadata:
|
66
|
+
name: pod-cleanup-controller-code
|
67
|
+
namespace: default
|
68
|
+
data:
|
69
|
+
controller.py: |
|
70
|
+
from kubernetes import client, config, watch
|
71
|
+
from collections import defaultdict
|
72
|
+
from datetime import datetime
|
73
|
+
import time
|
74
|
+
|
75
|
+
FAILURE_MODES = ['ErrImagePull', 'InvalidImageName']
|
76
|
+
|
77
|
+
def check_failure_mode(message):
|
78
|
+
for mode in FAILURE_MODES:
|
79
|
+
if mode in message:
|
80
|
+
return mode
|
81
|
+
return ''
|
82
|
+
|
83
|
+
def main():
|
84
|
+
# Load kube config
|
85
|
+
try:
|
86
|
+
config.load_incluster_config()
|
87
|
+
except:
|
88
|
+
config.load_kube_config()
|
89
|
+
|
90
|
+
v1 = client.CoreV1Api()
|
91
|
+
error_counts = defaultdict(int)
|
92
|
+
|
93
|
+
w = watch.Watch()
|
94
|
+
while True:
|
95
|
+
for event in w.stream(v1.list_namespaced_event, namespace="default"):
|
96
|
+
if event['object'].type == 'Warning' and event['object'].reason == 'Failed' and check_failure_mode(event['object'].message):
|
97
|
+
pod_name = event['object'].involved_object.name
|
98
|
+
pod_namespace = event['object'].involved_object.namespace
|
99
|
+
print(f"Pod {pod_namespace}/{pod_name} has failed with ErrImagePull. Patching and deleting...")
|
100
|
+
try:
|
101
|
+
# Get current time in UTC
|
102
|
+
current_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
|
103
|
+
|
104
|
+
# Create the status patch
|
105
|
+
body = {
|
106
|
+
"status": {
|
107
|
+
"conditions": [
|
108
|
+
{
|
109
|
+
"type": "ConfigIssue",
|
110
|
+
"status": "True",
|
111
|
+
"reason": "ErrImagePull",
|
112
|
+
"lastTransitionTime": current_time
|
113
|
+
}
|
114
|
+
]
|
115
|
+
}
|
116
|
+
}
|
117
|
+
|
118
|
+
# Patch pod status
|
119
|
+
v1.patch_namespaced_pod_status(pod_name, pod_namespace, body)
|
120
|
+
|
121
|
+
# Delete the pod
|
122
|
+
v1.delete_namespaced_pod(pod_name, pod_namespace)
|
123
|
+
except Exception as e:
|
124
|
+
print(f"Error handling pod: {e}")
|
125
|
+
print("Finished event stream... waiting for another stream...")
|
126
|
+
time.sleep(5)
|
127
|
+
|
128
|
+
if __name__ == '__main__':
|
129
|
+
main()
|