konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
konduktor/logging.py CHANGED
@@ -1,12 +1,22 @@
1
1
  """Logging utilities."""
2
2
 
3
+ import contextlib
3
4
  import logging
4
5
  import os
6
+ import threading
7
+ from datetime import datetime
5
8
 
6
9
  import colorama
7
10
 
8
- _FORMAT = "[%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
9
- _DATE_FORMAT = "%m-%d %H:%M:%S"
11
+ from konduktor import constants
12
+
13
+ CHECK_MARK_EMOJI = '\U00002714' # Heavy check mark unicode
14
+ PARTY_POPPER_EMOJI = '\U0001f389' # Party popper unicode
15
+
16
+ _FORMAT = '[%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
17
+ _DATE_FORMAT = '%m-%d %H:%M:%S'
18
+
19
+ _logging_config = threading.local()
10
20
 
11
21
 
12
22
  class NewLineFormatter(logging.Formatter):
@@ -18,9 +28,9 @@ class NewLineFormatter(logging.Formatter):
18
28
 
19
29
  def format(self, record):
20
30
  msg = logging.Formatter.format(self, record)
21
- if record.message != "":
31
+ if record.message != '':
22
32
  parts = msg.partition(record.message)
23
- msg = msg.replace("\n", "\r\n" + parts[0])
33
+ msg = msg.replace('\n', '\r\n' + parts[0])
24
34
  if self.dim:
25
35
  msg = colorama.Style.DIM + msg + colorama.Style.RESET_ALL
26
36
  return msg
@@ -29,10 +39,21 @@ class NewLineFormatter(logging.Formatter):
29
39
  FORMATTER = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
30
40
 
31
41
 
42
+ @contextlib.contextmanager
43
+ def set_logging_level(logger: str, level: int):
44
+ logger = logging.getLogger(logger)
45
+ original_level = logger.level
46
+ logger.setLevel(level)
47
+ try:
48
+ yield
49
+ finally:
50
+ logger.setLevel(original_level)
51
+
52
+
32
53
  def get_logger(name: str):
33
54
  # Determine the logging level based on the KONDUKTOR_DEBUG environment variable
34
55
  log_level = logging.INFO
35
- if os.environ.get("KONDUKTOR_DEBUG", None) == "1":
56
+ if os.environ.get('KONDUKTOR_DEBUG', None) == '1':
36
57
  log_level = logging.DEBUG
37
58
 
38
59
  # Configure the logger
@@ -45,3 +66,26 @@ def get_logger(name: str):
45
66
  logger.addHandler(ch)
46
67
  logger.propagate = False
47
68
  return logger
69
+
70
+
71
+ def is_silent():
72
+ if not hasattr(_logging_config, 'is_silent'):
73
+ # Should not set it globally, as the global assignment
74
+ # will be executed only once if the module is imported
75
+ # in the main thread, and will not be executed in other
76
+ # threads.
77
+ _logging_config.is_silent = False
78
+ return _logging_config.is_silent
79
+
80
+
81
+ def get_run_timestamp() -> str:
82
+ return 'konduktor-' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
83
+
84
+
85
+ def generate_tmp_logging_file_path(file_name: str) -> str:
86
+ """Generate an absolute path of a tmp file for logging."""
87
+ run_timestamp = get_run_timestamp()
88
+ log_dir = os.path.join(constants.KONDUKTOR_LOGS_DIRECTORY, run_timestamp)
89
+ log_path = os.path.expanduser(os.path.join(log_dir, file_name))
90
+
91
+ return log_path
@@ -26,6 +26,14 @@ spec:
26
26
  operator: "Equal"
27
27
  value: "present"
28
28
  effect: "NoSchedule"
29
+ - key: "trainy.konduktor.ai/faulty"
30
+ operator: "Equal"
31
+ value: "true"
32
+ effect: "NoSchedule"
33
+ - key: "cloud.google.com/gke-queued"
34
+ operator: "Equal"
35
+ value: "true"
36
+ effect: "NoSchedule"
29
37
  containers:
30
38
  - name: dmesg
31
39
  image: ubuntu:22.04
@@ -0,0 +1,129 @@
1
+ apiVersion: v1
2
+ kind: ServiceAccount
3
+ metadata:
4
+ name: pod-cleanup-controller
5
+ namespace: default
6
+ ---
7
+ apiVersion: rbac.authorization.k8s.io/v1
8
+ kind: Role
9
+ metadata:
10
+ name: pod-cleanup-controller
11
+ namespace: default
12
+ rules:
13
+ - apiGroups: [""]
14
+ resources: ["pods", "pods/status", "events"]
15
+ verbs: ["get", "list", "watch", "delete", "patch", "update"]
16
+ ---
17
+ apiVersion: rbac.authorization.k8s.io/v1
18
+ kind: RoleBinding
19
+ metadata:
20
+ name: pod-cleanup-controller
21
+ namespace: default
22
+ subjects:
23
+ - kind: ServiceAccount
24
+ name: pod-cleanup-controller
25
+ namespace: default
26
+ roleRef:
27
+ kind: Role
28
+ name: pod-cleanup-controller
29
+ apiGroup: rbac.authorization.k8s.io
30
+ ---
31
+ apiVersion: apps/v1
32
+ kind: Deployment
33
+ metadata:
34
+ name: pod-cleanup-controller
35
+ namespace: default
36
+ spec:
37
+ replicas: 1
38
+ selector:
39
+ matchLabels:
40
+ app: pod-cleanup-controller
41
+ template:
42
+ metadata:
43
+ labels:
44
+ app: pod-cleanup-controller
45
+ spec:
46
+ serviceAccountName: pod-cleanup-controller
47
+ containers:
48
+ - name: controller
49
+ image: python:3.10
50
+ command: ["/bin/sh", "-c"]
51
+ args: ["pip install kubernetes && echo 'starting controller' && python /app/controller.py"]
52
+ env:
53
+ - name: PYTHONUNBUFFERED
54
+ value: "0"
55
+ volumeMounts:
56
+ - name: controller-code
57
+ mountPath: /app
58
+ volumes:
59
+ - name: controller-code
60
+ configMap:
61
+ name: pod-cleanup-controller-code
62
+ ---
63
+ apiVersion: v1
64
+ kind: ConfigMap
65
+ metadata:
66
+ name: pod-cleanup-controller-code
67
+ namespace: default
68
+ data:
69
+ controller.py: |
70
+ from kubernetes import client, config, watch
71
+ from collections import defaultdict
72
+ from datetime import datetime
73
+ import time
74
+
75
+ FAILURE_MODES = ['ErrImagePull', 'InvalidImageName']
76
+
77
+ def check_failure_mode(message):
78
+ for mode in FAILURE_MODES:
79
+ if mode in message:
80
+ return mode
81
+ return ''
82
+
83
+ def main():
84
+ # Load kube config
85
+ try:
86
+ config.load_incluster_config()
87
+ except:
88
+ config.load_kube_config()
89
+
90
+ v1 = client.CoreV1Api()
91
+ error_counts = defaultdict(int)
92
+
93
+ w = watch.Watch()
94
+ while True:
95
+ for event in w.stream(v1.list_namespaced_event, namespace="default"):
96
+ if event['object'].type == 'Warning' and event['object'].reason == 'Failed' and check_failure_mode(event['object'].message):
97
+ pod_name = event['object'].involved_object.name
98
+ pod_namespace = event['object'].involved_object.namespace
99
+ print(f"Pod {pod_namespace}/{pod_name} has failed with ErrImagePull. Patching and deleting...")
100
+ try:
101
+ # Get current time in UTC
102
+ current_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
103
+
104
+ # Create the status patch
105
+ body = {
106
+ "status": {
107
+ "conditions": [
108
+ {
109
+ "type": "ConfigIssue",
110
+ "status": "True",
111
+ "reason": "ErrImagePull",
112
+ "lastTransitionTime": current_time
113
+ }
114
+ ]
115
+ }
116
+ }
117
+
118
+ # Patch pod status
119
+ v1.patch_namespaced_pod_status(pod_name, pod_namespace, body)
120
+
121
+ # Delete the pod
122
+ v1.delete_namespaced_pod(pod_name, pod_namespace)
123
+ except Exception as e:
124
+ print(f"Error handling pod: {e}")
125
+ print("Finished event stream... waiting for another stream...")
126
+ time.sleep(5)
127
+
128
+ if __name__ == '__main__':
129
+ main()